hs-src/Language/Scheme/Parser.hs

{-# LANGUAGE CPP #-}
{-# LANGUAGE RankNTypes #-}
{- |
Module      : Language.Scheme.Parser
Copyright   : Justin Ethier
Licence     : MIT (see LICENSE in the distribution)

Maintainer  : github.com/justinethier
Stability   : experimental
Portability : portable

This module implements parsing of Scheme code.
-}

module Language.Scheme.Parser 
    (
      lispDef
    -- *Higher level parsing
    , mainParser
    , readOrThrow
    , readExpr
    , readExprList 
    -- *Low level parsing
    , parseExpr 
    , parseAtom
    , parseBool
    , parseChar
    , parseOctalNumber 
    , parseBinaryNumber
    , parseHexNumber
    , parseDecimalNumber
    , parseNumber 
    , parseRealNumber
    , parseRationalNumber 
    , parseComplexNumber 
    , parseEscapedChar 
    , parseString 
    , parseVector
    , parseByteVector
    , parseHashTable
    , parseList
    , parseDottedList
    , parseQuoted
    , parseQuasiQuoted 
    , parseUnquoted 
    , parseUnquoteSpliced 
    ) where
import Language.Scheme.Types
import Control.Monad.Error
import Data.Array
import qualified Data.ByteString as BS
import qualified Data.Char as DC
import Data.Complex
import qualified Data.Map
import Data.Ratio
import Data.Word
import Numeric
import Text.ParserCombinators.Parsec hiding (spaces)
import Text.Parsec.Language
import qualified Text.Parsec.Token as P
#if __GLASGOW_HASKELL__ >= 702
import Data.Functor.Identity (Identity)
import Text.Parsec.Prim (ParsecT)
#endif

-- This was added by pull request #63 as part of a series of fixes
-- to get husk to build on ghc 7.2.2
--
-- For now this has been removed to allow husk to support the older
-- GHC 6.x.x series.
--
--import Data.Functor.Identity (Identity)

-- |Language definition for Scheme
lispDef :: LanguageDef ()
lispDef 
  = emptyDef    
  { P.commentStart   = "#|"
  , P.commentEnd     = "|#"
  , P.commentLine    = ";"
  , P.nestedComments = True
  , P.identStart     = letter <|> symbol
  , P.identLetter    = letter <|> digit <|> symbol
  , P.reservedNames  = []
  , P.caseSensitive  = True
  } 

#if __GLASGOW_HASKELL__ >= 702
lexer :: P.GenTokenParser String () Data.Functor.Identity.Identity
#endif
lexer = P.makeTokenParser lispDef

#if __GLASGOW_HASKELL__ >= 702
dot :: ParsecT String () Identity String
#endif
dot = P.dot lexer

#if __GLASGOW_HASKELL__ >= 702
parens :: ParsecT String () Identity a -> ParsecT String () Identity a
#endif
parens = P.parens lexer

#if __GLASGOW_HASKELL__ >= 702
brackets :: ParsecT String () Identity a -> ParsecT String () Identity a
#endif
brackets = P.brackets lexer

#if __GLASGOW_HASKELL__ >= 702
identifier :: ParsecT String () Identity String
#endif
identifier = P.identifier lexer

#if __GLASGOW_HASKELL__ >= 702
whiteSpace :: ParsecT String () Identity ()
#endif
whiteSpace = P.whiteSpace lexer

#if __GLASGOW_HASKELL__ >= 702
lexeme :: ParsecT String () Identity a -> ParsecT String () Identity a
#endif
lexeme = P.lexeme lexer

-- |Match a special character
symbol :: Parser Char
symbol = oneOf "!$%&|*+-/:<=>?@^_~."

-- |Parse an atom (scheme symbol)
parseAtom :: Parser LispVal
parseAtom = do
  atom <- identifier
  if atom == "."
     then pzero -- Do not match this form
     else return $ Atom atom

-- |Parse a boolean
parseBool :: Parser LispVal
parseBool = do _ <- string "#"
               x <- oneOf "tf"
               return $ case x of
                          't' -> Bool True
                          'f' -> Bool False
                          _ -> Bool False

-- |Parse a character
parseChar :: Parser LispVal
parseChar = do
  _ <- try (string "#\\")
  c <- anyChar
  r <- many (letter <|> digit)
  let pchr = c : r
  case pchr of
    "space"     -> return $ Char ' '
    "newline"   -> return $ Char '\n'
    "alarm"     -> return $ Char '\a' 
    "backspace" -> return $ Char '\b' 
    "delete"    -> return $ Char '\DEL'
    "escape"    -> return $ Char '\ESC' 
    "null"      -> return $ Char '\0' 
    "return"    -> return $ Char '\n' 
    "tab"       -> return $ Char '\t'
    _ -> case (c : r) of
        [ch] -> return $ Char ch
        ('x' : hexs) -> do
            rv <- parseHexScalar hexs
            return $ Char rv
        _ -> pzero

-- |Parse an integer in octal notation, base 8
parseOctalNumber :: Parser LispVal
parseOctalNumber = do
  _ <- try (string "#o")
  sign <- many (oneOf "-")
  num <- many1 (oneOf "01234567")
  case (length sign) of
     0 -> return $ Number $ fst $ head (Numeric.readOct num)
     1 -> return $ Number $ fromInteger $ (*) (-1) $ fst $ head (Numeric.readOct num)
     _ -> pzero

-- |Parse an integer in binary notation, base 2
parseBinaryNumber :: Parser LispVal
parseBinaryNumber = do
  _ <- try (string "#b")
  sign <- many (oneOf "-")
  num <- many1 (oneOf "01")
  case (length sign) of
     0 -> return $ Number $ fst $ head (Numeric.readInt 2 (`elem` "01") DC.digitToInt num)
     1 -> return $ Number $ fromInteger $ (*) (-1) $ fst $ head (Numeric.readInt 2 (`elem` "01") DC.digitToInt num)
     _ -> pzero

-- |Parse an integer in hexadecimal notation, base 16
parseHexNumber :: Parser LispVal
parseHexNumber = do
  _ <- try (string "#x")
  sign <- many (oneOf "-")
  num <- many1 (digit <|> oneOf "abcdefABCDEF")
  case (length sign) of
     0 -> return $ Number $ fst $ head (Numeric.readHex num)
     1 -> return $ Number $ fromInteger $ (*) (-1) $ fst $ head (Numeric.readHex num)
     _ -> pzero

-- |Parser for Integer, base 10
parseDecimalNumber :: Parser LispVal
parseDecimalNumber = do
  _ <- try (many (string "#d"))
  sign <- many (oneOf "-")
  num <- many1 digit
  if (length sign) > 1
     then pzero
     else return $ (Number . read) $ sign ++ num

-- |Parser for a base 10 Integer that will also
--  check to see if the number is followed by
--  an exponent (scientific notation). If so,
--  the integer is converted to a float of the
--  given magnitude.
parseDecimalNumberMaybeExponent :: Parser LispVal
parseDecimalNumberMaybeExponent = do
  num <- parseDecimalNumber
  parseNumberExponent num

-- |Parse an integer in any base
parseNumber :: Parser LispVal
parseNumber = parseDecimalNumberMaybeExponent <|>
              parseHexNumber <|>
              parseBinaryNumber <|>
              parseOctalNumber <?>
              "Unable to parse number"

-- |Parse a floating point number
parseRealNumber :: Parser LispVal
parseRealNumber = do
  sign <- many (oneOf "-+")
  num <- many digit
  _ <- char '.'
  frac <- many1 digit
  let dec = if not (null num)
               then num ++ "." ++ frac
               else "0." ++ frac
  f <- case (length sign) of
     0 -> return $ Float $ fst $ head (Numeric.readFloat dec)
          -- Bit of a hack, but need to support the + sign as well as the minus.
     1 -> if sign == "-" 
             then return $ Float $ (*) (-1.0) $ fst $ head (Numeric.readFloat dec)
             else return $ Float $ fst $ head (Numeric.readFloat dec)
     _ -> pzero
  parseNumberExponent f

-- | Parse the exponent section of a floating point number
--   in scientific notation. Eg "e10" from "1.0e10"
parseNumberExponent :: LispVal -> Parser LispVal
parseNumberExponent n = do 
  expnt <- many $ oneOf "Ee"
  case (length expnt) of
    0 -> return n
    1 -> do
      num <- try parseDecimalNumber
      case num of
        Number nexp -> buildResult n nexp
        _ -> pzero
    _ -> pzero
 where 
  buildResult (Number num) nexp = return $ Float $ (fromIntegral num) * (10 ** (fromIntegral nexp))
  buildResult (Float num) nexp = return $ Float $ num * (10 ** (fromIntegral nexp))
  buildResult _ _ = pzero

-- |Parse a rational number
parseRationalNumber :: Parser LispVal
parseRationalNumber = do
  pnumerator <- parseDecimalNumber
  case pnumerator of
    Number n -> do
      _ <- char '/'
      sign <- many (oneOf "-")
      num <- many1 digit
      if (length sign) > 1
         then pzero
         else do
             let pdenominator = read $ sign ++ num
             if pdenominator == 0
                then return $ Number 0 -- TODO: Prevents a div-by-zero error, but not really correct either
                else return $ Rational $ n % pdenominator
    _ -> pzero

-- |Parse a complex number
parseComplexNumber :: Parser LispVal
parseComplexNumber = do
  lispreal <- (try parseRealNumber <|> try parseRationalNumber <|> parseDecimalNumber)
  let real = case lispreal of
                  Number n -> fromInteger n
                  Rational r -> fromRational r
                  Float f -> f
                  _ -> 0
  _ <- char '+'
  lispimag <- (try parseRealNumber <|> try parseRationalNumber <|> parseDecimalNumber)
  let imag = case lispimag of
                  Number n -> fromInteger n
                  Rational r -> fromRational r
                  Float f -> f
                  _ -> 0 -- Case should never be reached
  _ <- char 'i'
  return $ Complex $ real :+ imag

-- |Parse an escaped character
parseEscapedChar :: forall st .
                    GenParser Char st Char
parseEscapedChar = do
  _ <- char '\\'
  c <- anyChar
  case c of
    'a' -> return '\a'
    'b' -> return '\b'
    'n' -> return '\n'
    't' -> return '\t'
    'r' -> return '\r'
    'x' -> do
        num <- many $ letter <|> digit
        _ <- char ';'
        parseHexScalar num
    _ -> return c

-- |Parse a hexidecimal scalar
parseHexScalar :: Monad m => String -> m Char
parseHexScalar num = do
    let ns = Numeric.readHex num
    case ns of
        [] -> fail $ "Unable to parse hex value " ++ show num
        _ -> return $ DC.chr $ fst $ head ns

-- |Parse a string
parseString :: Parser LispVal
parseString = do
  _ <- char '"'
  x <- many (parseEscapedChar <|> noneOf "\"")
  _ <- char '"'
  return $ String x

-- |Parse a vector
parseVector :: Parser LispVal
parseVector = do
  vals <- sepBy parseExpr whiteSpace
  return $ Vector (listArray (0, (length vals - 1)) vals)

-- |Parse a bytevector
parseByteVector :: Parser LispVal
parseByteVector = do
  ns <- sepBy parseNumber whiteSpace
  return $ ByteVector $ BS.pack $ map conv ns
 where 
   conv (Number n) = fromInteger n :: Word8
   conv _ = 0 :: Word8

-- |Parse a hash table. The table is either empty or is made up of
--  an alist (associative list)
parseHashTable :: Parser LispVal
parseHashTable = do
  -- This function uses explicit recursion to loop over the parsed list:
  -- As long as it is an alist, the members are appended to an accumulator
  -- so they can be added to the hash table. However, if the input list is
  -- determined not to be an alist, Nothing is returned, letting the parser
  -- know that a valid hashtable was not read.
  let f :: [(LispVal, LispVal)] -> [LispVal] -> Maybe [(LispVal, LispVal)]
      f acc [] = Just acc
      f acc (List [a, b] :ls) = f (acc ++ [(a, b)]) ls
      f acc (DottedList [a] b :ls) = f (acc ++ [(a, b)]) ls
      f _ (_:_) = Nothing
  vals <- sepBy parseExpr whiteSpace
  let mvals = f [] vals
  case mvals of
    Just m -> return $ HashTable $ Data.Map.fromList m
    Nothing -> pzero

-- |Parse a list
parseList :: Parser LispVal
parseList = liftM List $ sepBy parseExpr whiteSpace
-- TODO: wanted to use endBy (or a variant) above, but it causes an error such that dotted lists are not parsed

-- |Parse a dotted list (scheme pair)
parseDottedList :: Parser LispVal
parseDottedList = do
  phead <- endBy parseExpr whiteSpace
  case phead of
    [] -> pzero -- car is required; no match   
    _ -> do
      ptail <- dot >> parseExpr
      case ptail of
        DottedList ls l -> return $ DottedList (phead ++ ls) l 
        -- Issue #41
        -- Improper lists are tricky because if an improper list ends in a 
        -- proper list, then it becomes proper as well. The following cases 
        -- handle that, as well as preserving necessary functionality when 
        -- appropriate, such as for unquoting.
        --
        -- FUTURE: I am not sure if this is complete, in fact the "unquote" 
        -- seems like it could either be incorrect or one special case among 
        -- others. Anyway, for the 3.3 release this is good enough to pass all
        -- test cases. It will be revisited later if necessary.
        --
        List (Atom "unquote" : _) -> return $ DottedList phead ptail 
        List ls -> return $ List $ phead ++ ls
        {- Regarding above, see 
           http://community.schemewiki.org/?scheme-faq-language#dottedapp
         
           Note, however, that most Schemes expand literal lists occurring in 
           function applications, e.g. (foo bar . (1 2 3)) is expanded into 
           (foo bar 1 2 3) by the reader. It is not entirely clear whether this 
           is a consequence of the standard - the notation is not part of the 
           R5RS grammar but there is strong evidence to suggest a Scheme 
           implementation cannot comply with all of R5RS without performing this
           transformation. -}
        _ -> return $ DottedList phead ptail

-- |Parse a quoted expression
parseQuoted :: Parser LispVal
parseQuoted = do
  _ <- lexeme $ char '\''
  x <- parseExpr
  return $ List [Atom "quote", x]

-- |Parse a quasi-quoted expression
parseQuasiQuoted :: Parser LispVal
parseQuasiQuoted = do
  _ <- lexeme $ char '`'
  x <- parseExpr
  return $ List [Atom "quasiquote", x]

-- |Parse an unquoted expression (a quasiquotated expression preceded
--  by a comma)
parseUnquoted :: Parser LispVal
parseUnquoted = do
  _ <- try (lexeme $ char ',')
  x <- parseExpr
  return $ List [Atom "unquote", x]

-- |Parse an unquote-spliced expression
parseUnquoteSpliced :: Parser LispVal
parseUnquoteSpliced = do
  _ <- try (lexeme $ string ",@")
  x <- parseExpr
  return $ List [Atom "unquote-splicing", x]

-- FUTURE: should be able to use the grammar from R5RS
-- to make parsing more efficient (mostly by minimizing
-- or eliminating the number of try's below)

-- |Parse an expression
parseExpr :: Parser LispVal
parseExpr =
      try (lexeme parseComplexNumber)
  <|> try (lexeme parseRationalNumber)
  <|> try (lexeme parseRealNumber)
  <|> try (lexeme parseNumber)
  <|> lexeme parseChar
  <|> parseUnquoteSpliced
  <|> do _ <- try (lexeme $ string "#(")
         x <- parseVector
         _ <- lexeme $ char ')'
         return x
  <|> do _ <- try (lexeme $ string "#u8(")
         x <- parseByteVector
         _ <- lexeme $ char ')'
         return x
--  <|> do _ <- try (lexeme $ string "#hash(")
--         x <- parseHashTable
--         _ <- lexeme $ char ')'
--         return x
  <|> try parseAtom
  <|> lexeme parseString
  <|> lexeme parseBool
  <|> parseQuoted
  <|> parseQuasiQuoted
  <|> parseUnquoted
  <|> try (parens parseList)
  <|> parens parseDottedList
  <|> try (brackets parseList)
  <|> brackets parseDottedList
  <?> "Expression"

-- |Initial parser used by the high-level parse functions
mainParser :: Parser LispVal
mainParser = do
    _ <- whiteSpace
    parseExpr

-- |Use a parser to parse the given text, throwing an error
--  if there is a problem parsing the text.
readOrThrow :: Parser a -> String -> ThrowsError a
readOrThrow parser input = case parse parser "lisp" input of
  Left err -> throwError $ Parser err
  Right val -> return val

-- |Parse an expression from a string of text
readExpr :: String -> ThrowsError LispVal
readExpr = readOrThrow mainParser

-- |Parse many expressions from a string of text
readExprList :: String -> ThrowsError [LispVal]
readExprList = readOrThrow (endBy mainParser whiteSpace)