-- | -- Module : Data.SExpresso.Language.SchemeR5RS -- Copyright : © 2019 Vincent Archambault -- License : 0BSD -- -- Maintainer : Vincent Archambault -- Stability : experimental -- -- Module for parsing the Scheme R5RS language. -- -- Scheme R5RS s-expressions are parsed as @'SExpr' 'SExprType' -- 'SchemeToken'@. Such s-expressions can be converted into a Scheme -- R5RS datum (see 'Datum') by the function 'sexpr2Datum'. {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE TypeFamilies #-} {-# LANGUAGE ScopedTypeVariables #-} -- Parsing library for some parts of the Scheme R5RS language -- as defined in section 7 of the report -- The library does parse tab and \r\n and whitespace module Data.SExpresso.Language.SchemeR5RS ( -- * SchemeToken and Datum related data types and functions SExprType(..), SchemeToken(..), tokenParser, sexpr, Datum(..), sexpr2Datum, -- * Scheme R5RS whitespace parsers whitespace, comment, interTokenSpace, interTokenSpace1, -- * Individual parser for each of the constructors of SchemeToken identifier, boolean, character, stringParser, quote, quasiquote, comma, commaAt, dot, -- ** Scheme Number -- -- | Scheme R5RS numbers are quite exotic. They can have exactness -- prefix, radix prefix and the pound sign (#) can replace a -- digit. On top of that, you can define integer, rational, decimal -- and complex numbers of arbitrary precision. Decimal numbers can -- also have a suffix indicating the machine precision. -- -- Since Haskell does not have native types to express this -- complexity, this module defines the 'SchemeNumber' data type to -- encode the parsed number. User of this module can then convert a -- 'SchemeNumber' object to a more appropriate data type according -- to their needs. SchemeNumber(..), Exactness(..), Complex(..), SReal(..), Sign(..), UInteger(..), Pounds, Precision(..), Suffix(..), number, ) where import Control.Monad (mzero) import Data.Maybe import Data.Proxy import Data.List import qualified Data.Char as C import qualified Data.Text as T import qualified Data.Text.Lazy as L import qualified Data.Text.Lazy.Builder as B import Data.Foldable import Text.Megaparsec import Text.Megaparsec.Char import qualified Text.Megaparsec.Char.Lexer as ML import Data.SExpresso.SExpr import Data.SExpresso.Parse -- | The 'SchemeToken' data type defines the atoms of an Scheme R5RS -- s-expression. An @'SExpr' 'SExprType' 'SchemeToken'@ object -- containning the atoms 'TQuote', 'TQuasiquote', 'TComma', 'TCommaAt' -- and 'TDot' need futher processing in order to get what the R5RS -- report calls a datum. See also 'Datum'. data SchemeToken = -- | A boolean. TBoolean Bool -- | A number. See 'SchemeNumber'. | TNumber SchemeNumber -- | A unicode character. | TChar Char -- | A string. | TString T.Text -- | A valid R5RS identifier. | TIdentifier T.Text -- | The quote (') symbol. | TQuote -- | The quasiquote (`) symbol. | TQuasiquote -- | The comma (,) symbol. | TComma -- | The comma at (,\@) symbol. | TCommaAt -- | The dot (.) symbol. | TDot deriving (Eq, Show) -- | The 'tokenParser' parses a 'SchemeToken' tokenParser :: (MonadParsec e s m, Token s ~ Char) => m SchemeToken tokenParser = (boolean >>= return . TBoolean) <|> -- character must come before number (character >>= return . TChar) <|> (stringParser >>= return . TString) <|> -- We must try number because it can conflict with -- the dot ex : .2 and (a . b) -- and identifier ex : - and -1 (try number >>= return . TNumber) <|> (identifier >>= return . TIdentifier) <|> (quote >> return TQuote) <|> (quasiquote >> return TQuasiquote) <|> -- commaAt must come before comma (commaAt >> return TCommaAt) <|> (comma >> return TComma) <|> (dot >> return TDot) spacingRule :: SchemeToken -> SpacingRule spacingRule (TString _) = SOptional spacingRule TQuote = SOptional spacingRule TQuasiquote = SOptional spacingRule TComma = SOptional spacingRule TCommaAt = SOptional spacingRule _ = SMandatory -- | Scheme R5RS defines two types of s-expressions. Standard list -- beginning with '(' and vector beginning with '#('. The 'SExprType' -- data type indicates which one was parsed. data SExprType = -- | A standard list STList -- | A vector | STVector deriving (Eq, Show) -- | The 'sexpr' defines a 'SExprParser' to parse a Scheme R5RS -- s-expression as an @'SExpr' 'SExprType' 'SchemeToken'@. If you also -- want source position see the 'withLocation' function. -- -- Space is optional before and after the following tokens: -- -- * 'TString' -- * 'TQuote' -- * 'TQuasiquote' -- * 'TComma' -- * 'TCommaAt' sexpr :: forall e s m . (MonadParsec e s m, Token s ~ Char) => SExprParser m SExprType SchemeToken sexpr = let sTag = (single '(' >> return STList) <|> (chunk (tokensToChunk (Proxy :: Proxy s) "#(") >> return STVector) eTag = \t -> single ')' >> return t in SExprParser sTag eTag tokenParser interTokenSpace1 (mkSpacingRule spacingRule) -- | The 'Datum' data type implements the Scheme R5RS definition of a Datum. See also 'sexpr2Datum'. data Datum = DBoolean Bool | DNumber SchemeNumber | DChar Char | DString T.Text | DIdentifier T.Text | DList [Datum] | DDotList [Datum] Datum | DQuote Datum | DQuasiquote Datum | DComma Datum | DCommaAt Datum | DVector [Datum] deriving (Eq, Show) -- | The 'sexpr2Datum' function takes a list of 'SchemeToken' and -- returns a list of 'Datum'. In case of failure it will report an -- error, hence the 'Either' data type in the signature. -- -- As defined in the Scheme R5RS report, the 'TQuote', 'TQuasiquote', -- 'TComma', 'TCommaAt' and 'TDot' tokens must be followed by another -- token. sexpr2Datum :: [SExpr SExprType SchemeToken] -> Either String [Datum] sexpr2Datum = foldrM vectorFold [] where vectorFold :: SExpr SExprType SchemeToken -> [Datum] -> Either String [Datum] vectorFold (SAtom TQuote) [] = Left $ "Expecting a datum after a quote" vectorFold (SAtom TQuote) (x : xs) = pure $ DQuote x : xs vectorFold (SAtom TQuasiquote) [] = Left $ "Expecting a datum after a quasiquote" vectorFold (SAtom TQuasiquote) (x : xs) = pure $ DQuasiquote x : xs vectorFold (SAtom TComma) [] = Left $ "Expecting a datum after a comma" vectorFold (SAtom TComma) (x : xs) = pure $ DComma x : xs vectorFold (SAtom TCommaAt) [] = Left $ "Expecting a datum after a commaAt" vectorFold (SAtom TCommaAt) (x : xs) = pure $ DCommaAt x : xs vectorFold (SAtom TDot) _ = Left "Unexpected dot" vectorFold (SList STVector xs) acc = ((:) . DVector) <$> sexpr2Datum xs <*> pure acc vectorFold (SList STList xs) acc = let chooseConstructor (isDotList, ls) = (:) (if isDotList then DDotList (init ls) (last ls) else DList ls) in chooseConstructor <$> (foldrM listFold (False, []) xs) <*> pure acc vectorFold (SAtom x) acc = pure $ simpleToken x : acc simpleToken :: SchemeToken -> Datum simpleToken (TBoolean x) = DBoolean x simpleToken (TNumber x) = DNumber x simpleToken (TChar x) = DChar x simpleToken (TString x) = DString x simpleToken (TIdentifier x) = DIdentifier x simpleToken _ = error "simpleToken only handles a subset of SchemeToken constructors" listFold :: SExpr SExprType SchemeToken -> (Bool, [Datum]) -> Either String (Bool, [Datum]) listFold (SAtom TDot) (_, [x]) = pure (True, [x]) listFold x (d, acc) = (,) d <$> vectorFold x acc ------------------------- Whitespace and comments ------------------------- -- | The 'whitespace' parser parses one space, tab or end of line (\\n and \\r\\n). whitespace :: (MonadParsec e s m, Token s ~ Char) => m () whitespace = (char ' ' >> return ()) <|> (char '\t' >> return ()) <|> (eol >> return ()) -- | The 'comment' parser parses a semi-colon (;) character and -- everything until the end of line included. comment :: (MonadParsec e s m, Token s ~ Char) => m () comment = char ';' >> takeWhileP Nothing (\c -> c /= '\n' && c /= '\r') >> ((eol >> return ()) <|> eof) atmosphere :: (MonadParsec e s m, Token s ~ Char) => m () atmosphere = whitespace <|> comment -- | The 'interTokenSpace' parser parses zero or more whitespace or comment. interTokenSpace :: (MonadParsec e s m, Token s ~ Char) => m () interTokenSpace = many atmosphere >> return () -- | The 'interTokenSpace1' parser parses one or more whitespace or comment. interTokenSpace1 :: (MonadParsec e s m, Token s ~ Char) => m () interTokenSpace1 = some atmosphere >> return () ------------------------- Identifier ------------------------- -- | The 'identifier' parser parses a Scheme R5RS identifier. identifier :: forall e s m . (MonadParsec e s m, Token s ~ Char) => m T.Text identifier = standardIdentifier <|> peculiarIdentifier where standardIdentifier = do i <- oneOf initialList is <- takeWhileP Nothing (\c -> c `elem` subsequentList) return $ T.pack $ (i : chunkToTokens (Proxy :: Proxy s) is) initialList :: String initialList = ['a'..'z'] ++ ['A'..'Z'] ++ "!$%&*/:<=>?^_~" subsequentList :: String subsequentList = initialList ++ ['0'..'9'] ++ "+-.@" peculiarIdentifier :: forall e s m . (MonadParsec e s m, Token s ~ Char) => m T.Text peculiarIdentifier = (single '+' >> return "+") <|> (single '-' >> return "-") <|> (chunk (tokensToChunk (Proxy :: Proxy s) "...") >> return "...") ------------------------- Booleans ------------------------- -- | The 'boolean' parser parses a Scheme R5RS boolean (\#t or \#f). boolean :: forall e s m . (MonadParsec e s m, Token s ~ Char) => m Bool boolean = (chunk (tokensToChunk (Proxy :: Proxy s) "#t") >> return True) <|> (chunk (tokensToChunk (Proxy :: Proxy s) "#f") >> return False) ------------------------- Character ------------------------- -- | The 'character' parser parses a Scheme R5RS character. character :: forall e s m . (MonadParsec e s m, Token s ~ Char) => m Char character = do _ <- chunk (tokensToChunk (Proxy :: Proxy s) "#\\") (chunk (tokensToChunk (Proxy :: Proxy s) "newline") >> return '\n') <|> (chunk (tokensToChunk (Proxy :: Proxy s) "space") >> return ' ') <|> anySingle ------------------------- String ------------------------- -- | The 'stringParser' parser parses a Scheme R5RS character. stringParser :: forall e s m . (MonadParsec e s m, Token s ~ Char) => m T.Text stringParser = do _ <- char '"' xs <- consume return $ L.toStrict $ B.toLazyText xs where consume :: (MonadParsec e s m, Token s ~ Char) => m B.Builder consume = do x <- takeWhileP Nothing (\c -> c /= '\\' && c /= '"') c <- char '\\' <|> char '"' let xB = B.fromString $ chunkToTokens (Proxy :: Proxy s) x case c of '"' -> return $ xB _ -> do c1 <- char '\\' <|> char '"' x2 <- consume return $ xB <> B.fromString [c1] <> x2 ------------------------- Numbers ------------------------- data Radix = R2 | R8 | R10 | R16 deriving (Eq, Show) -- | A Scheme R5RS number is either exact or inexact. The paragraph -- 6.4.2 from the R5RS report should clarify the meaning of exact and -- inexact : -- -- \"\"\"A numerical constant may be specified to be either -- exact or inexact by a prefix. The prefixes are \#e for exact, and \#i -- for inexact. An exactness prefix may appear before or after any -- radix prefix that is used. If the written representation of a -- number has no exactness prefix, the constant may be either inexact -- or exact. It is inexact if it contains a decimal point, an -- exponent, or a \“#\” character in the place of a digit, otherwise it -- is exact.\"\"\" data Exactness = Exact | Inexact deriving (Eq, Show) -- | The 'Sign' datatype indicates if a number is positive ('Plus') or negative ('Minus') data Sign = Plus | Minus deriving (Eq, Show) -- | A Scheme R5RS number can have many # signs at the end. This type alias -- indicates the number of # signs parsed. type Pounds = Integer -- | A Scheme R5RS unsigned integer can be written in three ways. -- -- * With digits only -- * With digits and # signs -- * With only # signs in some special context. data UInteger = -- | Integer made only of digits UInteger Integer -- | Integer made of digits and #. The first argument is the number -- that was parsed and the second the number of # signs. For -- example, 123## is represented as @UIntPounds 123 2@. Do not take -- the first argument as a good approximation of the number. It -- needs to be shifted by the number of pounds. | UIntPounds Integer Pounds -- | Integer made only of #. It can only appear as the third argument in numbers of the form @'SDecimal' _ _ _ _@. | UPounds Pounds deriving (Eq, Show) hasPounds :: UInteger -> Bool hasPounds (UInteger _) = False hasPounds _ = True isInexactI :: UInteger -> Bool isInexactI = hasPounds -- | Scheme R5RS defines 5 types of machine precision for a decimal -- number. The machine precision is specified in the suffix (see -- 'Suffix'). data Precision = -- | Suffix starting with e. PDefault | -- | Suffix starting with s. PShort | -- | Suffix starting with f. PSingle | -- | Suffix starting with d. PDouble | -- | Suffix starting with l. PLong deriving (Eq, Show) -- | The 'Suffix' data type represents the suffix for a Scheme R5RS -- decimal number. It is a based 10 exponent. data Suffix = Suffix Precision Sign Integer deriving (Eq, Show) -- | The 'SReal' data type represents a Scheme R5RS real number. data SReal = -- | A signed integer. SInteger Sign UInteger -- | A signed rational. The first number is the numerator and the -- second one the denominator. | SRational Sign UInteger UInteger -- | A signed decimal number. The first number appears before the -- dot, the second one after the dot. | SDecimal Sign UInteger UInteger (Maybe Suffix) deriving (Eq, Show) isInexactR :: SReal -> Bool isInexactR (SInteger _ i) = isInexactI i isInexactR (SRational _ i1 i2) = isInexactI i1 || isInexactI i2 isInexactR (SDecimal _ _ _ _) = True -- | The 'Complex' data type represents a Scheme R5RS complex number. data Complex = -- | A real number. CReal SReal -- | A complex number in angular notation. | CAngle SReal SReal -- | A complex number in absolute notation. | CAbsolute SReal SReal deriving (Eq, Show) isInexact :: Complex -> Bool isInexact (CReal s) = isInexactR s isInexact (CAngle s1 s2) = isInexactR s1 || isInexactR s2 isInexact (CAbsolute s1 s2) = isInexactR s1 || isInexactR s2 -- | A Scheme R5RS number is an exact or inexact complex number. data SchemeNumber = SchemeNumber Exactness Complex deriving (Eq, Show) -- | The 'number' parser parses a Scheme R5RS number. number :: (MonadParsec e s m, Token s ~ Char) => m SchemeNumber number = do (r, e) <- prefix c <- complex (fromMaybe R10 r) let e' = fromMaybe (if isInexact c then Inexact else Exact) e return $ SchemeNumber e' c complex :: forall e s m . (MonadParsec e s m, Token s ~ Char) => Radix -> m Complex complex r = do ms <- optional sign case ms of Nothing -> complex' Plus Just s -> i s <|> complex' s where -- Parser for +i and -i i s = char 'i' >> (return $ CAbsolute (SInteger Plus (UInteger 0)) (SInteger s (UInteger 1))) -- Parser for complex except +i and -i complex' sr = do -- First parse a number n1 <- ureal r sr -- Check if the number is followed by any of these characters c <- optional (char '@' <|> char '+' <|> char '-' <|> char 'i') case c of -- Plain real number Nothing -> return $ CReal n1 -- Complex angular number Just '@' -> do n2 <- real r return $ CAngle n1 n2 -- Pure imaginary number Just 'i' -> return $ CAbsolute (SInteger Plus (UInteger 0)) n1 -- Real +/- Imaginary number Just '+' -> imaginaryPart n1 Plus Just _ -> imaginaryPart n1 Minus imaginaryPart realN si = do u <- optional (ureal r si) _ <- char 'i' case u of Nothing -> return $ CAbsolute realN (SInteger si (UInteger 1)) Just n2 -> return $ CAbsolute realN n2 real :: (MonadParsec e s m, Token s ~ Char) => Radix -> m SReal real r = do s <- option Plus sign ureal r s ureal :: forall e s m . (MonadParsec e s m, Token s ~ Char) => Radix -> Sign -> m SReal ureal r s = dotN <|> ureal' where dotN = do _ <- char '.' if r /= R10 then label "Numbers containing decimal point must be in decimal radix" mzero else do n <- uinteger R10 sf <- optional suffix return $ SDecimal s (UInteger 0) n sf ureal' = do -- First parse an integer u1 <- uinteger r -- Check if the integer is followed by these characters mc <- optional (char '/' <|> char '.') case mc of -- Integer with or without suffix Nothing -> plainInteger u1 -- Rational Just '/' -> rational u1 -- Decimal Just _ -> decimal u1 plainInteger u1 = do sf <- optional suffix case sf of Just _ -> return $ SDecimal s u1 (UInteger 0) sf Nothing -> return $ SInteger s u1 rational u1 = do u2 <- uinteger r return $ SRational s u1 u2 decimal u1 = do if r /= R10 then label "Numbers containing decimal point must be in decimal radix" mzero else do -- If u1 has # character, only other # are -- allowed. Otherwise a number may be present n <- if hasPounds u1 then return Nothing else optional (udigit R10) :: m (Maybe Integer) pounds <- takeWhileP Nothing (== '#') sf <- optional suffix let nbPounds = toInteger $ chunkLength (Proxy :: Proxy s) pounds let u2 = case (hasPounds u1, nbPounds, n) of (True, p, _) -> UPounds p (False, 0, Nothing) -> UInteger 0 (False, 0, (Just x)) -> UInteger x (False, p, Nothing) -> UPounds p (False, p, (Just x)) -> UIntPounds x p return $ SDecimal s u1 u2 sf uinteger :: forall e s m . (MonadParsec e s m, Token s ~ Char) => Radix -> m UInteger uinteger r = do n <- udigit r pounds <- takeWhileP Nothing (== '#') let nbPounds = toInteger $ chunkLength (Proxy :: Proxy s) pounds if nbPounds <= 0 then return $ UInteger n else return $ UIntPounds n nbPounds prefix :: (MonadParsec e s m, Token s ~ Char) => m (Maybe Radix, Maybe Exactness) prefix = do x <- optional $ char '#' case x of Nothing -> return (Nothing, Nothing) _ -> do c <- char 'i' <|> char 'e' <|> char 'b' <|> char 'o' <|> char 'd' <|> char 'x' case c of 'i' -> optional radix >>= \r -> return (r, Just Inexact) 'e' -> optional radix >>= \r -> return (r, Just Exact) 'b' -> optional exactness >>= \e -> return (Just R2, e) 'o' -> optional exactness >>= \e -> return (Just R8, e) 'd' -> optional exactness >>= \e -> return (Just R10, e) _ -> optional exactness >>= \e -> return (Just R16, e) exactness :: forall e s m . (MonadParsec e s m, Token s ~ Char) => m Exactness exactness = (chunk (tokensToChunk (Proxy :: Proxy s) "#e") >> return Exact) <|> (chunk (tokensToChunk (Proxy :: Proxy s) "#i") >> return Inexact) radix :: forall e s m . (MonadParsec e s m, Token s ~ Char) => m Radix radix = (chunk (tokensToChunk (Proxy :: Proxy s) "#b") >> return R2) <|> (chunk (tokensToChunk (Proxy :: Proxy s) "#o") >> return R8) <|> (chunk (tokensToChunk (Proxy :: Proxy s) "#d") >> return R10) <|> (chunk (tokensToChunk (Proxy :: Proxy s) "#x") >> return R16) udigit :: forall e s m a . (MonadParsec e s m, Token s ~ Char, Integral a) => Radix -> m a udigit r = do case r of R2 -> ML.binary R8 -> ML.octal R10 -> ML.decimal R16 -> hexadecimal -- ML.hexadecimal also parses uppercase "ABCDEF" where hexadecimal = mkNum <$> takeWhile1P Nothing (\c -> c `elem` ("0123456789abcdef" :: String)) "hexadecimal integer" mkNum = foldl' step 0 . chunkToTokens (Proxy :: Proxy s) step a c = a * 16 + fromIntegral (C.digitToInt c) sign :: (MonadParsec e s m, Token s ~ Char) => m Sign sign = (char '-' >> return Minus) <|> (char '+' >> return Plus) suffix :: (MonadParsec e s m, Token s ~ Char) => m Suffix suffix = do p <- (char 'e' >> return PDefault) <|> (char 's' >> return PShort) <|> (char 'f' >> return PSingle) <|> (char 'd' >> return PDouble) <|> (char 'l' >> return PLong) s <- option Plus sign n <- udigit R10 return $ Suffix p s n ------------------------- Other tokens ------------------------- -- | The 'quote' parser parses a quote character ('). quote :: (MonadParsec e s m, Token s ~ Char) => m Char quote = char '\'' -- | The 'quasiquote' parser parses a quasiquote character (`). quasiquote :: (MonadParsec e s m, Token s ~ Char) => m Char quasiquote = char '`' -- | The 'comma' parser parses a comma (,). comma :: (MonadParsec e s m, Token s ~ Char) => m Char comma = char ',' -- | The 'commaAt' parser parses a comma followed by \@ (,\@). commaAt :: forall e s m . (MonadParsec e s m, Token s ~ Char) => m T.Text commaAt = chunk (tokensToChunk (Proxy :: Proxy s) ",@") >> return ",@" -- | The 'dot' parser parses a single dot character (.). dot :: (MonadParsec e s m, Token s ~ Char) => m Char dot = char '.'