{ {-# LANGUAGE BangPatterns #-} {-# LANGUAGE NamedFieldPuns #-} {-# OPTIONS_GHC -fno-warn-missing-signatures #-} {-# OPTIONS_GHC -fno-warn-name-shadowing #-} {-# OPTIONS_GHC -fno-warn-tabs #-} {-# OPTIONS_GHC -fno-warn-unused-binds #-} {-# OPTIONS_GHC -fno-warn-unused-imports #-} {-# OPTIONS_GHC -fno-warn-unused-matches #-} module Language.Sexp.Lexer ( lexSexp ) where import qualified Data.Text as T import Data.Text.Read import qualified Data.Text.Lazy as TL import Data.Text.Lazy.Encoding (decodeUtf8) import qualified Data.ByteString.Lazy.Char8 as B8 import Language.Sexp.LexerInterface import Language.Sexp.Token import Language.Sexp.Types (Position (..)) } $whitechar = [\ \t\n\r\f\v] $unispace = \x01 $whitespace = [$whitechar $unispace] $uninonspace = \x02 $uniany = [$unispace $uninonspace] @any = (. | $uniany) $digit = 0-9 $hex = [0-9 A-F a-f] $alpha = [a-z A-Z] $graphic = [$alpha $digit \!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~ \(\)\,\;\[\]\`\{\} \:\"\'\_ $uninonspace] @intnum = [\-\+]? $digit+ @scinum = [\-\+]? $digit+ ([\.]$digit+)? ([eE] [\-\+]? $digit+)? $charesc = [abfnrtv\\\"] @escape = \\ ($charesc | $digit+ | x $hex+) @string = $graphic # [\"\\] | " " | @escape $idinitial = [$alpha \!\$\%\&\*\/\<\=\>\?\~\_\^\.\+\- $uninonspace] $idsubseq = [$idinitial $digit \: $uninonspace] @identifier = $idinitial $idsubseq* @keyword = ":" $idsubseq+ :- $whitespace+ ; ";" @any* ; "(" { just TokLParen } ")" { just TokRParen } "[" { just TokLBracket } "]" { just TokRBracket } "'" / $graphic { just TokQuote } "#t" { just (TokBool True) } "#f" { just (TokBool False) } "#" / $graphic { just TokHash } @intnum { TokInt `via` readInteger } @scinum { TokReal `via` (read . T.unpack) } @identifier { TokSymbol `via` id } @keyword { TokKeyword `via` id } \" @string* \" { TokStr `via` readString } . { TokUnknown `via` T.head } { type AlexAction = LineCol -> TL.Text -> LocatedBy LineCol Token readInteger :: T.Text -> Integer readInteger str = case signed decimal str of Left err -> error $ "Lexer is broken: " ++ err Right (a, rest) | T.null (T.strip rest) -> a | otherwise -> error $ "Lexer is broken, leftover: " ++ show rest readString :: T.Text -> T.Text readString = T.pack . read . T.unpack just :: Token -> AlexAction just tok pos _ = L pos tok via :: (a -> Token) -> (T.Text -> a) -> AlexAction via ftok f pos str = L pos . ftok . f . TL.toStrict $str alexScanTokens :: AlexInput -> [LocatedBy LineCol Token] alexScanTokens input = case alexScan input defaultCode of AlexEOF -> [] AlexError (AlexInput {aiInput, aiLineCol = LineCol line col}) -> error $ "Lexical error at line " ++ show line ++ " column " ++ show col ++ ". Remaining input: " ++ TL.unpack (TL.take 1000 aiInput) AlexSkip input _ -> alexScanTokens input AlexToken input' tokLen action -> action (aiLineCol input) inputText : alexScanTokens input' where -- It is safe to take token length from input because every byte Alex -- sees corresponds to exactly one character, even if character is a -- Unicode one that occupies several bytes. We do character translation -- in LexerInterface.alexGetByte function so that all unicode characters -- occupy single byte. -- -- On the other hand, taking N characters from Text will take N valid -- characters, not N bytes. -- -- Thus, we're good. inputText = TL.take (fromIntegral tokLen) $ aiInput input where defaultCode :: Int defaultCode = 0 lexSexp :: Position -> TL.Text -> [LocatedBy Position Token] lexSexp (Position fn line1 col1) = map (mapPosition fixPos) . alexScanTokens . mkAlexInput where fixPos (LineCol l c) | l == 1 = Position fn line1 (col1 + c) | otherwise = Position fn (pred l + line1) c }