module GLL.Combinators.Lexer ( default_lexer, lexer, lexerEither, LexerSettings(..), emptyLanguage, oneOf, manyOf, someOf, baseToDec, ) where import GLL.Types.Grammar (Token(..), SubsumesToken(..)) import Data.List (isPrefixOf) import Data.Char (isSpace, isDigit, isAlpha, isUpper, isLower) import Text.Regex.Applicative import Text.Regex.Applicative.Common (signed) -- | Settings for changing the behaviour of the builtin lexer 'lexer'. -- Lexers are built using "Text.Regex.Applicative". data LexerSettings = LexerSettings { -- | Which keychars to recognise? Default: none. keychars :: [Char] -- | Which keywords to recognise? Default: none. , keywords :: [String] -- | What is considered a whitespace character? Default: 'Data.Char.isSpace'. , whitespace :: Char -> Bool -- | How does a line comment start? Default: '"'//'"'. , lineComment :: String -- | How does a block comment open? Default: '"'{-'"'. , blockCommentOpen :: String -- | How does a block comment close? Default: '"'-}'"'. , blockCommentClose :: String -- | How to recognise identifiers? Default alphanumerical with lowercase alpha start. , identifiers :: RE Char String -- | How to recognise alternative identifiers? Default alphanumerical with uppercase alpha start. , altIdentifiers :: RE Char String -- | Arbitrary tokens /(a,b)/. /a/ is the token name, /b/ is a regular expression. , tokens :: [(String, RE Char String)] -- | Whether integer literals may be signed positive or negative. Default: 'False' , signed_int_lits :: Bool } -- | The default 'LexerSettings'. emptyLanguage :: LexerSettings emptyLanguage = LexerSettings [] [] isSpace "//" "{-" "-}" ((:) <$> psym isLower <*> lowercase_id) ((:) <$> psym isUpper <*> lowercase_id) [] False where lowercase_id = many (psym (\c -> isAlpha c || c == '_' || isDigit c)) -- | A lexer using the default 'LexerSettings'. default_lexer :: SubsumesToken t => String -> [t] default_lexer = lexer emptyLanguage -- | Variant of 'lexerEither' that throws an error or returns the result otherwise lexer :: SubsumesToken t => LexerSettings -> String -> [t] lexer set inp = case lexerEither set inp of Left err -> error err Right ts -> ts -- | A lexer parameterised by 'LexerSettings'. lexerEither :: SubsumesToken t => LexerSettings -> String -> Either String [t] lexerEither _ [] = Right [] lexerEither lexsets s | start /= "" && end /= "" && start `isPrefixOf` s = blockState 1 (drop lS s) | lComm /= "" && lComm `isPrefixOf` s = case dropWhile ((/=) '\n') s of [] -> Right [] (c:cs) -> lexerEither lexsets cs | isWS (head s) = lexerEither lexsets (dropWhile isWS s) | otherwise = case findLongestPrefix (lTokens lexsets) s of Just (tok, rest) -> fmap (tok :) $ lexerEither lexsets rest Nothing -> Left ("lexical error at: " ++ show (take 10 s)) where start = blockCommentOpen lexsets end = blockCommentClose lexsets isWS = whitespace lexsets lComm = lineComment lexsets lS = length start lE = length end blockState :: SubsumesToken t => Int -> String -> Either String [t] blockState n [] = Right [] blockState 0 rest = lexerEither lexsets rest blockState n cs | start `isPrefixOf` cs = blockState (n+1) (drop lS cs) | end `isPrefixOf` cs = blockState (n-1) (drop lE cs) | otherwise = blockState n (tail cs) lTokens :: SubsumesToken t => LexerSettings -> RE Char t lTokens lexsets = lCharacters <|> lKeywords <|> upcast . IntLit . Just <$> lIntegers (signed_int_lits lexsets) <|> upcast . FloatLit . Just <$> lFloats <|> upcast . IDLit . Just <$> identifiers lexsets <|> upcast . AltIDLit . Just <$> altIdentifiers lexsets <|> upcast . CharLit . Just <$> lCharLit <|> upcast . StringLit . Just <$> lStringLit <|> lMore where lMore = foldr ((<|>) . uncurry lToken) empty (tokens lexsets) lChar c = upcast (Char c) <$ sym c lCharacters = foldr ((<|>) . lChar) empty (keychars lexsets) lKeyword k = upcast (Keyword k) <$ string k lKeywords = foldr ((<|>) . lKeyword) empty (keywords lexsets) lToken t re = upcast . Token t . Just <$> re lStringLit = toString <$ sym '\"' <*> many strChar <* sym '\"' where strChar = sym '\\' *> sym '\"' <|> psym ((/=) '\"') toString inner = read ("\"" ++ inner ++ "\"") lCharLit = id <$ sym '\'' <*> charChar <* sym '\'' where charChar = sym '\\' *> sym '\'' <|> psym ((/=) '\'') lFloats :: RE Char Double lFloats = signed ( read <$> ( mkDP <$> decimal <*> sym '.' <*> decimal <*> optional exponent <|> mkEP <$> decimal <*> exponent )) where mkDP pre _ post mexp = pre ++ "." ++ post ++ maybe "" id mexp mkEP pre exp = pre ++ exp exponent = mk <$> (sym 'e' <|> sym 'E') <*> optional (sym '+' <|> sym '-') <*> decimal where mk pre sign dec = pre : maybe "" (:[]) sign ++ dec lIntegers :: Bool -> RE Char Int lIntegers True = signed lNaturals lIntegers False = lNaturals lNaturals :: RE Char Int lNaturals = (read <$> decimal) <|> (baseToDec 16 <$ hexPrefix <*> someOf (['0'..'9']++['A'..'F']++['a'..'f'])) <|> (baseToDec 8 <$ octPrefix <*> someOf ['0'..'7']) <|> (baseToDec 2 <$ binPrefix <*> someOf ['0','1']) where hexPrefix = string "0x" <|> string "0X" octPrefix = string "0o" <|> string "0O" binPrefix = string "0b" <|> string "0B" decimal :: RE Char String decimal = someOf ['0'..'9'] -- | Convert numerical representation in a given base -- (max base = 16, written as string) -- into decimal representation (returned as Int) baseToDec :: Int -> String -> Int baseToDec base = baseToDec' 0 base . map toInt where baseToDec' acc base [] = acc baseToDec' acc base (d:ds) = baseToDec' (acc * base + d) base ds toInt c | c == 'A' || c == 'a' = 10 | c == 'B' || c == 'b' = 11 | c == 'C' || c == 'c' = 12 | c == 'D' || c == 'd' = 13 | c == 'E' || c == 'e' = 14 | c == 'F' || c == 'f' = 15 | otherwise = read [c] oneOf :: Eq t => [t] -> RE t t oneOf ts = psym (\t -> t `elem` ts) manyOf :: Eq t => [t] -> RE t [t] manyOf ts = many (oneOf ts) someOf :: Eq t => [t] -> RE t [t] someOf ts = some (oneOf ts) {- manyOf :: Eq t => [t] -> RE t [t] manyOf ts = empty <|> someOf ts someOf :: Eq t => [t] -> RE t [t] someOf ts = (:) <$> oneOf ts <*> manyOf ts -}