module GLL.Combinators.Lexer (
default_lexer, lexer, LexerSettings(..), emptyLanguage,
) where
import GLL.Types.Grammar (Token(..), SubsumesToken(..))
import Data.List (isPrefixOf)
import Data.Char (isSpace, isDigit, isAlpha, isUpper, isLower)
import Text.Regex.Applicative
data LexerSettings = LexerSettings {
keychars :: [Char]
, keywords :: [String]
, whitespace :: Char -> Bool
, lineComment :: String
, blockCommentOpen :: String
, blockCommentClose :: String
, identifiers :: RE Char String
, altIdentifiers :: RE Char String
, tokens :: [(String, RE Char String)]
}
emptyLanguage :: LexerSettings
emptyLanguage = LexerSettings [] [] isSpace "//" "{-" "-}"
((:) <$> psym isLower <*> lowercase_id)
((:) <$> psym isUpper <*> lowercase_id)
[]
where lowercase_id = many (psym (\c -> isAlpha c || c == '_' || isDigit c))
default_lexer :: SubsumesToken t => String -> [t]
default_lexer = lexer emptyLanguage
lexer :: SubsumesToken t => LexerSettings -> String -> [t]
lexer _ [] = []
lexer lexsets s
| start /= "" && end /= "" && start `isPrefixOf` s = blockState 1 (drop lS s)
| lComm /= "" && lComm `isPrefixOf` s = case dropWhile ((/=) '\n') s of
[] -> []
(c:cs) -> lexer lexsets cs
| isWS (head s) = lexer lexsets (dropWhile isWS s)
| otherwise = case findLongestPrefix (lTokens lexsets) s of
Just (tok, rest) -> tok : lexer lexsets rest
Nothing -> error ("lexical error at: " ++ show (take 10 s))
where start = blockCommentOpen lexsets
end = blockCommentClose lexsets
isWS = whitespace lexsets
lComm = lineComment lexsets
lS = length start
lE = length end
blockState :: SubsumesToken t => Int -> String -> [t]
blockState n [] = []
blockState 0 rest = lexer lexsets rest
blockState n cs | start `isPrefixOf` cs = blockState (n+1) (drop lS cs)
| end `isPrefixOf` cs = blockState (n1) (drop lE cs)
| otherwise = blockState n (tail cs)
lTokens :: SubsumesToken t => LexerSettings -> RE Char t
lTokens lexsets =
lCharacters
<|> lKeywords
<|> charsToInt <$> optional (sym '-') <*> some (psym isDigit)
<|> upcast . IDLit . Just <$> identifiers lexsets
<|> upcast . AltIDLit . Just <$> altIdentifiers lexsets
<|> upcast . CharLit . Just <$> lCharLit
<|> upcast . StringLit . Just <$> lStringLit
<|> lMore
where
charsToInt Nothing n = upcast (IntLit (Just (read n)))
charsToInt (Just _) n = upcast (IntLit (Just ((read n))))
lChar c = upcast (Char c) <$ sym c
lCharacters = foldr ((<|>) . lChar) empty (keychars lexsets)
lKeyword k = upcast (Keyword k) <$ string k
lKeywords = foldr ((<|>) . lKeyword) empty (keywords lexsets)
lMore = foldr ((<|>) . uncurry lToken) empty (tokens lexsets)
lToken t re = upcast . Token t . Just <$> re
lStringLit = toString <$ sym '\"' <*> many strChar <* sym '\"'
where strChar = sym '\\' *> sym '\"'
<|> psym ((/=) '\"')
toString inner = read ("\"" ++ inner ++ "\"")
lCharLit = id <$ sym '\'' <*> charChar <* sym '\''
where charChar = sym '\\' *> sym '\''
<|> psym ((/=) '\'')