src/GLL/Combinators/Lexer.hs


module GLL.Combinators.Lexer (
    default_lexer, lexer, LexerSettings(..), emptyLanguage,
    ) where

import GLL.Types.Grammar (Token(..), SubsumesToken(..))
import Data.List (isPrefixOf)
import Data.Char (isSpace, isDigit, isAlpha, isUpper, isLower)
import Text.Regex.Applicative

-- | Settings for changing the behaviour of the builtin lexer 'lexer'.
-- Lexers are built using "Text.Regex.Applicative".
data LexerSettings = LexerSettings {
        -- | Which keychars to recognise? Default: none.
        keychars        :: [Char]
        -- | Which keywords to recognise? Default: none.
    ,   keywords        :: [String]
        -- | What is considered a whitespace character? Default: 'Data.Char.isSpace'.
    ,   whitespace      :: Char -> Bool
        -- | How does a line comment start? Default: '"'//'"'.
    ,   lineComment     :: String
        -- | How does a block comment open? Default: '"'{-'"'. 
    ,   blockCommentOpen :: String
        -- | How does a block comment close? Default: '"'-}'"'.
    ,   blockCommentClose :: String
        -- | How to recognise identifiers? Default alphanumerical with lowercase alpha start.
    ,   identifiers     :: RE Char String
        -- | How to recognise alternative identifiers? Default alphanumerical with uppercase alpha start.
    ,   altIdentifiers  :: RE Char String
        -- | Arbitrary tokens /(a,b)/. /a/ is the token name, /b/ is a regular expression.
    ,   tokens          :: [(String, RE Char String)]
    }

-- | The default 'LexerSettings'.
emptyLanguage :: LexerSettings
emptyLanguage = LexerSettings [] [] isSpace "//" "{-" "-}"
    ((:) <$> psym isLower <*> lowercase_id)
    ((:) <$> psym isUpper <*> lowercase_id)
    []
 where lowercase_id = many (psym (\c -> isAlpha c || c == '_' || isDigit c))

-- | A lexer using the default 'LexerSettings'.
default_lexer :: SubsumesToken t => String -> [t]
default_lexer = lexer emptyLanguage 

-- | A lexer parameterised by 'LexerSettings'.
lexer :: SubsumesToken t => LexerSettings -> String -> [t]
lexer _ [] = []
lexer lexsets s
  | start /= "" && end /= "" && start `isPrefixOf` s = blockState 1 (drop lS s)
  | lComm /= "" && lComm `isPrefixOf` s = case dropWhile ((/=) '\n') s of
      []      -> []
      (c:cs)  -> lexer lexsets cs
  | isWS (head s) = lexer lexsets (dropWhile isWS s)
  | otherwise = case findLongestPrefix (lTokens lexsets) s of
        Just (tok, rest)   -> tok : lexer lexsets rest
        Nothing            -> error ("lexical error at: " ++ show (take 10 s))
  where start = blockCommentOpen lexsets
        end   = blockCommentClose lexsets
        isWS  = whitespace lexsets
        lComm = lineComment lexsets
        lS    = length start
        lE    = length end

        blockState :: SubsumesToken t => Int -> String -> [t] 
        blockState n [] = [] 
        blockState 0 rest = lexer lexsets rest
        blockState n cs | start `isPrefixOf` cs = blockState (n+1) (drop lS cs)
                    | end `isPrefixOf` cs   = blockState (n-1) (drop lE cs) 
                    | otherwise             = blockState n (tail cs)

lTokens :: SubsumesToken t => LexerSettings -> RE Char t 
lTokens lexsets =
        lCharacters
    <|> lKeywords
    <|> charsToInt  <$> optional (sym '-') <*> some (psym isDigit)
    <|> upcast . IDLit . Just <$> identifiers lexsets 
    <|> upcast . AltIDLit . Just <$> altIdentifiers lexsets
    <|> upcast . CharLit . Just <$> lCharLit
    <|> upcast . StringLit . Just <$> lStringLit
    <|> lMore
    where   
            charsToInt Nothing n = upcast (IntLit (Just (read n)))
            charsToInt (Just _) n = upcast (IntLit (Just (-(read n))))

            lChar c = upcast (Char c) <$ sym c
            lCharacters = foldr ((<|>) . lChar) empty (keychars lexsets) 

            lKeyword k  = upcast (Keyword k) <$ string k
            lKeywords = foldr ((<|>) . lKeyword) empty (keywords lexsets)

            lMore = foldr ((<|>) . uncurry lToken) empty (tokens lexsets)
            lToken t re = upcast . Token t . Just <$> re

            lStringLit = toString <$ sym '\"' <*> many strChar <* sym '\"'
             where strChar =  sym '\\' *> sym '\"'
                              <|> psym ((/=) '\"')
                   toString inner = read ("\"" ++ inner ++ "\"")

            lCharLit = id <$ sym '\'' <*> charChar <* sym '\''
              where charChar = sym '\\' *> sym '\''
                                <|> psym ((/=) '\'')