module Language.Haskell.Exts.Lexer (Token(..), lexer) where
import Language.Haskell.Exts.ParseMonad
import Data.Char
import Data.Ratio
data Token
= VarId String
| QVarId (String,String)
| IDupVarId (String)
| ILinVarId (String)
| ConId String
| QConId (String,String)
| DVarId [String]
| VarSym String
| ConSym String
| QVarSym (String,String)
| QConSym (String,String)
| IntTok Integer
| FloatTok Rational
| Character Char
| StringTok String
| LeftParen
| RightParen
| LeftHashParen
| RightHashParen
| SemiColon
| LeftCurly
| RightCurly
| VRightCurly
| LeftSquare
| RightSquare
| Comma
| Underscore
| BackQuote
| Dot
| DotDot
| Colon
| DoubleColon
| Equals
| Backslash
| Bar
| LeftArrow
| RightArrow
| At
| Tilde
| DoubleArrow
| Minus
| Exclamation
| Star
| THExpQuote
| THPatQuote
| THDecQuote
| THTypQuote
| THCloseQuote
| THIdEscape (String)
| THParenEscape
| THVarQuote
| THTyQuote
| RPGuardOpen
| RPGuardClose
| RPCAt
| XCodeTagOpen
| XCodeTagClose
| XStdTagOpen
| XStdTagClose
| XCloseTagOpen
| XEmptyTagClose
| XPCDATA String
| XRPatOpen
| XRPatClose
| PragmaEnd
| PragmaUnknown (String,String)
| RULES
| INLINE Bool
| SPECIALISE
| SPECIALISE_INLINE Bool
| SOURCE
| DEPRECATED
| WARNING
| SCC
| GENERATED
| CORE
| UNPACK
| OPTIONS (Maybe String,String)
| CFILES String
| LANGUAGE
| INCLUDE String
| KW_As
| KW_Case
| KW_Class
| KW_Data
| KW_Default
| KW_Deriving
| KW_Do
| KW_MDo
| KW_Else
| KW_Family
| KW_Forall
| KW_Hiding
| KW_If
| KW_Import
| KW_In
| KW_Infix
| KW_InfixL
| KW_InfixR
| KW_Instance
| KW_Let
| KW_Module
| KW_NewType
| KW_Of
| KW_Then
| KW_Type
| KW_Where
| KW_Qualified
| KW_Foreign
| KW_Export
| KW_Safe
| KW_Unsafe
| KW_Threadsafe
| KW_StdCall
| KW_CCall
| EOF
deriving (Eq,Show)
reserved_ops :: [(String,Token)]
reserved_ops = [
( ".", Dot ),
( "..", DotDot ),
( ":", Colon ),
( "::", DoubleColon ),
( "=", Equals ),
( "\\", Backslash ),
( "|", Bar ),
( "<-", LeftArrow ),
( "->", RightArrow ),
( "@", At ),
( "~", Tilde ),
( "=>", DoubleArrow ),
( "*", Star )
]
special_varops :: [(String,Token)]
special_varops = [
( "-", Minus ),
( "!", Exclamation ) --ditto
]
reserved_ids :: [(String,Token)]
reserved_ids = [
( "_", Underscore ),
( "case", KW_Case ),
( "class", KW_Class ),
( "data", KW_Data ),
( "default", KW_Default ),
( "deriving", KW_Deriving ),
( "do", KW_Do ),
( "else", KW_Else ),
( "family", KW_Family ),
( "forall", KW_Forall ),
( "if", KW_If ),
( "import", KW_Import ),
( "in", KW_In ),
( "infix", KW_Infix ),
( "infixl", KW_InfixL ),
( "infixr", KW_InfixR ),
( "instance", KW_Instance ),
( "let", KW_Let ),
( "mdo", KW_MDo ),
( "module", KW_Module ),
( "newtype", KW_NewType ),
( "of", KW_Of ),
( "then", KW_Then ),
( "type", KW_Type ),
( "where", KW_Where ),
( "foreign", KW_Foreign )
]
special_varids :: [(String,Token)]
special_varids = [
( "as", KW_As ),
( "qualified", KW_Qualified ),
( "hiding", KW_Hiding ),
( "export", KW_Export),
( "safe", KW_Safe),
( "unsafe", KW_Unsafe),
( "threadsafe", KW_Threadsafe),
( "stdcall", KW_StdCall),
( "ccall", KW_CCall)
]
pragmas :: [(String,Token)]
pragmas = [
( "rules", RULES ),
( "inline", INLINE True ),
( "noinline", INLINE False ),
( "notinline", INLINE False ),
( "specialise", SPECIALISE ),
( "specialize", SPECIALISE ),
( "source", SOURCE ),
( "deprecated", DEPRECATED ),
( "warning", WARNING ),
( "scc", SCC ),
( "generated", GENERATED ),
( "core", CORE ),
( "unpack", UNPACK ),
( "language", LANGUAGE ),
( "options", OPTIONS undefined ),
( "cfiles", CFILES undefined ),
( "include", INCLUDE undefined )
]
isIdent, isHSymbol :: Char -> Bool
isIdent c = isAlpha c || isDigit c || c == '\'' || c == '_'
isHSymbol c = c `elem` ":!#%&*./?@\\-" || ((isSymbol c || isPunctuation c) && not (c `elem` "(),;[]`{}_\"'"))
matchChar :: Char -> String -> Lex a ()
matchChar c msg = do
s <- getInput
if null s || head s /= c then fail msg else discard 1
lexer :: (Token -> P a) -> P a
lexer = runL $ do
bol <- checkBOL
(bol, ws) <- lexWhiteSpace bol
ec <- getExtContext
case ec of
Just ChildCtxt | not bol && ws -> return $ XPCDATA " "
_ -> do startToken
if bol then lexBOL else lexToken
lexWhiteSpace :: Bool -> Lex a (Bool, Bool)
lexWhiteSpace bol = do
s <- getInput
case s of
'{':'-':'#':_ -> do
return (bol, False)
'{':'-':_ -> do
discard 2
bol <- lexNestedComment bol
(bol, _) <- lexWhiteSpace bol
return (bol, True)
'-':'-':s | all (== '-') (takeWhile isHSymbol s) -> do
lexWhile (== '-')
lexWhile (/= '\n')
s' <- getInput
case s' of
[] -> fail "Unterminated end-of-line comment"
_ -> do
lexNewline
lexWhiteSpace True
return (True, True)
'\n':_ -> do
lexNewline
lexWhiteSpace True
return (True, True)
'\t':_ -> do
lexTab
(bol, _) <- lexWhiteSpace bol
return (bol, True)
c:_ | isSpace c -> do
discard 1
(bol, _) <- lexWhiteSpace bol
return (bol, True)
_ -> return (bol, False)
lexNestedComment :: Bool -> Lex a Bool
lexNestedComment bol = do
s <- getInput
case s of
'-':'}':_ -> discard 2 >> return bol
'{':'-':_ -> do
discard 2
bol <- lexNestedComment bol
lexNestedComment bol
'\t':_ -> lexTab >> lexNestedComment bol
'\n':_ -> lexNewline >> lexNestedComment True
_:_ -> discard 1 >> lexNestedComment bol
[] -> fail "Unterminated nested comment"
lexBOL :: Lex a Token
lexBOL = do
pos <- getOffside
case pos of
LT -> do
setBOL
popContextL "lexBOL"
return VRightCurly
EQ ->
return SemiColon
GT -> lexToken
lexToken :: Lex a Token
lexToken = do
ec <- getExtContext
case ec of
Just HarpCtxt -> lexHarpToken
Just TagCtxt -> lexTagCtxt
Just CloseTagCtxt -> lexCloseTagCtxt
Just ChildCtxt -> lexChildCtxt
Just CodeTagCtxt -> lexCodeTagCtxt
_ -> lexStdToken
lexChildCtxt :: Lex a Token
lexChildCtxt = do
s <- getInput
case s of
'<':'%':_ -> do discard 2
pushExtContextL CodeTagCtxt
return XCodeTagOpen
'<':'/':_ -> do discard 2
popExtContextL "lexChildCtxt"
pushExtContextL CloseTagCtxt
return XCloseTagOpen
'<':'[':_ -> do discard 2
pushExtContextL HarpCtxt
return XRPatOpen
'<':_ -> do discard 1
pushExtContextL TagCtxt
return XStdTagOpen
_ -> lexPCDATA
lexPCDATA :: Lex a Token
lexPCDATA = do
s <- getInput
case s of
[] -> return EOF
_ -> case s of
'\n':_ -> do
x <- lexNewline >> lexPCDATA
case x of
XPCDATA p -> return $ XPCDATA $ '\n':p
EOF -> return EOF
'<':_ -> return $ XPCDATA ""
_ -> do let pcd = takeWhile (\c -> not $ elem c "<\n") s
l = length pcd
discard l
x <- lexPCDATA
case x of
XPCDATA pcd' -> return $ XPCDATA $ pcd ++ pcd'
EOF -> return EOF
lexCodeTagCtxt :: Lex a Token
lexCodeTagCtxt = do
s <- getInput
case s of
'%':'>':_ -> do discard 2
popExtContextL "lexCodeTagContext"
return XCodeTagClose
_ -> lexStdToken
lexCloseTagCtxt :: Lex a Token
lexCloseTagCtxt = do
s <- getInput
case s of
'>':_ -> do discard 1
popExtContextL "lexCloseTagCtxt"
return XStdTagClose
_ -> lexStdToken
lexTagCtxt :: Lex a Token
lexTagCtxt = do
s <- getInput
case s of
'/':'>':_ -> do discard 2
popExtContextL "lexTagCtxt: Empty tag"
return XEmptyTagClose
'>':_ -> do discard 1
popExtContextL "lexTagCtxt: Standard tag"
pushExtContextL ChildCtxt
return XStdTagClose
_ -> lexStdToken
lexHarpToken :: Lex a Token
lexHarpToken = do
s <- getInput
case s of
']':'>':_ -> do discard 2
popExtContextL "lexHarpToken"
return XRPatClose
_ -> lexStdToken
lexStdToken :: Lex a Token
lexStdToken = do
s <- getInput
case s of
[] -> return EOF
'0':c:d:_ | toLower c == 'o' && isOctDigit d -> do
discard 2
n <- lexOctal
return (IntTok n)
| toLower c == 'x' && isHexDigit d -> do
discard 2
n <- lexHexadecimal
return (IntTok n)
'?':c:_ | isLower c -> do
discard 1
id <- lexWhile isIdent
return $ IDupVarId id
'%':c:_ | isLower c -> do
discard 1
id <- lexWhile isIdent
return $ ILinVarId id
'(':'|':c:_ | isHSymbol c -> discard 1 >> return LeftParen
'(':'|':_ -> do discard 2
return RPGuardOpen
'|':')':_ -> do discard 2
return RPGuardClose
'@':':':_ -> do discard 2
return RPCAt
'[':'|':_ -> do
discard 2
return $ THExpQuote
'[':c:'|':_ | c == 'e' -> do
discard 3
return $ THExpQuote
| c == 'p' -> do
discard 3
return THPatQuote
| c == 'd' -> do
discard 3
return THDecQuote
| c == 't' -> do
discard 3
return THTypQuote
'|':']':_ -> do discard 2
return THCloseQuote
'$':c:_ | isLower c -> do
discard 1
id <- lexWhile isIdent
return $ THIdEscape id
| c == '(' -> do
discard 2
return THParenEscape
'<':'%':_ -> do discard 2
pushExtContextL CodeTagCtxt
return XCodeTagOpen
'<':c:_ | isAlpha c -> do discard 1
pushExtContextL TagCtxt
return XStdTagOpen
'(':'#':_ -> do discard 2 >> return LeftHashParen
'#':')':_ -> do discard 2 >> return RightHashParen
'{':'-':'#':_ -> do discard 3 >> lexPragmaStart
'#':'-':'}':_ -> do discard 3 >> return PragmaEnd
c:_ | isDigit c -> lexDecimalOrFloat
| isUpper c -> lexConIdOrQual ""
| isLower c || c == '_' -> do
idents <- lexIdents
case idents of
[ident] -> return $ case lookup ident (reserved_ids ++ special_varids) of
Just keyword -> keyword
Nothing -> VarId ident
_ -> return $ DVarId idents
| isHSymbol c -> do
sym <- lexWhile isHSymbol
return $ case lookup sym (reserved_ops ++ special_varops) of
Just t -> t
Nothing -> case c of
':' -> ConSym sym
_ -> VarSym sym
| otherwise -> do
discard 1
case c of
'(' -> return LeftParen
')' -> return RightParen
',' -> return Comma
';' -> return SemiColon
'[' -> return LeftSquare
']' -> return RightSquare
'`' -> return BackQuote
'{' -> do
pushContextL NoLayout
return LeftCurly
'}' -> do
popContextL "lexStdToken"
return RightCurly
'\'' -> lexCharacter
'"' -> lexString
_ -> fail ("Illegal character \'" ++ show c ++ "\'\n")
where lexIdents :: Lex a [String]
lexIdents = do
ident <- lexWhile isIdent
s <- getInput
case s of
'-':c:_ | isAlpha c -> do
discard 1
idents <- lexIdents
return $ ident : idents
'#':_ -> do
discard 1
return [ident ++ "#"]
_ -> return [ident]
lexPragmaStart :: Lex a Token
lexPragmaStart = do
lexWhile isSpace
pr <- lexWhile isAlphaNum
case lookup (map toLower pr) pragmas of
Just SPECIALISE -> do
s <- getInput
case dropWhile isSpace $ map toLower s of
'i':'n':'l':'i':'n':'e':_ -> do
lexWhile isSpace
discard 6
return $ SPECIALISE_INLINE True
'n':'o':'i':'n':'l':'i':'n':'e':_ -> do
lexWhile isSpace
discard 8
return $ SPECIALISE_INLINE False
'n':'o':'t':'i':'n':'l':'i':'n':'e':_ -> do
lexWhile isSpace
discard 9
return $ SPECIALISE_INLINE False
_ -> return SPECIALISE
Just (OPTIONS _) -> do
s <- getInput
case s of
'_':_ -> do
discard 1
com <- lexWhile isIdent
rest <- lexRawPragma
return $ OPTIONS (Just com, rest)
x:_ | isSpace x -> do
rest <- lexRawPragma
return $ OPTIONS (Nothing, rest)
_ -> fail "Malformed Options pragma"
Just (CFILES _) -> do
rest <- lexRawPragma
return $ CFILES rest
Just (INCLUDE _) -> do
rest <- lexRawPragma
return $ INCLUDE rest
Just p -> return p
_ -> do rawStr <- lexRawPragma
return $ PragmaUnknown (pr, rawStr)
lexRawPragma :: Lex a String
lexRawPragma = do
rpr <- lexRawPragmaAux
return $ dropWhile isSpace rpr
where lexRawPragmaAux = do
rpr <- lexWhile (/='#')
s <- getInput
case s of
'#':'-':'}':_ -> return rpr
_ -> do
discard 1
rpr' <- lexRawPragma
return $ rpr ++ '#':rpr'
lexDecimalOrFloat :: Lex a Token
lexDecimalOrFloat = do
ds <- lexWhile isDigit
rest <- getInput
case rest of
('.':d:_) | isDigit d -> do
discard 1
frac <- lexWhile isDigit
let num = parseInteger 10 (ds ++ frac)
decimals = toInteger (length frac)
exponent <- do
rest2 <- getInput
case rest2 of
'e':_ -> lexExponent
'E':_ -> lexExponent
_ -> return 0
return (FloatTok ((num%1) * 10^^(exponent decimals)))
e:_ | toLower e == 'e' -> do
exponent <- lexExponent
return (FloatTok ((parseInteger 10 ds%1) * 10^^exponent))
_ -> return (IntTok (parseInteger 10 ds))
where
lexExponent :: Lex a Integer
lexExponent = do
discard 1
r <- getInput
case r of
'+':d:_ | isDigit d -> do
discard 1
lexDecimal
'-':d:_ | isDigit d -> do
discard 1
n <- lexDecimal
return (negate n)
d:_ | isDigit d -> lexDecimal
_ -> fail "Float with missing exponent"
lexConIdOrQual :: String -> Lex a Token
lexConIdOrQual qual = do
con <- lexWhile isIdent
let conid | null qual = ConId con
| otherwise = QConId (qual,con)
qual' | null qual = con
| otherwise = qual ++ '.':con
just_a_conid <- alternative (return conid)
rest <- getInput
case rest of
'.':c:_
| isLower c || c == '_' -> do
discard 1
ident <- lexWhile isIdent
s <- getInput
ident' <- case s of
'#':_ -> discard 1 >> return (ident ++ "#")
_ -> return ident
case lookup ident' reserved_ids of
Just _ -> just_a_conid
Nothing -> return (QVarId (qual', ident'))
| isUpper c -> do
discard 1
lexConIdOrQual qual'
| isHSymbol c -> do
discard 1
sym <- lexWhile isHSymbol
case lookup sym reserved_ops of
Just _ -> just_a_conid
Nothing -> return $ case c of
':' -> QConSym (qual', sym)
_ -> QVarSym (qual', sym)
'#':c:_
| isSpace c -> do
discard 1
case conid of
ConId con -> return $ ConId $ con ++ "#"
QConId (q,con) -> return $ QConId (q,con ++ "#")
_ -> return conid
lexCharacter :: Lex a Token
lexCharacter = do
s <- getInput
case s of
'\'':_ -> discard 1 >> return THTyQuote
'\\':_ -> do
c <- lexEscape
matchQuote
return (Character c)
c:'\'':_ -> discard 2 >> return (Character c)
_ -> return THVarQuote
where matchQuote = matchChar '\'' "Improperly terminated character constant"
lexString :: Lex a Token
lexString = loop ""
where
loop s = do
r <- getInput
case r of
'\\':'&':_ -> do
discard 2
loop s
'\\':c:_ | isSpace c -> do
discard 1
lexWhiteChars
matchChar '\\' "Illegal character in string gap"
loop s
| otherwise -> do
ce <- lexEscape
loop (ce:s)
'"':_ -> do
discard 1
return (StringTok (reverse s))
c:_ -> do
discard 1
loop (c:s)
[] -> fail "Improperly terminated string"
lexWhiteChars :: Lex a ()
lexWhiteChars = do
s <- getInput
case s of
'\n':_ -> do
lexNewline
lexWhiteChars
'\t':_ -> do
lexTab
lexWhiteChars
c:_ | isSpace c -> do
discard 1
lexWhiteChars
_ -> return ()
lexEscape :: Lex a Char
lexEscape = do
discard 1
r <- getInput
case r of
'a':_ -> discard 1 >> return '\a'
'b':_ -> discard 1 >> return '\b'
'f':_ -> discard 1 >> return '\f'
'n':_ -> discard 1 >> return '\n'
'r':_ -> discard 1 >> return '\r'
't':_ -> discard 1 >> return '\t'
'v':_ -> discard 1 >> return '\v'
'\\':_ -> discard 1 >> return '\\'
'"':_ -> discard 1 >> return '\"'
'\'':_ -> discard 1 >> return '\''
'^':c:_ -> discard 2 >> cntrl c
'N':'U':'L':_ -> discard 3 >> return '\NUL'
'S':'O':'H':_ -> discard 3 >> return '\SOH'
'S':'T':'X':_ -> discard 3 >> return '\STX'
'E':'T':'X':_ -> discard 3 >> return '\ETX'
'E':'O':'T':_ -> discard 3 >> return '\EOT'
'E':'N':'Q':_ -> discard 3 >> return '\ENQ'
'A':'C':'K':_ -> discard 3 >> return '\ACK'
'B':'E':'L':_ -> discard 3 >> return '\BEL'
'B':'S':_ -> discard 2 >> return '\BS'
'H':'T':_ -> discard 2 >> return '\HT'
'L':'F':_ -> discard 2 >> return '\LF'
'V':'T':_ -> discard 2 >> return '\VT'
'F':'F':_ -> discard 2 >> return '\FF'
'C':'R':_ -> discard 2 >> return '\CR'
'S':'O':_ -> discard 2 >> return '\SO'
'S':'I':_ -> discard 2 >> return '\SI'
'D':'L':'E':_ -> discard 3 >> return '\DLE'
'D':'C':'1':_ -> discard 3 >> return '\DC1'
'D':'C':'2':_ -> discard 3 >> return '\DC2'
'D':'C':'3':_ -> discard 3 >> return '\DC3'
'D':'C':'4':_ -> discard 3 >> return '\DC4'
'N':'A':'K':_ -> discard 3 >> return '\NAK'
'S':'Y':'N':_ -> discard 3 >> return '\SYN'
'E':'T':'B':_ -> discard 3 >> return '\ETB'
'C':'A':'N':_ -> discard 3 >> return '\CAN'
'E':'M':_ -> discard 2 >> return '\EM'
'S':'U':'B':_ -> discard 3 >> return '\SUB'
'E':'S':'C':_ -> discard 3 >> return '\ESC'
'F':'S':_ -> discard 2 >> return '\FS'
'G':'S':_ -> discard 2 >> return '\GS'
'R':'S':_ -> discard 2 >> return '\RS'
'U':'S':_ -> discard 2 >> return '\US'
'S':'P':_ -> discard 2 >> return '\SP'
'D':'E':'L':_ -> discard 3 >> return '\DEL'
'o':c:_ | isOctDigit c -> do
discard 1
n <- lexOctal
checkChar n
'x':c:_ | isHexDigit c -> do
discard 1
n <- lexHexadecimal
checkChar n
c:_ | isDigit c -> do
n <- lexDecimal
checkChar n
_ -> fail "Illegal escape sequence"
where
checkChar n | n <= 0x01FFFF = return (chr (fromInteger n))
checkChar _ = fail "Character constant out of range"
cntrl :: Char -> Lex a Char
cntrl c | c >= '@' && c <= '_' = return (chr (ord c ord '@'))
cntrl _ = fail "Illegal control character"
lexOctal :: Lex a Integer
lexOctal = do
ds <- lexWhile isOctDigit
return (parseInteger 8 ds)
lexHexadecimal :: Lex a Integer
lexHexadecimal = do
ds <- lexWhile isHexDigit
return (parseInteger 16 ds)
lexDecimal :: Lex a Integer
lexDecimal = do
ds <- lexWhile isDigit
return (parseInteger 10 ds)
parseInteger :: Integer -> String -> Integer
parseInteger radix ds =
foldl1 (\n d -> n * radix + d) (map (toInteger . digitToInt) ds)