module Language.Haskell.Preprocessor.Lexer (
scan, dscan, module T
) where
import Char
import Language.Haskell.Preprocessor.Token as T
import Language.Haskell.Preprocessor.Loc
import qualified Language.Haskell.Preprocessor.SynSpec as SS
scan :: SS.SynSpec -> String -> String -> [Token]
scan spec file = pass3 spec . pass2 spec . pass1 spec file
dscan :: String -> String -> [Token]
dscan = scan SS.defaultSpec
lexerr :: Loc -> String -> [Token]
lexerr loc msg = [newToken { tag = Error, T.loc = loc, val = msg }]
pass1 :: SS.SynSpec -> String -> String -> [Token]
pass1 spec file str = bol (initial file) str where
unboxed = SS.unboxed spec
pragmas = SS.pragmas spec
varinit c = isLower c || c == '_'
coninit c = isUpper c
opinit c = c `elem` ":!$%&*+./<=>?@\\^|-~"
varcont c = isAlphaNum c || c `elem` "'_"
concont = varcont
opcont c = opinit c || c == '#'
rawnext tag loc inp val = loc' `seq` new : token loc' inp where
new = newToken { T.tag = tag, T.loc = loc, T.val = val }
loc' = advance loc val
next tag loc inp val = rawnext tag loc inp val
rnext tag loc inp val = next tag loc inp (reverse val)
bol loc inp = case inp of
'#':r -> let (acc, inp) = eatWhile (/= '\n') r "#"
val = reverse acc in
case fromDirective val of
Nothing -> next CPragma loc inp val
Just loc ->
case inp of
'\n':r -> bol loc r
_ -> []
c:inp | isSpace c -> bol (advance loc c) inp
_ -> token loc inp
token loc inp = case inp of
'(':'#':r | unboxed -> next Other loc r "(#"
'#':')':r | unboxed -> next Other loc r "#)"
'{':'-':'#':r | pragmas -> next Other loc r "{-#"
'#':'-':'}':r | pragmas -> next Other loc r "#-}"
'{':'-':r -> lexComment loc r "-{" 1
'(':r -> next Other loc r "("
')':r -> next Other loc r ")"
'[':r -> next Other loc r "["
']':r -> next Other loc r "]"
'{':r -> next Other loc r "{"
'}':r -> next Other loc r "}"
';':r -> next Other loc r ";"
',':r -> next Other loc r ","
'`':r -> next Other loc r "`"
'\'':r -> lexChar loc r "'"
'"':r -> lexString loc r "\""
'-':'-':r -> lexHyphen loc r "--"
'\n':r -> bol (advance loc '\n') r
c:r | isSpace c -> token (advance loc c) r
| coninit c -> lexCon loc r [c]
| varinit c -> lexVar loc r [c]
| opinit c -> lexOp loc r [c]
| isDigit c -> lexNumber loc r [c]
c:_ -> lexerr loc ("unexpected character " ++ show c)
[] -> []
lexChar loc inp acc = case inp of
'\'':'#':r
| unboxed -> rnext CharLit loc r ('#':'\'':acc)
'\'':r -> rnext CharLit loc r ('\'':acc)
'\\':'\'':r -> lexChar loc r ('\'':'\\':acc)
'\\':'\\':r -> lexChar loc r ('\\':'\\':acc)
c:r -> lexChar loc r (c:acc)
[] -> lexerr loc "eof in character literal"
lexString loc inp acc = case inp of
'"':'#':r
| unboxed -> rnext StringLit loc r ('#':'"':acc)
'"':r -> rnext StringLit loc r ('"':acc)
'\\':'"':r -> lexString loc r ('"':'\\':acc)
'\\':'\\':r -> lexString loc r ('\\':'\\':acc)
c:r -> lexString loc r (c:acc)
[] -> lexerr loc "eof in string literal"
lexNumber loc inp acc = case inp of
c:r | isDigit c -> lexNumber loc r (c:acc)
| c == '.' && not (null r) && isDigit (head r)
-> lexFloat loc r (c:acc)
| c `elem` "eE" -> lexMaybeExp IntLit loc inp acc
'#':r | unboxed -> rnext IntLit loc r ('#':acc)
_ -> rnext IntLit loc inp acc
lexFloat loc inp acc = case eatWhile isDigit inp acc of
(acc, r@(c:_)) | c `elem` "eE" -> lexMaybeExp FloatLit loc r acc
(acc, '#':r) | unboxed -> rnext FloatLit loc r ('#':acc)
(acc, r) -> rnext FloatLit loc r acc
lexMaybeExp tag loc inp acc = case inp of
e:'-':d:r | isDigit d -> lexExp loc r (d:'-':e:acc)
e:'+':d:r | isDigit d -> lexExp loc r (d:'+':e:acc)
e:d:r | isDigit d -> lexExp loc r (d:e:acc)
_ -> rnext tag loc inp acc
lexExp loc inp acc = case eatWhile isDigit inp acc of
(acc, '#':r) -> rnext FloatLit loc r ('#':acc)
(acc, r) -> rnext FloatLit loc r acc
lexVar loc inp acc = case inp of
c:r | varcont c -> lexVar loc r (c:acc)
'#':r | unboxed -> rnext Variable loc r ('#':acc)
_ -> rnext Variable loc inp acc
lexOp loc inp acc = case inp of
c:r | opcont c -> lexOp loc r (c:acc)
_ -> rnext Operator loc inp acc
lexCon loc inp acc = case inp of
c:r | concont c -> lexVar loc r (c:acc)
'#':r | unboxed -> rnext Constructor loc r ('#':acc)
'.':r -> case r of
c:s | coninit c -> lexCon loc s (c:'.':acc)
| varinit c -> lexVar loc s (c:'.':acc)
| opcont c -> lexOp loc s (c:'.':acc)
_ -> rnext Constructor loc inp acc
_ -> rnext Constructor loc inp acc
lexHyphen loc inp acc = case inp of
'-':r -> lexHyphen loc r ('-':acc)
c:r | opcont c -> lexOp loc r (c:acc)
_ -> case eatWhile (/= '\n') inp acc of
(acc, r) -> rnext Comment loc r acc
lexComment :: Loc -> String -> String -> Int -> [Token]
lexComment loc inp acc 0 = rnext Comment loc inp acc
lexComment loc inp acc n = case inp of
'-':'}':r -> lexComment loc r ('}':'-':acc) (n 1)
'{':'-':r -> lexComment loc r ('-':'{':acc) (n + 1)
c:r -> lexComment loc r (c:acc) n
[] -> lexerr loc "eof in comment"
eatWhile pred inp acc = case inp of
c:r | pred c -> eatWhile pred r (c:acc)
_ -> (acc, inp)
pass2 :: SS.SynSpec -> [Token] -> [Token]
pass2 _ = loop [] where
loop acc toks = case toks of
t@Token { tag = Comment } : rest
-> loop (t:acc) rest
t : rest
-> t { com = reverse acc } : loop [] rest
[] -> case acc of
[] -> []
t:_ -> [newToken { tag = Other,
com = reverse acc,
loc = loc t `advance` '\n'} ]
pass3 :: SS.SynSpec -> [Token] -> [Token]
pass3 spec = loop [] where
levelnest = SS.levelnest spec
indents = [ s | block <- SS.blocks spec, SS.I s <- block ]
indent loc = Token VIndent [] loc ""
semi loc = Token VSemi [] loc ""
dedent loc = Token VDedent [] loc ""
loop stk toks = case toks of
[] -> []
t:ts
| val t `elem` indents ->
case ts of
[] -> t : indent _loc : dedent _loc : check _loc stk []
where _loc = loc t
t':_
| val t' == "{" ->
t : check (loc t') stk ts
| otherwise ->
t : indent (loc t') : begin (loc t') stk ts
| otherwise -> t : check (loc t) stk ts
begin _loc stk toks = case stk of
[] -> loop [n] toks
m:_ -> case n `compare` m of
GT -> loop (n:stk) toks
EQ | levelnest
-> loop (n:stk) toks
_ -> dedent _loc : check _loc stk toks
where n = col _loc
check _loc stk toks = case (stk, toks) of
(ms, []) -> map (const (dedent _loc)) ms
(m:ms, t:ts) ->
case col (loc t) `compare` m of
LT -> dedent (loc t) : check (loc t) ms (t:ts)
EQ -> semi (loc t) : loop (m:ms) (t:ts)
_ -> loop (m:ms) (t:ts)
([], ts) -> loop [] ts