module CHSLexer (CHSToken(..), lexCHS)
where
import Data.List ((\\))
import Data.Char (isDigit)
import Control.Monad (liftM)
import Numeric (readDec, readOct, readHex)
import Position (Position(..), Pos(posOf), incPos, retPos, tabPos)
import Errors (ErrorLvl(..), Error, makeError)
import UNames (NameSupply, Name, names)
import Idents (Ident, lexemeToIdent, identToLexeme)
import Lexers (Regexp, Lexer, Action, epsilon, char, (+>), lexaction,
lexactionErr, lexmeta, (>|<), (>||<), ctrlLexer, star, plus,
quest, alt, string, LexerState, execLexer)
import C2HSState (CST, raise, raiseError, nop, getNameSupply)
data CHSToken = CHSTokArrow Position
| CHSTokDArrow Position
| CHSTokDot Position
| CHSTokComma Position
| CHSTokEqual Position
| CHSTokMinus Position
| CHSTokStar Position
| CHSTokAmp Position
| CHSTokHat Position
| CHSTokLBrace Position
| CHSTokRBrace Position
| CHSTokLParen Position
| CHSTokRParen Position
| CHSTokEndHook Position
| CHSTokAs Position
| CHSTokCall Position
| CHSTokClass Position
| CHSTokContext Position
| CHSTokDerive Position
| CHSTokEnum Position
| CHSTokForeign Position
| CHSTokFun Position
| CHSTokGet Position
| CHSTokImport Position
| CHSTokLib Position
| CHSTokNewtype Position
| CHSTokPointer Position
| CHSTokPrefix Position
| CHSTokPure Position
| CHSTokQualif Position
| CHSTokSet Position
| CHSTokSizeof Position
| CHSTokStable Position
| CHSTokType Position
| CHSTok_2Case Position
| CHSTokUnsafe Position
| CHSTokWith Position
| CHSTokLock Position
| CHSTokNolock Position
| CHSTokString Position String
| CHSTokHSVerb Position String
| CHSTokIdent Position Ident
| CHSTokHaskell Position String
| CHSTokCPP Position String
| CHSTokLine Position
| CHSTokC Position String
| CHSTokCtrl Position Char
| CHSTokPragma Position
| CHSTokPragEnd Position
instance Pos CHSToken where
posOf (CHSTokArrow pos ) = pos
posOf (CHSTokDArrow pos ) = pos
posOf (CHSTokDot pos ) = pos
posOf (CHSTokComma pos ) = pos
posOf (CHSTokEqual pos ) = pos
posOf (CHSTokMinus pos ) = pos
posOf (CHSTokStar pos ) = pos
posOf (CHSTokAmp pos ) = pos
posOf (CHSTokHat pos ) = pos
posOf (CHSTokLBrace pos ) = pos
posOf (CHSTokRBrace pos ) = pos
posOf (CHSTokLParen pos ) = pos
posOf (CHSTokRParen pos ) = pos
posOf (CHSTokEndHook pos ) = pos
posOf (CHSTokAs pos ) = pos
posOf (CHSTokCall pos ) = pos
posOf (CHSTokClass pos ) = pos
posOf (CHSTokContext pos ) = pos
posOf (CHSTokDerive pos ) = pos
posOf (CHSTokEnum pos ) = pos
posOf (CHSTokForeign pos ) = pos
posOf (CHSTokFun pos ) = pos
posOf (CHSTokGet pos ) = pos
posOf (CHSTokImport pos ) = pos
posOf (CHSTokLib pos ) = pos
posOf (CHSTokNewtype pos ) = pos
posOf (CHSTokPointer pos ) = pos
posOf (CHSTokPrefix pos ) = pos
posOf (CHSTokPure pos ) = pos
posOf (CHSTokQualif pos ) = pos
posOf (CHSTokSet pos ) = pos
posOf (CHSTokSizeof pos ) = pos
posOf (CHSTokStable pos ) = pos
posOf (CHSTokType pos ) = pos
posOf (CHSTok_2Case pos ) = pos
posOf (CHSTokUnsafe pos ) = pos
posOf (CHSTokWith pos ) = pos
posOf (CHSTokLock pos ) = pos
posOf (CHSTokNolock pos ) = pos
posOf (CHSTokString pos _) = pos
posOf (CHSTokHSVerb pos _) = pos
posOf (CHSTokIdent pos _) = pos
posOf (CHSTokHaskell pos _) = pos
posOf (CHSTokCPP pos _) = pos
posOf (CHSTokC pos _) = pos
posOf (CHSTokCtrl pos _) = pos
posOf (CHSTokPragma pos ) = pos
posOf (CHSTokPragEnd pos ) = pos
instance Eq CHSToken where
(CHSTokArrow _ ) == (CHSTokArrow _ ) = True
(CHSTokDArrow _ ) == (CHSTokDArrow _ ) = True
(CHSTokDot _ ) == (CHSTokDot _ ) = True
(CHSTokComma _ ) == (CHSTokComma _ ) = True
(CHSTokEqual _ ) == (CHSTokEqual _ ) = True
(CHSTokMinus _ ) == (CHSTokMinus _ ) = True
(CHSTokStar _ ) == (CHSTokStar _ ) = True
(CHSTokAmp _ ) == (CHSTokAmp _ ) = True
(CHSTokHat _ ) == (CHSTokHat _ ) = True
(CHSTokLBrace _ ) == (CHSTokLBrace _ ) = True
(CHSTokRBrace _ ) == (CHSTokRBrace _ ) = True
(CHSTokLParen _ ) == (CHSTokLParen _ ) = True
(CHSTokRParen _ ) == (CHSTokRParen _ ) = True
(CHSTokEndHook _ ) == (CHSTokEndHook _ ) = True
(CHSTokAs _ ) == (CHSTokAs _ ) = True
(CHSTokCall _ ) == (CHSTokCall _ ) = True
(CHSTokClass _ ) == (CHSTokClass _ ) = True
(CHSTokContext _ ) == (CHSTokContext _ ) = True
(CHSTokDerive _ ) == (CHSTokDerive _ ) = True
(CHSTokEnum _ ) == (CHSTokEnum _ ) = True
(CHSTokForeign _ ) == (CHSTokForeign _ ) = True
(CHSTokFun _ ) == (CHSTokFun _ ) = True
(CHSTokGet _ ) == (CHSTokGet _ ) = True
(CHSTokImport _ ) == (CHSTokImport _ ) = True
(CHSTokLib _ ) == (CHSTokLib _ ) = True
(CHSTokNewtype _ ) == (CHSTokNewtype _ ) = True
(CHSTokPointer _ ) == (CHSTokPointer _ ) = True
(CHSTokPrefix _ ) == (CHSTokPrefix _ ) = True
(CHSTokPure _ ) == (CHSTokPure _ ) = True
(CHSTokQualif _ ) == (CHSTokQualif _ ) = True
(CHSTokSet _ ) == (CHSTokSet _ ) = True
(CHSTokSizeof _ ) == (CHSTokSizeof _ ) = True
(CHSTokStable _ ) == (CHSTokStable _ ) = True
(CHSTokType _ ) == (CHSTokType _ ) = True
(CHSTok_2Case _ ) == (CHSTok_2Case _ ) = True
(CHSTokUnsafe _ ) == (CHSTokUnsafe _ ) = True
(CHSTokWith _ ) == (CHSTokWith _ ) = True
(CHSTokLock _ ) == (CHSTokLock _ ) = True
(CHSTokNolock _ ) == (CHSTokNolock _ ) = True
(CHSTokString _ _) == (CHSTokString _ _) = True
(CHSTokHSVerb _ _) == (CHSTokHSVerb _ _) = True
(CHSTokIdent _ _) == (CHSTokIdent _ _) = True
(CHSTokHaskell _ _) == (CHSTokHaskell _ _) = True
(CHSTokCPP _ _) == (CHSTokCPP _ _) = True
(CHSTokC _ _) == (CHSTokC _ _) = True
(CHSTokCtrl _ _) == (CHSTokCtrl _ _) = True
(CHSTokPragma _ ) == (CHSTokPragma _ ) = True
(CHSTokPragEnd _ ) == (CHSTokPragEnd _ ) = True
_ == _ = False
instance Show CHSToken where
showsPrec _ (CHSTokArrow _ ) = showString "->"
showsPrec _ (CHSTokDArrow _ ) = showString "=>"
showsPrec _ (CHSTokDot _ ) = showString "."
showsPrec _ (CHSTokComma _ ) = showString ","
showsPrec _ (CHSTokEqual _ ) = showString "="
showsPrec _ (CHSTokMinus _ ) = showString "-"
showsPrec _ (CHSTokStar _ ) = showString "*"
showsPrec _ (CHSTokAmp _ ) = showString "&"
showsPrec _ (CHSTokHat _ ) = showString "^"
showsPrec _ (CHSTokLBrace _ ) = showString "{"
showsPrec _ (CHSTokRBrace _ ) = showString "}"
showsPrec _ (CHSTokLParen _ ) = showString "("
showsPrec _ (CHSTokRParen _ ) = showString ")"
showsPrec _ (CHSTokEndHook _ ) = showString "#}"
showsPrec _ (CHSTokAs _ ) = showString "as"
showsPrec _ (CHSTokCall _ ) = showString "call"
showsPrec _ (CHSTokClass _ ) = showString "class"
showsPrec _ (CHSTokContext _ ) = showString "context"
showsPrec _ (CHSTokDerive _ ) = showString "deriving"
showsPrec _ (CHSTokEnum _ ) = showString "enum"
showsPrec _ (CHSTokForeign _ ) = showString "foreign"
showsPrec _ (CHSTokFun _ ) = showString "fun"
showsPrec _ (CHSTokGet _ ) = showString "get"
showsPrec _ (CHSTokImport _ ) = showString "import"
showsPrec _ (CHSTokLib _ ) = showString "lib"
showsPrec _ (CHSTokNewtype _ ) = showString "newtype"
showsPrec _ (CHSTokPointer _ ) = showString "pointer"
showsPrec _ (CHSTokPrefix _ ) = showString "prefix"
showsPrec _ (CHSTokPure _ ) = showString "pure"
showsPrec _ (CHSTokQualif _ ) = showString "qualified"
showsPrec _ (CHSTokSet _ ) = showString "set"
showsPrec _ (CHSTokSizeof _ ) = showString "sizeof"
showsPrec _ (CHSTokStable _ ) = showString "stable"
showsPrec _ (CHSTokType _ ) = showString "type"
showsPrec _ (CHSTok_2Case _ ) = showString "underscoreToCase"
showsPrec _ (CHSTokUnsafe _ ) = showString "unsafe"
showsPrec _ (CHSTokWith _ ) = showString "with"
showsPrec _ (CHSTokLock _ ) = showString "lock"
showsPrec _ (CHSTokNolock _ ) = showString "nolock"
showsPrec _ (CHSTokString _ s) = showString ("\"" ++ s ++ "\"")
showsPrec _ (CHSTokHSVerb _ s) = showString ("`" ++ s ++ "'")
showsPrec _ (CHSTokIdent _ i) = (showString . identToLexeme) i
showsPrec _ (CHSTokHaskell _ s) = showString s
showsPrec _ (CHSTokCPP _ s) = showString s
showsPrec _ (CHSTokC _ s) = showString s
showsPrec _ (CHSTokCtrl _ c) = showChar c
showsPrec _ (CHSTokPragma _ ) = showString "{-# LANGUAGE"
showsPrec _ (CHSTokPragEnd _ ) = showString "#-}"
data CHSLexerState = CHSLS {
nestLvl :: Int,
inHook :: Bool,
namesup :: [Name]
}
initialState :: CST s CHSLexerState
initialState = do
namesup <- liftM names getNameSupply
return $ CHSLS {
nestLvl = 0,
inHook = False,
namesup = namesup
}
assertFinalState :: Position -> CHSLexerState -> CST s ()
assertFinalState pos CHSLS {nestLvl = nestLvl, inHook = inHook}
| nestLvl > 0 = raiseError pos ["Unexpected end of file!",
"Unclosed nested comment."]
| inHook = raiseError pos ["Unexpected end of file!",
"Unclosed binding hook."]
| otherwise = nop
type CHSLexer = Lexer CHSLexerState CHSToken
type CHSAction = Action CHSToken
type CHSRegexp = Regexp CHSLexerState CHSToken
infixl 3 `lexactionName`
lexactionName :: CHSRegexp
-> (String -> Position -> Name -> CHSToken)
-> CHSLexer
re `lexactionName` action = re `lexmeta` action'
where
action' str pos state = let name:ns = namesup state
in
(Just $ Right (action str pos name),
incPos pos (length str),
state {namesup = ns},
Nothing)
chslexer :: CHSLexer
chslexer = pragma
>||< haskell
>||< nested
>||< ctrl
>||< hook
>||< cpp
pragma :: CHSLexer
pragma = string "{-# LANGUAGE" `lexmeta` \_ pos s ->
(Just $ Right (CHSTokPragma pos), incPos pos 12, s, Just langLexer)
langLexer :: CHSLexer
langLexer = whitespace >||< identOrKW >||< symbol >||<
(string "#-}" `lexmeta` \_ pos s ->
(Just $ Right (CHSTokPragEnd pos), incPos pos 3, s, Just chslexer))
haskell :: CHSLexer
haskell = ( anyButSpecial`star` epsilon
>|< specialButQuotes
>|< char '"' +> inhstr`star` char '"'
>|< string "'\"'"
>|< string "--" +> anyButNL`star` epsilon
)
`lexaction` copyVerbatim
>||< char '"'
`lexactionErr`
\_ pos -> (Left $ makeError ErrorErr pos
["Lexical error!",
"Unclosed string."])
where
anyButSpecial = alt (inlineSet \\ specialSet)
specialButQuotes = alt (specialSet \\ ['"'])
anyButNL = alt (anySet \\ ['\n'])
inhstr = instr >|< char '\\' >|< string "\\\"" >|< gap
gap = char '\\' +> alt (' ':ctrlSet)`plus` char '\\'
copyVerbatim :: CHSAction
copyVerbatim cs pos = Just $ CHSTokHaskell pos cs
nested :: CHSLexer
nested =
string "{-"
`lexmeta` enterComment
>||<
string "-}"
`lexmeta` leaveComment
where
enterComment cs pos s =
(copyVerbatim' cs pos,
incPos pos 2,
s {nestLvl = nestLvl s + 1},
Just $ inNestedComment)
leaveComment cs pos s =
case nestLvl s of
0 -> (commentCloseErr pos,
incPos pos 2,
s,
Nothing)
1 -> (copyVerbatim' cs pos,
incPos pos 2,
s {nestLvl = nestLvl s - 1},
Just chslexer)
_ -> (copyVerbatim' cs pos,
incPos pos 2,
s {nestLvl = nestLvl s - 1},
Nothing)
copyVerbatim' cs pos = Just $ Right (CHSTokHaskell pos cs)
commentCloseErr pos =
Just $ Left (makeError ErrorErr pos
["Lexical error!",
"`-}' not preceded by a matching `{-'."])
inNestedComment :: CHSLexer
inNestedComment = commentInterior
>||< nested
>||< ctrl
commentInterior :: CHSLexer
commentInterior = ( anyButSpecial`star` epsilon
>|< special
)
`lexaction` copyVerbatim
where
anyButSpecial = alt (inlineSet \\ commentSpecialSet)
special = alt commentSpecialSet
ctrl :: CHSLexer
ctrl =
char '\n' `lexmeta` newline
>||< char '\r' `lexmeta` newline
>||< char '\v' `lexmeta` newline
>||< char '\f' `lexmeta` formfeed
>||< char '\t' `lexmeta` tab
where
newline [c] pos = ctrlResult pos c (retPos pos)
formfeed [c] pos = ctrlResult pos c (incPos pos 1)
tab [c] pos = ctrlResult pos c (tabPos pos)
ctrlResult pos c pos' s =
(Just $ Right (CHSTokCtrl pos c), pos', s, Nothing)
hook :: CHSLexer
hook = string "{#"
`lexmeta` \_ pos s -> (Nothing, incPos pos 2, s, Just bhLexer)
cpp :: CHSLexer
cpp = directive
where
directive =
string "\n#" +> alt ('\t':inlineSet)`star` epsilon
`lexmeta`
\(_:_:dir) pos s ->
case dir of
['c'] ->
(Nothing, retPos pos, s, Just cLexer)
'c':sp:_ | sp `elem` " \t" ->
(Nothing, retPos pos, s, Just cLexer)
' ':line@(n:_) | isDigit n ->
let pos' = adjustPosByCLinePragma line pos
in (Just $ Right (CHSTokLine pos'), pos', s, Nothing)
_ ->
(Just $ Right (CHSTokCPP pos dir),
retPos pos, s, Nothing)
adjustPosByCLinePragma :: String -> Position -> Position
adjustPosByCLinePragma str (Position fname _ _) =
(Position fname' row' 0)
where
str' = dropWhite str
(rowStr, str'') = span isDigit str'
row' = read rowStr
str''' = dropWhite str''
fnameStr = takeWhile (/= '"') . drop 1 $ str'''
fname' | null str''' || head str''' /= '"' = fname
| fnameStr == fname = fname
| otherwise = fnameStr
dropWhite = dropWhile (\c -> c == ' ' || c == '\t')
bhLexer :: CHSLexer
bhLexer = identOrKW
>||< symbol
>||< strlit
>||< hsverb
>||< whitespace
>||< endOfHook
>||< string "--" +> anyButNL`star` char '\n'
`lexmeta` \_ pos s -> (Nothing, retPos pos, s, Nothing)
where
anyButNL = alt (anySet \\ ['\n'])
endOfHook = string "#}"
`lexmeta`
\_ pos s -> (Just $ Right (CHSTokEndHook pos),
incPos pos 2, s, Just chslexer)
cLexer :: CHSLexer
cLexer = inlineC
>||< ctrl
>||< string "\n#endc"
`lexmeta`
\_ pos s -> (Just $ Right (CHSTokCtrl pos '\n'), retPos pos, s,
Just chslexer)
where
inlineC = alt inlineSet `lexaction` copyVerbatimC
copyVerbatimC :: CHSAction
copyVerbatimC cs pos = Just $ CHSTokC pos cs
whitespace :: CHSLexer
whitespace = (char ' ' `lexaction` \_ _ -> Nothing)
>||< ctrlLexer
identOrKW :: CHSLexer
identOrKW =
(letter +> (letter >|< digit >|< char '\'')`star` epsilon
`lexactionName` \cs pos name -> (idkwtok $!pos) cs name)
>||<
(char '\'' +> letter +> (letter >|< digit)`star` char '\''
`lexactionName` \cs pos name -> (mkid $!pos) cs name)
where
idkwtok pos "as" _ = CHSTokAs pos
idkwtok pos "call" _ = CHSTokCall pos
idkwtok pos "class" _ = CHSTokClass pos
idkwtok pos "context" _ = CHSTokContext pos
idkwtok pos "deriving" _ = CHSTokDerive pos
idkwtok pos "enum" _ = CHSTokEnum pos
idkwtok pos "foreign" _ = CHSTokForeign pos
idkwtok pos "fun" _ = CHSTokFun pos
idkwtok pos "get" _ = CHSTokGet pos
idkwtok pos "import" _ = CHSTokImport pos
idkwtok pos "lib" _ = CHSTokLib pos
idkwtok pos "newtype" _ = CHSTokNewtype pos
idkwtok pos "pointer" _ = CHSTokPointer pos
idkwtok pos "prefix" _ = CHSTokPrefix pos
idkwtok pos "pure" _ = CHSTokPure pos
idkwtok pos "qualified" _ = CHSTokQualif pos
idkwtok pos "set" _ = CHSTokSet pos
idkwtok pos "sizeof" _ = CHSTokSizeof pos
idkwtok pos "stable" _ = CHSTokStable pos
idkwtok pos "type" _ = CHSTokType pos
idkwtok pos "underscoreToCase" _ = CHSTok_2Case pos
idkwtok pos "unsafe" _ = CHSTokUnsafe pos
idkwtok pos "with" _ = CHSTokWith pos
idkwtok pos "lock" _ = CHSTokLock pos
idkwtok pos "nolock" _ = CHSTokNolock pos
idkwtok pos cs name = mkid pos cs name
mkid pos cs name = CHSTokIdent pos (lexemeToIdent pos cs name)
symbol :: CHSLexer
symbol = sym "->" CHSTokArrow
>||< sym "=>" CHSTokDArrow
>||< sym "." CHSTokDot
>||< sym "," CHSTokComma
>||< sym "=" CHSTokEqual
>||< sym "-" CHSTokMinus
>||< sym "*" CHSTokStar
>||< sym "&" CHSTokAmp
>||< sym "^" CHSTokHat
>||< sym "{" CHSTokLBrace
>||< sym "}" CHSTokRBrace
>||< sym "(" CHSTokLParen
>||< sym ")" CHSTokRParen
where
sym cs con = string cs `lexaction` \_ pos -> Just (con pos)
strlit :: CHSLexer
strlit = char '"' +> (instr >|< char '\\')`star` char '"'
`lexaction` \cs pos -> Just (CHSTokString pos (init . tail $ cs))
hsverb :: CHSLexer
hsverb = char '`' +> inhsverb`star` char '\''
`lexaction` \cs pos -> Just (CHSTokHSVerb pos (init . tail $ cs))
letter, digit, instr, inchar, inhsverb :: Regexp s t
letter = alt ['a'..'z'] >|< alt ['A'..'Z'] >|< char '_'
digit = alt ['0'..'9']
instr = alt ([' '..'\127'] \\ "\"\\")
inchar = alt ([' '..'\127'] \\ "\'")
inhsverb = alt ([' '..'\127'] \\ "\'")
anySet, inlineSet, specialSet, commentSpecialSet, ctrlSet :: [Char]
anySet = ['\0'..'\255']
inlineSet = anySet \\ ctrlSet
specialSet = ['{', '-', '"', '\'']
commentSpecialSet = ['{', '-']
ctrlSet = ['\n', '\f', '\r', '\t', '\v']
lexCHS :: String -> Position -> CST s [CHSToken]
lexCHS cs pos =
do
state <- initialState
let (ts, lstate, errs) = execLexer chslexer (cs, pos, state)
(_, pos', state') = lstate
mapM raise errs
assertFinalState pos' state'
return ts