{-# OPTIONS_HADDOCK hide #-} ----------------------------------------------------------------------------- -- | -- Module : Language.Haskell.Exts.Annotated.InternalLexer -- Copyright : (c) The GHC Team, 1997-2000 -- (c) Niklas Broberg, 2004-2009 -- License : BSD-style (see the file LICENSE.txt) -- -- Maintainer : Niklas Broberg, d00nibro@chalmers.se -- Stability : stable -- Portability : portable -- -- Lexer for Haskell, with some extensions. -- ----------------------------------------------------------------------------- -- ToDo: Introduce different tokens for decimal, octal and hexadecimal (?) -- ToDo: FloatTok should have three parts (integer part, fraction, exponent) (?) -- ToDo: Use a lexical analyser generator (lx?) module Language.Haskell.Exts.InternalLexer (Token(..), showToken, lexer, topLexer) where import Language.Haskell.Exts.ParseMonad import Language.Haskell.Exts.SrcLoc import Language.Haskell.Exts.Comments import Language.Haskell.Exts.Extension import Language.Haskell.Exts.ExtScheme import Data.Char import Data.Ratio import Data.List (intersperse) import Control.Monad (when) -- import Debug.Trace (trace) data Token = VarId String | QVarId (String,String) | IDupVarId (String) -- duplicable implicit parameter | ILinVarId (String) -- linear implicit parameter | ConId String | QConId (String,String) | DVarId [String] -- to enable varid's with '-' in them | VarSym String | ConSym String | QVarSym (String,String) | QConSym (String,String) | IntTok (Integer, String) | FloatTok (Rational, String) | Character (Char, String) | StringTok (String, String) | IntTokHash (Integer, String) -- 1# | WordTokHash (Integer, String) -- 1## | FloatTokHash (Rational, String) -- 1.0# | DoubleTokHash (Rational, String) -- 1.0## | CharacterHash (Char, String) -- c# | StringHash (String, String) -- "Hello world!"# -- Symbols | LeftParen | RightParen | LeftHashParen | RightHashParen | SemiColon | LeftCurly | RightCurly | VRightCurly -- a virtual close brace | LeftSquare | RightSquare | Comma | Underscore | BackQuote -- Reserved operators | Dot -- reserved for use with 'forall x . x' | DotDot | Colon | DoubleColon | Equals | Backslash | Bar | LeftArrow | RightArrow | At | Tilde | DoubleArrow | Minus | Exclamation | Star | LeftArrowTail -- >- | RightArrowTail -- -< | LeftDblArrowTail -- >>- | RightDblArrowTail -- -<< -- Template Haskell | THExpQuote -- [| or [e| | THPatQuote -- [p| | THDecQuote -- [d| | THTypQuote -- [t| | THCloseQuote -- |] | THIdEscape (String) -- dollar x | THParenEscape -- dollar ( | THVarQuote -- 'x (but without the x) | THTyQuote -- ''T (but without the T) | THQuasiQuote (String,String) -- [$...|...] -- HaRP | RPGuardOpen -- (| | RPGuardClose -- |) | RPCAt -- @: -- Hsx | XCodeTagOpen -- <% | XCodeTagClose -- %> | XStdTagOpen -- < | XStdTagClose -- > | XCloseTagOpen -- | XChildTagOpen -- <%> (note that close doesn't exist, it's XCloseTagOpen followed by XCodeTagClose) | XPCDATA String | XRPatOpen -- <[ | XRPatClose -- ]> -- Pragmas | PragmaEnd -- #-} | RULES | INLINE Bool | INLINE_CONLIKE | SPECIALISE | SPECIALISE_INLINE Bool | SOURCE | DEPRECATED | WARNING | SCC | GENERATED | CORE | UNPACK | OPTIONS (Maybe String,String) -- | CFILES String -- | INCLUDE String | LANGUAGE | ANN -- Reserved Ids | KW_As | KW_By -- transform list comprehensions | KW_Case | KW_Class | KW_Data | KW_Default | KW_Deriving | KW_Do | KW_MDo | KW_Else | KW_Family -- indexed type families | KW_Forall -- universal/existential types | KW_Group -- transform list comprehensions | KW_Hiding | KW_If | KW_Import | KW_In | KW_Infix | KW_InfixL | KW_InfixR | KW_Instance | KW_Let | KW_Module | KW_NewType | KW_Of | KW_Proc -- arrows | KW_Rec -- arrows | KW_Then | KW_Type | KW_Using -- transform list comprehensions | KW_Where | KW_Qualified -- FFI | KW_Foreign | KW_Export | KW_Safe | KW_Unsafe | KW_Threadsafe | KW_Interruptible | KW_StdCall | KW_CCall | KW_CPlusPlus | KW_DotNet | KW_Jvm | KW_Js | KW_CApi | EOF deriving (Eq,Show) reserved_ops :: [(String,(Token, Maybe ExtScheme))] reserved_ops = [ ( "..", (DotDot, Nothing) ), ( ":", (Colon, Nothing) ), ( "::", (DoubleColon, Nothing) ), ( "=", (Equals, Nothing) ), ( "\\", (Backslash, Nothing) ), ( "|", (Bar, Nothing) ), ( "<-", (LeftArrow, Nothing) ), ( "->", (RightArrow, Nothing) ), ( "@", (At, Nothing) ), ( "@:", (RPCAt, Just (Any [RegularPatterns])) ), ( "~", (Tilde, Nothing) ), ( "=>", (DoubleArrow, Nothing) ), ( "*", (Star, Just (Any [KindSignatures])) ), -- Arrows notation ( "-<", (LeftArrowTail, Just (Any [Arrows])) ), ( ">-", (RightArrowTail, Just (Any [Arrows])) ), ( "-<<", (LeftDblArrowTail, Just (Any [Arrows])) ), ( ">>-", (RightDblArrowTail, Just (Any [Arrows])) ), -- Unicode notation ( "\x2190", (LeftArrow, Just (Any [UnicodeSyntax])) ), ( "\x2192", (RightArrow, Just (Any [UnicodeSyntax])) ), ( "\x21d2", (DoubleArrow, Just (Any [UnicodeSyntax])) ), ( "\x2237", (DoubleColon, Just (Any [UnicodeSyntax])) ), ( "\x2919", (LeftArrowTail, Just (All [UnicodeSyntax, Arrows])) ), ( "\x291a", (RightArrowTail, Just (All [UnicodeSyntax, Arrows])) ), ( "\x291b", (LeftDblArrowTail, Just (All [UnicodeSyntax, Arrows])) ), ( "\x291c", (RightDblArrowTail, Just (All [UnicodeSyntax, Arrows])) ), ( "\x2605", (Star, Just (All [UnicodeSyntax, KindSignatures])) ), ( "\x2200", (KW_Forall, Just (All [UnicodeSyntax, ExplicitForAll])) ) ] special_varops :: [(String,(Token, Maybe ExtScheme))] special_varops = [ -- the dot is only a special symbol together with forall, but can still be used as function composition ( ".", (Dot, Just (Any [ExplicitForAll, ExistentialQuantification])) ), ( "-", (Minus, Nothing) ), ( "!", (Exclamation, Nothing) ) ] reserved_ids :: [(String,(Token, Maybe ExtScheme))] reserved_ids = [ ( "_", (Underscore, Nothing) ), ( "by", (KW_By, Just (Any [TransformListComp])) ), ( "case", (KW_Case, Nothing) ), ( "class", (KW_Class, Nothing) ), ( "data", (KW_Data, Nothing) ), ( "default", (KW_Default, Nothing) ), ( "deriving", (KW_Deriving, Nothing) ), ( "do", (KW_Do, Nothing) ), ( "else", (KW_Else, Nothing) ), ( "family", (KW_Family, Just (Any [TypeFamilies])) ), -- indexed type families ( "forall", (KW_Forall, Just (Any [ExplicitForAll, ExistentialQuantification])) ), -- universal/existential quantification ( "group", (KW_Group, Just (Any [TransformListComp])) ), ( "if", (KW_If, Nothing) ), ( "import", (KW_Import, Nothing) ), ( "in", (KW_In, Nothing) ), ( "infix", (KW_Infix, Nothing) ), ( "infixl", (KW_InfixL, Nothing) ), ( "infixr", (KW_InfixR, Nothing) ), ( "instance", (KW_Instance, Nothing) ), ( "let", (KW_Let, Nothing) ), ( "mdo", (KW_MDo, Just (Any [RecursiveDo])) ), ( "module", (KW_Module, Nothing) ), ( "newtype", (KW_NewType, Nothing) ), ( "of", (KW_Of, Nothing) ), ( "proc", (KW_Proc, Just (Any [Arrows])) ), ( "rec", (KW_Rec, Just (Any [Arrows])) ), ( "then", (KW_Then, Nothing) ), ( "type", (KW_Type, Nothing) ), ( "using", (KW_Using, Just (Any [TransformListComp])) ), ( "where", (KW_Where, Nothing) ), -- FFI ( "foreign", (KW_Foreign, Just (Any [ForeignFunctionInterface])) ) ] special_varids :: [(String,(Token, Maybe ExtScheme))] special_varids = [ ( "as", (KW_As, Nothing) ), ( "qualified", (KW_Qualified, Nothing) ), ( "hiding", (KW_Hiding, Nothing) ), -- FFI ( "export", (KW_Export, Just (Any [ForeignFunctionInterface])) ), ( "safe", (KW_Safe, Just (Any [ForeignFunctionInterface])) ), ( "unsafe", (KW_Unsafe, Just (Any [ForeignFunctionInterface])) ), ( "threadsafe", (KW_Threadsafe, Just (Any [ForeignFunctionInterface])) ), ( "interruptible", (KW_Interruptible, Just (Any [InterruptibleFFI])) ), ( "stdcall", (KW_StdCall, Just (Any [ForeignFunctionInterface])) ), ( "ccall", (KW_CCall, Just (Any [ForeignFunctionInterface])) ), ( "cplusplus", (KW_CPlusPlus, Just (Any [ForeignFunctionInterface])) ), ( "dotnet", (KW_DotNet, Just (Any [ForeignFunctionInterface])) ), ( "jvm", (KW_Jvm, Just (Any [ForeignFunctionInterface])) ), ( "js", (KW_Js, Just (Any [ForeignFunctionInterface])) ), ( "capi", (KW_CApi, Just (Any [CApiFFI])) ) ] pragmas :: [(String,Token)] pragmas = [ ( "rules", RULES ), ( "inline", INLINE True ), ( "noinline", INLINE False ), ( "notinline", INLINE False ), ( "specialise", SPECIALISE ), ( "specialize", SPECIALISE ), ( "source", SOURCE ), ( "deprecated", DEPRECATED ), ( "warning", WARNING ), ( "ann", ANN ), ( "scc", SCC ), ( "generated", GENERATED ), ( "core", CORE ), ( "unpack", UNPACK ), ( "language", LANGUAGE ), ( "options", OPTIONS undefined ) -- we'll tweak it before use - promise! -- ( "cfiles", CFILES undefined ), -- same here... -- ( "include", INCLUDE undefined ) -- ...and here! ] isIdent, isHSymbol :: Char -> Bool isIdent c = isAlpha c || isDigit c || c == '\'' || c == '_' isHSymbol c = c `elem` ":!#%&*./?@\\-" || ((isSymbol c || isPunctuation c) && not (c `elem` "(),;[]`{}_\"'")) matchChar :: Char -> String -> Lex a () matchChar c msg = do s <- getInput if null s || head s /= c then fail msg else discard 1 -- The top-level lexer. -- We need to know whether we are at the beginning of the line to decide -- whether to insert layout tokens. lexer :: (Loc Token -> P a) -> P a lexer = runL topLexer topLexer :: Lex a (Loc Token) topLexer = do b <- pullCtxtFlag if b then -- trace (show cf ++ ": " ++ show VRightCurly) $ -- the lex context state flags that we must do an empty {} - UGLY setBOL >> getSrcLocL >>= \l -> return (Loc (mkSrcSpan l l) VRightCurly) else do bol <- checkBOL (bol, ws) <- lexWhiteSpace bol -- take care of whitespace in PCDATA ec <- getExtContext case ec of -- if there was no linebreak, and we are lexing PCDATA, -- then we want to care about the whitespace. -- We don't bother to test for XmlSyntax, since we -- couldn't end up in ChildCtxt otherwise. Just ChildCtxt | not bol && ws -> getSrcLocL >>= \l -> return $ Loc (mkSrcSpan l l) $ XPCDATA " " _ -> do startToken sl <- getSrcLocL t <- if bol then lexBOL -- >>= \t -> trace ("BOL: " ++ show t) (return t) else lexToken -- >>= \t -> trace (show t) (return t) el <- getSrcLocL return $ Loc (mkSrcSpan sl el) t lexWhiteSpace :: Bool -> Lex a (Bool, Bool) lexWhiteSpace bol = do s <- getInput ignL <- ignoreLinePragmasL case s of -- If we find a recognised pragma, we don't want to treat it as a comment. '{':'-':'#':rest | isRecognisedPragma rest -> return (bol, False) | isLinePragma rest && not ignL -> do (l, fn) <- lexLinePragma setSrcLineL l setLineFilenameL fn lexWhiteSpace True '{':'-':_ -> do loc <- getSrcLocL discard 2 (bol, c) <- lexNestedComment bol "" loc2 <- getSrcLocL pushComment $ Comment True (mkSrcSpan loc loc2) (reverse c) (bol, _) <- lexWhiteSpace bol return (bol, True) '-':'-':s | all (== '-') (takeWhile isHSymbol s) -> do loc <- getSrcLocL discard 2 dashes <- lexWhile (== '-') rest <- lexWhile (/= '\n') s' <- getInput loc2 <- getSrcLocL let com = Comment False (mkSrcSpan loc loc2) $ dashes ++ rest case s' of [] -> pushComment com >> return (False, True) _ -> do pushComment com lexNewline lexWhiteSpace True return (True, True) '\n':_ -> do lexNewline lexWhiteSpace True return (True, True) '\t':_ -> do lexTab (bol, _) <- lexWhiteSpace bol return (bol, True) c:_ | isSpace c -> do discard 1 (bol, _) <- lexWhiteSpace bol return (bol, True) _ -> return (bol, False) isRecognisedPragma, isLinePragma :: String -> Bool isRecognisedPragma str = let pragma = map toLower . takeWhile isAlphaNum . dropWhile isSpace $ str in case lookup pragma pragmas of Nothing -> False _ -> True isLinePragma str = let pragma = map toLower . takeWhile isAlphaNum . dropWhile isSpace $ str in case pragma of "line" -> True _ -> False lexLinePragma :: Lex a (Int, String) lexLinePragma = do discard 3 -- {-# lexWhile isSpace discard 4 -- LINE lexWhile isSpace i <- lexWhile isDigit when (null i) $ fail "Improperly formatted LINE pragma" lexWhile isSpace matchChar '"' "Improperly formatted LINE pragma" fn <- lexWhile (/= '"') matchChar '"' "Impossible - lexLinePragma" lexWhile isSpace mapM (flip matchChar "Improperly formatted LINE pragma") "#-}" lexNewline return (read i, fn) lexNestedComment :: Bool -> String -> Lex a (Bool, String) lexNestedComment bol str = do s <- getInput case s of '-':'}':_ -> discard 2 >> return (bol, str) '{':'-':_ -> do discard 2 (bol, c) <- lexNestedComment bol ("-{" ++ str) -- rest of the subcomment lexNestedComment bol ("}-" ++ c ) -- rest of this comment '\t':_ -> lexTab >> lexNestedComment bol ('\t':str) '\n':_ -> lexNewline >> lexNestedComment True ('\n':str) c:_ -> discard 1 >> lexNestedComment bol (c:str) [] -> fail "Unterminated nested comment" -- When we are lexing the first token of a line, check whether we need to -- insert virtual semicolons or close braces due to layout. lexBOL :: Lex a Token lexBOL = do pos <- getOffside -- trace ("Off: " ++ (show pos)) $ do case pos of LT -> do -- trace "layout: inserting '}'\n" $ -- Set col to 0, indicating that we're still at the -- beginning of the line, in case we need a semi-colon too. -- Also pop the context here, so that we don't insert -- another close brace before the parser can pop it. setBOL popContextL "lexBOL" return VRightCurly EQ -> -- trace "layout: inserting ';'\n" $ return SemiColon GT -> lexToken lexToken :: Lex a Token lexToken = do ec <- getExtContext -- we don't bother to check XmlSyntax since we couldn't -- have ended up in a non-Nothing context if it wasn't -- enabled. case ec of Just HarpCtxt -> lexHarpToken Just TagCtxt -> lexTagCtxt Just CloseTagCtxt -> lexCloseTagCtxt Just ChildCtxt -> lexChildCtxt Just CodeTagCtxt -> lexCodeTagCtxt _ -> lexStdToken lexChildCtxt :: Lex a Token lexChildCtxt = do -- if we ever end up here, then XmlSyntax must be on. s <- getInput case s of '<':'%':'>':_ -> do discard 3 pushExtContextL ChildCtxt return XChildTagOpen '<':'%':_ -> do discard 2 pushExtContextL CodeTagCtxt return XCodeTagOpen '<':'/':_ -> do discard 2 popExtContextL "lexChildCtxt" pushExtContextL CloseTagCtxt return XCloseTagOpen '<':'[':_ -> do discard 2 pushExtContextL HarpCtxt return XRPatOpen '<':_ -> do discard 1 pushExtContextL TagCtxt return XStdTagOpen _ -> lexPCDATA lexPCDATA :: Lex a Token lexPCDATA = do -- if we ever end up here, then XmlSyntax must be on. s <- getInput case s of [] -> return EOF _ -> case s of '\n':_ -> do x <- lexNewline >> lexPCDATA case x of XPCDATA p -> return $ XPCDATA $ '\n':p EOF -> return EOF _ -> fail $ "lexPCDATA: unexpected token: " ++ show x '<':_ -> return $ XPCDATA "" _ -> do let pcd = takeWhile (\c -> not $ elem c "<\n") s l = length pcd discard l x <- lexPCDATA case x of XPCDATA pcd' -> return $ XPCDATA $ pcd ++ pcd' EOF -> return EOF _ -> fail $ "lexPCDATA: unexpected token: " ++ show x lexCodeTagCtxt :: Lex a Token lexCodeTagCtxt = do -- if we ever end up here, then XmlSyntax must be on. s <- getInput case s of '%':'>':_ -> do discard 2 popExtContextL "lexCodeTagContext" return XCodeTagClose _ -> lexStdToken lexCloseTagCtxt :: Lex a Token lexCloseTagCtxt = do -- if we ever end up here, then XmlSyntax must be on. s <- getInput case s of '%':'>':_ -> do discard 2 popExtContextL "lexCloseTagCtxt" return XCodeTagClose '>':_ -> do discard 1 popExtContextL "lexCloseTagCtxt" return XStdTagClose _ -> lexStdToken lexTagCtxt :: Lex a Token lexTagCtxt = do -- if we ever end up here, then XmlSyntax must be on. s <- getInput case s of '/':'>':_ -> do discard 2 popExtContextL "lexTagCtxt: Empty tag" return XEmptyTagClose '>':_ -> do discard 1 popExtContextL "lexTagCtxt: Standard tag" pushExtContextL ChildCtxt return XStdTagClose _ -> lexStdToken lexHarpToken :: Lex a Token lexHarpToken = do -- if we ever end up here, then RegularPatterns must be on. s <- getInput case s of ']':'>':_ -> do discard 2 popExtContextL "lexHarpToken" return XRPatClose _ -> lexStdToken lexStdToken :: Lex a Token lexStdToken = do s <- getInput exts <- getExtensionsL let intHash = lexHash IntTok IntTokHash (Right WordTokHash) case s of [] -> return EOF '0':c:d:_ | toLower c == 'o' && isOctDigit d -> do discard 2 (n, str) <- lexOctal con <- intHash return (con (n, '0':c:str)) | toLower c == 'x' && isHexDigit d -> do discard 2 (n, str) <- lexHexadecimal con <- intHash return (con (n, '0':c:str)) -- implicit parameters '?':c:_ | isLower c && ImplicitParams `elem` exts -> do discard 1 id <- lexWhile isIdent return $ IDupVarId id '%':c:_ | isLower c && ImplicitParams `elem` exts -> do discard 1 id <- lexWhile isIdent return $ ILinVarId id -- end implicit parameters -- harp -- '(':'|':c:_ | isHSymbol c -> discard 1 >> return LeftParen '(':'|':c:_ | RegularPatterns `elem` exts && not (isHSymbol c) -> do discard 2 return RPGuardOpen '|':')':_ | RegularPatterns `elem` exts -> do discard 2 return RPGuardClose {- This is handled by the reserved_ops above. '@':':':_ | RegularPatterns `elem` exts -> do discard 2 return RPCAt -} -- template haskell '[':'|':_ | TemplateHaskell `elem` exts -> do discard 2 return $ THExpQuote '[':c:'|':_ | c == 'e' && TemplateHaskell `elem` exts -> do discard 3 return $ THExpQuote | c == 'p' && TemplateHaskell `elem` exts -> do discard 3 return THPatQuote | c == 'd' && TemplateHaskell `elem` exts -> do discard 3 return THDecQuote | c == 't' && TemplateHaskell `elem` exts -> do discard 3 return THTypQuote '[':'$':c:_ | isLower c && QuasiQuotes `elem` exts -> discard 2 >> lexQuasiQuote '[':c:s | isLower c && QuasiQuotes `elem` exts && case dropWhile isIdent s of { '|':_ -> True;_->False} -> discard 1 >> lexQuasiQuote '|':']':_ | TemplateHaskell `elem` exts -> do discard 2 return THCloseQuote '$':c:_ | isLower c && TemplateHaskell `elem` exts -> do discard 1 id <- lexWhile isIdent return $ THIdEscape id | c == '(' && TemplateHaskell `elem` exts -> do discard 2 return THParenEscape -- end template haskell -- hsx '<':'%':c:_ | XmlSyntax `elem` exts -> do case c of '>' -> do discard 3 pushExtContextL ChildCtxt return XChildTagOpen _ -> do discard 2 pushExtContextL CodeTagCtxt return XCodeTagOpen '<':c:_ | isAlpha c && XmlSyntax `elem` exts -> do discard 1 pushExtContextL TagCtxt return XStdTagOpen -- end hsx '(':'#':c:_ | UnboxedTuples `elem` exts && not (isHSymbol c) -> do discard 2 >> return LeftHashParen '#':')':_ | UnboxedTuples `elem` exts -> do discard 2 >> return RightHashParen -- pragmas '{':'-':'#':_ -> do discard 3 >> lexPragmaStart '#':'-':'}':_ -> do discard 3 >> return PragmaEnd c:_ | isDigit c -> lexDecimalOrFloat | isUpper c -> lexConIdOrQual "" | isLower c || c == '_' -> do idents <- lexIdents case idents of [ident] -> case lookup ident (reserved_ids ++ special_varids) of Just (keyword, scheme) -> do -- check if an extension keyword is enabled if isEnabled scheme exts then flagKW keyword >> return keyword else return $ VarId ident Nothing -> return $ VarId ident _ -> return $ DVarId idents | isHSymbol c -> do sym <- lexWhile isHSymbol return $ case lookup sym (reserved_ops ++ special_varops) of Just (t , scheme) -> -- check if an extension op is enabled if isEnabled scheme exts then t else case c of ':' -> ConSym sym _ -> VarSym sym Nothing -> case c of ':' -> ConSym sym _ -> VarSym sym | otherwise -> do discard 1 case c of -- First the special symbols '(' -> return LeftParen ')' -> return RightParen ',' -> return Comma ';' -> return SemiColon '[' -> return LeftSquare ']' -> return RightSquare '`' -> return BackQuote '{' -> do pushContextL NoLayout return LeftCurly '}' -> do popContextL "lexStdToken" return RightCurly '\'' -> lexCharacter '"' -> lexString _ -> fail ("Illegal character \'" ++ show c ++ "\'\n") where lexIdents :: Lex a [String] lexIdents = do ident <- lexWhile isIdent s <- getInput exts <- getExtensionsL case s of -- This is the only way we can get more than one ident in the list -- and it requires XmlSyntax to be on. '-':c:_ | XmlSyntax `elem` exts && isAlpha c -> do discard 1 idents <- lexIdents return $ ident : idents '#':_ | MagicHash `elem` exts -> do discard 1 return [ident ++ "#"] _ -> return [ident] lexQuasiQuote :: Lex a Token lexQuasiQuote = do -- We've seen and dropped [$ already ident <- lexWhile isIdent matchChar '|' "Malformed quasi-quote quoter" body <- lexQQBody return $ THQuasiQuote (ident, body) lexQQBody :: Lex a String lexQQBody = do s <- getInput case s of '\\':']':_ -> do discard 2 str <- lexQQBody return (']':str) '\\':'|':_ -> do discard 2 str <- lexQQBody return ('|':str) '|':']':_ -> discard 2 >> return "" '|':_ -> do discard 1 str <- lexQQBody return ('|':str) ']':_ -> do discard 1 str <- lexQQBody return (']':str) '\\':_ -> do discard 1 str <- lexQQBody return ('\\':str) '\n':_ -> do lexNewline str <- lexQQBody return ('\n':str) [] -> fail "Unexpected end of input while lexing quasi-quoter" _ -> do str <- lexWhile (not . (`elem` "\\|\n")) rest <- lexQQBody return (str++rest) lexPragmaStart :: Lex a Token lexPragmaStart = do lexWhile isSpace pr <- lexWhile isAlphaNum case lookup (map toLower pr) pragmas of Just (INLINE True) -> do s <- getInput case map toLower s of '_':'c':'o':'n':'l':'i':'k':'e':_ -> do discard 8 return $ INLINE_CONLIKE _ -> return $ INLINE True Just SPECIALISE -> do s <- getInput case dropWhile isSpace $ map toLower s of 'i':'n':'l':'i':'n':'e':_ -> do lexWhile isSpace discard 6 return $ SPECIALISE_INLINE True 'n':'o':'i':'n':'l':'i':'n':'e':_ -> do lexWhile isSpace discard 8 return $ SPECIALISE_INLINE False 'n':'o':'t':'i':'n':'l':'i':'n':'e':_ -> do lexWhile isSpace discard 9 return $ SPECIALISE_INLINE False _ -> return SPECIALISE Just (OPTIONS _) -> do -- see, I promised we'd mask out the 'undefined' s <- getInput case s of '_':_ -> do discard 1 com <- lexWhile isIdent rest <- lexRawPragma return $ OPTIONS (Just com, rest) x:_ | isSpace x -> do rest <- lexRawPragma return $ OPTIONS (Nothing, rest) _ -> fail "Malformed Options pragma" {- Just (CFILES _) -> do rest <- lexRawPragma return $ CFILES rest Just (INCLUDE _) -> do rest <- lexRawPragma return $ INCLUDE rest -} Just p -> return p _ -> fail "Internal error: Unrecognised recognised pragma" -- do rawStr <- lexRawPragma -- return $ PragmaUnknown (pr, rawStr) -- no support for unrecognized pragmas, treat as comment -- discard 3 -- #-} -- topLexer -- we just discard it as a comment for now and restart -} lexRawPragma :: Lex a String lexRawPragma = do rpr <- lexRawPragmaAux return $ dropWhile isSpace rpr where lexRawPragmaAux = do rpr <- lexWhile (/='#') s <- getInput case s of '#':'-':'}':_ -> return rpr "" -> fail "End-of-file inside pragma" _ -> do discard 1 rpr' <- lexRawPragma return $ rpr ++ '#':rpr' lexDecimalOrFloat :: Lex a Token lexDecimalOrFloat = do ds <- lexWhile isDigit rest <- getInput exts <- getExtensionsL case rest of ('.':d:_) | isDigit d -> do discard 1 frac <- lexWhile isDigit let num = parseInteger 10 (ds ++ frac) decimals = toInteger (length frac) (exponent, estr) <- do rest2 <- getInput case rest2 of 'e':_ -> lexExponent 'E':_ -> lexExponent _ -> return (0,"") con <- lexHash FloatTok FloatTokHash (Right DoubleTokHash) return $ con ((num%1) * 10^^(exponent - decimals), ds ++ '.':frac ++ estr) e:_ | toLower e == 'e' -> do (exponent, estr) <- lexExponent con <- lexHash FloatTok FloatTokHash (Right DoubleTokHash) return $ con ((parseInteger 10 ds%1) * 10^^exponent, ds ++ estr) '#':'#':_ | MagicHash `elem` exts -> discard 2 >> return (WordTokHash (parseInteger 10 ds, ds)) '#':_ | MagicHash `elem` exts -> discard 1 >> return (IntTokHash (parseInteger 10 ds, ds)) _ -> return (IntTok (parseInteger 10 ds, ds)) where lexExponent :: Lex a (Integer, String) lexExponent = do (e:r) <- getInput discard 1 -- 'e' or 'E' case r of '+':d:_ | isDigit d -> do discard 1 (n, str) <- lexDecimal return (n, e:'+':str) '-':d:_ | isDigit d -> do discard 1 (n, str) <- lexDecimal return (negate n, e:'-':str) d:_ | isDigit d -> lexDecimal >>= \(n,str) -> return (n, e:str) _ -> fail "Float with missing exponent" lexHash :: (b -> Token) -> (b -> Token) -> Either String (b -> Token) -> Lex a (b -> Token) lexHash a b c = do exts <- getExtensionsL if MagicHash `elem` exts then do r <- getInput case r of '#':'#':_ -> case c of Right c -> discard 2 >> return c Left s -> fail s '#':_ -> discard 1 >> return b _ -> return a else return a lexConIdOrQual :: String -> Lex a Token lexConIdOrQual qual = do con <- lexWhile isIdent let conid | null qual = ConId con | otherwise = QConId (qual,con) qual' | null qual = con | otherwise = qual ++ '.':con just_a_conid <- alternative (return conid) rest <- getInput exts <- getExtensionsL case rest of '.':c:_ | isLower c || c == '_' -> do -- qualified varid? discard 1 ident <- lexWhile isIdent s <- getInput exts <- getExtensionsL ident' <- case s of '#':_ | MagicHash `elem` exts -> discard 1 >> return (ident ++ "#") _ -> return ident case lookup ident' reserved_ids of -- cannot qualify a reserved word Just (_,scheme) | isEnabled scheme exts -> just_a_conid _ -> return (QVarId (qual', ident')) | isUpper c -> do -- qualified conid? discard 1 lexConIdOrQual qual' | isHSymbol c -> do -- qualified symbol? discard 1 sym <- lexWhile isHSymbol exts <- getExtensionsL case lookup sym reserved_ops of -- cannot qualify a reserved operator Just (_,scheme) | isEnabled scheme exts -> just_a_conid _ -> return $ case c of ':' -> QConSym (qual', sym) _ -> QVarSym (qual', sym) '#':cs | null cs || not (isHSymbol $ head cs) && not (isIdent $ head cs) && MagicHash `elem` exts -> do discard 1 case conid of ConId con -> return $ ConId $ con ++ "#" QConId (q,con) -> return $ QConId (q,con ++ "#") _ -> fail $ "lexConIdOrQual: unexpected token: " ++ show conid _ -> return conid -- not a qualified thing lexCharacter :: Lex a Token lexCharacter = do -- We need to keep track of not only character constants but also TH 'x and ''T -- We've seen ' so far s <- getInput exts <- getExtensionsL case s of '\'':_ | TemplateHaskell `elem` exts -> discard 1 >> return THTyQuote '\\':_ -> do (c,raw) <- lexEscape matchQuote con <- lexHash Character CharacterHash (Left "Double hash not available for character literals") return (con (c, '\\':raw)) c:'\'':_ -> do discard 2 con <- lexHash Character CharacterHash (Left "Double hash not available for character literals") return (con (c, [c])) _ | any (`elem` exts) [TemplateHaskell, DataKinds] -> return THVarQuote _ -> fail "Improper character constant or misplaced \'" where matchQuote = matchChar '\'' "Improperly terminated character constant" lexString :: Lex a Token lexString = loop ("","") where loop (s,raw) = do r <- getInput exts <- getExtensionsL case r of '\\':'&':_ -> do discard 2 loop (s, '&':'\\':raw) '\\':c:_ | isSpace c -> do discard 1 wcs <- lexWhiteChars matchChar '\\' "Illegal character in string gap" loop (s, '\\':reverse wcs ++ '\\':raw) | otherwise -> do (ce, str) <- lexEscape loop (ce:s, reverse str ++ '\\':raw) '"':'#':_ | MagicHash `elem` exts -> do discard 2 return (StringHash (reverse s, reverse raw)) '"':_ -> do discard 1 return (StringTok (reverse s, reverse raw)) c:_ | c /= '\n' -> do discard 1 loop (c:s, c:raw) _ -> fail "Improperly terminated string" lexWhiteChars :: Lex a String lexWhiteChars = do s <- getInput case s of '\n':_ -> do lexNewline wcs <- lexWhiteChars return $ '\n':wcs '\t':_ -> do lexTab wcs <- lexWhiteChars return $ '\t':wcs c:_ | isSpace c -> do discard 1 wcs <- lexWhiteChars return $ c:wcs _ -> return "" lexEscape :: Lex a (Char, String) lexEscape = do discard 1 r <- getInput case r of -- Production charesc from section B.2 (Note: \& is handled by caller) 'a':_ -> discard 1 >> return ('\a', "a") 'b':_ -> discard 1 >> return ('\b', "b") 'f':_ -> discard 1 >> return ('\f', "f") 'n':_ -> discard 1 >> return ('\n', "n") 'r':_ -> discard 1 >> return ('\r', "r") 't':_ -> discard 1 >> return ('\t', "t") 'v':_ -> discard 1 >> return ('\v', "v") '\\':_ -> discard 1 >> return ('\\', "\\") '"':_ -> discard 1 >> return ('\"', "\"") '\'':_ -> discard 1 >> return ('\'', "\'") -- Production ascii from section B.2 '^':c:_ -> discard 2 >> cntrl c 'N':'U':'L':_ -> discard 3 >> return ('\NUL', "NUL") 'S':'O':'H':_ -> discard 3 >> return ('\SOH', "SOH") 'S':'T':'X':_ -> discard 3 >> return ('\STX', "STX") 'E':'T':'X':_ -> discard 3 >> return ('\ETX', "ETX") 'E':'O':'T':_ -> discard 3 >> return ('\EOT', "EOT") 'E':'N':'Q':_ -> discard 3 >> return ('\ENQ', "ENQ") 'A':'C':'K':_ -> discard 3 >> return ('\ACK', "ACK") 'B':'E':'L':_ -> discard 3 >> return ('\BEL', "BEL") 'B':'S':_ -> discard 2 >> return ('\BS', "BS") 'H':'T':_ -> discard 2 >> return ('\HT', "HT") 'L':'F':_ -> discard 2 >> return ('\LF', "LF") 'V':'T':_ -> discard 2 >> return ('\VT', "VT") 'F':'F':_ -> discard 2 >> return ('\FF', "FF") 'C':'R':_ -> discard 2 >> return ('\CR', "CR") 'S':'O':_ -> discard 2 >> return ('\SO', "SO") 'S':'I':_ -> discard 2 >> return ('\SI', "SI") 'D':'L':'E':_ -> discard 3 >> return ('\DLE', "DLE") 'D':'C':'1':_ -> discard 3 >> return ('\DC1', "DC1") 'D':'C':'2':_ -> discard 3 >> return ('\DC2', "DC2") 'D':'C':'3':_ -> discard 3 >> return ('\DC3', "DC3") 'D':'C':'4':_ -> discard 3 >> return ('\DC4', "DC4") 'N':'A':'K':_ -> discard 3 >> return ('\NAK', "NAK") 'S':'Y':'N':_ -> discard 3 >> return ('\SYN', "SYN") 'E':'T':'B':_ -> discard 3 >> return ('\ETB', "ETB") 'C':'A':'N':_ -> discard 3 >> return ('\CAN', "CAN") 'E':'M':_ -> discard 2 >> return ('\EM', "EM") 'S':'U':'B':_ -> discard 3 >> return ('\SUB', "SUB") 'E':'S':'C':_ -> discard 3 >> return ('\ESC', "ESC") 'F':'S':_ -> discard 2 >> return ('\FS', "FS") 'G':'S':_ -> discard 2 >> return ('\GS', "GS") 'R':'S':_ -> discard 2 >> return ('\RS', "RS") 'U':'S':_ -> discard 2 >> return ('\US', "US") 'S':'P':_ -> discard 2 >> return ('\SP', "SP") 'D':'E':'L':_ -> discard 3 >> return ('\DEL', "DEL") -- Escaped numbers 'o':c:_ | isOctDigit c -> do discard 1 (n, raw) <- lexOctal n <- checkChar n return (n, 'o':raw) 'x':c:_ | isHexDigit c -> do discard 1 (n, raw) <- lexHexadecimal n <- checkChar n return (n, 'x':raw) c:_ | isDigit c -> do (n, raw) <- lexDecimal n <- checkChar n return (n, raw) _ -> fail "Illegal escape sequence" where checkChar n | n <= 0x10FFFF = return (chr (fromInteger n)) checkChar _ = fail "Character constant out of range" -- Production cntrl from section B.2 cntrl :: Char -> Lex a (Char, String) cntrl c | c >= '@' && c <= '_' = return (chr (ord c - ord '@'), '^':c:[]) cntrl _ = fail "Illegal control character" -- assumes at least one octal digit lexOctal :: Lex a (Integer, String) lexOctal = do ds <- lexWhile isOctDigit return (parseInteger 8 ds, ds) -- assumes at least one hexadecimal digit lexHexadecimal :: Lex a (Integer, String) lexHexadecimal = do ds <- lexWhile isHexDigit return (parseInteger 16 ds, ds) -- assumes at least one decimal digit lexDecimal :: Lex a (Integer, String) lexDecimal = do ds <- lexWhile isDigit return (parseInteger 10 ds, ds) -- Stolen from Hugs's Prelude parseInteger :: Integer -> String -> Integer parseInteger radix ds = foldl1 (\n d -> n * radix + d) (map (toInteger . digitToInt) ds) flagKW :: Token -> Lex a () flagKW t = do when (t `elem` [KW_Do, KW_MDo]) $ do exts <- getExtensionsL when (NondecreasingIndentation `elem` exts) $ flagDo ------------------------------------------------------------------ -- "Pretty" printing for tokens showToken :: Token -> String showToken t = case t of VarId s -> s QVarId (q,s) -> q ++ '.':s IDupVarId s -> '?':s ILinVarId s -> '%':s ConId s -> s QConId (q,s) -> q ++ '.':s DVarId ss -> concat $ intersperse "-" ss VarSym s -> s ConSym s -> s QVarSym (q,s) -> q ++ '.':s QConSym (q,s) -> q ++ '.':s IntTok (_, s) -> s FloatTok (_, s) -> s Character (_, s) -> '\'':s ++ "'" StringTok (_, s) -> '"':s ++ "\"" IntTokHash (_, s) -> s ++ "#" WordTokHash (_, s) -> s ++ "##" FloatTokHash (_, s) -> s ++ "#" DoubleTokHash (_, s) -> s ++ "##" CharacterHash (_, s) -> '\'':s ++ "'#" StringHash (_, s) -> '"':s ++ "\"#" LeftParen -> "(" RightParen -> ")" LeftHashParen -> "(#" RightHashParen -> "#)" SemiColon -> ";" LeftCurly -> "{" RightCurly -> "}" VRightCurly -> "virtual }" LeftSquare -> "[" RightSquare -> "]" Comma -> "," Underscore -> "_" BackQuote -> "`" Dot -> "." DotDot -> ".." Colon -> ":" DoubleColon -> "::" Equals -> "=" Backslash -> "\\" Bar -> "|" LeftArrow -> "<-" RightArrow -> "->" At -> "@" Tilde -> "~" DoubleArrow -> "=>" Minus -> "-" Exclamation -> "!" Star -> "*" LeftArrowTail -> ">-" RightArrowTail -> "-<" LeftDblArrowTail -> ">>-" RightDblArrowTail -> "-<<" THExpQuote -> "[|" THPatQuote -> "[p|" THDecQuote -> "[d|" THTypQuote -> "[t|" THCloseQuote -> "|]" THIdEscape s -> '$':s THParenEscape -> "$(" THVarQuote -> "'" THTyQuote -> "''" THQuasiQuote (n,q) -> "[$" ++ n ++ "|" ++ q ++ "]" RPGuardOpen -> "(|" RPGuardClose -> "|)" RPCAt -> "@:" XCodeTagOpen -> "<%" XCodeTagClose -> "%>" XStdTagOpen -> "<" XStdTagClose -> ">" XCloseTagOpen -> " "/>" XPCDATA s -> "PCDATA " ++ s XRPatOpen -> "<[" XRPatClose -> "]>" PragmaEnd -> "#-}" RULES -> "{-# RULES" INLINE b -> "{-# " ++ if b then "INLINE" else "NOINLINE" INLINE_CONLIKE -> "{-# " ++ "INLINE_CONLIKE" SPECIALISE -> "{-# SPECIALISE" SPECIALISE_INLINE b -> "{-# SPECIALISE " ++ if b then "INLINE" else "NOINLINE" SOURCE -> "{-# SOURCE" DEPRECATED -> "{-# DEPRECATED" WARNING -> "{-# WARNING" SCC -> "{-# SCC" GENERATED -> "{-# GENERATED" CORE -> "{-# CORE" UNPACK -> "{-# UNPACK" OPTIONS (mt,s) -> "{-# OPTIONS" ++ maybe "" (':':) mt ++ " ..." -- CFILES s -> "{-# CFILES ..." -- INCLUDE s -> "{-# INCLUDE ..." LANGUAGE -> "{-# LANGUAGE" ANN -> "{-# ANN" KW_As -> "as" KW_By -> "by" KW_Case -> "case" KW_Class -> "class" KW_Data -> "data" KW_Default -> "default" KW_Deriving -> "deriving" KW_Do -> "do" KW_MDo -> "mdo" KW_Else -> "else" KW_Family -> "family" KW_Forall -> "forall" KW_Group -> "group" KW_Hiding -> "hiding" KW_If -> "if" KW_Import -> "import" KW_In -> "in" KW_Infix -> "infix" KW_InfixL -> "infixl" KW_InfixR -> "infixr" KW_Instance -> "instance" KW_Let -> "let" KW_Module -> "module" KW_NewType -> "newtype" KW_Of -> "of" KW_Proc -> "proc" KW_Rec -> "rec" KW_Then -> "then" KW_Type -> "type" KW_Using -> "using" KW_Where -> "where" KW_Qualified -> "qualified" KW_Foreign -> "foreign" KW_Export -> "export" KW_Safe -> "safe" KW_Unsafe -> "unsafe" KW_Threadsafe -> "threadsafe" KW_Interruptible -> "interruptible" KW_StdCall -> "stdcall" KW_CCall -> "ccall" XChildTagOpen -> "<%>" KW_CPlusPlus -> "cplusplus" KW_DotNet -> "dotnet" KW_Jvm -> "jvm" KW_Js -> "js" KW_CApi -> "capi" EOF -> "EOF"