{ ---------------------------------------------------------------------------- -- | -- Module : FastTags.Lexer -- Copyright : (c) Sergey Vinokurov 2019 ---------------------------------------------------------------------------- {-# LANGUAGE BangPatterns #-} {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE LambdaCase #-} {-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE ScopedTypeVariables #-} {-# OPTIONS_GHC -fno-warn-tabs #-} module FastTags.Lexer (tokenize) where import Control.Applicative as A import Control.Monad import Control.Monad.Writer.Strict import Control.Monad.State.Strict import qualified Data.ByteString as BS import Data.Text (Text) import qualified Data.Text as T import Data.Void (Void, absurd) import Data.Word import Foreign.Ptr (plusPtr) import FastTags.LensBlaze import FastTags.LexerM import FastTags.LexerTypes import FastTags.Token } $ascspace = [\ \t\r] $unispace = \x01 $space = [$ascspace $unispace] $nl = [\n] $ws = [$space\f\v] # $nl $dot = [\.] $asclarge = [A-Z] $unilarge = \x02 $large = [$asclarge $unilarge] $ascsmall = [a-z] $unismall = \x03 $small = [$ascsmall $unismall] -- These symbols can be part of operators but are reserved when occur by -- themselves. $symbols_reserved_as_standalone = [ \→ \∷ \⇒ \∀ ] $special_sym = [\(\)\,\;\[\]\`\{\}] $ascsymbol = [\!\#\$\%\&\*\+\.\/\<\=\>\?\@\\\^\|\-\~\:] $unisymbol = \x04 $symbol = [$ascsymbol $unisymbol $symbols_reserved_as_standalone] # [$special_sym \_\'\"] $ascident = [$ascsmall $asclarge] $uniident = [$unismall $unilarge] $ascdigit = [0-9] $unidigit = \x05 $digit = [$ascdigit $unidigit] $unisuffix = \x06 $ident_nonsym = [$ascident $uniident $unisuffix $digit] # [$symbol] $ident_syms = [\'\_\#] $ident = [$ident_nonsym $ident_syms] -- Stands for "→", "∷", "⇒", "⦇", "⦈", "∀", "⟦", "⟧" $reserved_symbol = \x07 -- Random characters for which there's no category. $other = \x08 -- Some rules assume that there's an optional \r before \n so don't change this definition. @nl = ( [\r]? $nl ) @qualificationPrefix = ( $large $ident* $dot )* @arrow = "->" @doublecolon = "::" @implies = "=>" @lbanana = "(|" @rbanana = "|)" $charesc = [a b f n r t v \\ \" \' \&] $octdigit = [0-7] $hexdigit = [0-9a-fA-F] @charescape = [\\] ( $charesc | $asclarge+ | "o" $octdigit+ | "x" $hexdigit+ ) @float_number = ( [\+\-]? ( $digit+ ( "." $digit+ )? | $digit* "." $digit+ ) ( [eE] [\+\-]? $digit* )? ) @number = ( [\+\-]? $digit+ | 0 ( [oO] $octdigit+ | [xX] $hexdigit ) | @float_number ) @cpp_ws = ( $ascspace | [\\] @nl ) @cpp_opt_ws = @cpp_ws* @cpp_nonempty_ws = ( $ascspace @cpp_ws* | @cpp_ws* $ascspace ) @cpp_dir_start = "#"+ @cpp_opt_ws @define_name = [$ascident $ascdigit _ ']+ @define_body = ( [^ \\ $nl]+ | [\\] ( @nl | . ) )+ @nl -- Except "define" @cppdirective = ( "if" | "ifdef" | "ifndef" | "endif" | "elif" | "else" | "undef" | "line" | "error" | "warning" | "include" ) -- Except cpp directives, "let", and "enum". @hscdirective = ( "def" | "const" | "const_str" | "type" | "peek" | "poke" | "ptr" | "offset" | "size" | "alignment" ) -- Except "enum" @all_cpp_and_hsc_directives = ( "define" | "let" | @cppdirective | @hscdirective ) :- -- Literate Haskell support. 'literate' code handles all text except actual -- Haskell program text. It aims to strip all non-Haskell text. { @nl ">" $ws* { \input len -> (Newline $! countInputSpace input len) <$ startLiterateBird } @nl "\begin{code}" @nl $space* { \input len -> (Newline $! countInputSpace input len) <$ startLiterateLatex } @nl ; . { \_ _ -> dropUntilNL' } } -- Drop shebang <0, literate> $nl "#!" .* ; -- Can skip whitespace everywhere since it does not affect meaning in any -- state. <0, comment, qq, literate> $ws+ ; <0> { -- Analyse "#if 0" constructs used in e.g. GHC.Base ^ @cpp_dir_start "if" @cpp_opt_ws "0" ( @cpp_nonempty_ws .* )? { \_ _ -> startPreprocessorStripping } -- Named defines, implicitly drops: '( @cpp_ws+ | "(" ) @define_body' ^ @cpp_dir_start ("define" | "let") @cpp_ws+ @define_name { \input len -> do modify $ \s -> s { asInput = dropUntilUnescapedNL $ asInput s } pure $! CppDefine $! extractDefineOrLetName input len } ^ @cpp_dir_start @cppdirective .* ( [\\] @nl .* )* ; ^ @cpp_dir_start ("{" (@cpp_ws | @nl)*)? "enum" { \_ _ -> pure HSCEnum } ^ @cpp_dir_start @all_cpp_and_hsc_directives { \_ _ -> pure HSCDirective } ^ @cpp_dir_start "{" (@cpp_ws | @nl)* @all_cpp_and_hsc_directives { \_ _ -> pure HSCDirectiveBraced } -- Drop everything else that starts with #, e.g. -- '# 17 "/usr/include/stdc-predef.h" 3 4' -- #{get_area "Queue.T"} ^ "#" $ascspace* "{"? @define_name+ { \_ _ -> dropUntilNL' } } { @cpp_dir_start @cpp_opt_ws ( "ifdef" | "if" ) .* { \_ _ -> startPreprocessorStripping } @cpp_dir_start @cpp_opt_ws ("else" | "elif" | "endif") .* { \_ _ -> endPreprocessorStripping } @cpp_dir_start @cpp_opt_ws ( "define" | "undef" | "line" | "error" | "warning" | "include" | "let" ) { \_ _ -> dropUntilNL' } $nl ; . { \_ _ -> dropUntilNLOr' 35 -- '#' } } -- Newlines and comments. <0> { @nl ">" $space* / { isLiterateEnabled' } { \input len -> pure $! Newline $! countInputSpace input len } @nl / { shouldEndLiterateBird } { \_ _ -> Newline 0 <$ endLiterate } @nl "\end{code}" / { shouldEndLiterateLatex } { \_ _ -> endLiterate' } [\\]? @nl $space* "{-" { \input len -> startIndentationCounting (countInputSpace input len) } [\\]? @nl $space* { \input len -> pure $! Newline $! len - countBackslashCR input - 1 } [\-][\-]+ ~[$symbol $nl] { \_ _ -> dropUntilNL' } [\-][\-]+ / @nl ; } -- Nested comments <0, comment> "{-" { \_ _ -> startComment } "-}" { \_ _ -> endComment startCode } -- 45 - '-' -- 123 - '{' @nl ; ($other | .) { \_ _ -> dropUntilNLOrEither' 45 123 } <0> "-}" { \_ _ -> errorAtLine "Unmatched -}" } "{-" { \_ _ -> startIndentComment } "-}" { \_ _ -> endComment indentCountCode } (. | @nl) ; { $space* "{-" { \input len -> addIndentationSize (fromIntegral (countInputSpace input len)) *> startIndentComment } $space* { \_ len -> endIndentationCounting len } } <0> { -- Strings [\"] ( [^\" \\ \r \n] | [\\] . | [\\] @nl ( $ws* [\\] )? )* [\"] { \_ _ -> pure String } [\"] { \_ _ -> pure DQuote } -- Characters. [\'] ( [^ \' \\ \n \r] | @charescape ) [\'] { kw Character } } -- Template Haskell quasiquoters <0> { "[|" { \_ _ -> startUnconditionalQuasiQuoter } "[" [\$\(]* @qualificationPrefix $ident+ [\)]* "|" { \input _ -> startQuasiquoter input } "$(" { \_ _ -> startSplice CtxHaskell } ^ "$" [\']* @qualificationPrefix $ident+ { \_ _ -> pure ToplevelSplice } } { "$(" { \_ _ -> startSplice CtxQuasiquoter } "|]" { \_ _ -> endQuasiquoter } $reserved_symbol { \input _len -> reservedSymbolQQ (unsafeTextHead (aiPtr input)) } (. | @nl) ; } -- Vanilla tokens <0> { "case" { kw KWCase } "class" { kw KWClass } "data" { kw KWData } "default" { kw KWDefault } "deriving" { kw KWDeriving } "do" { kw KWDo } "else" { kw KWElse } "family" { kw KWFamily } "forall" { \_ _ -> pure forallTokenVal } "foreign" { kw KWForeign } "if" { kw KWIf } "import" { kw KWImport } "in" { kw KWIn } "infix" { kw KWInfix } "infixl" { kw KWInfixl } "infixr" { kw KWInfixr } "instance" { kw KWInstance } "let" { kw KWLet } "module" { kw KWModule } "newtype" { kw KWNewtype } "of" { kw KWOf } "pattern" { \_ _ -> pure patternTokenVal } "then" { kw KWThen } "type" { kw KWType } "where" { kw KWWhere } @arrow { kw Arrow } "@" { kw At } "`" { kw Backtick } "," { kw Comma } "." { kw Dot } @doublecolon { kw DoubleColon } "=" { kw Equals } "!" { kw ExclamationMark } @implies { kw Implies } "{" { kw LBrace } "[" { kw LBracket } "(" { pushLParen } "|" { kw Pipe } "}" { kw RBrace } "]" { kw RBracket } ")" { popRParen } "~" { kw Tilde } ";" { kw Semicolon } [\\] { kw LambdaBackslash } -- Not interested in numbers, but it takes time to extract their text so -- it's quicker to just ignore them. @number { kw Number } [\']* @qualificationPrefix ($ident | $large)+ { \input len -> pure $! T $! takeText input len } @qualificationPrefix $symbol+ { \input len -> pure $! T $! takeText input len } $reserved_symbol { \input _len -> reservedSymbol (unsafeTextHead (aiPtr input)) } @lbanana / ~[$symbol] { \_ _ -> pure LBanana } @rbanana { \_ _ -> pure RBanana } } { type AlexAction = AlexInput -> Int -> AlexM TokenVal type AlexPred a = a -> AlexInput -> Int -> AlexInput -> Bool {-# INLINE kw #-} kw :: TokenVal -> AlexAction kw tok = \_ _ -> pure tok isLiterateEnabled' :: AlexPred (LitMode a) isLiterateEnabled' litLoc _inputBefore _len _inputAfter = isLiterateEnabled litLoc shouldEndLiterateBird :: AlexPred (LitMode LitStyle) shouldEndLiterateBird litLoc inputBefore _len _inputAfter = case unsafeTextHeadAscii $ (`plusPtr` 1) $ aiPtr inputBefore of -- 62 = '>' 62 -> False _ -> isLiterateBirdInside litLoc shouldEndLiterateLatex :: AlexPred (LitMode LitStyle) shouldEndLiterateLatex litLoc _inputBefore _len _inputAfter = isLiterateLatexInside litLoc tokenize :: FilePath -> LitMode Void -> Bool -> BS.ByteString -> Either Text [Token] tokenize filename litLoc trackPrefixesOffsets input = case runAlexM filename trackPrefixesOffsets litLoc code input scanTokens of (Nothing, xs) -> Right xs (Just err, _) -> Left err where code = case litLoc of LitVanilla -> startCode LitOutside -> literateCode LitInside x -> absurd x scanTokens :: AlexM (Maybe Text) scanTokens = go where go = do !nextTok <- continueScanning case nextTok of EOF -> pure Nothing Error err -> pure $ Just err _ -> do -- Use input after reading token to get proper prefix that includes -- token we currently read. AlexState{asInput} <- get tell [(asInput, nextTok)] go continueScanning :: AlexM TokenVal continueScanning = do s@AlexState{asInput} <- get go (view asCodeL s) (view asLiterateLocL s) asInput where go :: AlexCode -> LitMode LitStyle -> AlexInput -> AlexM TokenVal go !code !litLoc = go' where go' input = do case alexScanUser litLoc input (unAlexCode code) :: AlexReturn AlexAction of AlexEOF -> pure EOF AlexError input -> do code' <- gets (view asCodeL) pure $ Error $ "Lexical error while in state " <> T.pack (show code') <> " at line " <> T.pack (show (unLine (view aiLineL input))) <> ": '" <> takeText input 40 <> "'" AlexSkip input' _ -> go' input' AlexToken input' tokLen action -> alexSetInput input' *> action input tokLen dropUntilNL_ :: AlexM () dropUntilNL_ = modify $ \s -> s { asInput = dropUntilNL $ asInput s } dropUntilNL' :: AlexM TokenVal dropUntilNL' = dropUntilNL_ *> continueScanning dropUntilNLOr' :: Word8 -> AlexM TokenVal dropUntilNLOr' w = do modify $ \s -> s { asInput = dropUntilNLOr w $ asInput s } continueScanning dropUntilNLOrEither' :: Word8 -> Word8 -> AlexM TokenVal dropUntilNLOrEither' w1 w2 = do modify $ \s -> s { asInput = dropUntilNLOrEither w1 w2 $ asInput s } continueScanning startIndentationCounting :: Int -> AlexM TokenVal startIndentationCounting !n = do modify (\s -> set asCommentDepthL 1 $ set asIndentationSizeL (fromIntegral n) s) alexSetNextCode indentCommentCode continueScanning endIndentationCounting :: Int -> AlexM TokenVal endIndentationCounting !n = do alexSetNextCode startCode Newline . (+ n) . fromIntegral <$> gets (view asIndentationSizeL) startIndentComment :: AlexM TokenVal startIndentComment = do void $ modifyCommentDepth (+1) alexSetNextCode indentCommentCode continueScanning startPreprocessorStripping :: AlexM TokenVal startPreprocessorStripping = do void $ modifyPreprocessorDepth (+1) alexSetNextCode stripCppCode continueScanning endPreprocessorStripping :: AlexM TokenVal endPreprocessorStripping = do newDepth <- modifyPreprocessorDepth (\x -> x - 1) when (newDepth == 0) $ alexSetNextCode startCode continueScanning startComment :: AlexM TokenVal startComment = do void $ modifyCommentDepth (+1) alexSetNextCode commentCode continueScanning endComment :: AlexCode -> AlexM TokenVal endComment nextCode = do newDepth <- modifyCommentDepth (\x -> x - 1) when (newDepth == 0) $ alexSetNextCode nextCode continueScanning startQuasiquoter :: AlexInput -> AlexM TokenVal startQuasiquoter AlexInput{aiPtr} = do !haveEnd <- gets (view asHaveQQEndL) isEndPresent <- case haveEnd of Nothing -> do let haveEnd' = checkQuasiQuoteEndPresent aiPtr modify $ set asHaveQQEndL (Just haveEnd') pure haveEnd' Just ends' -> pure ends' case isEndPresent of -- No chance of quasi-quote closing till the end of current file. -- Assume that file ought to be well-formed and treat currently -- matched input (and throw away the pipe character). False -> pure LBracket True -> startUnconditionalQuasiQuoter startUnconditionalQuasiQuoter :: AlexM TokenVal startUnconditionalQuasiQuoter = QuasiquoterStart <$ alexSetNextCode qqCode startSplice :: Context -> AlexM TokenVal startSplice ctx = do alexSetNextCode startCode pushContext ctx pure SpliceStart endQuasiquoter :: AlexM TokenVal endQuasiquoter = QuasiquoterEnd <$ alexSetNextCode startCode pushLParen :: AlexAction pushLParen _ _ = LParen <$ pushContext CtxHaskell popRParen :: AlexAction popRParen _ _ = do cs <- gets asContextStack case cs of [] -> pure () c : cs' -> do modify $ \s -> s { asContextStack = cs' } alexSetNextCode $ case c of CtxHaskell -> startCode CtxQuasiquoter -> qqCode pure RParen {-# INLINE errorAtLine #-} errorAtLine :: MonadState AlexState m => Text -> m TokenVal errorAtLine msg = do line <- gets (unLine . view aiLineL . asInput) return $ Error $ "Error at line " <> T.pack (show line) <> ": " <> msg startLiterateBird :: AlexM () startLiterateBird = do alexSetNextCode startCode alexEnterBirdLiterateEnv startLiterateLatex :: AlexM () startLiterateLatex = do alexSetNextCode startCode alexEnterLiterateLatexEnv endLiterate :: AlexM () endLiterate = do alexSetNextCode literateCode alexExitLiterateEnv endLiterate' :: AlexM TokenVal endLiterate' = do alexSetNextCode literateCode alexExitLiterateEnv continueScanning reservedSymbol :: Char -> AlexM TokenVal reservedSymbol = \case '→' -> pure Arrow '∷' -> pure DoubleColon '⇒' -> pure Implies '∀' -> pure forallTokenVal '⦇' -> pure LBanana '⦈' -> A.pure RBanana '⟦' -> startUnconditionalQuasiQuoter '⟧' -> endQuasiquoter c -> error $ "Unexpected reserved symbol: " ++ show c reservedSymbolQQ :: Char -> AlexM TokenVal reservedSymbolQQ = \case '⟧' -> endQuasiquoter _ -> continueScanning {-# INLINE countBackslashCR #-} countBackslashCR :: AlexInput -> Int countBackslashCR AlexInput{aiPtr} = case unsafeTextHeadAscii aiPtr of -- '\\' 92 -> case unsafeTextHeadOfTailAscii aiPtr of -- '\r' 13 -> 2 _ -> 1 -- '\r' 13 -> 1 _ -> 0 -- Known codes {-# INLINE startCode #-} {-# INLINE qqCode #-} {-# INLINE commentCode #-} {-# INLINE indentCommentCode #-} {-# INLINE indentCountCode #-} {-# INLINE literateCode #-} {-# INLINE stripCppCode #-} startCode, qqCode, commentCode, indentCommentCode, indentCountCode, literateCode, stripCppCode :: AlexCode startCode = AlexCode 0 qqCode = AlexCode qq commentCode = AlexCode comment indentCommentCode = AlexCode indentComment indentCountCode = AlexCode indentCount literateCode = AlexCode literate stripCppCode = AlexCode stripCpp }