-- | Reference lexer for core langauge parser. Slow but Simple. -- -- The lexers here all use 'String' in place of a real name type. -- After applying these functions to the program text, we need -- to use `renameTok` tok convert the strings in `TokNamed` tokens -- into the name type specific to the langauge fragment to be parsed. -- module DDC.Core.Lexer ( module DDC.Core.Lexer.Tokens , Located (..) -- * Lexer , lexModuleWithOffside , lexExp) where import DDC.Core.Lexer.Token.Builtin import DDC.Core.Lexer.Token.Index import DDC.Core.Lexer.Token.Keyword import DDC.Core.Lexer.Token.Literal import DDC.Core.Lexer.Token.Names import DDC.Core.Lexer.Token.Operator import DDC.Core.Lexer.Token.Symbol import DDC.Core.Lexer.Offside import DDC.Core.Lexer.Tokens import DDC.Data.SourcePos import Data.Text (Text) import qualified System.IO.Unsafe as System import qualified Text.Lexer.Inchworm.Char as I import qualified Data.Text as Text -- Module ----------------------------------------------------------------------------------------- -- | Lex a module and apply the offside rule. -- -- Automatically drop comments from the token stream along the way. -- lexModuleWithOffside :: FilePath -- ^ Path to source file, for error messages. -> Int -- ^ Starting line number. -> String -- ^ String containing program text. -> [Located (Token String)] lexModuleWithOffside sourceName lineStart str = applyOffside [] [] $ addStarts $ dropUnused $ lexText sourceName lineStart $ Text.pack str where dropUnused ts = case ts of [] -> [] Located _ (KM KComment{}) : ts' -> dropUnused ts' t : ts' -> t : dropUnused ts' -- Exp -------------------------------------------------------------------------------------------- -- | Lex a string into tokens. -- -- Automatically drop comments from the token stream along the way. -- lexExp :: FilePath -- ^ Path to source file, for error messages. -> Int -- ^ Starting line number. -> String -- ^ String containing program text. -> [Located (Token String)] lexExp sourceName lineStart str = dropUnused $ lexText sourceName lineStart $ Text.pack str where dropUnused ts = case ts of [] -> [] Located _ (KM KComment{}) : ts' -> dropUnused ts' Located _ (KM KNewLine{}) : ts' -> dropUnused ts' t : ts' -> t : dropUnused ts' -- Generic ---------------------------------------------------------------------------------------- -- Tokenize some input text. -- -- NOTE: Although the main interface for the lexer uses standard Haskell strings, -- we're using Text internally to get proper unicode tokenization. -- Eventually, we should refactor the API to only pass around Text, rather -- than Strings. -- lexText :: String -- ^ Name of source file, which is attached to the tokens. -> Int -- ^ Starting line number. -> Text -- ^ Text to tokenize. -> [Located (Token String)] lexText filePath nStart txt = let (toks, locEnd, strLeftover) = System.unsafePerformIO $ I.scanListIO (I.Location nStart 1) I.bumpLocationWithChar (Text.unpack txt) (scanner filePath) I.Location lineEnd colEnd = locEnd spEnd = SourcePos filePath lineEnd colEnd in case strLeftover of [] -> toks str -> toks ++ [Located spEnd (KErrorJunk (take 10 str))] -- | Scanner for core tokens tokens. type Scanner a = I.Scanner IO I.Location [Char] a ------------------------------------------------------------------------------- -- | Scanner for source and core files. -- -- The lexical structure for source and core is a bit different, -- but close enough that there's no point writing a separate lexer yet. -- scanner :: FilePath -> Scanner (Located (Token String)) scanner fileName = let stamp :: (I.Location, a) -> Located a stamp (I.Location line col, token) = Located (SourcePos fileName line col) token {-# INLINE stamp #-} stamp' :: (a -> b) -> (I.Location, a) -> Located b stamp' k (I.Location line col, token) = Located (SourcePos fileName line col) (k token) {-# INLINE stamp' #-} in I.skip (\c -> c == ' ' || c == '\t') $ I.alts [ -- Newlines are scanned to their own tokens because -- the transform that manages the offside rule uses them. fmap stamp $ I.from (\c -> case c of '\n' -> return $ KM KNewLine _ -> Nothing) -- Scan comments into their own tokens, -- these then get dropped by the dropComments function. , fmap (stamp' (KM . KComment)) $ I.scanHaskellCommentLine , fmap (stamp' (KM . KComment)) $ I.scanHaskellCommentBlock -- deBruijn indices. -- Needs to come before scanSymbol as '^' is also an operator. , fmap (stamp' (KA . KIndex)) $ scanIndex -- Literal values. , fmap (stamp' (\(l, b) -> KA (KLiteral l b))) $ scanLiteral -- Infix operators. -- Needs to come before scanSymbol because operators -- like "==" are parsed atomically rather than as -- two separate '=' symbols. , fmap (stamp' (KA . KOp)) $ scanInfixOperator -- Prefix operators. , fmap (stamp' (KA . KOpVar)) $ scanPrefixOperator -- The unit value. -- Needs to come before scanSymbol because the "()" -- lexeme is parsed atomically rather than as -- separate '(' and ')' symbols. , fmap stamp $ I.froms (Just 2) (\ss -> if ss == "()" then Just (KA $ KBuiltin $ BDaConUnit) else Nothing) -- Symbolic tokens like punctuation. , fmap (stamp' (KA . KSymbol)) $ scanSymbol -- Named things. -- Keywords have the same lexical structure as variables as -- they all start with a lower-case letter. We need to check -- for keywords before accepting a variable. , fmap (stamp' (KA . KBuiltin)) $ scanBuiltin , fmap (stamp' (KA . KKeyword)) $ scanKeyword , fmap (stamp' (KN . KCon)) $ scanConName , fmap (stamp' (KN . KVar)) $ scanVarName ]