{-# LANGUAGE RecordWildCards, BangPatterns #-} -- Most of this module follows the Haskell report, https://www.haskell.org/onlinereport/lexemes.html module Lexer(Lexeme(..), lexer, unlexerFile) where import Data.Char import Data.List import Data.Tuple.Extra -- | A lexeme of text, approx some letters followed by some space. data Lexeme = Lexeme {line :: {-# UNPACK #-} !Int -- ^ 1-based line number (0 = generated) ,col :: {-# UNPACK #-} !Int -- ^ 1-based col number (0 = generated) ,lexeme :: String -- ^ Actual text of the item ,whitespace :: String -- ^ Suffix spaces and comments } deriving Show charNewline x = x == '\r' || x == '\n' || x == '\f' charSpecial x = x `elem` "(),;[]`{}" charAscSymbol x = x `elem` "!#$%&*+./<=>?@\\^|-~" || x == ':' -- special case for me charSymbol x = charAscSymbol x || (isSymbol x && not (charSpecial x) && x `notElem` "_\"\'") charIdentStart x = isAlpha x || x == '_' charIdentCont x = isAlphaNum x || x == '_' || x == '\'' lexer :: String -> [Lexeme] lexer = go1 1 1 where -- we might start with whitespace, before any lexemes go1 line col xs | (whitespace, xs) <- lexerWhitespace xs , whitespace /= "" , (line2, col2) <- reposition line col whitespace = Lexeme{lexeme="", ..} : go line2 col2 xs go1 line col xs = go line col xs go line col "" = [] go line col xs | (lexeme, xs) <- lexerLexeme xs , (whitespace, xs) <- lexerWhitespace xs , (line2, col2) <- reposition line col $ whitespace ++ lexeme = Lexeme{..} : go line2 col2 xs reposition :: Int -> Int -> String -> (Int, Int) reposition = go where go !line !col [] = (line, col) go line col (x:xs) | x == '\n' = go (line+1) 1 xs | x == '\t' = go line (col+8) xs -- technically not totally correct, but please, don't use tabs | otherwise = go line (col+1) xs -- We take a lot of liberties with lexemes around module qualification, because we want to make fields magic -- we ignore numbers entirely because they don't have any impact on what we want to do lexerLexeme :: String -> (String, String) lexerLexeme ('\'':x:'\'':xs) = (['\'',x,'\''], xs) lexerLexeme ('\'':x:xs) | x /= '\'' = ("\'", x:xs) -- might be a data kind, see #25 lexerLexeme (open:xs) | open == '\'' || open == '\"' = seen [open] $ go xs where go (x:xs) | x == open = ([x], xs) | x == '\\', x2:xs <- xs = seen [x,x2] $ go xs | otherwise = seen [x] $ go xs go [] = ([], []) lexerLexeme (x:xs) | charSymbol x , (a, xs) <- span charSymbol xs = (x:a, xs) lexerLexeme (x:xs) | charIdentStart x , (a, xs) <- span charIdentCont xs = (x:a, xs) lexerLexeme (x:xs) = ([x], xs) lexerLexeme [] = ([], []) lexerWhitespace :: String -> (String, String) lexerWhitespace (x:xs) | isSpace x = seen [x] $ lexerWhitespace xs lexerWhitespace ('-':'-':xs) | (a, xs) <- span (== '-') xs , not $ any charSymbol $ take 1 xs , (b, xs) <- break charNewline xs , (c, xs) <- splitAt 1 xs = seen "--" $ seen a $ seen b $ seen c $ lexerWhitespace xs lexerWhitespace ('{':'-':xs) = seen "{-" $ f 1 xs where f 1 ('-':'}':xs) = seen "-}" $ lexerWhitespace xs f i ('{':'-':xs) = seen "{-" $ f (i+1) xs f i (x:xs) = seen [x] $ f i xs f i [] = ([], []) lexerWhitespace xs = ([], xs) seen xs = first (xs++) unlexerFile :: FilePath -> [Lexeme] -> String unlexerFile src xs = dropping 1 ++ go 1 True [(line, lexeme ++ whitespace) | Lexeme{..} <- xs] where go :: Int -> Bool -> [(Int, String)] -> String go doc drp ((i, x):xs) = (if doc /= i && i /= 0 && drp then dropping i else "") ++ x ++ go ((if i == 0 then doc else i) + length (filter (== '\n') x)) ("\n" `isSuffixOf` x) xs go _ _ [] = "" dropping n = "{-# LINE " ++ show n ++ " " ++ show src ++ " #-}\n"