{-# LANGUAGE BangPatterns, CPP, OverloadedStrings, ViewPatterns #-} -- | Tokenization breaks a 'String' into pieces of whitespace, -- constants, symbols, and identifiers. module Hpp.Tokens (Token(..), detok, isImportant, notImportant, importants, trimUnimportant, detokenize, tokenize, newLine, skipLiteral) where import Control.Arrow (first, second) import Data.Char (isAlphaNum, isDigit, isSpace, isOctDigit, isHexDigit, digitToInt) import Data.Foldable (foldl') #if __GLASGOW_HASKELL__ < 808 import Data.Monoid ((<>)) #endif import Data.String (IsString, fromString) import Hpp.StringSig -- | Tokenization is 'words' except the white space is tagged rather -- than discarded. data Token s = Important s -- ^ Identifiers, symbols, and constants | Other s -- ^ White space, etc. deriving (Eq,Ord,Show) instance Functor Token where fmap f (Important s) = Important (f s) fmap f (Other s) = Other (f s) {-# INLINE fmap #-} -- | Extract the contents of a 'Token'. detok :: Token s -> s detok (Important s) = s detok (Other s) = s {-# INLINE detok #-} -- | 'True' if the given 'Token' is 'Important'; 'False' otherwise. isImportant :: Token s -> Bool isImportant (Important _) = True isImportant _ = False -- | 'True' if the given 'Token' is /not/ 'Important'; 'False' -- otherwise. notImportant :: Token s -> Bool notImportant (Other _) = True notImportant _ = False -- | Return the contents of only 'Important' (non-space) tokens. importants :: [Token s] -> [s] importants = map detok . filter isImportant -- | Trim 'Other' 'Token's from both ends of a list of 'Token's. trimUnimportant :: [Token s] -> [Token s] trimUnimportant = aux id . dropWhile (not . isImportant) where aux _ [] = [] aux acc (t@(Important _) : ts) = acc (t : aux id ts) aux acc (t@(Other _) : ts) = aux (acc . (t:)) ts -- | Is a 'Token' a newline character? newLine :: (Eq s, IsString s) => Token s -> Bool newLine (Other s) = s == "\n" newLine _ = False maybeImp :: Stringy s => s -> [Token s] maybeImp s = if isEmpty s then [] else [Important s] digitsFromBase :: Stringy s => Int -> s -> s digitsFromBase base = fromString . show . foldl' aux 0 . map digitToInt . toChars where aux acc d = base * acc + d escapeChar :: Stringy s => Char -> Maybe s escapeChar = fmap fromString . flip lookup lut where lut = map (second (show :: Int -> String)) [ ('a', 0x07), ('b', 0x08), ('f', 0x0C), ('n', 0x0A) , ('r', 0x0D), ('t', 0x09), ('v', 0x0B), ('\\', 0x5C) , ('\'', 0x27), ('"', 0x22), ('?', 0x3F) ] data TokChar = TokSpace Char | TokQuote | TokDQuote -- | Break a 'String' into space and non-whitespace runs. tokWords :: Stringy s => s -> [Token s] tokWords s = case sbreak aux s of -- No word breaks Nothing -> [Important s] -- Word delimited by space Just (TokSpace c, pre, pos) -> case sbreak (predicateJust (not . isSpace)) pos of Nothing -> maybeImp pre ++ [Other (cons c pos)] Just (c', spaces, pos') -> maybeImp pre ++ Other (cons c spaces) : tokWords (cons c' pos') -- Possible character literal Just (TokQuote, pre, pos) -> let pre' = snoc pre '\'' in case pos of '\\' :. cs -> case sbreak (boolJust . (== '\'')) cs of Nothing -> [Important (pre' <> pos)] Just (_,esc,pos') | isEmpty esc -> case sbreak (boolJust . (== '\'')) pos' of Just (_,esc', pos'') | isEmpty esc' -> Important pre : Important ("'\\\''") : tokWords pos'' -- Important (fromJust $ escapeChar '\'') : tokWords pos'' _ -> [Important (pre' <> pos)] | otherwise -> let esc' = if sall isOctDigit esc then Important (digitsFromBase 8 esc) else case esc of 'x' :. hs | sall isHexDigit hs -> Important (digitsFromBase 16 hs) (escapeChar -> Just e) :. Nil -> Important e _ -> Important ("'\\" <> snoc esc '\'') in maybeImp pre ++ esc' : tokWords pos' c:.('\'':.cs) -> maybeImp pre ++ Important (fromString ['\'', c, '\'']) : tokWords cs _:._ -> let oops = snoc pre '\'' in case tokWords pos of (Important t:ts) -> Important (oops<>t) : ts ts -> Important oops : ts _ -> [Important (snoc pre '\'')] -- String literal Just (TokDQuote, pre, pos) -> let (lit,pos') = skipLiteral pos in (if isEmpty pre then [] else [Important pre]) ++ Important (cons '"' lit) : tokWords pos' where aux c | isSpace c = Just (TokSpace c) | c == '\'' = Just TokQuote | c == '"' = Just TokDQuote | otherwise = Nothing {-# INLINE aux #-} {-# INLINABLE tokWords #-} data LitStringChar = DBackSlash | EscapedDQuote | DQuote -- | Skip over a string or character literal returning the literal and -- the remaining the input. skipLiteral :: Stringy s => s -> (s,s) skipLiteral s = case breakOn [("\\\\", DBackSlash), ("\\\"", EscapedDQuote), ("\"", DQuote)] s of Nothing -> (s, mempty) -- Unmatched double quote?! Just (DBackSlash, pre, pos) -> first ((pre <> "\\\\") <>) (skipLiteral pos) Just (EscapedDQuote, pre, pos) -> first ((pre <> "\\\"") <>) (skipLiteral pos) Just (DQuote, pre, pos) -> (snoc pre '"', pos) {-# INLINABLE skipLiteral #-} -- | @splits isDelimiter str@ tokenizes @str@ using @isDelimiter@ as a -- delimiter predicate. Leading whitespace is also stripped from -- tokens. splits :: Stringy s => (Char -> Bool) -> s -> [s] splits isDelim = filter (not . isEmpty) . go . sdropWhile isSpace where go s = case sbreak (\c -> if isDelim c then Just c else Nothing) s of Nothing -> [s] Just (d, pre, pos) -> pre : fromString [d] : go (sdropWhile isSpace pos) {-# INLINE splits #-} -- | Predicate on space characters based on something approximating -- valid identifier syntax. This is used to break apart non-space -- characters. validIdentifierChar :: Char -> Bool validIdentifierChar c = isAlphaNum c || c == '_' || c == '\'' -- | Something like @12E+FOO@ is a single pre-processor token, so -- @FOO@ should not be macro expanded. fixExponents :: Stringy s => [Token s] -> [Token s] fixExponents [] = [] fixExponents (t1'@(Important t1) : ts@(Important t2 : Important t3 : ts')) = case (,,,) <$> uncons t1 <*> unsnoc t1 <*> uncons t2 <*> uncons t3 of Just !(!(!d1,_), !(_,!e), !(!c,!cs), !(!d2,_)) | elem c ("-+" :: [Char]) && isEmpty cs && isDigit d1 && isAlphaNum d2 && elem e ("eE" :: [Char]) -> let t = t1 <> t2 <> t3 in t `seq` Important t : fixExponents ts' _ -> t1' : fixExponents ts fixExponents (t:ts) = t : fixExponents ts {-# INLINABLE fixExponents #-} -- | Break an input 'String' into a sequence of 'Tokens'. Warning: -- This may not exactly correspond to your target language's -- definition of a valid identifier! tokenize :: Stringy s => s -> [Token s] tokenize = fixExponents . foldMap seps . tokWords where seps t@(Other _) = [t] seps t@(Important s) = case uncons s of Nothing -> [] Just (c,_) | c == '"' -> [t] | c == '\'' -> [t] | otherwise -> map Important (splits (not . validIdentifierChar) s) {-# INLINABLE tokenize #-} -- | Collapse a sequence of 'Tokens' back into a 'String'. @detokenize -- . tokenize == id@. detokenize :: Monoid s => [Token s] -> s detokenize = foldMap detok {-# INLINE detokenize #-}