module Hydrogen.Syntax.Parser.TokenParser where import Hydrogen.Prelude hiding ((<|>), many) import Hydrogen.Syntax.Types import Hydrogen.Parsing.Char hiding (token, tokens, space) data TokenizerConfig = TokenizerConfig { cfgSpaces :: [Char] , cfgEscapes :: [(Char, Char)] , cfgComments :: String , cfgOpeningBraces :: [Char] , cfgClosingBraces :: [Char] , cfgControlCharacters :: [Char] , cfgSpecialCharacters :: [Char] , cfgSpecialSemantics :: [Char] , cfgFoldSpecials :: [Char] } deriving (Eq, Show) instance Default TokenizerConfig where def = TokenizerConfig { cfgSpaces = " " , cfgEscapes = [ ('\\', '\\'), ('\'', '\''), ('"', '"'), ('`', '`'), ('0', '\0'), ('a', '\a'), ('b', '\b'), ('f', '\f'), ('n', '\n'), ('r', '\r'), ('t', '\t'), ('v', '\v') ] , cfgComments = ";;" , cfgOpeningBraces = "([{" , cfgClosingBraces = ")]}" , cfgControlCharacters = ['\0' .. '\x1F'] , cfgSpecialCharacters = ",;" , cfgSpecialSemantics = "{;" , cfgFoldSpecials = ";" } parseTokens :: TokenizerConfig -> String -> Parser String (Tokens Token) parseTokens cfg src = either mkError sanitize . runIdentity . runParserT tokens () src where TokenizerConfig { .. } = cfg tokens = many (liftA2 (,) getPosition token) <* eof token = choice [ TIndent <$> indent , something >>= \x -> choice [ TString x '\'' <$> chars3 "'''" , TString x '\"' <$> chars3 "\"\"\"" , TString x '`' <$> chars3 "```" , TString x '\'' <$> chars' '\'' , TString x '\"' <$> chars' '\"' , TString x '`' <$> chars' '`' , TSomething <$> return x ] , TString "" '\'' <$> chars3 "'''" , TString "" '\"' <$> chars3 "\"\"\"" , TString "" '`' <$> chars3 "```" , TString "" '\'' <$> chars '\'' , TString "" '\"' <$> chars '\"' , TString "" '`' <$> chars' '`' , const TSpaces <$> many1 space , TSpecial <$> oneOf cfgSpecialCharacters , TBraceOpen "" <$> oneOf cfgOpeningBraces , TBraceClose <$> oneOf cfgClosingBraces ] space = oneOf cfgSpaces forbidden = join [ cfgOpeningBraces , cfgClosingBraces , "\'\"`" , cfgSpecialCharacters , cfgSpaces , cfgControlCharacters ] something = many1 (noneOf forbidden) indent = many1 (try (many space >> (nl <|> comment))) *> (length <$> many space) where nl = string "\n" comment = try (string cfgComments >> many (noneOf "\n") <* char '\n') chars' d = inBetween (char d) (many (try (char '\\' >> char d) <|> noneOf (d : "\n"))) chars d = inBetween (char d) (many (noneOf (d : "\\\n") <|> escape)) chars3 d = manyBetween (try (string d)) (try (string d)) (noneOf "\\" <|> escape) inBetween d p = d *> p <* d escape = char '\\' >> choice [ special <$> oneOf (map fst cfgEscapes) , char 'x' >> (convert <$> count 2 hexDigit) , char 'u' >> (convert <$> count 4 hexDigit) , char 'U' >> (convert <$> count 8 hexDigit) ] where special = fromJust . flip lookup cfgEscapes convert str = let [(hex, _)] = readHex str in toEnum hex sanitize :: [(SourcePos, Token)] -> Either SomethingBad (Tokens Token) sanitize (t1@(p1, v1) : ts@(_t2@(p2, v2) : tss)) = case (v1, v2) of (TSomething k, TBraceOpen _ t) -> sanitize ((p1, TBraceOpen k t) : tss) (TString _ _ _, TSomething _) -> Left [(p2, "")] (TString _ _ _, TBraceOpen _ _) -> Left [(p2, "")] (TBraceClose _, TSomething _) -> Left [(p2, "")] (TBraceClose _, TString _ _ _) -> Left [(p2, "")] (TBraceClose _, TBraceOpen _ _) -> Left [(p2, "")] (TBraceOpen _ _, TIndent _) -> sanitize (t1 : tss) (TBraceOpen _ _, TSpaces) -> sanitize (t1 : tss) (TBraceOpen _ _, _) -> fmap ([t1, (p2, TIndent (sourceColumn p2 - 1))] ++) (sanitize ts) (TSpaces, _) -> sanitize ts (TSpecial _, TSpaces) -> sanitize (t1 : tss) (TSpecial x1, TSpecial x2) | x1 == x2 && x1 `elem` cfgFoldSpecials -> sanitize ts (TIndent _, TBraceOpen _ b) | b `elem` cfgSpecialSemantics -> sanitize ts (TIndent _, TBraceClose _) -> sanitize ts (TIndent _, TSpecial x) | x `elem` cfgSpecialSemantics -> sanitize ts (TSpecial x, TIndent _) | x `elem` cfgSpecialSemantics -> sanitize (t1 : tss) _ -> fmap (t1 :) (sanitize ts) sanitize [t@(_, v)] = Right $ case v of TSpaces -> [] TIndent _ -> [] TSpecial x | x `elem` cfgSpecialSemantics -> [] _ -> [t] sanitize [] = Right []