module Hydrogen.Syntax.Parser.TokenParser where
import Hydrogen.Prelude hiding ((<|>), many)
import Hydrogen.Syntax.Types
import Hydrogen.Parsing.Char hiding (token, tokens, space)
data TokenizerConfig = TokenizerConfig {
cfgSpaces :: [Char]
, cfgEscapes :: [(Char, Char)]
, cfgComments :: String
, cfgOpeningBraces :: [Char]
, cfgClosingBraces :: [Char]
, cfgControlCharacters :: [Char]
, cfgSpecialCharacters :: [Char]
, cfgSpecialSemantics :: [Char]
, cfgFoldSpecials :: [Char]
} deriving (Eq, Show)
instance Default TokenizerConfig where
def = TokenizerConfig {
cfgSpaces = " "
, cfgEscapes = [
('\\', '\\'),
('\'', '\''),
('"', '"'),
('`', '`'),
('0', '\0'),
('a', '\a'),
('b', '\b'),
('f', '\f'),
('n', '\n'),
('r', '\r'),
('t', '\t'),
('v', '\v') ]
, cfgComments = ";;"
, cfgOpeningBraces = "([{"
, cfgClosingBraces = ")]}"
, cfgControlCharacters = ['\0' .. '\x1F']
, cfgSpecialCharacters = ",;"
, cfgSpecialSemantics = "{;"
, cfgFoldSpecials = ";"
}
parseTokens :: TokenizerConfig -> String -> Parser String (Tokens Token)
parseTokens cfg src = either mkError sanitize . runIdentity . runParserT tokens () src
where
TokenizerConfig { .. } = cfg
tokens = many (liftA2 (,) getPosition token) <* eof
token = choice [
TIndent <$> indent
, something >>= \x -> choice [
TString x '\'' <$> chars3 "'''"
, TString x '\"' <$> chars3 "\"\"\""
, TString x '`' <$> chars3 "```"
, TString x '\'' <$> chars' '\''
, TString x '\"' <$> chars' '\"'
, TString x '`' <$> chars' '`'
, TSomething <$> return x
]
, TString "" '\'' <$> chars3 "'''"
, TString "" '\"' <$> chars3 "\"\"\""
, TString "" '`' <$> chars3 "```"
, TString "" '\'' <$> chars '\''
, TString "" '\"' <$> chars '\"'
, TString "" '`' <$> chars' '`'
, const TSpaces <$> many1 space
, TSpecial <$> oneOf cfgSpecialCharacters
, TBraceOpen "" <$> oneOf cfgOpeningBraces
, TBraceClose <$> oneOf cfgClosingBraces
]
space = oneOf cfgSpaces
forbidden = join [
cfgOpeningBraces
, cfgClosingBraces
, "\'\"`"
, cfgSpecialCharacters
, cfgSpaces
, cfgControlCharacters
]
something = many1 (noneOf forbidden)
indent = many1 (try (many space >> (nl <|> comment))) *> (length <$> many space)
where
nl = string "\n"
comment = try (string cfgComments >> many (noneOf "\n") <* char '\n')
chars' d = inBetween (char d) (many (try (char '\\' >> char d) <|> noneOf (d : "\n")))
chars d = inBetween (char d) (many (noneOf (d : "\\\n") <|> escape))
chars3 d = manyBetween (try (string d)) (try (string d)) (noneOf "\\" <|> escape)
inBetween d p = d *> p <* d
escape = char '\\' >> choice [
special <$> oneOf (map fst cfgEscapes)
, char 'x' >> (convert <$> count 2 hexDigit)
, char 'u' >> (convert <$> count 4 hexDigit)
, char 'U' >> (convert <$> count 8 hexDigit)
]
where
special = fromJust . flip lookup cfgEscapes
convert str = let [(hex, _)] = readHex str in toEnum hex
sanitize :: [(SourcePos, Token)] -> Either SomethingBad (Tokens Token)
sanitize (t1@(p1, v1) : ts@(_t2@(p2, v2) : tss)) = case (v1, v2) of
(TSomething k, TBraceOpen _ t) -> sanitize ((p1, TBraceOpen k t) : tss)
(TString _ _ _, TSomething _) -> Left [(p2, "")]
(TString _ _ _, TBraceOpen _ _) -> Left [(p2, "")]
(TBraceClose _, TSomething _) -> Left [(p2, "")]
(TBraceClose _, TString _ _ _) -> Left [(p2, "")]
(TBraceClose _, TBraceOpen _ _) -> Left [(p2, "")]
(TBraceOpen _ _, TIndent _) -> sanitize (t1 : tss)
(TBraceOpen _ _, TSpaces) -> sanitize (t1 : tss)
(TBraceOpen _ _, _) ->
fmap ([t1, (p2, TIndent (sourceColumn p2 1))] ++) (sanitize ts)
(TSpaces, _) -> sanitize ts
(TSpecial _, TSpaces) -> sanitize (t1 : tss)
(TSpecial x1, TSpecial x2) | x1 == x2 && x1 `elem` cfgFoldSpecials -> sanitize ts
(TIndent _, TBraceOpen _ b) | b `elem` cfgSpecialSemantics -> sanitize ts
(TIndent _, TBraceClose _) -> sanitize ts
(TIndent _, TSpecial x) | x `elem` cfgSpecialSemantics -> sanitize ts
(TSpecial x, TIndent _) | x `elem` cfgSpecialSemantics -> sanitize (t1 : tss)
_ -> fmap (t1 :) (sanitize ts)
sanitize [t@(_, v)] = Right $ case v of
TSpaces -> []
TIndent _ -> []
TSpecial x | x `elem` cfgSpecialSemantics -> []
_ -> [t]
sanitize [] = Right []