module Text.SXML.Internal(lexer, Token(..)) where import Data.Char import Text.SXML.Utils import qualified Data.Text as T data Token = OpenParen | CloseParen | DocRoot | NamespacesList | AttributesList | PI | Comment | Entity | Str T.Text | NodeName T.Text | LexError String deriving (Eq, Show) lexer :: String -> [Token] lexer = lexerh 0 lexerh _ "" = [] lexerh n (c:cs) | c == '(' = OpenParen : lexerh (n+1) cs | c == ')' = CloseParen : lexerh (n+1) cs | c == '@' = AttributesList : lexerh (n+1) cs | c == '*' = parseSpNode "" (n+1) cs | c == '"' = parseStr "" (n+1) cs | isSpace c = lexerh (n+1) cs | isXMLNameStartChar c = parseNode [c] (n+1) cs | otherwise = [LexError $ "Parsing error at position " ++ show n ++ ": invalid character at the start of an XML node name " ++ show c] where parseSpNode n _ "" | n == "*POT" = [DocRoot] | n == "*IP" = [PI] | n == "*SECAPSEMAN" = [NamespacesList] | n == "*TNEMMOC" = [Comment] | n == "*YTITNE" = [Entity] | otherwise = [NodeName (T.pack ('*': reverse n))] parseSpNode n x (c:cs) = if isSpace c then case n of "*POT" -> DocRoot : lexerh (x+1) cs "*IP" -> PI : lexerh (x+1) cs "*SECAPSEMAN" -> NamespacesList : lexerh (x+1) cs "*TNEMMOC" -> Comment : lexerh (x+1) cs "*YTITNE" -> Entity : lexerh (x+1) cs _ -> NodeName (T.pack ('*': reverse n)) : lexerh (x+1) cs else parseSpNode (c:n) (x+1) cs parseNode n _ "" = [NodeName . revpack $ n] parseNode n x (c:cs) | isSpace c = NodeName (revpack n) : lexerh (x+1) cs | isXMLNameChar c = parseNode (c:n) (x+1) cs | otherwise = [LexError $ "Parsing error at position " ++ show x ++ ": invalid character in an XML node name " ++ show c] parseStr n x "" = [LexError $ "Parsing error at position " ++ show x ++ ": unterminated string literal"] parseStr n x (c:cs) = case c of '"' -> Str (revpack n) : lexerh (x+1) cs '\\' -> parseEscape n (x+1) cs _ -> parseStr (c:n) (x+1) cs parseEscape n x "" = [LexError $ "Parsing error at position " ++ show x ++ ": unfinished string escape"] parseEscape n x (c:cs) = case c of '"' -> parseStr (c:n) (x+1) cs '\\' -> parseStr (c:n) (x+1) cs 'b' -> parseStr ('\b':n) (x+1) cs 'n' -> parseStr ('\n':n) (x+1) cs 'f' -> parseStr ('\f':n) (x+1) cs 'r' -> parseStr ('\r':n) (x+1) cs 't' -> parseStr ('\t':n) (x+1) cs 'v' -> parseStr ('\v':n) (x+1) cs _ -> [LexError $ "Parsing error at position " ++ show x ++ ": invalid string escape '\\" ++ [c] ++ "'"] revpack = T.pack . reverse