module Text.SXML.Internal(lexer, Token(..)) where

import Data.Char
import Text.SXML.Utils
import qualified Data.Text as T

data Token = OpenParen | CloseParen | DocRoot | NamespacesList | AttributesList | PI | Comment | Entity | Str T.Text | NodeName T.Text | LexError String deriving (Eq, Show)

lexer :: String -> [Token]
lexer = lexerh 0

lexerh _ "" = []
lexerh n (c:cs)
  | c == '(' = OpenParen : lexerh (n+1) cs
  | c == ')' = CloseParen : lexerh (n+1) cs
  | c == '@' = AttributesList : lexerh (n+1) cs
  | c == '*' = parseSpNode "" (n+1) cs
  | c == '"' = parseStr "" (n+1) cs
  | isSpace c = lexerh (n+1) cs
  | isXMLNameStartChar c = parseNode [c] (n+1) cs
  | otherwise = [LexError $ "Parsing error at position " ++ show n ++ ": invalid character at the start of an XML node name " ++ show c]
  where parseSpNode n _ ""
          | n == "*POT" = [DocRoot]
          | n == "*IP" = [PI]
          | n == "*SECAPSEMAN" = [NamespacesList]
          | n == "*TNEMMOC" = [Comment]
          | n == "*YTITNE" = [Entity]
          | otherwise = [NodeName (T.pack ('*': reverse n))]
        parseSpNode n x (c:cs) =
          if isSpace c
            then case n of
                   "*POT" -> DocRoot : lexerh (x+1) cs
                   "*IP" -> PI : lexerh (x+1) cs
                   "*SECAPSEMAN" -> NamespacesList : lexerh (x+1) cs
                   "*TNEMMOC" -> Comment : lexerh (x+1) cs
                   "*YTITNE" -> Entity : lexerh (x+1) cs
                   _ -> NodeName (T.pack ('*': reverse n)) : lexerh (x+1) cs
            else parseSpNode (c:n) (x+1) cs
        parseNode n _ "" = [NodeName . revpack $ n]
        parseNode n x (c:cs)
          | isSpace c = NodeName (revpack n) : lexerh (x+1) cs
          | isXMLNameChar c = parseNode (c:n) (x+1) cs
          | otherwise = [LexError $ "Parsing error at position " ++ show x ++ ": invalid character in an XML node name " ++ show c]
        parseStr n x "" = [LexError $ "Parsing error at position " ++ show x ++ ": unterminated string literal"]
        parseStr n x (c:cs) =
          case c of
            '"' -> Str (revpack n) : lexerh (x+1) cs
            '\\' -> parseEscape n (x+1) cs
            _ -> parseStr (c:n) (x+1) cs
        parseEscape n x "" = [LexError $ "Parsing error at position " ++ show x ++ ": unfinished string escape"]
        parseEscape n x (c:cs) =
          case c of
            '"' -> parseStr (c:n) (x+1) cs
            '\\' -> parseStr (c:n) (x+1) cs
            'b' -> parseStr ('\b':n) (x+1) cs
            'n' -> parseStr ('\n':n) (x+1) cs
            'f' -> parseStr ('\f':n) (x+1) cs
            'r' -> parseStr ('\r':n) (x+1) cs
            't' -> parseStr ('\t':n) (x+1) cs
            'v' -> parseStr ('\v':n) (x+1) cs
            _ -> [LexError $ "Parsing error at position " ++ show x ++ ": invalid string escape '\\" ++ [c] ++ "'"]
        revpack = T.pack . reverse