-- | This is another hand-written lexer, this time for the Xtract
--   command-language.  The entry point is lexXtract.  You don't
--   normally need to use this module directly - the lexer is called
--   automatically by the parser.  (We only expose this interface
--   for debugging purposes.)
--
--   The Xtract command language is very like the XPath specification.

module Text.XML.HaXml.Xtract.Lex
  ( lexXtract
  , Posn(..)
  , TokenT(..)
  , Token
  ) where

import Char
import List(isPrefixOf)


type Token = Either String (Posn, TokenT)

data Posn = Pn Int		-- char index only
        deriving Eq

instance Show Posn where
      showsPrec p (Pn c) = showString "char pos " . shows c

data TokenT =
      Symbol String
    | TokString String		--     begins with letter
    | TokNum Integer		--     begins with digit
    deriving Eq

instance Show TokenT where
    showsPrec p (Symbol s) = showString s
    showsPrec p (TokString s) = showString s
    showsPrec p (TokNum n) = shows n

emit :: TokenT -> Posn -> Token
emit tok p = forcep p `seq` Right (p,tok)
  where forcep (Pn n) = n

lexerror :: String -> Posn -> [Token]
lexerror s p = [Left ("Lexical error in selection pattern at "++show p++": "
                       ++s++"\n")]

addcol :: Int -> Posn -> Posn
addcol n (Pn c) = Pn (c+n)

newline, tab :: Posn -> Posn
newline (Pn c) = Pn (c+1)
tab     (Pn c) = Pn (((c`div`8)+1)*8)

white :: Char -> Posn -> Posn
white '\t' = tab
white ' '  = addcol 1
white '\n' = addcol 1
white '\r' = addcol 1
white '\xa0' = addcol 1

blank :: (Posn->String->[Token]) -> Posn-> String-> [Token]
blank k p []       = []
blank k p (' ': s) = blank k (addcol 1 p) s
blank k p ('\t':s) = blank k (tab p) s
blank k p ('\n':s) = blank k (newline p) s
blank k p ('\r':s) = blank k p s
blank k p ('\xa0': s) = blank k (addcol 1 p) s
blank k p    s     = k p s

----
lexXtract :: String -> [Token]
lexXtract = selAny (Pn 1)

syms = "/[]()@,=*&|~$+-<>"

selAny :: Posn -> String -> [Token]
selAny p [] = []
selAny p ('/':ss)
    | '/' == head ss  = emit (Symbol "//") p:  selAny (addcol 2 p) (tail ss)
selAny p ('!':ss)
    | '=' == head ss  = emit (Symbol "!=") p:  selAny (addcol 2 p) (tail ss)
selAny p ('<':ss)
    | '=' == head ss  = emit (Symbol "<=") p:  selAny (addcol 2 p) (tail ss)
selAny p ('>':ss)
    | '=' == head ss  = emit (Symbol ">=") p:  selAny (addcol 2 p) (tail ss)
selAny p ('\'':ss)    = emit (Symbol "'") p:
                        accumulateUntil '\'' (Symbol "'") [] p (addcol 1 p) ss selAny
selAny p ('"':ss)     = emit (Symbol "\"") p:
                        accumulateUntil '"' (Symbol "\"") [] p (addcol 1 p) ss selAny
selAny p ('_':ss)     = gatherName "_" p (addcol 1 p) ss (blank selAny)
selAny p (':':ss)     = gatherName ":" p (addcol 1 p) ss (blank selAny)
selAny p ('.':ss)
    | "=."  `isPrefixOf` ss  = emit (Symbol ".=.") p:  selAny (addcol 3 p) (drop 2 ss)
    | "!=." `isPrefixOf` ss  = emit (Symbol ".!=.") p: selAny (addcol 4 p) (drop 3 ss)
    | "<."  `isPrefixOf` ss  = emit (Symbol ".<.") p:  selAny (addcol 3 p) (drop 2 ss)
    | "<=." `isPrefixOf` ss  = emit (Symbol ".<=.") p: selAny (addcol 4 p) (drop 3 ss)
    | ">."  `isPrefixOf` ss  = emit (Symbol ".>.") p:  selAny (addcol 3 p) (drop 2 ss)
    | ">=." `isPrefixOf` ss  = emit (Symbol ".>=.") p: selAny (addcol 4 p) (drop 3 ss)
    | "/"   `isPrefixOf` ss  = emit (Symbol "./") p: selAny (addcol 2 p) (drop 1 ss)
selAny p (s:ss)
    | s `elem` syms   = emit (Symbol [s]) p:     selAny (addcol 1 p) ss
    | isSpace s       = blank selAny p (s:ss)
    | isAlpha s       = gatherName [s] p (addcol 1 p) ss (blank selAny)
    | isDigit s       = gatherNum  [s] p (addcol 1 p) ss (blank selAny)
    | otherwise       = lexerror "unrecognised pattern" p

gatherName acc pos p (s:ss) k
  | isAlphaNum s || s `elem` "-_:" = gatherName (s:acc) pos (addcol 1 p) ss k
gatherName acc pos p ss k =
  emit (TokString (reverse acc)) pos: k p ss

gatherNum acc pos p (s:ss) k
  | isHexDigit s = gatherNum (s:acc) pos (addcol 1 p) ss k
gatherNum acc pos p ss k =
  emit (TokNum (read (reverse acc))) pos: k p ss

accumulateUntil c tok acc pos  p  [] k =
    lexerror ("found end of pattern while looking for "++c
              :" to match opening quote at "++show pos) p
accumulateUntil c tok acc pos  p (s:ss) k
    | c==s       = emit (TokString (reverse acc)) pos:
                                  emit tok p: k (addcol 1 p) ss
    | isSpace s  = accumulateUntil c tok (s:acc) pos (white s p) ss k
    | otherwise  = accumulateUntil c tok (s:acc) pos (addcol 1 p) ss k