module XMLScanner
( Delimiter(..)
, pcdataMode, markupMode
, isNMCHAR, isSEPCHAR
, expandReferences
) where
import XML
import Char
import qualified DTD
isSEPCHAR, isNMCHAR, isNMSTART :: Char -> Bool
isSEPCHAR = isSpace
isNMCHAR c = isAlphaNum c || c `elem` ".-_:"
isNMSTART c = isAlphaNum c || c `elem` "_:"
doSpan :: (a -> Bool) -> (b -> c -> d) -> ([a] -> b) -> ([a] -> c) -> [a] -> d
doSpan pred k f g = sp f where
sp f' [] = k (f' []) (g [])
sp f' s@(c:cs)
| pred c = sp (f' . (c:)) cs
| otherwise = k (f' []) (g s)
drop1 :: [a] -> [a]
drop1 [] = []
drop1 (_:xs) = xs
data Delimiter =
WS String
| CDATA String
| GEREF Name
| STAGO
| ETAGO
| MDO
| MDOCOM
| MDODSO
| PIO
| PIC
| LEXERR String
| REST String
| NAME Name
| RNINAME Name
| PEREF Name
| LITERAL String
| TAGC
| VI
| EETAGC
| MDC
| DSO
| DSC
| MSC
| COM
| GRPO
| GRPC
| AND
| OR
| SEQ
| OPT
| REP
| PLUS
| MINUS
| PERO
deriving (Eq, Show)
pcdataMode, tagMode, markupMode :: String -> [Delimiter]
pcdataMode [] = []
pcdataMode ('<':s) = case s of
'!':'-':'-':r -> MDOCOM : comMode pcdataMode r
'!':'[':r -> msMode r
'!':r -> MDO : markupMode r
'/':r -> ETAGO : tagMode r
'?':r -> PIO : piMode pcdataMode r
']':']':'>':r -> MSC : pcdataMode r
r -> STAGO : tagMode r
pcdataMode ('&':'#':s) = doSpan (';'/=) (:) mkCREF (pcdataMode . drop1) s
where mkCREF = CDATA . return . chr . stringToInt 10
pcdataMode ('&':s) =
case span isNMCHAR s of
(ename,';':r) -> GEREF ename : pcdataMode r
(junk,r) -> LEXERR ("Bad entity reference " ++ junk)
: pcdataMode r
pcdataMode ('>':r) = LEXERR "Warning: %%% SKIPPING UNESCAPED '>'":pcdataMode r
pcdataMode (c:s)
| isSEPCHAR c = doSpan isSEPCHAR (:) (WS . (c:)) pcdataMode s
| otherwise = doSpan isDATACHAR (:) (CDATA . (c:)) pcdataMode s
where isDATACHAR ch =
case ch of '<' -> False; '&' ->False; _ -> True
tagMode [] = []
tagMode ('/':'>':r) = EETAGC : pcdataMode r
tagMode ('>':r) = TAGC : pcdataMode r
tagMode ('=':r) = VI : tagMode r
tagMode ('"':r) = doSpan ('"'/=) (:) LITERAL (tagMode . drop1) r
tagMode ('\'':r) = doSpan ('\''/=) (:) LITERAL (tagMode . drop1) r
tagMode ('<':'/':r) = ETAGO : tagMode r
tagMode ('<':r) = STAGO : tagMode r
tagMode cs@(c:s)
| isSEPCHAR c = tagMode (dropWhile isSEPCHAR s)
| isNMSTART c = doSpan isNMCHAR (:) NAME tagMode cs
| otherwise = LEXERR [c] : tagMode s
expandReferences :: DTD.EntityMap -> String -> String
expandReferences entities = expand where
expand s = case s of
[] -> []
'&':'#':'X':r -> doCharRef 16 expand r
'&':'#':r -> doCharRef 10 expand r
'&':r -> doEntityRef entities expand r
x:r -> x : expand r
doCharRef :: Int -> (String -> String) -> [Char] -> [Char]
doCharRef base k = doSpan (';'/=) (:) (chr . stringToInt base) (k . drop1)
stringToInt :: Int -> String -> Int
stringToInt base = foldl digit 0 . map (\x -> if isDigit x then digitToInt x else 0)
where digit num next = base*num + next
doEntityRef :: DTD.EntityMap -> (String -> String) -> String -> String
doEntityRef entities k r = doSpan (';'/=) (++) replacement (k . drop1) r where
replacement ename = case DTD.expandInternalEntity entities ename of
Just s -> s
_ -> ""
markupMode [] = []
markupMode ('%':s) = case span isNMCHAR s of
([], ' ':r) -> PERO : markupMode r
(ename,';':r) -> PEREF ename : markupMode r
(ename,r) -> LEXERR ("Bad parameter entity reference %" ++ ename)
: markupMode r
markupMode ('-':'-':r) = eatComment r
markupMode ('>':r) = MDC : markupMode r
markupMode ('"':r) = doSpan ('"'/=) (:) LITERAL (markupMode . drop1) r
markupMode ('\'':r) = doSpan ('\''/=) (:) LITERAL (markupMode . drop1) r
markupMode ('#':r) = doSpan isNMCHAR (:) (RNINAME . ('#':)) markupMode r
markupMode ('<':'!':'-':'-':r)
= MDOCOM : comMode markupMode r
markupMode ('<':'!':r) = MDO : markupMode r
markupMode ('<':'?':r) = PIO : piMode markupMode r
markupMode s@('<':_) = pcdataMode s
markupMode cs@(c:s)
| isSEPCHAR c = markupMode (dropWhile isSEPCHAR s)
| isNMSTART c = doSpan isNMCHAR (:) NAME markupMode cs
| otherwise = (case c of
'&' -> AND
'|' -> OR
',' -> SEQ
'?' -> OPT
'*' -> REP
'+' -> PLUS
'-' -> MINUS
'[' -> DSO
']' -> DSC
'(' -> GRPO
')' -> GRPC
_ -> LEXERR [c]) : markupMode s
msMode, cdataMode, eatComment :: String -> [Delimiter]
piMode, comMode, cdMode :: (String -> [Delimiter]) -> String -> [Delimiter]
msMode ('C':'D':'A':'T':'A':'[':rest) = cdataMode rest
msMode s =
let (ms, rest) = span ('['/=) s
in LEXERR ("Illegal marked section ["++ms) : pcdataMode (drop1 rest)
cdataMode (']':']':'>':rest) = pcdataMode rest
cdataMode ('<':'!':'[':rest) = error "Nested <![ in marked section"
cdataMode [] = []
cdataMode (c:cs) = doSpan spn (:) (CDATA . (c:)) cdataMode cs where
spn '\n' = False
spn ']' = False
spn '<' = False
spn _ = True
comMode prevMode cs = case cs of
[] -> []
'-':'-':r -> COM : cdMode prevMode r
(c:s) -> doSpan brk (:) (CDATA . (c:)) (comMode prevMode) s where
brk '-' = False
brk '\n' = False
brk _ = True
cdMode prevMode cs = case cs of
[] -> []
'>':r -> MDC : prevMode r
'-':'-':r -> COM : comMode prevMode r
c:s -> if isSEPCHAR c
then cdMode prevMode (dropWhile isSEPCHAR s)
else LEXERR [c] : cdMode prevMode s
eatComment cs = case cs of
[] -> []
'-':'-':r -> markupMode r
(_:r) -> eatComment r
piMode prevMode cs = case cs of
[] -> []
'?':'>':r -> PIC : prevMode r
(c:s) -> doSpan ('?'/=) (:) (CDATA . (c:)) (piMode prevMode) s