module Text.XML.Enumerator.Parse
(
parseBytes
, parseText
, detectUtf
, SEvent (..)
, simplify
, SAttr
, parseFile
, parseFile_
, tag
, tag'
, tag''
, content
, content'
, AttrParser
, requireAttr
, optionalAttr
, requireAttrRaw
, optionalAttrRaw
, ignoreAttrs
, choose
, many
, force
, XmlException (..)
) where
import Data.Attoparsec.Text
( char, Parser, takeWhile1, skipWhile, string
, manyTill, takeWhile, try, anyChar, endOfInput, hexadecimal, decimal
)
import qualified Data.Attoparsec.Text as A
import Data.Attoparsec.Text.Enumerator (iterParser)
import Data.XML.Types
( Name (..), Event (..), Content (..), Attribute (..)
, Doctype (..), Instruction (..), ExternalID (..)
)
import Control.Applicative ((<|>), (<$>))
import Data.Text.Lazy (pack, Text)
import qualified Data.Text.Lazy as T
import Text.XML.Enumerator.Token
import Prelude hiding (takeWhile)
import qualified Data.ByteString as S
import qualified Data.ByteString.Lazy as L
import qualified Data.Map as Map
import Data.Enumerator (Iteratee, Enumeratee, (>>==), Stream (..),
checkDone, yield, ($$), joinI, run, throwError)
import qualified Data.Enumerator as E
import qualified Data.Enumerator.List as EL
import qualified Data.Enumerator.Text as ET
import qualified Data.Enumerator.Binary as EB
import Control.Monad (unless, ap, liftM)
import qualified Data.Text as TS
import Data.List (foldl')
import Control.Applicative (Applicative (..))
import Data.Typeable (Typeable)
import Control.Exception (Exception, throwIO, SomeException)
import Data.Enumerator.Binary (enumFile)
import Control.Monad.IO.Class (liftIO)
import Data.Char (isSpace)
tokenToEvent :: [NSLevel] -> Token -> ([NSLevel], [Event])
tokenToEvent n (TokenBeginDocument _) = (n, [])
tokenToEvent n (TokenInstruction i) = (n, [EventInstruction i])
tokenToEvent n (TokenBeginElement name as isClosed) =
(n', if isClosed then [begin, end] else [begin])
where
l0 = case n of
[] -> NSLevel Nothing Map.empty
x:_ -> x
(as', l') = foldl' go (id, l0) as
go (front, l) a@(TName kpref kname, val)
| kpref == Just "xmlns" =
(front, l { prefixes = Map.insert kname (contentsToText val)
$ prefixes l })
| kpref == Nothing && kname == "xmlns" =
(front, l { defaultNS = if T.null $ contentsToText val
then Nothing
else Just $ contentsToText val })
| otherwise = (front . (:) a, l)
n' = if isClosed then n else l' : n
contentsToText = T.concat . map helper
helper (ContentText t) = t
helper (ContentEntity _) = T.empty
fixAttName level (name', val) = Attribute (tnameToName True level name') val
begin = EventBeginElement (tnameToName False l' name) $ map (fixAttName l') $ as' []
end = EventEndElement $ tnameToName False l' name
tokenToEvent n (TokenEndElement name) =
(n', [EventEndElement $ tnameToName False l name])
where
(l, n') =
case n of
[] -> (NSLevel Nothing Map.empty, [])
x:xs -> (x, xs)
tokenToEvent n (TokenContent c) = (n, [EventContent c])
tokenToEvent n (TokenComment c) = (n, [EventComment c])
tokenToEvent n (TokenDoctype t eid) = (n, [EventDoctype $ Doctype t eid []])
tnameToName :: Bool -> NSLevel -> TName -> Name
tnameToName _ _ (TName (Just "xml") name) =
Name name (Just "http://www.w3.org/XML/1998/namespace") (Just "xml")
tnameToName isAttr (NSLevel def _) (TName Nothing name) =
Name name (if isAttr then Nothing else def) Nothing
tnameToName _ (NSLevel _ m) (TName (Just pref) name) =
case Map.lookup pref m of
Just ns -> Name name (Just ns) (Just pref)
Nothing -> Name name Nothing (Just pref)
detectUtf :: Monad m => Enumeratee S.ByteString TS.Text m a
detectUtf step = do
x <- EB.take 4
let (toDrop, codec) =
case L.unpack x of
[0x00, 0x00, 0xFE, 0xFF] -> (4, ET.utf32_be)
[0xFF, 0xFE, 0x00, 0x00] -> (4, ET.utf32_le)
0xFE : 0xFF: _ -> (2, ET.utf16_be)
0xFF : 0xFE: _ -> (2, ET.utf16_le)
0xEF : 0xBB: 0xBF : _ -> (3, ET.utf8)
[0x00, 0x00, 0x00, 0x3C] -> (0, ET.utf32_be)
[0x3C, 0x00, 0x00, 0x00] -> (0, ET.utf32_le)
[0x00, 0x3C, 0x00, 0x3F] -> (0, ET.utf16_be)
[0x3C, 0x00, 0x3F, 0x00] -> (0, ET.utf16_le)
_ -> (0, ET.utf8)
unless (toDrop == 4) $ yield () $ Chunks $ L.toChunks $ L.drop toDrop x
ET.decode codec step
parseBytes :: Monad m => Enumeratee S.ByteString Event m a
parseBytes step = joinI $ detectUtf $$ parseText step
parseText :: Monad m => Enumeratee TS.Text Event m a
parseText =
checkDone $ \k -> k (Chunks [EventBeginDocument]) >>== loop []
where
loop levels = checkDone $ go levels
go levels k = do
mtoken <- iterToken
case mtoken of
Nothing -> k (Chunks [EventEndDocument]) >>== return
Just token ->
let (levels', events) = tokenToEvent levels token
in k (Chunks events) >>== loop levels'
iterToken :: Monad m => Iteratee TS.Text m (Maybe Token)
iterToken = iterParser ((endOfInput >> return Nothing) <|> fmap Just parseToken)
parseToken :: Parser Token
parseToken = do
(char '<' >> parseLt) <|> fmap TokenContent (parseContent False False)
where
parseLt =
(char '?' >> parseInstr) <|>
(char '!' >> (parseComment <|> parseCdata <|> parseDoctype)) <|>
(char '/' >> parseEnd) <|>
parseBegin
parseInstr = do
name <- parseIdent
if name == "xml"
then do
as <- A.many parseAttribute
skipSpace
char' '?'
char' '>'
newline <|> return ()
return $ TokenBeginDocument as
else do
skipSpace
x <- T.pack <$> manyTill anyChar (try $ string "?>")
return $ TokenInstruction $ Instruction name x
parseComment = do
char' '-'
char' '-'
c <- T.pack <$> manyTill anyChar (string "-->")
return $ TokenComment c
parseCdata = do
_ <- string "[CDATA["
t <- T.pack <$> manyTill anyChar (string "]]>")
return $ TokenContent $ ContentText t
parseDoctype = do
_ <- string "DOCTYPE"
skipSpace
i <- parseIdent
skipSpace
eid <- fmap Just parsePublicID <|>
fmap Just parseSystemID <|>
return Nothing
skipSpace
(do
char' '['
skipWhile (/= ']')
char' ']'
skipSpace) <|> return ()
char' '>'
newline
return $ TokenDoctype i eid
parsePublicID = do
_ <- string "PUBLIC"
x <- quotedText
y <- quotedText
return $ PublicID x y
parseSystemID = do
_ <- string "SYSTEM"
x <- quotedText
return $ SystemID x
quotedText = do
skipSpace
T.fromChunks . return <$> (between '"' <|> between '\'')
between c = do
char' c
x <- takeWhile (/=c)
char' c
return x
parseEnd = do
skipSpace
n <- parseName
skipSpace
char' '>'
return $ TokenEndElement n
parseBegin = do
skipSpace
n <- parseName
as <- A.many parseAttribute
skipSpace
isClose <- (char '/' >> skipSpace >> return True) <|> return False
char' '>'
return $ TokenBeginElement n as isClose
parseAttribute :: Parser TAttribute
parseAttribute = do
skipSpace
key <- parseName
skipSpace
char' '='
skipSpace
val <- squoted <|> dquoted
return (key, val)
where
squoted = do
char' '\''
manyTill (parseContent False True) (char '\'')
dquoted = do
char' '"'
manyTill (parseContent True False) (char '"')
parseName :: Parser TName
parseName = do
i1 <- parseIdent
mi2 <- (char ':' >> fmap Just parseIdent) <|> return Nothing
return $
case mi2 of
Nothing -> TName Nothing i1
Just i2 -> TName (Just i1) i2
parseIdent :: Parser Text
parseIdent =
T.fromChunks . return <$> takeWhile1 valid
where
valid '&' = False
valid '<' = False
valid '>' = False
valid ':' = False
valid '?' = False
valid '=' = False
valid '"' = False
valid '\'' = False
valid '/' = False
valid c = not $ isSpace c
parseContent :: Bool
-> Bool
-> Parser Content
parseContent breakDouble breakSingle =
parseEntity <|> parseText'
where
parseEntity = do
char' '&'
parseEntityNum <|> parseEntityWord
parseEntityNum = do
char' '#'
w <- parseEntityHex <|> parseEntityDig
return $ ContentText $ pack [toEnum w]
parseEntityHex = do
char' 'x'
res <- hexadecimal
char' ';'
return res
parseEntityDig = do
res <- decimal
char' ';'
return res
parseEntityWord = do
s <- takeWhile1 (/= ';')
char' ';'
return $ case s of
_
| s == "amp" -> ContentText "&"
| s == "gt" -> ContentText ">"
| s == "lt" -> ContentText "<"
| s == "apos" -> ContentText "'"
| s == "quot" -> ContentText "\""
| otherwise ->
ContentEntity $ T.fromChunks [s]
parseText' = do
bs <- takeWhile1 valid
return $ ContentText $ T.fromChunks [bs]
valid '"' = not breakDouble
valid '\'' = not breakSingle
valid '&' = False
valid '<' = False
valid _ = True
skipSpace :: Parser ()
skipSpace = skipWhile isSpace
newline :: Parser ()
newline = ((char '\r' >> char '\n') <|> char '\n') >> return ()
char' :: Char -> Parser ()
char' c = char c >> return ()
type SAttr = (Name, Text)
data SEvent = SBeginElement Name [SAttr]
| SEndElement
| SContent Text
deriving (Show, Eq)
content :: Monad m => Iteratee SEvent m (Maybe Text)
content = do
x <- E.peek
case x of
Just (SContent t) -> EL.drop 1 >> return (Just t)
_ -> return Nothing
content' :: Monad m => Iteratee SEvent m Text
content' = do
x <- content
case x of
Nothing -> return T.empty
Just y -> return y
tag :: Monad m
=> (Name -> Maybe a)
-> (a -> AttrParser b)
-> (b -> Iteratee SEvent m c)
-> Iteratee SEvent m (Maybe c)
tag checkName attrParser f = do
x <- dropWS
case x of
Just (SBeginElement name as) ->
case checkName name of
Just y ->
case runAttrParser' (attrParser y) as of
Left e -> throwError e
Right z -> do
EL.drop 1
z' <- f z
a <- dropWS
case a of
Just SEndElement -> EL.drop 1 >> return (Just z')
_ -> throwError $ SXmlException ("Expected end tag for: " ++ show name) a
Nothing -> return Nothing
_ -> return Nothing
where
dropWS = do
x <- E.peek
case x of
Just (SContent t)
| T.all isSpace t -> EL.drop 1 >> E.peek
_ -> return x
runAttrParser' p as =
case runAttrParser p as of
Left e -> Left e
Right ([], x) -> Right x
Right (attr, _) -> Left $ UnparsedAttributes attr
tag' :: Monad m
=> Name
-> AttrParser a
-> (a -> Iteratee SEvent m b)
-> Iteratee SEvent m (Maybe b)
tag' name attrParser = tag
(\x -> if x == name then Just () else Nothing)
(const attrParser)
tag'' :: Monad m => Name -> Iteratee SEvent m a -> Iteratee SEvent m (Maybe a)
tag'' name f = tag' name (return ()) $ const f
choose :: Monad m
=> [Iteratee SEvent m (Maybe a)]
-> Iteratee SEvent m (Maybe a)
choose [] = return Nothing
choose (i:is) = do
x <- i
case x of
Nothing -> choose is
Just a -> return $ Just a
force :: Monad m
=> String
-> Iteratee SEvent m (Maybe a)
-> Iteratee SEvent m a
force msg i = do
x <- i
case x of
Nothing -> throwError $ XmlException msg Nothing
Just a -> return a
simplify :: Monad m => (Text -> Maybe Text) -> Enumeratee Event SEvent m b
simplify renderEntity =
loop []
where
loop stack = E.checkDone $ go stack
sattr (Attribute x y) = do
y' <- flip mapM y $ \z ->
case z of
ContentText t -> return t
ContentEntity t ->
case renderEntity t of
Just t' -> return t'
Nothing -> throwError $ InvalidEntity t
return (x, T.concat y')
go stack k = do
x <- EL.head
case x of
Nothing -> k EOF >>== return
Just EventBeginDocument -> go stack k
Just EventEndDocument ->
k EOF >>== return
Just EventInstruction{} -> go stack k
Just EventDoctype{} -> go stack k
Just (EventBeginElement n as) -> do
as' <- mapM sattr as
k (Chunks [SBeginElement n as']) >>== loop (n : stack)
Just (EventEndElement n) ->
case stack of
[] -> throwError $ InvalidEndElement n
n':rest
| n == n' -> k (Chunks [SEndElement]) >>== loop rest
| otherwise -> throwError $ InvalidEndElement n
Just (EventContent c) -> do
t <- contentToText c
ts <- takeContents $ (:) t
k (Chunks [SContent $ T.concat $ ts []]) >>== loop stack
Just EventComment{} -> go stack k
where
contentToText (ContentEntity e) =
case renderEntity e of
Nothing -> throwError $ InvalidEntity e
Just t -> return t
contentToText (ContentText t) = return t
takeContents front = do
x <- E.peek
case x of
Nothing -> return front
Just EventBeginElement{} -> return front
Just EventEndElement{} -> return front
Just (EventContent c) -> do
EL.drop 1
t <- contentToText c
takeContents $ front . (:) t
Just EventBeginDocument -> helper
Just EventEndDocument -> helper
Just EventInstruction{} -> helper
Just EventDoctype{} -> helper
Just EventComment{} -> helper
where
helper = EL.drop 1 >> takeContents front
parseFile_ :: String -> (Text -> Maybe Text) -> Iteratee SEvent IO a -> IO a
parseFile_ fn re p =
parseFile fn re p >>= go
where
go (Left e) = liftIO $ throwIO e
go (Right a) = return a
parseFile :: String -> (Text -> Maybe Text) -> Iteratee SEvent IO a -> IO (Either SomeException a)
parseFile fn re p =
run $ enumFile fn $$ joinI
$ parseBytes $$ joinI
$ simplify re $$ p
data XmlException = XmlException
{ xmlErrorMessage :: String
, xmlBadInput :: Maybe Event
}
| InvalidEndElement Name
| InvalidEntity Text
| SXmlException
{ xmlErrorMessage :: String
, sxmlBadInput :: Maybe SEvent
}
| UnparsedAttributes [SAttr]
deriving (Show, Typeable)
instance Exception XmlException
newtype AttrParser a = AttrParser { runAttrParser :: [SAttr] -> Either XmlException ([SAttr], a) }
instance Monad AttrParser where
return a = AttrParser $ \as -> Right (as, a)
(AttrParser f) >>= g = AttrParser $ \as ->
case f as of
Left e -> Left e
Right (as', f') -> runAttrParser (g f') as'
instance Functor AttrParser where
fmap = liftM
instance Applicative AttrParser where
pure = return
(<*>) = ap
optionalAttrRaw :: (SAttr -> Maybe b) -> AttrParser (Maybe b)
optionalAttrRaw f =
AttrParser $ go id
where
go front [] = Right (front [], Nothing)
go front (a:as) =
case f a of
Nothing -> go (front . (:) a) as
Just b -> Right (front as, Just b)
requireAttrRaw :: String -> (SAttr -> Maybe b) -> AttrParser b
requireAttrRaw msg f = do
x <- optionalAttrRaw f
case x of
Just b -> return b
Nothing -> AttrParser $ const $ Left $ XmlException msg Nothing
requireAttr :: Name -> AttrParser Text
requireAttr n = requireAttrRaw
("Missing attribute: " ++ show n)
(\(x, y) -> if x == n then Just y else Nothing)
optionalAttr :: Name -> AttrParser (Maybe Text)
optionalAttr n = optionalAttrRaw
(\(x, y) -> if x == n then Just y else Nothing)
ignoreAttrs :: AttrParser ()
ignoreAttrs = AttrParser $ \_ -> Right ([], ())
many :: Monad m => Iteratee SEvent m (Maybe a) -> Iteratee SEvent m [a]
many i =
go id
where
go front = do
x <- i
case x of
Nothing -> return $ front []
Just y -> go $ front . (:) y