module Text.XML.Expat.Tree (
Node(..),
Nodes,
Attributes,
UNode,
UNodes,
UAttributes,
extractText,
parseTree,
parseTree',
Encoding(..),
XMLParseError(..),
XMLParseLocation(..),
parseSAX,
SAXEvent(..),
saxToTree,
parseSAXLocations,
XMLParseException(..),
parseSAXThrowing,
parseTreeThrowing,
GenericXMLString(..)
) where
import Text.XML.Expat.IO
import qualified Data.ByteString as B
import qualified Data.ByteString.Lazy as L
import qualified Data.ByteString.Internal as I
import Data.IORef
import System.IO.Unsafe (unsafePerformIO)
import Data.ByteString.Internal (c2w, w2c, c_strlen)
import qualified Data.Monoid as M
import qualified Data.Text as T
import qualified Data.Text.Encoding as TE
import qualified Codec.Binary.UTF8.String as U8
import Data.Monoid
import Data.Typeable
import Control.Exception.Extensible as Exc
import Control.Applicative
import Control.Concurrent
import Control.Concurrent.MVar
import Control.Parallel.Strategies
import Control.Monad
import System.IO.Unsafe
import System.Mem.Weak
import Foreign.C.String
import Foreign.Ptr
class (M.Monoid s, Eq s) => GenericXMLString s where
gxNullString :: s -> Bool
gxToString :: s -> String
gxFromString :: String -> s
gxFromChar :: Char -> s
gxHead :: s -> Char
gxTail :: s -> s
gxBreakOn :: Char -> s -> (s, s)
gxFromCStringLen :: CStringLen -> IO s
gxToByteString :: s -> B.ByteString
instance GenericXMLString String where
gxNullString = null
gxToString = id
gxFromString = id
gxFromChar c = [c]
gxHead = head
gxTail = tail
gxBreakOn c = break (==c)
gxFromCStringLen cstr = U8.decodeString <$> peekCStringLen cstr
gxToByteString = B.pack . map c2w . U8.encodeString
instance GenericXMLString B.ByteString where
gxNullString = B.null
gxToString = U8.decodeString . map w2c . B.unpack
gxFromString = B.pack . map c2w . U8.encodeString
gxFromChar = B.singleton . c2w
gxHead = w2c . B.head
gxTail = B.tail
gxBreakOn c = B.break (== c2w c)
gxFromCStringLen = peekByteStringLen
gxToByteString = id
instance GenericXMLString T.Text where
gxNullString = T.null
gxToString = T.unpack
gxFromString = T.pack
gxFromChar = T.singleton
gxHead = T.head
gxTail = T.tail
gxBreakOn c = T.break (==c)
gxFromCStringLen cstr = TE.decodeUtf8 <$> peekByteStringLen cstr
gxToByteString = TE.encodeUtf8
peekByteStringLen :: CStringLen -> IO B.ByteString
peekByteStringLen (cstr, len) =
I.create (fromIntegral len) $ \ptr ->
I.memcpy ptr (castPtr cstr) (fromIntegral len)
data Node tag text =
Element {
eName :: !tag,
eAttrs :: ![(tag,text)],
eChildren :: [Node tag text]
} |
Text !text
deriving (Eq, Show)
instance (NFData tag, NFData text) => NFData (Node tag text) where
rnf (Element nam att chi) = rnf (nam, att, chi)
rnf (Text txt) = rnf txt
type Attributes tag text = [(tag, text)]
type Nodes tag text = [Node tag text]
type UNodes text = Nodes text text
type UNode text = Node text text
type UAttributes text = Attributes text text
extractText :: Monoid text => Node tag text -> text
extractText (Element _ _ children) = mconcat $ map extractText children
extractText (Text txt) = txt
modifyChildren :: ([Node tag text] -> [Node tag text])
-> Node tag text
-> Node tag text
modifyChildren f node = node { eChildren = f (eChildren node) }
mkText :: GenericXMLString text => CString -> IO text
mkText cstr = do
len <- c_strlen cstr
gxFromCStringLen (cstr, fromIntegral len)
parseTree' :: (GenericXMLString tag, GenericXMLString text) =>
Maybe Encoding
-> B.ByteString
-> Either XMLParseError (Node tag text)
parseTree' enc doc = unsafePerformIO $ runParse where
runParse = do
parser <- newParser enc
let emptyString = gxFromString ""
stack <- newIORef [Element emptyString [] []]
setStartElementHandler parser $ \cName cAttrs -> do
name <- mkText cName
attrs <- forM cAttrs $ \(cAttrName,cAttrValue) -> do
attrName <- mkText cAttrName
attrValue <- mkText cAttrValue
return (attrName, attrValue)
modifyIORef stack (start name attrs)
return True
setEndElementHandler parser $ \cName -> do
modifyIORef stack end
return True
setCharacterDataHandler parser $ \cText -> do
txt <- gxFromCStringLen cText
modifyIORef stack (text txt)
return True
mError <- parse' parser doc
case mError of
Just error -> return $ Left error
Nothing -> do
[Element _ _ [root]] <- readIORef stack
return $ Right root
start name attrs stack = Element name attrs [] : stack
text str (cur:rest) = modifyChildren (Text str:) cur : rest
end (cur:parent:rest) =
let node = modifyChildren reverse cur in
modifyChildren (node:) parent : rest
data SAXEvent tag text =
StartElement tag [(tag, text)] |
EndElement tag |
CharacterData text |
FailDocument XMLParseError
deriving (Eq, Show)
instance (NFData tag, NFData text) => NFData (SAXEvent tag text) where
rnf (StartElement tag atts) = rnf (tag, atts)
rnf (EndElement tag) = rnf tag
rnf (CharacterData text) = rnf text
rnf (FailDocument err) = rnf err
parseSAX :: (GenericXMLString tag, GenericXMLString text) =>
Maybe Encoding
-> L.ByteString
-> [SAXEvent tag text]
parseSAX enc input = unsafePerformIO $ do
parser <- newParser enc
queueRef <- newIORef []
setStartElementHandler parser $ \cName cAttrs -> do
name <- mkText cName
attrs <- forM cAttrs $ \(cAttrName,cAttrValue) -> do
attrName <- mkText cAttrName
attrValue <- mkText cAttrValue
return (attrName, attrValue)
modifyIORef queueRef (StartElement name attrs:)
return True
setEndElementHandler parser $ \cName -> do
name <- mkText cName
modifyIORef queueRef (EndElement name:)
return True
setCharacterDataHandler parser $ \cText -> do
txt <- gxFromCStringLen cText
modifyIORef queueRef (CharacterData txt:)
return True
let runParser input = unsafeInterleaveIO $ do
rem <- case input of
(c:cs) -> do
mError <- parseChunk parser c False
case mError of
Just error -> return [FailDocument error]
Nothing -> runParser cs
[] -> do
mError <- parseChunk parser B.empty True
case mError of
Just error -> return [FailDocument error]
Nothing -> return []
queue <- readIORef queueRef
writeIORef queueRef []
return $ reverse queue ++ rem
runParser $ L.toChunks input
data XMLParseException = XMLParseException XMLParseError
deriving (Eq, Show, Typeable)
instance Exception XMLParseException where
parseSAXThrowing :: (GenericXMLString tag, GenericXMLString text) =>
Maybe Encoding
-> L.ByteString
-> [SAXEvent tag text]
parseSAXThrowing mEnc bs = map freakOut $ parseSAX mEnc bs
where
freakOut (FailDocument err) = Exc.throw $ XMLParseException err
freakOut other = other
parseSAXLocations :: (GenericXMLString tag, GenericXMLString text) =>
Maybe Encoding
-> L.ByteString
-> [(SAXEvent tag text, XMLParseLocation)]
parseSAXLocations enc input = unsafePerformIO $ do
parser <- newParser enc
queueRef <- newIORef []
setStartElementHandler parser $ \cName cAttrs -> do
name <- mkText cName
attrs <- forM cAttrs $ \(cAttrName,cAttrValue) -> do
attrName <- mkText cAttrName
attrValue <- mkText cAttrValue
return (attrName, attrValue)
loc <- getParseLocation parser
modifyIORef queueRef ((StartElement name attrs,loc):)
return True
setEndElementHandler parser $ \cName -> do
name <- mkText cName
loc <- getParseLocation parser
modifyIORef queueRef ((EndElement name, loc):)
return True
setCharacterDataHandler parser $ \cText -> do
txt <- gxFromCStringLen cText
loc <- getParseLocation parser
modifyIORef queueRef ((CharacterData txt, loc):)
return True
let runParser [] = return []
runParser (c:cs) = unsafeInterleaveIO $ do
mError <- parseChunk parser c (null cs)
queue <- readIORef queueRef
writeIORef queueRef []
rem <- case mError of
Just error -> do
loc <- getParseLocation parser
return [(FailDocument error, loc)]
Nothing -> runParser cs
return $ reverse queue ++ rem
runParser $ L.toChunks input
saxToTree :: GenericXMLString tag =>
[SAXEvent tag text]
-> (Node tag text, Maybe XMLParseError)
saxToTree events =
let (nodes, mError, _) = ptl events
in (safeHead nodes, mError)
where
safeHead (a:_) = a
safeHead [] = Element (gxFromString "") [] []
ptl (StartElement name attrs:rem) =
let (children, err1, rem') = ptl rem
elt = Element name attrs children
(out, err2, rem'') = ptl rem'
in (elt:out, err1 `mplus` err2, rem'')
ptl (EndElement name:rem) = ([], Nothing, rem)
ptl (CharacterData txt:rem) =
let (out, err, rem') = ptl rem
in (Text txt:out, err, rem')
ptl (FailDocument err:_) = ([], Just err, [])
ptl [] = ([], Nothing, [])
parseTree :: (GenericXMLString tag, GenericXMLString text) =>
Maybe Encoding
-> L.ByteString
-> (Node tag text, Maybe XMLParseError)
parseTree mEnc bs = saxToTree $ parseSAX mEnc bs
parseTreeThrowing :: (GenericXMLString tag, GenericXMLString text) =>
Maybe Encoding
-> L.ByteString
-> Node tag text
parseTreeThrowing mEnc bs = fst $ saxToTree $ parseSAXThrowing mEnc bs