module Text.XML.HXT.Arrow.ProcessDocument
( parseXmlDocument
, parseHtmlDocument
, validateDocument
, propagateAndValidateNamespaces
, andValidateNamespaces
, getDocumentContents
)
where
import Control.Arrow
import Control.Arrow.ArrowList
import Control.Arrow.ArrowIf
import Control.Arrow.ArrowTree
import Control.Arrow.ListArrow ( fromLA )
import Text.XML.HXT.DOM.Interface
import Text.XML.HXT.Arrow.XmlArrow
import Text.XML.HXT.Arrow.XmlState
import Text.XML.HXT.Arrow.XmlState.TypeDefs
import Text.XML.HXT.Arrow.ParserInterface ( parseXmlDoc
, parseHtmlDoc
, substHtmlEntityRefs
)
import Text.XML.HXT.Arrow.Edit ( transfAllCharRef
)
import Text.XML.HXT.Arrow.GeneralEntitySubstitution
( processGeneralEntities
)
import Text.XML.HXT.Arrow.DTDProcessing ( processDTD
)
import Text.XML.HXT.Arrow.DocumentInput ( getXmlContents
)
import Text.XML.HXT.Arrow.Namespace ( propagateNamespaces
, validateNamespaces
)
import Text.XML.HXT.DTDValidation.Validation ( validate
, getDTDSubset
, transform
)
parseXmlDocument :: Bool -> IOStateArrow s XmlTree XmlTree
parseXmlDocument validate'
= ( replaceChildren ( ( getAttrValue a_source
&&&
xshow getChildren
)
>>>
parseXmlDoc
>>>
filterErrorMsg
)
>>>
setDocumentStatusFromSystemState "parse XML document"
>>>
processDTD
>>>
processGeneralEntities
>>>
transfAllCharRef
>>>
( if validate'
then validateDocument
else this
)
)
`when` documentStatusOk
parseHtmlDocument :: IOStateArrow s XmlTree XmlTree
parseHtmlDocument
= ( perform ( getAttrValue a_source >>> traceValue 1 (("parseHtmlDoc: parse HTML document " ++) . show) )
>>>
( parseHtml $< getSysVar theTagSoup )
>>>
( removeWarnings $< getSysVar (theWarnings .&&&. theTagSoup )
)
>>>
setDocumentStatusFromSystemState "parse HTML document"
>>>
traceTree
>>>
traceSource
>>>
perform ( getAttrValue a_source
>>>
traceValue 1 (\ src -> "parse HTML document " ++ show src ++ " finished")
)
)
`when` documentStatusOk
where
parseHtml withTagSoup'
| withTagSoup' = withoutUserState $< getSysVar theTagSoupParser
| otherwise = traceMsg 1 ("parse document with parsec HTML parser")
>>>
replaceChildren
( ( getAttrValue a_source
&&&
xshow getChildren
)
>>>
parseHtmlDoc
)
>>>
substHtmlEntityRefs
removeWarnings (warnings, withTagSoup')
| withTagSoup'
&&
not warnings = this
| otherwise = processTopDownWithAttrl
( if warnings
then filterErrorMsg
else ( none
`when`
isError
)
)
validateDocument :: IOStateArrow s XmlTree XmlTree
validateDocument
= ( traceMsg 1 "validating document"
>>>
perform ( validateDoc
>>>
filterErrorMsg
)
>>>
setDocumentStatusFromSystemState "document validation"
>>>
traceMsg 1 "document validated, transforming doc with respect to DTD"
>>>
transformDoc
>>>
traceMsg 1 "document transformed"
>>>
traceSource
>>>
traceTree
)
`when`
documentStatusOk
propagateAndValidateNamespaces :: IOStateArrow s XmlTree XmlTree
propagateAndValidateNamespaces
= ( traceMsg 1 "propagating namespaces"
>>>
propagateNamespaces
>>>
traceDoc "propagating namespaces done"
>>>
andValidateNamespaces
)
`when`
documentStatusOk
andValidateNamespaces :: IOStateArrow s XmlTree XmlTree
andValidateNamespaces
= ( traceMsg 1 "validating namespaces"
>>>
( setDocumentStatusFromSystemState "namespace propagation"
`when`
( validateNamespaces >>> perform filterErrorMsg )
)
>>>
traceMsg 1 "namespace validation finished"
)
`when`
documentStatusOk
getDocumentContents :: String -> IOStateArrow s b XmlTree
getDocumentContents src
= root [] []
>>>
addAttr a_source src
>>>
traceMsg 1 ("readDocument: start processing document " ++ show src)
>>>
getXmlContents
validateDoc :: ArrowList a => a XmlTree XmlTree
validateDoc = fromLA ( validate
`when`
getDTDSubset
)
transformDoc :: ArrowList a => a XmlTree XmlTree
transformDoc = fromLA transform