module Text.XML.HXT.Arrow.DocumentInput
( getURIContents
, getXmlContents
, getXmlEntityContents
, getEncoding
, decodeDocument
)
where
import Control.Arrow
import Control.Arrow.ArrowList
import Control.Arrow.ArrowIf
import Control.Arrow.ArrowTree
import Control.Arrow.ArrowIO
import Text.XML.HXT.DOM.Unicode
( getDecodingFct
, guessEncoding
, normalizeNL
)
import qualified Text.XML.HXT.IO.GetFILE as FILE
import qualified Text.XML.HXT.IO.GetHTTPNative as HTTP
import qualified Text.XML.HXT.IO.GetHTTPCurl as CURL
import Text.XML.HXT.DOM.Interface
import Text.XML.HXT.Arrow.XmlArrow
import Text.XML.HXT.Arrow.XmlIOStateArrow
import Text.XML.HXT.Arrow.ParserInterface
( parseXmlDocEncodingSpec
, parseXmlEntityEncodingSpec
, removeEncodingSpec
)
import Data.List
( isPrefixOf
)
import Data.Char
( toLower
)
import Data.Maybe
protocolHandlers :: AssocList String (IOStateArrow s XmlTree XmlTree)
protocolHandlers
= [ ("file", getFileContents)
, ("http", getHttpContents)
, ("stdin", getStdinContents)
]
getProtocolHandler :: IOStateArrow s String (IOStateArrow s XmlTree XmlTree)
getProtocolHandler
= arr (\ s -> lookupDef getUnsupported s protocolHandlers)
getUnsupported :: IOStateArrow s XmlTree XmlTree
getUnsupported
= perform ( getAttrValue a_source
>>>
arr (("unsupported protocol in URI " ++) . show)
>>>
applyA (arr issueFatal)
)
>>>
setDocumentStatusFromSystemState "accessing documents"
getStringContents :: IOStateArrow s XmlTree XmlTree
getStringContents
= setCont $< getAttrValue a_source
>>>
addAttr transferMessage "OK"
>>>
addAttr transferStatus "200"
where
setCont contents
= replaceChildren (txt contents')
>>>
addAttr transferURI (take 7 contents)
>>>
addAttr a_source (show . prefix 48 $ contents')
where
contents' = drop (length stringProtocol) contents
prefix l s
| length s' > l = take (l 3) s' ++ "..."
| otherwise = s'
where
s' = take (l + 1) s
getFileContents :: IOStateArrow s XmlTree XmlTree
getFileContents
= applyA ( getAttrValue transferURI
>>>
getPathFromURI
>>>
arrIO FILE.getCont
>>>
( arr addError
|||
arr addTxtContent
)
)
getStdinContents :: IOStateArrow s XmlTree XmlTree
getStdinContents
= applyA ( arrIO (\ _ -> FILE.getStdinCont)
>>>
( arr addError
|||
arr addTxtContent
)
)
addError :: String -> IOStateArrow s XmlTree XmlTree
addError e
= issueFatal e
>>>
setDocumentStatusFromSystemState "accessing documents"
addTxtContent :: String -> IOStateArrow s XmlTree XmlTree
addTxtContent c
= replaceChildren (txt c)
>>>
addAttr transferMessage "OK"
>>>
addAttr transferStatus "200"
getHttpContents :: IOStateArrow s XmlTree XmlTree
getHttpContents
= getCont $<<<< ( getAttrValue transferURI
&&&
getOpt a_proxy
&&&
getOpt a_use_curl
&&&
getOpt a_options_curl
)
where
getOpt opt
= getAttrValue0 opt
`orElse`
getParamString opt
getCont uri proxy curl curlOpt
= applyA ( ( if curl == v_1
then ( traceMsg 2 ( "get HTTP via "
++ show ( "curl "
++ (if null proxy then "" else "--proxy " ++ proxy)
++ (if null curlOpt then "" else " " ++ curlOpt)
++ " " ++ uri
)
)
>>>
arrIO0 ( CURL.getCont curlOpt uri proxy )
)
else arrIO0 ( HTTP.getCont uri proxy )
)
>>>
( arr addError
|||
arr addContent
)
)
addContent :: (AssocList String String, String) -> IOStateArrow s XmlTree XmlTree
addContent (al, c)
= replaceChildren (txt c)
>>>
seqA (map (uncurry addAttr) al)
getURIContents :: IOStateArrow s XmlTree XmlTree
getURIContents
= getContentsFromString
`orElse`
getContentsFromDoc
where
getContentsFromString
= ( getAttrValue a_source
>>>
isA (isPrefixOf stringProtocol)
)
`guards`
getStringContents
getContentsFromDoc
= ( ( addTransferURI $< getBaseURI
>>>
getCont
)
`when`
( setAbsURI $< ( getAttrValue a_source
>>^
( \ src-> (if null src then "stdin:" else src) )
)
)
)
>>>
setDocumentStatusFromSystemState "getURIContents"
setAbsURI src
= ifA ( constA src >>> changeBaseURI )
this
( issueFatal ("illegal URI : " ++ show src) )
addTransferURI uri
= addAttr transferURI uri
getCont
= applyA ( getBaseURI
>>>
traceString 2 (("getURIContents: reading" ++) . show)
>>>
getSchemeFromURI
>>>
getProtocolHandler
)
`orElse`
this
setBaseURIFromDoc :: IOStateArrow s XmlTree XmlTree
setBaseURIFromDoc
= perform ( getAttrValue transferURI
>>>
isA (isPrefixOf stringProtocol)
>>>
setBaseURI
)
getXmlContents :: IOStateArrow s XmlTree XmlTree
getXmlContents
= getXmlContents' parseXmlDocEncodingSpec
>>>
setBaseURIFromDoc
getXmlEntityContents :: IOStateArrow s XmlTree XmlTree
getXmlEntityContents
= getXmlContents' parseXmlEntityEncodingSpec
>>>
processChildren removeEncodingSpec
>>>
setBaseURIFromDoc
getXmlContents' :: IOStateArrow s XmlTree XmlTree -> IOStateArrow s XmlTree XmlTree
getXmlContents' parseEncodingSpec
= ( getURIContents
>>>
parseEncodingSpec
>>>
filterErrorMsg
>>>
decodeDocument
>>>
perform ( getAttrValue transferURI
>>>
traceString 1 (("getXmlContents: content read and decoded for " ++) . show)
)
>>>
traceTree
>>>
traceSource
)
`when`
isRoot
getEncoding :: IOStateArrow s XmlTree String
getEncoding
= catA [ xshow getChildren
>>>
arr guessEncoding
, getAttrValue transferEncoding
, getAttrValue a_encoding
, getParamString a_encoding
, constA utf8
]
>. (head . filter (not . null))
decodeDocument :: IOStateArrow s XmlTree XmlTree
decodeDocument
= ( decodeArr $< getEncoding )
`when`
( isRoot >>> isXmlOrHtmlDoc )
where
isXmlOrHtmlDoc
= ( getAttrValue transferMimeType >>^ map toLower )
>>>
isA (\ t -> null t || isXmlMimeType t || isHtmlMimeType t)
decodeArr :: String -> IOStateArrow s XmlTree XmlTree
decodeArr enc
= maybe notFound found . getDecodingFct $ enc
where
found df
= traceMsg 2 ("decodeDocument: encoding is " ++ show enc)
>>>
processChildren (decodeText df)
>>>
addAttr transferEncoding enc
notFound
= issueFatal ("encoding scheme not supported: " ++ show enc)
>>>
setDocumentStatusFromSystemState "decoding document"
decodeText df
= getText
>>> arr df
>>> ( ( (normalizeNL . fst) ^>> mkText )
<+>
( arrL snd
>>>
arr ((enc ++) . (" encoding error" ++))
>>>
applyA (arr issueErr)
>>>
none
)
)