-- ------------------------------------------------------------ {- | Module : Text.XML.HXT.Arrow.ExpatInterface Copyright : Copyright (C) 2010 Uwe Schmidt License : MIT Maintainer : Uwe Schmidt (uwe@fh-wedel.de) Stability : experimental Portability: portable Internal Interface for Expat Parser -} -- ------------------------------------------------------------ module Text.XML.HXT.Arrow.ExpatInterface where import Text.XML.Expat.Tree import Text.XML.HXT.Core import qualified Text.XML.HXT.Core as X import qualified Text.XML.HXT.DOM.XmlNode as XN import Text.XML.HXT.Arrow.XmlState.TypeDefs import Text.XML.HXT.Parser.XhtmlEntities ( xhtmlEntities ) -- import Debug.Trace -- ------------------------------------------------------------ {- | The system config option to enable the expat parser Here is an example, how to use it: > ... > import Text.HXT.XML.Core > import Text.HXT.XML.Expat > ... > > readDocument [ withExpat True ] "some-file.xml" > ... reads the given document and parses it with the expat parser. There is no validation enabled. The parameter to @withExpat@ determines, whether parsing is done strict. Here strict parsing is enabled. When strict parsing is used, the parse is immediately checked for parser errors, and possible errors are issued. When set to non-strict parsing, error checking is delayed and may be done later with the @issueExpatErr@ arrow. When HTML parsing is enabled, the expat parser will be configured with the HTML enitity reference resolver, else only the predefined XML enitities will be substituted. -} withExpat :: Bool -> SysConfig withExpat strict = setS (theExpat .&&&. theTagSoup .&&&. theExpatParser .&&&. theExpatErrors ) (True, (False, (parseExpat strict, none))) -- | Turns off expat parsing. The build in HXT parsers will be used. withoutExpat :: SysConfig withoutExpat = setS theExpat False -- ------------------------------------------------------------ parseExpat :: Bool -> IOSArrow XmlTree XmlTree parseExpat strict = parse1 $<< ( getAttrValue transferEncoding &&& getSysVar theLowerCaseNames ) where parse1 enc isHtml = traceMsg 1 ( "parse document with expat parser, encoding is " ++ show enc ++ ", issue errors is " ++ show strict ++ ", HTML entity subst is " ++ show isHtml ) >>> ( substContents $< parse2 ) where substContents (t, e) | strict = case e of Nothing -> setChildren [t] Just _ -> ee e >>> setChildren [] | otherwise = perform ( constA (ee e) >>> traceMsg 1 "set expat error" >>> setSysVar theExpatErrors >>> none ) >>> setChildren [t] where ee Nothing = none ee (Just (XMLParseError msg loc)) = issueErr ("Expat error at " ++ show (xmlLineNumber loc) ++ ":" ++ show (xmlColumnNumber loc) ++ ":" ++ msg ) parse2 :: IOSArrow XmlTree (XmlTree, Maybe XMLParseError) parse2 = xshowBlob X.getChildren -- expat parser wants bytestrings as input >>> arr ( parse parseOptions >>> first uNodeStringToXmlTree ) parseOptions | isHtml = parseO { entityDecoder = Just htmlEncoder } | otherwise = parseO where parseO = defaultParseOptions { overrideEncoding = expatEnc } htmlEncoder :: String -> Maybe String htmlEncoder ent = -- traceShow ("\n" ++ ent ++ "\n") $ fmap (toEnum >>> (:[])) . lookup ent $ xhtmlEntities expatEnc = lookup enc [ (X.usAscii, ASCII) , (X.utf8, UTF8) , (X.utf16, UTF16) , (X.isoLatin1, ISO88591) ] -- | In case of lazy parsing check for possible errors issueExpatErr :: IOStateArrow s b b issueExpatErr = withoutUserState $ perform $ constA undefined >>> applyA (getSysVar theExpatErrors) -- ------------------------------------------------------------ uNodeStringToXmlTree :: UNode String -> XmlTree uNodeStringToXmlTree (Element n al cl) = XN.mkElement (mkName n) (map (\ (an, av) -> XN.mkAttr (mkName an) [XN.mkText av]) al) (map uNodeStringToXmlTree cl) uNodeStringToXmlTree (Text t) = XN.mkText t -- ------------------------------------------------------------