-- ------------------------------------------------------------ {- | Module : Text.XML.HXT.Arrow.ReadDocument Copyright : Copyright (C) 2005 Uwe Schmidt License : MIT Maintainer : Uwe Schmidt (uwe@fh-wedel.de) Stability : experimental Portability: portable Version : $Id: ReadDocument.hs,v 1.10 2006/11/24 07:41:37 hxml Exp $ Compound arrows for reading an XML\/HTML document or an XML\/HTML string -} -- ------------------------------------------------------------ module Text.XML.HXT.Arrow.ReadDocument ( readDocument , readFromDocument , readString , readFromString , hread , xread ) where import Control.Arrow.ListArrows import Text.XML.HXT.Arrow.DOMInterface import Text.XML.HXT.Arrow.XmlArrow import Text.XML.HXT.Arrow.XmlIOStateArrow import Text.XML.HXT.Arrow.Edit ( canonicalizeAllNodes , canonicalizeForXPath , canonicalizeContents , removeDocWhiteSpace ) import Text.XML.HXT.Arrow.ParserInterface import Text.XML.HXT.Arrow.ProcessDocument ( getDocumentContents , parseXmlDocument , parseHtmlDocument , propagateAndValidateNamespaces ) import Text.XML.HXT.RelaxNG.Validator ( validateDocumentWithRelaxSchema ) -- ------------------------------------------------------------ {- | the main document input filter this filter can be configured by an option list, a value of type "Attributes" available options: * 'a_parse_html': use HTML parser, else use XML parser (default) - 'a_validate' : validate document againsd DTD (default), else skip validation - 'a_relax_schema' : validate document with Relax NG, the options value is the schema URI this implies using XML parser, no validation against DTD, and canonicalisation - 'a_check_namespaces' : check namespaces, else skip namespace processing (default) - 'a_canonicalize' : canonicalize document (default), else skip canonicalization - 'a_preserve_comment' : preserve comments during canonicalization, else remove comments (default) - 'a_remove_whitespace' : remove all whitespace, used for document indentation, else skip this step (default) - 'a_indent' : indent document by inserting whitespace, else skip this step (default) - 'a_issue_warnings' : issue warnings, when parsing HTML (default), else ignore HTML parser warnings - 'a_issue_errors' : issue all error messages on stderr (default), or ignore all error messages (default) - 'a_trace' : trace level: values: 0 - 4 - 'a_proxy' : proxy for http access, e.g. www-cache:3128 - 'a_use_curl' : for http access via external programm curl, default is native HTTP access - 'a_options_curl' : more options for external program curl - 'a_encoding' : default document encoding ('utf8', 'isoLatin1', 'usAscii', 'iso8859_2', ... , 'iso8859_16', ...) All attributes not evaluated by readDocument are stored in the created document root node for easy access of the various options in e.g. the input\/output modules If the document name is the empty string or an uri of the form \"stdin:\", the document is read from standard input. examples: > readDocument [ ] "test.xml" reads and validates a document \"test.xml\", no namespace propagation, only canonicalization is performed > readDocument [ (a_validate, "0") > , (a_encoding, isoLatin1) > ] "test.xml" reads document \"test.xml\" without validation, default encoding 'isoLatin1'. > readDocument [ (a_parse_html, "1") > , (a_encoding, isoLatin1) > ] "" reads a HTML document from standard input, no validation is done when parsing HTML, default encoding is 'isoLatin1' > readDocument [ (a_parse_html, "1") > , (a_proxy, "www-cache:3128") > , (a_curl, "1") > , (a_issue_warnings, "0") > ] "http://www.haskell.org/" reads Haskell homepage with HTML parser ignoring any warnings, with http access via external program curl and proxy \"www-cache\" at port 3128 > readDocument [ (a_validate, "1") > , (a_check_namespace, "1") > , (a_remove_whitespace, "1") > , (a_trace, "2") > ] "http://www.w3c.org/" read w3c home page (xhtml), validate and check namespaces, remove whitespace between tags, trace activities with level 2 for minimal complete examples see 'Text.XML.HXT.Arrow.WriteDocument.writeDocument' and 'runX', the main starting point for running an XML arrow. -} readDocument :: Attributes -> String -> IOStateArrow s b XmlTree readDocument userOptions src = traceLevel >>> addInputOptionsToSystemState >>> getDocumentContents remainingOptions src >>> parse >>> checknamespaces >>> canonicalize >>> whitespace >>> relax >>> traceMsg 1 ("readDocument: " ++ show src ++ " processed") >>> traceSource >>> traceTree where parse | validateWithRelax = parseXmlDocument False -- for Relax NG use XML parser without validation | hasOption a_parse_html = parseHtmlDocument -- parse as HTML (hasOption a_issue_warnings) | otherwise = parseXmlDocument (hasOption a_validate) -- parse as XML checknamespaces | hasOption a_check_namespaces || validateWithRelax = propagateAndValidateNamespaces | otherwise = this canonicalize | validateWithRelax = canonicalizeAllNodes | hasOption a_canonicalize && hasOption a_preserve_comment = canonicalizeForXPath | hasOption a_canonicalize = canonicalizeAllNodes | otherwise = this relax | validateWithRelax = validateDocumentWithRelaxSchema remainingOptions relaxSchema | otherwise = this whitespace | hasOption a_remove_whitespace = removeDocWhiteSpace | otherwise = this addInputOptionsToSystemState = addSysOptions (filter ((`elem` httpOptions) . fst) remainingOptions) where addSysOptions = seqA . map (uncurry setParamString) traceLevel = maybe this (setTraceLevel . read) . lookup a_trace $ options validateWithRelax = hasEntry a_relax_schema options relaxSchema = lookup1 a_relax_schema options hasOption n = optionIsSet n options options = addEntries userOptions defaultOptions defaultOptions = [ ( a_parse_html, v_0 ) , ( a_validate, v_1 ) , ( a_issue_warnings, v_1 ) , ( a_check_namespaces, v_0 ) , ( a_canonicalize, v_1 ) , ( a_preserve_comment, v_0 ) , ( a_remove_whitespace, v_0 ) ] httpOptions = [a_proxy, a_use_curl, a_options_curl] remainingOptions = filter (not . flip hasEntry defaultOptions . fst) options -- ------------------------------------------------------------ -- | -- the arrow version of 'readDocument', the arrow input is the source URI readFromDocument :: Attributes -> IOStateArrow s String XmlTree readFromDocument userOptions = applyA ( arr $ \ s -> readDocument userOptions s ) -- ------------------------------------------------------------ -- | -- read a document that is stored in a normal Haskell String -- -- the same function as readDocument, but the parameter forms the input. -- All options available for 'readDocument' are applicable for readString. -- -- Default encoding: No encoding is done, the String argument is taken as Unicode string readString :: Attributes -> String -> IOStateArrow s b XmlTree readString userOptions content = readDocument ( (a_encoding, unicodeString) : userOptions ) (stringProtocol ++ content) -- ------------------------------------------------------------ -- | -- the arrow version of 'readString', the arrow input is the source URI readFromString :: Attributes -> IOStateArrow s String XmlTree readFromString userOptions = applyA ( arr $ \ s -> readString userOptions s ) -- ------------------------------------------------------------ -- | -- parse a string as HTML content, substitute all HTML entity refs and canonicalize tree -- (substitute char refs, ...). Errors are ignored. -- -- A simpler version of 'readFromString' but with less functionality. -- Does not run in the IO monad hread :: ArrowXml a => a String XmlTree hread = parseHtmlContent >>> substHtmlEntityRefs >>> processTopDown ( none `when` isError ) >>> canonicalizeContents -- | -- parse a string as XML content, substitute all predefined XML entity refs and canonicalize tree -- (substitute char refs, ...) xread :: ArrowXml a => a String XmlTree xread = parseXmlContent >>> substXmlEntityRefs >>> canonicalizeContents -- ------------------------------------------------------------