module Text.XML.HXT.Helper -- == EXPORTS ================================================================= (XmlValidator ,XmlEncoding, xmlEncodingFromString, xmlEncodingToString ,serializeXml, parseXml, ParseXmlConfig(..), defaultParseXmlConfig, xmlEncoding ,pickle, pickleWithEnc, pickleStr, unpickle, unpickleStr, unpickleDocM ,validateAndUnpickle ,PUCase, xpOne, xpMany, xpTagSwitch, xpCase, xpCaseConst, xpSwitch ,xpTextFixed, xpConsumeAll, xpExhaustiveList, xpIgnoreOnUnpickle, xpEither ,xpByteString ,_ISO8859_1_, _ISO8859_2_, _ISO8859_3_, _ISO8859_4_, _ISO8859_5_ ,_ISO8859_6_, _ISO8859_7_, _ISO8859_8_, _ISO8859_9_, _ISO8859_10_ ,_ISO8859_11_, _ISO8859_13_, _ISO8859_14_, _ISO8859_15_, _ISO8859_16_ ,_USASCII_, _UCS2_, _UTF8_, _UTF16_, _UTF16BE_, _UTF16LE_, _ISOLATIN1_ ,_UNICODE_) where -- == IMPORTS ================================================================= ---------------------------------------- -- STDLIB ---------------------------------------- import Control.Monad.Error () import qualified Control.Monad as M import Data.Char (toLower) import Data.Maybe (fromMaybe) import Data.List (elemIndex) ---------------------------------------- -- SITE-PACKAGES ---------------------------------------- import Control.Arrow.ArrowIf (when) import Control.Arrow.ArrowList (this,none) import Control.Arrow.ArrowTree (processChildren, processTopDown) import Data.Tree.NTree.TypeDefs (NTree(..)) import Text.XML.HXT.Arrow ( XmlTree , runLA, constA, (>>>), arr , replaceChildren,xshow,mkText,escapeXmlDoc , getChildren) import Text.XML.HXT.Arrow.Namespace (propagateNamespaces,uniqueNamespacesFromDeclAndQNames) import Text.XML.HXT.Arrow.Pickle (PU(..), unpickleDoc, pickleDoc ,xpElem, xpList, xpAlt, xpWrap, xpWrapMaybe, xpText, xpText0) import Text.XML.HXT.Arrow.Pickle.Schema (scFixed) import Text.XML.HXT.Arrow.Pickle.Xml (St(..)) import Text.XML.HXT.Arrow.ParserInterface (parseXmlDoc,substXmlEntityRefs) import Text.XML.HXT.Arrow.Edit (canonicalizeContents, removeDocWhiteSpace, removeAllComment ,indentDoc) import Text.XML.HXT.Arrow.XmlArrow (ArrowXml, root, isPi, hasName) import Text.XML.HXT.Arrow.DocumentOutput (encodeDocument') import Text.XML.HXT.DOM.TypeDefs (XNode(..)) import Text.XML.HXT.DOM.XmlKeywords (t_xml ,iso8859_1, iso8859_2, iso8859_3, iso8859_4, iso8859_5, iso8859_6 ,iso8859_7, iso8859_8, iso8859_9, iso8859_10, iso8859_11, iso8859_13 ,iso8859_14, iso8859_15, iso8859_16, usAscii, ucs2, utf8, utf16, utf16be ,utf16le, unicodeString, isoLatin1) import qualified Data.ByteString.Lazy.Char8 as BSLChar import qualified Data.ByteString.Lazy as BSL import Data.Encoding (DynEncoding, encodingFromStringExplicit, decodeLazyByteStringExplicit) -- == TYPES =================================================================== type XmlValidator = BSLChar.ByteString -> Either String () newtype XmlEncoding = XmlEncoding { xmlEncodingName :: String } deriving (Eq) instance Show XmlEncoding where show (XmlEncoding name) = name _UNICODE_ :: XmlEncoding _UNICODE_ = XmlEncoding unicodeString _ISO8859_1_ :: XmlEncoding _ISO8859_1_ = XmlEncoding iso8859_1 _ISO8859_2_ :: XmlEncoding _ISO8859_2_ = XmlEncoding iso8859_2 _ISO8859_3_ :: XmlEncoding _ISO8859_3_ = XmlEncoding iso8859_3 _ISO8859_4_ :: XmlEncoding _ISO8859_4_ = XmlEncoding iso8859_4 _ISO8859_5_ :: XmlEncoding _ISO8859_5_ = XmlEncoding iso8859_5 _ISO8859_6_ :: XmlEncoding _ISO8859_6_ = XmlEncoding iso8859_6 _ISO8859_7_ :: XmlEncoding _ISO8859_7_ = XmlEncoding iso8859_7 _ISO8859_8_ :: XmlEncoding _ISO8859_8_ = XmlEncoding iso8859_8 _ISO8859_9_ :: XmlEncoding _ISO8859_9_ = XmlEncoding iso8859_9 _ISO8859_10_ :: XmlEncoding _ISO8859_10_ = XmlEncoding iso8859_10 _ISO8859_11_ :: XmlEncoding _ISO8859_11_ = XmlEncoding iso8859_11 _ISO8859_13_ :: XmlEncoding _ISO8859_13_ = XmlEncoding iso8859_13 _ISO8859_14_ :: XmlEncoding _ISO8859_14_ = XmlEncoding iso8859_14 _ISO8859_15_ :: XmlEncoding _ISO8859_15_ = XmlEncoding iso8859_15 _ISO8859_16_ :: XmlEncoding _ISO8859_16_ = XmlEncoding iso8859_16 _USASCII_ :: XmlEncoding _USASCII_ = XmlEncoding usAscii _UCS2_ :: XmlEncoding _UCS2_ = XmlEncoding ucs2 _UTF8_ :: XmlEncoding _UTF8_ = XmlEncoding utf8 _UTF16_ :: XmlEncoding _UTF16_ = XmlEncoding utf16 _UTF16BE_ :: XmlEncoding _UTF16BE_ = XmlEncoding utf16be _UTF16LE_ :: XmlEncoding _UTF16LE_ = XmlEncoding utf16le _ISOLATIN1_ :: XmlEncoding _ISOLATIN1_ = XmlEncoding isoLatin1 -- == ENCODING FUNCTIONS ====================================================== xmlEncodingFromString :: String -> Either String XmlEncoding xmlEncodingFromString name = case map toLower name of "latin1" -> return _ISOLATIN1_ "latin9" -> return _ISO8859_15_ "iso-8859-1" -> return _ISO8859_1_ "iso-8859-2" -> return _ISO8859_2_ "iso-8859-3" -> return _ISO8859_3_ "iso-8859-4" -> return _ISO8859_4_ "iso-8859-5" -> return _ISO8859_5_ "iso-8859-6" -> return _ISO8859_6_ "iso-8859-7" -> return _ISO8859_7_ "iso-8859-8" -> return _ISO8859_8_ "iso-8859-9" -> return _ISO8859_9_ "iso-8859-10" -> return _ISO8859_10_ "iso-8859-11" -> return _ISO8859_11_ "iso-8859-13" -> return _ISO8859_13_ "iso-8859-14" -> return _ISO8859_14_ "iso-8859-15" -> return _ISO8859_15_ "utf8" -> return _UTF8_ "utf-8" -> return _UTF8_ "utf16" -> return _UTF16_ "utf-16" -> return _UTF16_ "ascii" -> return _USASCII_ "us-ascii" -> return _USASCII_ "unicode" -> return _UNICODE_ _ -> fail $ "Can't parse unsupported XmlEncoding name `" ++ name ++ "'." xmlEncodingToString :: XmlEncoding -> String xmlEncodingToString = show xmlEncoding :: Monad m => BSL.ByteString -> m DynEncoding xmlEncoding bs = let enc = case parseProlog of Nothing -> "UTF-8" Just s -> s in case encodingFromStringExplicit enc of Just e -> return e Nothing -> fail $ "unknown encoding: " ++ enc where parseProlog = do M.when (BSLChar.take 5 bs /= BSLChar.pack "") bs (_, rest) <- breakSubstring' (BSLChar.pack "encoding") header let notQuote = \c -> c /= '"' && c /= '\'' enc = (BSLChar.takeWhile notQuote . -- get the encoding BSLChar.drop 1 . -- drop quote BSLChar.dropWhile notQuote) -- skip to first quote rest return $ BSLChar.unpack enc breakSubstring' x bs = let (prefix, y) = breakSubstring x bs in if x `BSL.isPrefixOf` y then return (prefix, BSL.drop (BSL.length x) y) else fail ("breakSubstring failed: substring " ++ show x ++ " not contained in " ++ show bs) breakSubstring x bs = if x `BSL.isPrefixOf` bs then (BSL.empty, bs) else case BSL.uncons bs of Nothing -> (BSL.empty, BSL.empty) Just (c, bs') -> let (bs1, bs2) = breakSubstring x bs' in (BSL.cons c bs1, bs2) -- == XML FUNCTIONS =========================================================== xpOne :: PU a -> PU a xpOne = id xpMany :: String -> PU a -> PU [a] xpMany rootName pu = xpElem rootName (xpList pu) xpByteString :: PU BSL.ByteString xpByteString = xpWrap (BSLChar.pack, BSLChar.unpack) xpText0 pickle :: PU a -> a -> BSLChar.ByteString pickle pu = serializeXml _UTF8_ . pickleDoc pu pickleWithEnc :: XmlEncoding -> PU a -> a -> BSLChar.ByteString pickleWithEnc enc pu = serializeXml' noProlog enc . pickleDoc pu where noProlog = enc == _UNICODE_ unpickle :: Monad m => PU a -> BSLChar.ByteString -> m a unpickle pu bsl = parseXml defaultParseXmlConfig bsl >>= unpickleDocM pu validateAndUnpickle :: XmlValidator -> PU a -> BSLChar.ByteString -> Either String a validateAndUnpickle val pu bsl = case val bsl of Left s -> Left s _ -> case parseXml defaultParseXmlConfig bsl of Right t -> unpickleDocM pu t Left s -> Left s pickleStr :: PU a -> a -> String pickleStr pu = BSLChar.unpack . pickleWithEnc _UNICODE_ pu unpickleDocM :: Monad m => PU a -> XmlTree -> m a unpickleDocM pu t = case unpickleDoc pu t of Nothing -> fail "Unpickling failed." Just x -> return x unpickleStr :: Monad m => PU a -> [Char] -> m a unpickleStr pu = unpickle pu . BSLChar.pack serializeXml :: XmlEncoding -> XmlTree -> BSLChar.ByteString serializeXml enc xml = serializeXml' False enc xml serializeXml' :: Bool -> XmlEncoding -> XmlTree -> BSLChar.ByteString serializeXml' indent enc xml = let [str] = runLA (constA xml >>> encodeA) (error "serializeXml': undefined") in (BSLChar.pack (strip str)) where rmpi = enc == _UNICODE_ encodeA = processChildren (uniqueNamespacesFromDeclAndQNames) >>> (if indent then processChildren indentDoc else this) >>> escapeXmlDoc >>> encodeDocument' rmpi (xmlEncodingName enc) >>> replaceChildren (xshow getChildren >>> arr encode >>> mkText) >>> xshow getChildren encode = id strip = reverse . dropWhile isSpace . reverse . dropWhile isSpace isSpace = (`elem` " \n\t") data ParseXmlConfig = ParseXmlConfig { pxml_removeProcessingInstructions :: Bool , pxml_removeNonSignificantWhitespace :: Bool , pxml_removeComments :: Bool } defaultParseXmlConfig :: ParseXmlConfig defaultParseXmlConfig = ParseXmlConfig True True True parseXml :: Monad m => ParseXmlConfig -> BSLChar.ByteString -> m XmlTree parseXml cfg bstr = do enc <- xmlEncoding bstr case decodeLazyByteStringExplicit enc bstr of Left err -> fail (show err) Right str -> let trees = runLA (pipe str) (error "parseXml: undefined") in case trees of [] -> fail "empty result" (NTree _ ((NTree (XError _ msg) _) : _) : _) -> fail msg trees' -> return (last trees') where pipe str = root [] [] >>> replaceChildren (parse str) >>> (if pxml_removeProcessingInstructions cfg then removeXmlPi else this) >>> (if pxml_removeNonSignificantWhitespace cfg then removeDocWhiteSpace else this) >>> (if pxml_removeComments cfg then removeAllComment else this) parse str = constA ("urn:Data.ByteString", str) >>> parseXmlDoc >>> substXmlEntityRefs >>> canonicalizeContents >>> propagateNamespaces removeXmlPi :: ArrowXml a => a (NTree XNode) (NTree XNode) removeXmlPi = processTopDown (none `when` (isPi >>> hasName t_xml)) data PUCase a = PUCase { case_value :: a , case_spec :: PU a } xpCase :: (a -> b, b -> a) -> PU a -> PUCase b xpCase wrapfuns@(aToB,_bToA) specA = PUCase value spec where value = aToB (error "xpCase: tagging function requires evaluation") spec = xpWrap wrapfuns specA xpCaseConst :: a -> PU () -> PUCase a xpCaseConst a pu = xpCase (const a, const $ error undef) pu where undef = error "xpCaseConst: this value should have been ignored" xpSwitch :: Show a => [PUCase a] -> PU a xpSwitch = xpTagSwitch (takeWhile (/= ' ') . show) xpTagSwitch :: Eq t => (a -> t) -> [PUCase a] -> PU a xpTagSwitch tag cases = xpAlt idx (map case_spec cases) where idx = fromMaybe err . flip elemIndex tags . tag err = error $ "xpTagSwitch: no case matched" tags = map (tag . case_value) cases xpTextFixed :: String -> PU () xpTextFixed text = (xpWrapMaybe (\t -> if t == text then Just () else Nothing, const text) xpText) { theSchema = scFixed text } xpConsumeAll :: PU a -> PU a xpConsumeAll pu = PU { appPickle = appPickle pu , appUnPickle = \s1 -> case appUnPickle pu s1 of r@(Nothing, _) -> r r@(Just _, s2) -> case contents s2 of [] -> r _ -> (Nothing, s2) , theSchema = theSchema pu } xpIgnoreOnUnpickle :: (PU a -> PU a) -> PU a -> PU a xpIgnoreOnUnpickle tf pu = PU { appPickle = appPickle (tf pu) , appUnPickle = appUnPickle pu , theSchema = theSchema pu } xpExhaustiveList :: PU a -> PU [a] xpExhaustiveList = xpConsumeAll . xpList xpEither :: (Show a, Show b) => PU a -> PU b -> PU (Either a b) xpEither xpL xpR = xpSwitch [ xpCase (Left, \(Left l) -> l) xpL , xpCase (Right, \(Right r) -> r) xpR ]