{-# LANGUAGE OverloadedStrings, DeriveDataTypeable #-} module Text.XML.ToJSON ( {-| This library provide a way to convert xml to json. Further more, by combining with aeson's parsing facility, it provide a way to parse xml to haskell data type. -} parseXML , xmlToJSON , JSONParseError -- * streamlined api , bsSourceToJSON , bsRSourceToJSON -- * utils , tokenToBuilder , elementToJSON , tokensToJSON ) where import Control.Monad (when, liftM) import Control.Arrow (second) import Control.Exception (Exception) import Control.Applicative ( (*>), (<|>) ) import Data.Typeable (Typeable) import Data.Maybe (fromMaybe) import qualified Data.CaseInsensitive as CI import qualified Data.Text as T import qualified Data.ByteString.Lazy as L import Data.ByteString (ByteString) import qualified Blaze.ByteString.Builder as B import qualified Data.Attoparsec as A import Data.Attoparsec.ByteString.Char8 (char, skipSpace) import Data.Conduit (Source, yield, (=$), ($$++), ($$+-), MonadThrow(monadThrow)) import Data.Conduit.Internal (ResumableSource(ResumableSource)) import qualified Data.Conduit.List as C import qualified Data.Conduit.Attoparsec as C import qualified Data.Conduit.Text as C import qualified Data.HashMap.Strict as HM import qualified Data.Vector as V import Text.HTML.TagStream import qualified Text.HTML.TagStream.Text as T import qualified Text.HTML.TagStream.ByteString as S import Text.XML.ToJSON.Builder import Data.Aeson (Value(..), Object, FromJSON, fromJSON, Result(Error, Success)) -- | Convert tagstream-conduit `Token' to xml element `Builder' tokenToBuilder :: T.Token -> Builder tokenToBuilder (TagOpen s as selfClose) = do beginElement s addAttrs as when selfClose endElement tokenToBuilder (TagClose _) = endElement -- FIXME should match tag name? tokenToBuilder (Text s) = addValue s tokenToBuilder _ = return () -- |Convert xml `Element' to aeson `Value' . -- -- xml attributes and text values are converted to special object attribute @__attributes@ and @__values@. elementToJSON :: Element -> Value elementToJSON (Element as vs cs) = if null as && null cs then String (T.concat vs) else Object $ HM.fromListWith mergeObject $ attrs ++ values ++ map (second elementToJSON) cs where attrs = if null as then [] else [("__attributes", Object (attrsToObject as))] values = if null vs then [] else [("__values", Array (V.fromList (map String vs)))] attrsToObject :: [(T.Text, T.Text)] -> Object attrsToObject = HM.fromList . map (second String) mergeObject :: Value -> Value -> Value mergeObject v (Array arr) = Array (V.cons v arr) mergeObject v1 v2 = Array (V.fromList [v1, v2]) -- |Convert list of tagstream-conduit `Token` to aeson `Value' tokensToJSON :: [T.Token] -> Value tokensToJSON tokens = elementToJSON $ runBuilder (mapM_ tokenToBuilder tokens) -- | Consume a source and convert the content to aeson `Value', it try to inspect xml encoding from first tag. -- -- e.g. @bsSourceToJSON (C.sourceFile path_to_xml_file)@ bsSourceToJSON :: MonadThrow m => Source m ByteString -> m Value bsSourceToJSON src = bsRSourceToJSON (ResumableSource src (return ())) -- | Consume a source and convert the content to aeson `Value', it try to inspect xml encoding from first tag. -- -- e.g. @xmlStreamToJSONResumable (requestBody req)@ bsRSourceToJSON :: MonadThrow m => ResumableSource m ByteString -> m Value bsRSourceToJSON src = do -- try to peek the first tag to find the xml encoding. (src', token) <- src $$++ C.sinkParser (skipBOM *> skipSpace *> char '<' *> S.tag) let (mencoding, src'') = case token of (TagOpen "?xml" as _) -> (lookup "encoding" as, src') _ -> ( Nothing , prependRSrc (yield (B.toByteString (S.showToken id token))) src' ) codec = fromMaybe C.utf8 (mencoding >>= getCodec . CI.mk) liftM tokensToJSON (src'' $$+- (C.decode codec =$ T.tokenStream =$ C.consume)) where skipBOM :: A.Parser () skipBOM = ( A.string "\xff\xfe" <|> A.string "\xef\xbb\xbf" ) *> return () <|> return () prependRSrc :: Monad m => Source m a -> ResumableSource m a -> ResumableSource m a prependRSrc src (ResumableSource src' close) = ResumableSource (src >> src') close getCodec :: CI.CI ByteString -> Maybe C.Codec getCodec c = case c of "utf-8" -> Just C.utf8 "utf8" -> Just C.utf8 "iso8859" -> Just C.iso8859_1 _ -> Nothing newtype JSONParseError = JSONParseError String deriving (Typeable, Show) instance Exception JSONParseError -- | parse xml to haskell data type by using aeson's `FromJSON'. parseXML :: (MonadThrow m, FromJSON a) => L.ByteString -> m a parseXML s = xmlToJSON s >>= convert where convert v = case fromJSON v of Error err -> monadThrow (JSONParseError err) Success a -> return a -- | convert lazy xml `ByteString' to aeson `Value'. xmlToJSON :: MonadThrow m => L.ByteString -> m Value xmlToJSON s = bsSourceToJSON (C.sourceList (L.toChunks s))