module Text.XML.ToJSON
( elementToJSON
, tokensToJSON
, xmlToJSON
) where
import Control.Monad (when)
import Control.Arrow (second)
import Control.Applicative ( (<$>), (*>) )
import Data.Maybe (fromMaybe)
import qualified Data.CaseInsensitive as CI
import qualified Data.Text as T
import Data.ByteString (ByteString)
import qualified Blaze.ByteString.Builder as B
import Data.Attoparsec.ByteString.Char8 (char)
import Data.Conduit
import Data.Conduit.Internal (ResumableSource(ResumableSource))
import qualified Data.Conduit.List as C
import qualified Data.Conduit.Attoparsec as C
import qualified Data.Conduit.Text as C
import qualified Data.HashMap.Strict as HM
import qualified Data.Vector as V
import Text.HTML.TagStream
import qualified Text.HTML.TagStream.Text as T
import qualified Text.HTML.TagStream.ByteString as S
import Text.XML.ToJSON.Builder
import Data.Aeson (Value(..), Object)
tokenToBuilder :: T.Token -> Builder
tokenToBuilder (TagOpen s as selfClose) = do
beginElement s
addAttrs as
when selfClose endElement
tokenToBuilder (TagClose _) = endElement
tokenToBuilder (Text s) = addValue s
tokenToBuilder _ = return ()
attrsToObject :: [(Str, Str)] -> Object
attrsToObject = HM.fromList . map (second String)
mergeObject :: Value -> Value -> Value
mergeObject (Array arr) v = Array (V.cons v arr)
mergeObject v1 v2 = Array (V.fromList [v1, v2])
elementToJSON :: Element -> Value
elementToJSON (Element as vs cs) =
if null as && null cs
then
String (T.concat vs)
else
Object $ HM.fromListWith mergeObject
$ attrs
++ values
++ map (second elementToJSON) cs
where
attrs = if null as
then []
else [("__attributes", Object (attrsToObject as))]
values = if null vs
then []
else [("__values", Array (V.fromList (map String vs)))]
tokensToJSON :: [T.Token] -> Value
tokensToJSON tokens =
elementToJSON $ runBuilder (mapM_ tokenToBuilder tokens)
xmlToJSON :: (Functor m, Monad m, MonadThrow m) => Source m ByteString -> m Value
xmlToJSON src = do
(src', token) <- src $$+ C.sinkParser (char '<' *> S.tag)
let (mencoding, src'') =
case token of
(TagOpen "?xml" as _) ->
(lookup "encoding" as, src')
_ ->
( Nothing
, prependRSrc
(yield (B.toByteString (S.showToken id token)))
src'
)
codec = fromMaybe C.utf8 (mencoding >>= getCodec . CI.mk)
tokensToJSON <$> (src'' $$+- (C.decode codec =$ T.tokenStream =$ C.consume))
prependRSrc :: Monad m
=> Source m a
-> ResumableSource m a
-> ResumableSource m a
prependRSrc src (ResumableSource src' close) = ResumableSource (src >> src') close
getCodec :: CI.CI ByteString -> Maybe C.Codec
getCodec c =
case c of
"utf-8" -> Just C.utf8
"utf8" -> Just C.utf8
"gbk" -> Just C.iso8859_1
_ -> Nothing