{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedStrings #-}
module Xmlbf.Xeno
( fromXenoNode
, fromRawXml
) where
import qualified Data.Bifunctor as Bif
import qualified Data.ByteString as B
import qualified Data.HashMap.Strict as HM
import Data.Monoid ((<>))
import qualified Data.Text as T
import qualified Data.Text.Lazy as TL
import qualified Data.Text.Lazy.Builder as TB
import qualified Data.Text.Encoding as T
import Data.Traversable (for)
import qualified HTMLEntities.Decoder
import qualified Xeno.DOM as Xeno
import qualified Xmlbf
fromXenoNode
:: Xeno.Node
-> Either String Xmlbf.Node
fromXenoNode x = do
n <- decodeUtf8 (Xeno.name x)
as <- for (Xeno.attributes x) $ \(k,v) -> do
(,) <$> decodeUtf8 k <*> unescapeXmlUtf8 v
cs <- for (Xeno.contents x) $ \case
Xeno.Element n1 -> fromXenoNode n1
Xeno.Text bs -> Xmlbf.text' =<< unescapeXmlUtf8Lazy bs
Xeno.CData bs -> Xmlbf.text' =<< decodeUtf8Lazy bs
Xmlbf.element' n (HM.fromList as) cs
fromRawXml
:: B.ByteString
-> Either String [Xmlbf.Node]
fromRawXml = \bs -> case Xeno.parse ("<x>" <> dropBomUtf8 bs <> "</x>") of
Left e -> Left ("Malformed XML: " ++ show e)
Right n -> fromXenoNode n >>= \(Xmlbf.Element "x" _ cs) -> pure cs
decodeUtf8 :: B.ByteString -> Either String T.Text
{-# INLINE decodeUtf8 #-}
decodeUtf8 bs = Bif.first show (T.decodeUtf8' bs)
decodeUtf8Lazy :: B.ByteString -> Either String TL.Text
{-# INLINE decodeUtf8Lazy #-}
decodeUtf8Lazy bs = fmap TL.fromStrict (decodeUtf8 bs)
unescapeXmlUtf8 :: B.ByteString -> Either String T.Text
{-# INLINE unescapeXmlUtf8 #-}
unescapeXmlUtf8 bs = fmap TL.toStrict (unescapeXmlUtf8Lazy bs)
unescapeXmlUtf8Lazy :: B.ByteString -> Either String TL.Text
{-# INLINE unescapeXmlUtf8Lazy #-}
unescapeXmlUtf8Lazy bs = do
t <- decodeUtf8 bs
pure (TB.toLazyText (HTMLEntities.Decoder.htmlEncodedText t))
dropBomUtf8 :: B.ByteString -> B.ByteString
{-# INLINE dropBomUtf8 #-}
dropBomUtf8 bs | B.isPrefixOf "\xEF\xBB\xBF" bs = B.drop 3 bs
| otherwise = bs