{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedStrings #-}

module Xmlbf.Xeno
 ( fromXenoNode
 , fromRawXml
 ) where

import qualified Data.Bifunctor as Bif
import qualified Data.ByteString as B
import qualified Data.HashMap.Strict as HM
import Data.Monoid ((<>))
import qualified Data.Text as T
import qualified Data.Text.Lazy as TL
import qualified Data.Text.Lazy.Builder as TB
import qualified Data.Text.Encoding as T
import Data.Traversable (for)
import qualified HTMLEntities.Decoder
import qualified Xeno.DOM as Xeno
import qualified Xmlbf

--------------------------------------------------------------------------------
-- Xeno support

-- | Convert a 'Xeno.Node' from "Xeno.DOM" into an 'Element' from "Xmlbf".
fromXenoNode
  :: Xeno.Node -- ^ A 'Xeno.Node' from "Xeno.DOM".
  -> Either String Xmlbf.Node -- ^ A 'Xmlbf.Node' from "Xmlbf".
fromXenoNode x = do
  n <- decodeUtf8 (Xeno.name x)
  as <- for (Xeno.attributes x) $ \(k,v) -> do
     (,) <$> decodeUtf8 k <*> unescapeXmlUtf8 v
  cs <- for (Xeno.contents x) $ \case
     Xeno.Element n1 -> fromXenoNode n1
     Xeno.Text bs -> Xmlbf.text' =<< unescapeXmlUtf8Lazy bs
     Xeno.CData bs -> Xmlbf.text' =<< decodeUtf8Lazy bs
  Xmlbf.element' n (HM.fromList as) cs

-- | Parses a given UTF8-encoded raw XML fragment into @a@, using the @xeno@
-- Haskell library, so all of @xeno@'s parsing quirks apply.
--
-- You can provide the output of this function as input to "Xmlbf"'s
-- 'Xmlbf.runParser'.
--
-- The given XML can contain more zero or more text or element nodes.
--
-- Surrounding whitespace is not stripped.
fromRawXml
  :: B.ByteString                 -- ^ Raw XML fragment.
  -> Either String [Xmlbf.Node]   -- ^ 'Xmlbf.Node's from "Xmlbf"
fromRawXml = \bs -> case Xeno.parse ("<x>" <> dropBomUtf8 bs <> "</x>") of
  Left e -> Left ("Malformed XML: " ++ show e)
  Right n -> fromXenoNode n >>= \(Xmlbf.Element "x" _ cs) -> pure cs

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
-- Miscellaneous

decodeUtf8 :: B.ByteString -> Either String T.Text
{-# INLINE decodeUtf8 #-}
decodeUtf8 bs = Bif.first show (T.decodeUtf8' bs)

decodeUtf8Lazy :: B.ByteString -> Either String TL.Text
{-# INLINE decodeUtf8Lazy #-}
decodeUtf8Lazy bs = fmap TL.fromStrict (decodeUtf8 bs)

unescapeXmlUtf8 :: B.ByteString -> Either String T.Text
{-# INLINE unescapeXmlUtf8 #-}
unescapeXmlUtf8 bs = fmap TL.toStrict (unescapeXmlUtf8Lazy bs)

unescapeXmlUtf8Lazy :: B.ByteString -> Either String TL.Text
{-# INLINE unescapeXmlUtf8Lazy #-}
unescapeXmlUtf8Lazy bs = do
   t <- decodeUtf8 bs
   pure (TB.toLazyText (HTMLEntities.Decoder.htmlEncodedText t))

dropBomUtf8 :: B.ByteString -> B.ByteString
{-# INLINE dropBomUtf8 #-}
dropBomUtf8 bs | B.isPrefixOf "\xEF\xBB\xBF" bs = B.drop 3 bs
               | otherwise = bs