{-# LANGUAGE OverloadedStrings #-}
module Text.HTML.DOM
    ( eventConduit
    , sinkDoc
    , readFile
    , parseLBS
    , parseBSChunks
    , eventConduitText
    , sinkDocText
    , parseLT
    , parseSTChunks
    ) where

import Control.Monad.Trans.Resource
import Prelude hiding (readFile)
import qualified Data.ByteString as S
import qualified Text.HTML.TagStream as TS
import qualified Data.XML.Types as XT
import qualified Data.Text as T
import qualified Data.Text.Lazy as TL
import qualified Data.Conduit.List as CL
import Control.Arrow ((***))
import qualified Data.Set as Set
import qualified Text.XML as X
import Conduit
import qualified Data.ByteString.Lazy as L
import Data.Maybe (mapMaybe)
import qualified Data.Map.Strict as Map

-- | Converts a stream of bytes to a stream of properly balanced @Event@s.
--
-- Note that there may be multiple (or not) root elements. @sinkDoc@ addresses
-- that case.
eventConduit :: Monad m => ConduitT S.ByteString XT.Event m ()
eventConduit = decodeUtf8LenientC .| eventConduit'

eventConduitText :: Monad m => ConduitT T.Text XT.Event m ()
eventConduitText = eventConduit'

eventConduit' :: Monad m => ConduitT T.Text XT.Event m ()
eventConduit' =
    TS.tokenStream .| go []
  where
    go stack = do
        mx <- await
        case mx of
            Nothing -> closeStack stack

            -- Ignore processing instructions (or pseudo-instructions)
            Just (TS.TagOpen local _ _) | "?" `T.isPrefixOf` local -> go stack

            Just (TS.TagOpen local attrs isClosed) -> do
                let name = toName local
                    attrs' = map (toName *** return . XT.ContentText) $ Map.toList attrs
                yield $ XT.EventBeginElement name attrs'
                if isClosed || isVoid local
                    then yield (XT.EventEndElement name) >> go stack
                    else go $ name : stack
            Just (TS.TagClose name)
                | toName name `elem` stack ->
                    let loop [] = go []
                        loop (n:ns) = do
                            yield $ XT.EventEndElement n
                            if n == toName name
                                then go ns
                                else loop ns
                     in loop stack
                | otherwise -> go stack
            Just (TS.Text t) -> do
                yield $ XT.EventContent $ XT.ContentText t
                go stack
            Just (TS.Comment t) -> do
                yield $ XT.EventComment t
                go stack
            Just TS.Special{} -> go stack
            Just TS.Incomplete{} -> go stack
    toName l = XT.Name l Nothing Nothing
    closeStack = mapM_ (yield . XT.EventEndElement)

    isVoid = flip Set.member $ Set.fromList
        [ "area"
        , "base"
        , "br"
        , "col"
        , "command"
        , "embed"
        , "hr"
        , "img"
        , "input"
        , "keygen"
        , "link"
        , "meta"
        , "param"
        , "source"
        , "track"
        , "wbr"
        ]

sinkDoc :: MonadThrow m => ConduitT S.ByteString o m X.Document
sinkDoc = sinkDoc' eventConduit

sinkDocText :: MonadThrow m => ConduitT T.Text o m X.Document
sinkDocText = sinkDoc' eventConduitText

sinkDoc'
  :: MonadThrow m
  => ConduitT a XT.Event m ()
  -> ConduitT a o m X.Document
sinkDoc' f =
    fmap stripDummy $ mapOutput ((,) Nothing) f .| addDummyWrapper .| X.fromEvents
  where
    addDummyWrapper = do
        yield (Nothing, XT.EventBeginElement "html" [])
        awaitForever yield
        yield (Nothing, XT.EventEndElement "html")

    stripDummy doc@(X.Document pro (X.Element _ _ nodes) epi) =
        case mapMaybe toElement nodes of
            [root] -> X.Document pro root epi
            _ -> doc

    toElement (X.NodeElement e) = Just e
    toElement _ = Nothing

readFile :: FilePath -> IO X.Document
readFile fp = withSourceFile fp $ \src -> runConduit $ src .| sinkDoc

parseLBS :: L.ByteString -> X.Document
parseLBS = parseBSChunks . L.toChunks

parseBSChunks :: [S.ByteString] -> X.Document
parseBSChunks tss =
  case runConduit $ CL.sourceList tss .| sinkDoc of
    Left e -> error $ "Unexpected exception in parseBSChunks: " ++ show e
    Right x -> x

parseLT :: TL.Text -> X.Document
parseLT = parseSTChunks . TL.toChunks

parseSTChunks :: [T.Text] -> X.Document
parseSTChunks tss =
  case runConduit $ CL.sourceList tss .| sinkDocText of
    Left e -> error $ "Unexpected exception in parseSTChunks: " ++ show e
    Right x -> x