{-# LANGUAGE OverloadedStrings #-}
module Text.HTML.DOM
    ( eventConduit
    , sinkDoc
    , readFile
    , parseLBS
    , parseBSChunks
    , eventConduitText
    , sinkDocText
    , parseLT
    , parseSTChunks
    ) where

import Control.Monad.Trans.Resource
import Prelude hiding (readFile)
import qualified Data.ByteString as S
import qualified Text.HTML.TagStream as TS
import qualified Data.XML.Types as XT
import qualified Data.Text as T
import qualified Data.Text.Lazy as TL
import qualified Data.Conduit.List as CL
import Control.Arrow ((***))
import qualified Data.Set as Set
import qualified Text.XML as X
import Conduit
import qualified Data.ByteString.Lazy as L
import Data.Maybe (mapMaybe)
import qualified Data.Map.Strict as Map

-- | Converts a stream of bytes to a stream of properly balanced @Event@s.
--
-- Note that there may be multiple (or not) root elements. @sinkDoc@ addresses
-- that case.
eventConduit :: Monad m => ConduitT S.ByteString XT.Event m ()
eventConduit :: ConduitT ByteString Event m ()
eventConduit = ConduitT ByteString Text m ()
forall (m :: * -> *). Monad m => ConduitT ByteString Text m ()
decodeUtf8LenientC ConduitT ByteString Text m ()
-> ConduitM Text Event m () -> ConduitT ByteString Event m ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.| ConduitM Text Event m ()
forall (m :: * -> *). Monad m => ConduitT Text Event m ()
eventConduit'

eventConduitText :: Monad m => ConduitT T.Text XT.Event m ()
eventConduitText :: ConduitT Text Event m ()
eventConduitText = ConduitT Text Event m ()
forall (m :: * -> *). Monad m => ConduitT Text Event m ()
eventConduit'

eventConduit' :: Monad m => ConduitT T.Text XT.Event m ()
eventConduit' :: ConduitT Text Event m ()
eventConduit' =
    ConduitT Text Token m ()
forall (m :: * -> *). Monad m => ConduitT Text Token m ()
TS.tokenStream ConduitT Text Token m ()
-> ConduitM Token Event m () -> ConduitT Text Event m ()
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.| [Name] -> ConduitM Token Event m ()
go []
  where
    go :: [Name] -> ConduitM Token Event m ()
go [Name]
stack = do
        Maybe Token
mx <- ConduitT Token Event m (Maybe Token)
forall (m :: * -> *) i. Monad m => Consumer i m (Maybe i)
await
        case Maybe Token
mx of
            Maybe Token
Nothing -> [Name] -> ConduitM Token Event m ()
forall i. [Name] -> ConduitT i Event m ()
closeStack [Name]
stack

            -- Ignore processing instructions (or pseudo-instructions)
            Just (TS.TagOpen Text
local Map Text Text
_ Bool
_) | Text
"?" Text -> Text -> Bool
`T.isPrefixOf` Text
local -> [Name] -> ConduitM Token Event m ()
go [Name]
stack

            Just (TS.TagOpen Text
local Map Text Text
attrs Bool
isClosed) -> do
                let name :: Name
name = Text -> Name
toName Text
local
                    attrs' :: [(Name, [Content])]
attrs' = ((Text, Text) -> (Name, [Content]))
-> [(Text, Text)] -> [(Name, [Content])]
forall a b. (a -> b) -> [a] -> [b]
map (Text -> Name
toName (Text -> Name)
-> (Text -> [Content]) -> (Text, Text) -> (Name, [Content])
forall (a :: * -> * -> *) b c b' c'.
Arrow a =>
a b c -> a b' c' -> a (b, b') (c, c')
*** Content -> [Content]
forall (m :: * -> *) a. Monad m => a -> m a
return (Content -> [Content]) -> (Text -> Content) -> Text -> [Content]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> Content
XT.ContentText) ([(Text, Text)] -> [(Name, [Content])])
-> [(Text, Text)] -> [(Name, [Content])]
forall a b. (a -> b) -> a -> b
$ Map Text Text -> [(Text, Text)]
forall k a. Map k a -> [(k, a)]
Map.toList Map Text Text
attrs
                Event -> ConduitM Token Event m ()
forall (m :: * -> *) o i. Monad m => o -> ConduitT i o m ()
yield (Event -> ConduitM Token Event m ())
-> Event -> ConduitM Token Event m ()
forall a b. (a -> b) -> a -> b
$ Name -> [(Name, [Content])] -> Event
XT.EventBeginElement Name
name [(Name, [Content])]
attrs'
                if Bool
isClosed Bool -> Bool -> Bool
|| Text -> Bool
isVoid Text
local
                    then Event -> ConduitM Token Event m ()
forall (m :: * -> *) o i. Monad m => o -> ConduitT i o m ()
yield (Name -> Event
XT.EventEndElement Name
name) ConduitM Token Event m ()
-> ConduitM Token Event m () -> ConduitM Token Event m ()
forall (m :: * -> *) a b. Monad m => m a -> m b -> m b
>> [Name] -> ConduitM Token Event m ()
go [Name]
stack
                    else [Name] -> ConduitM Token Event m ()
go ([Name] -> ConduitM Token Event m ())
-> [Name] -> ConduitM Token Event m ()
forall a b. (a -> b) -> a -> b
$ Name
name Name -> [Name] -> [Name]
forall a. a -> [a] -> [a]
: [Name]
stack
            Just (TS.TagClose Text
name)
                | Text -> Name
toName Text
name Name -> [Name] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` [Name]
stack ->
                    let loop :: [Name] -> ConduitM Token Event m ()
loop [] = [Name] -> ConduitM Token Event m ()
go []
                        loop (Name
n:[Name]
ns) = do
                            Event -> ConduitM Token Event m ()
forall (m :: * -> *) o i. Monad m => o -> ConduitT i o m ()
yield (Event -> ConduitM Token Event m ())
-> Event -> ConduitM Token Event m ()
forall a b. (a -> b) -> a -> b
$ Name -> Event
XT.EventEndElement Name
n
                            if Name
n Name -> Name -> Bool
forall a. Eq a => a -> a -> Bool
== Text -> Name
toName Text
name
                                then [Name] -> ConduitM Token Event m ()
go [Name]
ns
                                else [Name] -> ConduitM Token Event m ()
loop [Name]
ns
                     in [Name] -> ConduitM Token Event m ()
loop [Name]
stack
                | Bool
otherwise -> [Name] -> ConduitM Token Event m ()
go [Name]
stack
            Just (TS.Text Text
t) -> do
                Event -> ConduitM Token Event m ()
forall (m :: * -> *) o i. Monad m => o -> ConduitT i o m ()
yield (Event -> ConduitM Token Event m ())
-> Event -> ConduitM Token Event m ()
forall a b. (a -> b) -> a -> b
$ Content -> Event
XT.EventContent (Content -> Event) -> Content -> Event
forall a b. (a -> b) -> a -> b
$ Text -> Content
XT.ContentText Text
t
                [Name] -> ConduitM Token Event m ()
go [Name]
stack
            Just (TS.Comment Text
t) -> do
                Event -> ConduitM Token Event m ()
forall (m :: * -> *) o i. Monad m => o -> ConduitT i o m ()
yield (Event -> ConduitM Token Event m ())
-> Event -> ConduitM Token Event m ()
forall a b. (a -> b) -> a -> b
$ Text -> Event
XT.EventComment Text
t
                [Name] -> ConduitM Token Event m ()
go [Name]
stack
            Just TS.Special{} -> [Name] -> ConduitM Token Event m ()
go [Name]
stack
            Just TS.Incomplete{} -> [Name] -> ConduitM Token Event m ()
go [Name]
stack
    toName :: Text -> Name
toName Text
l = Text -> Maybe Text -> Maybe Text -> Name
XT.Name Text
l Maybe Text
forall a. Maybe a
Nothing Maybe Text
forall a. Maybe a
Nothing
    closeStack :: [Name] -> ConduitT i Event m ()
closeStack = (Name -> ConduitT i Event m ()) -> [Name] -> ConduitT i Event m ()
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
(a -> m b) -> t a -> m ()
mapM_ (Event -> ConduitT i Event m ()
forall (m :: * -> *) o i. Monad m => o -> ConduitT i o m ()
yield (Event -> ConduitT i Event m ())
-> (Name -> Event) -> Name -> ConduitT i Event m ()
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Name -> Event
XT.EventEndElement)

    isVoid :: Text -> Bool
isVoid Text
name = Text -> Set Text -> Bool
forall a. Ord a => a -> Set a -> Bool
Set.member (Text -> Text
T.toLower Text
name) Set Text
voidSet

voidSet :: Set.Set T.Text
voidSet :: Set Text
voidSet = [Text] -> Set Text
forall a. Ord a => [a] -> Set a
Set.fromList
        [ Text
"area"
        , Text
"base"
        , Text
"br"
        , Text
"col"
        , Text
"command"
        , Text
"embed"
        , Text
"hr"
        , Text
"img"
        , Text
"input"
        , Text
"keygen"
        , Text
"link"
        , Text
"meta"
        , Text
"param"
        , Text
"source"
        , Text
"track"
        , Text
"wbr"
        ]

sinkDoc :: MonadThrow m => ConduitT S.ByteString o m X.Document
sinkDoc :: ConduitT ByteString o m Document
sinkDoc = ConduitT ByteString Event m () -> ConduitT ByteString o m Document
forall (m :: * -> *) a o.
MonadThrow m =>
ConduitT a Event m () -> ConduitT a o m Document
sinkDoc' ConduitT ByteString Event m ()
forall (m :: * -> *). Monad m => ConduitT ByteString Event m ()
eventConduit

sinkDocText :: MonadThrow m => ConduitT T.Text o m X.Document
sinkDocText :: ConduitT Text o m Document
sinkDocText = ConduitT Text Event m () -> ConduitT Text o m Document
forall (m :: * -> *) a o.
MonadThrow m =>
ConduitT a Event m () -> ConduitT a o m Document
sinkDoc' ConduitT Text Event m ()
forall (m :: * -> *). Monad m => ConduitT Text Event m ()
eventConduitText

sinkDoc'
  :: MonadThrow m
  => ConduitT a XT.Event m ()
  -> ConduitT a o m X.Document
sinkDoc' :: ConduitT a Event m () -> ConduitT a o m Document
sinkDoc' ConduitT a Event m ()
f =
    (Document -> Document)
-> ConduitT a o m Document -> ConduitT a o m Document
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap Document -> Document
stripDummy (ConduitT a o m Document -> ConduitT a o m Document)
-> ConduitT a o m Document -> ConduitT a o m Document
forall a b. (a -> b) -> a -> b
$ (Event -> (Maybe PositionRange, Event))
-> ConduitT a Event m ()
-> ConduitT a (Maybe PositionRange, Event) m ()
forall (m :: * -> *) o1 o2 i r.
Monad m =>
(o1 -> o2) -> ConduitT i o1 m r -> ConduitT i o2 m r
mapOutput ((,) Maybe PositionRange
forall a. Maybe a
Nothing) ConduitT a Event m ()
f ConduitT a (Maybe PositionRange, Event) m ()
-> ConduitM (Maybe PositionRange, Event) o m Document
-> ConduitT a o m Document
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.| ConduitT
  (Maybe PositionRange, Event) (Maybe PositionRange, Event) m ()
forall a. ConduitT (Maybe a, Event) (Maybe a, Event) m ()
addDummyWrapper ConduitT
  (Maybe PositionRange, Event) (Maybe PositionRange, Event) m ()
-> ConduitM (Maybe PositionRange, Event) o m Document
-> ConduitM (Maybe PositionRange, Event) o m Document
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.| ConduitM (Maybe PositionRange, Event) o m Document
forall (m :: * -> *) o.
MonadThrow m =>
ConduitT (Maybe PositionRange, Event) o m Document
X.fromEvents
  where
    addDummyWrapper :: ConduitT (Maybe a, Event) (Maybe a, Event) m ()
addDummyWrapper = do
        (Maybe a, Event) -> ConduitT (Maybe a, Event) (Maybe a, Event) m ()
forall (m :: * -> *) o i. Monad m => o -> ConduitT i o m ()
yield (Maybe a
forall a. Maybe a
Nothing, Name -> [(Name, [Content])] -> Event
XT.EventBeginElement Name
"html" [])
        ((Maybe a, Event)
 -> ConduitT (Maybe a, Event) (Maybe a, Event) m ())
-> ConduitT (Maybe a, Event) (Maybe a, Event) m ()
forall (m :: * -> *) i o r.
Monad m =>
(i -> ConduitT i o m r) -> ConduitT i o m ()
awaitForever (Maybe a, Event) -> ConduitT (Maybe a, Event) (Maybe a, Event) m ()
forall (m :: * -> *) o i. Monad m => o -> ConduitT i o m ()
yield
        (Maybe a, Event) -> ConduitT (Maybe a, Event) (Maybe a, Event) m ()
forall (m :: * -> *) o i. Monad m => o -> ConduitT i o m ()
yield (Maybe a
forall a. Maybe a
Nothing, Name -> Event
XT.EventEndElement Name
"html")

    stripDummy :: Document -> Document
stripDummy doc :: Document
doc@(X.Document Prologue
pro (X.Element Name
_ Map Name Text
_ [Node]
nodes) [Miscellaneous]
epi) =
        case (Node -> Maybe Element) -> [Node] -> [Element]
forall a b. (a -> Maybe b) -> [a] -> [b]
mapMaybe Node -> Maybe Element
toElement [Node]
nodes of
            [Element
root] -> Prologue -> Element -> [Miscellaneous] -> Document
X.Document Prologue
pro Element
root [Miscellaneous]
epi
            [Element]
_ -> Document
doc

    toElement :: Node -> Maybe Element
toElement (X.NodeElement Element
e) = Element -> Maybe Element
forall a. a -> Maybe a
Just Element
e
    toElement Node
_ = Maybe Element
forall a. Maybe a
Nothing

readFile :: FilePath -> IO X.Document
readFile :: FilePath -> IO Document
readFile FilePath
fp = FilePath
-> (ConduitM () ByteString IO () -> IO Document) -> IO Document
forall (m :: * -> *) (n :: * -> *) i a.
(MonadUnliftIO m, MonadIO n) =>
FilePath -> (ConduitM i ByteString n () -> m a) -> m a
withSourceFile FilePath
fp ((ConduitM () ByteString IO () -> IO Document) -> IO Document)
-> (ConduitM () ByteString IO () -> IO Document) -> IO Document
forall a b. (a -> b) -> a -> b
$ \ConduitM () ByteString IO ()
src -> ConduitT () Void IO Document -> IO Document
forall (m :: * -> *) r. Monad m => ConduitT () Void m r -> m r
runConduit (ConduitT () Void IO Document -> IO Document)
-> ConduitT () Void IO Document -> IO Document
forall a b. (a -> b) -> a -> b
$ ConduitM () ByteString IO ()
src ConduitM () ByteString IO ()
-> ConduitM ByteString Void IO Document
-> ConduitT () Void IO Document
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.| ConduitM ByteString Void IO Document
forall (m :: * -> *) o.
MonadThrow m =>
ConduitT ByteString o m Document
sinkDoc

parseLBS :: L.ByteString -> X.Document
parseLBS :: ByteString -> Document
parseLBS = [ByteString] -> Document
parseBSChunks ([ByteString] -> Document)
-> (ByteString -> [ByteString]) -> ByteString -> Document
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> [ByteString]
L.toChunks

parseBSChunks :: [S.ByteString] -> X.Document
parseBSChunks :: [ByteString] -> Document
parseBSChunks [ByteString]
tss =
  case ConduitT () Void (Either SomeException) Document
-> Either SomeException Document
forall (m :: * -> *) r. Monad m => ConduitT () Void m r -> m r
runConduit (ConduitT () Void (Either SomeException) Document
 -> Either SomeException Document)
-> ConduitT () Void (Either SomeException) Document
-> Either SomeException Document
forall a b. (a -> b) -> a -> b
$ [ByteString] -> ConduitT () ByteString (Either SomeException) ()
forall (m :: * -> *) a i. Monad m => [a] -> ConduitT i a m ()
CL.sourceList [ByteString]
tss ConduitT () ByteString (Either SomeException) ()
-> ConduitM ByteString Void (Either SomeException) Document
-> ConduitT () Void (Either SomeException) Document
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.| ConduitM ByteString Void (Either SomeException) Document
forall (m :: * -> *) o.
MonadThrow m =>
ConduitT ByteString o m Document
sinkDoc of
    Left SomeException
e -> FilePath -> Document
forall a. HasCallStack => FilePath -> a
error (FilePath -> Document) -> FilePath -> Document
forall a b. (a -> b) -> a -> b
$ FilePath
"Unexpected exception in parseBSChunks: " FilePath -> FilePath -> FilePath
forall a. [a] -> [a] -> [a]
++ SomeException -> FilePath
forall a. Show a => a -> FilePath
show SomeException
e
    Right Document
x -> Document
x

parseLT :: TL.Text -> X.Document
parseLT :: Text -> Document
parseLT = [Text] -> Document
parseSTChunks ([Text] -> Document) -> (Text -> [Text]) -> Text -> Document
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> [Text]
TL.toChunks

parseSTChunks :: [T.Text] -> X.Document
parseSTChunks :: [Text] -> Document
parseSTChunks [Text]
tss =
  case ConduitT () Void (Either SomeException) Document
-> Either SomeException Document
forall (m :: * -> *) r. Monad m => ConduitT () Void m r -> m r
runConduit (ConduitT () Void (Either SomeException) Document
 -> Either SomeException Document)
-> ConduitT () Void (Either SomeException) Document
-> Either SomeException Document
forall a b. (a -> b) -> a -> b
$ [Text] -> ConduitT () Text (Either SomeException) ()
forall (m :: * -> *) a i. Monad m => [a] -> ConduitT i a m ()
CL.sourceList [Text]
tss ConduitT () Text (Either SomeException) ()
-> ConduitM Text Void (Either SomeException) Document
-> ConduitT () Void (Either SomeException) Document
forall (m :: * -> *) a b c r.
Monad m =>
ConduitM a b m () -> ConduitM b c m r -> ConduitM a c m r
.| ConduitM Text Void (Either SomeException) Document
forall (m :: * -> *) o. MonadThrow m => ConduitT Text o m Document
sinkDocText of
    Left SomeException
e -> FilePath -> Document
forall a. HasCallStack => FilePath -> a
error (FilePath -> Document) -> FilePath -> Document
forall a b. (a -> b) -> a -> b
$ FilePath
"Unexpected exception in parseSTChunks: " FilePath -> FilePath -> FilePath
forall a. [a] -> [a] -> [a]
++ SomeException -> FilePath
forall a. Show a => a -> FilePath
show SomeException
e
    Right Document
x -> Document
x