{-# LANGUAGE OverloadedStrings #-} import Test.HUnit hiding (Test) import Test.Hspec import Test.Hspec.QuickCheck import Data.ByteString.Lazy.Char8 () import qualified Text.HTML.DOM as H import qualified Text.XML as X import qualified Data.Map as Map import qualified Data.Text as T import Control.Exception (evaluate) import Control.DeepSeq (($!!)) import Control.Monad (void) main :: IO () main = hspec $ do describe "parses" $ do it "well-formed document" $ X.parseLBS_ X.def "baz" @=? H.parseLBS "baz" it "adds missing close tags" $ X.parseLBS_ X.def "baz" @=? H.parseLBS "baz" it "void tags" $ X.parseLBS_ X.def "foo" @=? H.parseLBS "foo" it "xml entities" $ X.parseLBS_ X.def "baz>" @=? H.parseLBS "baz>" it "html entities" $ X.parseLBS_ X.def "baz " @=? H.parseLBS "baz " it "decimal entities" $ X.parseLBS_ X.def "baz " @=? H.parseLBS "baz " it "hex entities" $ X.parseLBS_ X.def "bazŠ" @=? H.parseLBS "bazŠ" it "invalid entities" $ X.parseLBS_ X.def "baz&foobar;" @=? H.parseLBS "baz&foobar;" it "multiple root elements" $ X.parseLBS_ X.def "baz&foobar;" @=? H.parseLBS "baz&foobar;" it "doesn't strip whitespace" $ X.parseLBS_ X.def " hello" @=? H.parseLBS " hello" it "split code-points" $ X.parseLBS_ X.def " " @=? H.parseBSChunks ["\xc2", "\xa0"] it "latin1 codes" $ X.parseText_ X.def "\232" @=? H.parseSTChunks ["\232"] it "latin1 codes strict vs lazy" $ H.parseLT "\232" @=? H.parseSTChunks ["\232"] describe "HTML parsing" $ do it "XHTML" $ let html = "foo

Hello World

" doc = X.Document (X.Prologue [] Nothing []) root [] root = X.Element "html" Map.empty [ X.NodeElement $ X.Element "head" Map.empty [ X.NodeElement $ X.Element "title" Map.empty [X.NodeContent "foo"] ] , X.NodeElement $ X.Element "body" Map.empty [ X.NodeElement $ X.Element "p" Map.empty [X.NodeContent "Hello World"] ] ] in H.parseLBS html @?= doc it "XHTML with doctype and \n\nfoo

Hello World

" doc = X.Document (X.Prologue [] Nothing []) root [] root = X.Element "html" Map.empty [ X.NodeElement $ X.Element "head" Map.empty [ X.NodeElement $ X.Element "title" Map.empty [X.NodeContent "foo"] ] , X.NodeElement $ X.Element "body" Map.empty [ X.NodeElement $ X.Element "p" Map.empty [X.NodeContent "Hello World"] ] ] in H.parseLBS html @?= doc it "HTML" $ let html = "foo

Hello World

" doc = X.Document (X.Prologue [] Nothing []) root [] root = X.Element "html" Map.empty [ X.NodeElement $ X.Element "head" Map.empty [ X.NodeElement $ X.Element "title" Map.empty [X.NodeContent "foo"] ] , X.NodeElement $ X.Element "body" Map.empty [ X.NodeElement $ X.Element "br" Map.empty [] , X.NodeElement $ X.Element "p" Map.empty [X.NodeContent "Hello World"] ] ] in H.parseLBS html @?= doc it "doesn't double unescape" $ let html = "

Hello > World

" doc = X.Document (X.Prologue [] Nothing []) root [] root = X.Element "p" Map.empty [ X.NodeContent "Hello > World" ] in H.parseLBS html @?= doc it "handles entities in attributes" $ let html = "
" doc = X.Document (X.Prologue [] Nothing []) root [] root = X.Element "br" (Map.singleton "title" "Mac & Cheese") [] in H.parseLBS html @?= doc it "doesn't double escape entities in attributes" $ let html = "
" doc = X.Document (X.Prologue [] Nothing []) root [] root = X.Element "br" (Map.singleton "title" "Mac & Cheese") [] in H.parseLBS html @?= doc describe "script tags" $ do it "ignores funny characters" $ let html = "" doc = X.Document (X.Prologue [] Nothing []) root [] root = X.Element "script" Map.empty [X.NodeContent "hello > world"] in H.parseLBS html @?= doc {- Would be nice... doesn't work with tagstream-conduit original code. Not even sure if the HTML5 parser spec discusses this case. it "ignores inside string" $ let html = "\" world" doc = X.Document (X.Prologue [] Nothing []) root [] root = X.Element "script" Map.empty [X.NodeContent "hello \"\" world"] in H.parseLBS html @?= doc -} it "unterminated" $ let html = "