{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE OverloadedStrings #-} module Main (main) where import Text.HTML.Scalpel.Core import Control.Applicative import Control.Monad (guard) import Data.List (isInfixOf) import System.Exit (ExitCode(..), exitSuccess, exitWith) import Test.HUnit (Test(..), (@=?), (~:), runTestTT, failures) import qualified Text.HTML.TagSoup as TagSoup import qualified Text.Regex.TDFA main :: IO () main = do n <- runTestTT (TestList [scrapeTests]) exit $ failures n exit :: Int -> IO () exit 0 = exitSuccess exit n = exitWith $ ExitFailure n re :: String -> Text.Regex.TDFA.Regex re = Text.Regex.TDFA.makeRegex scrapeTests = "scrapeTests" ~: TestList [ scrapeTest "htmls should extract matching tag" "foo" (Just ["foo"]) (htmls ("a" @: [])) , scrapeTest "htmls should ignore non-matching tag" "foobar" (Just ["foo", "bar"]) (htmls ("a" @: [])) , scrapeTest "htmls should extract matching tag when it is nested" "foo" (Just ["foo"]) (htmls ("a" @: [])) , scrapeTest "htmls should extract each matching tag even if it is nested" "foo" (Just ["foo", "foo"]) (htmls ("a" @: [])) , scrapeTest "htmls with no matching nodes should result in an empty list" "foo" (Just []) (htmls ("b" @: [])) , scrapeTest "unclosed tags should be treated as immediately closed" "foo" (Just [""]) (htmls ("a" @: [])) , scrapeTest "scraping should obey attribute predicates" "foobar" (Just ["bar"]) (htmls ("a" @: ["key" @= "value"])) , scrapeTest "selectors using // should match the deepest node" "foo" (Just ["foo"]) (htmls ("a" // "b" @: [] // "c")) , scrapeTest "selectors using // should skip over irrelevant nodes" "foobarbaz" (Just ["foo", "bar"]) (htmls ("a" // "b")) , scrapeTest "hasClass should match tags with multiple classes" "foo" (Just ["foo"]) (htmls ("a" @: [hasClass "a"])) , scrapeTest "hasClass should not match tags without the specified class" "foo" (Just []) (htmls ("a" @: [hasClass "c"])) , scrapeTest "notP should negate attribute predicates" "foobarbaz" (Just ["foo", "baz"]) (texts ("a" @: [notP $ hasClass "a"])) , scrapeTest "@=~ should match via regular expressions" "foo" (Just ["foo"]) (htmls ("a" @: ["key" @=~ re "va(foo|bar|lu)e"])) , scrapeTest "AnyAttribute should match any attribute key" "foobar" (Just ["foo", "bar"]) (htmls ("a" @: [AnyAttribute @= "value"])) , scrapeTest "AnyAttribute should not match any attribute value" "foobar" (Just ["bar"]) (htmls ("a" @: [AnyAttribute @= "value"])) , scrapeTest "AnyTag should match any tag with the corresponding attributes" "foobar" (Just ["foo", "bar"]) (htmls (AnyTag @: [AnyAttribute @= "value"])) , scrapeTest "AnyTag should not match tags without the corresponding attributes" "foobar" (Just ["bar"]) (htmls (AnyTag @: [AnyAttribute @= "value"])) , scrapeTest "Custom predicates" "123" (Just ["2", "3"]) (htmls (AnyTag @: [match (==)])) , scrapeTest "text should extract inner text from the first matching tag" "foo" (Just "foo") (text "a") , scrapeTest "text should extract inner text from only the first matching tag" "foobar" (Just "foo") (text "a") , scrapeTest "texts should extract inner text from all matching tags" "foobar" (Just ["foo", "bar"]) (texts "a") , scrapeTest "fmap should work as expected" "foobar" (Just [True, False]) (map (== "foo") <$> texts "a") , scrapeTest "attr extract matching attribute value" "" (Just "foo") (attr "key" "a") , scrapeTest "attr extract matching attribute value with complex predicates" "" (Just "baz") (attr "key2" $ "a" @: ["key1" @= "bar"]) , scrapeTest "chroot should limit context to just selected node" "foobar" (Just ["foo"]) (chroot "a" $ texts "b") , scrapeTest "chroots should work for all matching nodes" "foobar" (Just ["foo", "bar"]) (chroots "a" $ text "b") , scrapeTest "<|> should return first match if valid" "foobar" (Just "foo") (text ("a" // "b") <|> text ("a" // "c")) , scrapeTest "<|> should return second match if valid" "foobar" (Just "bar") (text ("a" // "d") <|> text ("a" // "c")) , scrapeTest "Unclosed tags should be treated as immediately closed" "" (Just "foobar") (attr "src" "img") , scrapeTest "scraping should work for self-closing tags" "" (Just "foobar") (attr "src" "img") , scrapeTest "lower case selectors should match any case tag" "foobar" (Just ["foo", "bar"]) (texts "a") , scrapeTest "upper case selectors should match any case tag" "foobar" (Just ["foo", "bar"]) (texts "A") , scrapeTest "attribute key matching should be case-insensitive" "foo" (Just ["foo"]) (texts $ "A" @: ["b" @= "C"]) , scrapeTest "attribute value matching should be case-sensitive" "foo" (Just []) (texts $ "A" @: ["b" @= "c"]) , scrapeTest "notP should invert attribute value matching" "foobarbaz" (Just ["foo", "baz"]) (texts ("a" @: [notP $ "b" @= "C"])) , scrapeTest "html should work when matching the root node" "foo" (Just "foo") (html "a") , scrapeTest "html should work when matching a nested node" "
" (Just "
  • 1
  • ") (html "li") , scrapeTest "html should work when matching a node with no inner text" "
    " (Just "
    ") (html "div") , scrapeTest "htmls should return html matching root nodes" "foobar" (Just ["foo","bar"]) (htmls "a") , scrapeTest "htmls should return html matching nested nodes" "
    " (Just ["
  • 1
  • ", "
  • 2
  • "]) (htmls "li") , scrapeTest "htmls should return html matching empty nested nodes" "
    " (Just ["
    "]) (htmls "div") , scrapeTest "innerHTML should exclude root tags" "123" (Just "123") (innerHTML anySelector) , scrapeTest "innerHTML of a self closed tag should be the empty string" "" (Just "") (innerHTML anySelector) , scrapeTest "innerHTML should match root nodes" "foobar" (Just ["foo","bar"]) (innerHTMLs "a") , scrapeTest "guard should stop matches" "foobarbaz" (Just "bar") (chroot "a" $ do t <- text anySelector guard ("b" `isInfixOf` t) html anySelector) , scrapeTest "// should force a decent before matching" "
    inner text
    " (Just ["inner"]) (attrs "id" ("div" // "div")) , scrapeTest "div // div should match div/div/div twice" "
    " (Just ["b", "c"]) (attrs "id" ("div" // "div")) , scrapeTest "anySelector should match the root node" "12345" (Just "12345") (text anySelector) , scrapeTest "failing a pattern match should stop a scraper" "1" Nothing $ do "Bad pattern" <- text "a" return "OK" , scrapeTest "passing a pattern match should not stop a scraper" "1" (Just "OK") $ do "1" <- text "a" return "OK" , scrapeTest "position should return the index of the match" "

    A

    B

    C

    " (Just [(0, "A"), (1, "B"), (2, "C")]) (chroots ("article" // "p") $ do index <- position content <- text anySelector return (index, content)) , scrapeTest "position should return the index of most recent match" "

    A

    B

    C

    " (Just [[(0, "A")], [(0, "B"), (1, "C")]]) (chroots "article" $ chroots "p" $ do index <- position content <- text anySelector return (index, content)) , scrapeTest "DFS regression test for #59 (1)" "

    p1

    p2

    p3

    p4

    " (Just ["p1", "p2", "p3", "p4"]) (texts "p") , scrapeTest "DFS regression test for #59 (2)" "123" (Just ["1","2","3"]) (texts "a") , scrapeTest "DFS regression test for #59 (3)" "123" (Just ["1","2","3"]) (texts $ "a" // "b") , scrapeTest "DFS regression test for #59 (4)" "123" (Just ["1","2","3"]) (texts "b") , scrapeTest "atDepth 1 should select immediate children" "12" (Just ["1"]) (texts $ "a" // "b" `atDepth` 1) , scrapeTest "atDepth 2 should select children children" "12" (Just ["2"]) (texts $ "a" // "b" `atDepth` 2) , scrapeTest "atDepth should compose with attribute predicates" "12" (Just ["1"]) (texts $ "a" // "b" @: [hasClass "foo"] `atDepth` 1) -- Depth should handle malformed HTML correctly. Below and are not -- closed in the proper order, but since is nested within both in the -- context of , is still at depth 3. , scrapeTest "atDepth should handle tags closed out of order (full context)" "1" (Just ["1"]) (texts $ "a" // "d" `atDepth` 3) -- However, from the context of , is only at depth 1 because there is -- no closing tag within the tag so the tag is assumed to be -- self-closing. , scrapeTest "atDepth should handle tags closed out of order (partial context)" "2" (Just ["2"]) (texts $ "b" // "d" `atDepth` 1) , scrapeTest "// should handle tags closed out of order" "2" (Just ["2"]) (texts $ "b" // "d") , scrapeTest "// should handle tags closed out of order for the root (1)" "2" (Just ["2"]) (texts $ "b" // "d") , scrapeTest "// should handle tags closed out of order for the root (2)" "2" (Just ["2"]) (texts $ "c" // "d") , scrapeTest "textSelector should select each text node" "1234567" (Just $ map show [1..7]) (texts textSelector) , scrapeTest "anySelector should select text nodes" "1234567" (Just ["1", "2", "3", "456", "7"]) (texts $ anySelector `atDepth` 0) , scrapeTest "atDepth should treat out of focus close tags as immediately closed" "2" (Just ["2"]) (texts $ "a" // "d" `atDepth` 2) , scrapeTest "Applicative sanity checks for SerialScraper" "123" (Just ("1", "2")) (inSerial $ (,) <$> stepNext (text "a") <*> stepNext (text "b")) , scrapeTest "Monad sanity checks for SerialScraper" "123" (Just ("1", "2")) (inSerial $ do a <- stepNext (text "a") b <- stepNext (text "b") return (a, b)) , scrapeTest "stepping off the end of the list without reading should be allowed" "123" (Just ["1", "2", "3", "2" , "1"]) (inSerial $ do a <- stepNext $ text anySelector b <- stepNext $ text anySelector c <- stepNext $ text anySelector d <- stepBack $ text anySelector e <- stepBack $ text anySelector return [a, b, c, d, e]) , scrapeTest "stepping off the end of the list and reading should fail" "123" Nothing (inSerial $ (,,,) <$> stepNext (text anySelector) <*> stepNext (text anySelector) <*> stepNext (text anySelector) <*> stepNext (text anySelector)) , scrapeTest "seeking should skip over nodes" "123" (Just ("2", "3")) (inSerial $ (,) <$> seekNext (text "b") <*> seekNext (text "a")) , scrapeTest "seeking should fail if there is not matching node" "123" Nothing (inSerial $ seekNext $ text "c") , scrapeTest "seeking off the end the zipper should be allowed without reading" "123" (Just ("3", "1")) (inSerial $ (,) <$> seekNext (text "c") <*> seekBack (text "a")) , scrapeTest "Alternative sanity check for SerialScraper" "123" (Just ["1", "bar", "3"]) (inSerial $ many $ stepNext (text $ textSelector `atDepth` 0) <|> stepNext (attr "foo" $ "a" `atDepth` 0)) , scrapeTest "MonadFail sanity check for SerialScraper (passing check)" "1" (Just "OK") (inSerial $ do "1" <- stepNext $ text textSelector return "OK") , scrapeTest "MonadFail sanity check for SerialScraper (failing check)" "1" Nothing (inSerial $ do "mismatch" <- stepNext $ text textSelector return "OK") , scrapeTest "untilNext should stop at first match" "123" (Just ["1", "2"]) (inSerial $ untilNext (matches "b") $ many $ stepNext $ text anySelector) , scrapeTest "untilNext should go till end of the zipper on no match" "123" (Just ["1", "2", "3"]) (inSerial $ untilNext (matches "c") $ many $ stepNext $ text anySelector) , scrapeTest "untilNext should leave the focus at the match" "123" (Just "3") (inSerial $ do untilNext (matches "b") $ many $ stepNext $ text anySelector stepNext $ text "b") , scrapeTest "untilNext should create valid a empty context" "12" (Just "1") (inSerial $ do untilNext (matches "a") $ return () stepNext $ text "a") , scrapeTest "scraping within an empty context should fail" "12" Nothing (inSerial $ do untilNext (matches "a") $ stepNext $ text anySelector stepNext $ text "a") , scrapeTest "untilBack should leave the focus of the new context at the end" "123" (Just ("bar", ["1", "2", "3"], ["2", "1"])) (inSerial $ do as <- many $ seekNext $ text "a" as' <- untilBack (matches "b") $ many $ stepBack $ text "a" b <- stepBack $ attr "foo" "b" return (b, as, as')) , scrapeTest "inSerial in a chroot should visit immediate children" "12" (Just ["1", "2"]) (chroot "parent" $ inSerial $ many $ stepNext $ text anySelector) , scrapeTest "Issue #41 regression test" "

    Here

    Other stuff that matters

    " (Just "Other stuff that matters") (inSerial $ do seekNext $ matches $ "p" @: [hasClass "something"] stepNext $ text "p") , scrapeTest "Issue #45 regression test" (unlines [ "" , "

    title1

    " , "

    title2 1

    " , "

    text 1

    " , "

    text 2

    " , "

    title2 2

    " , "

    text 3

    " , "

    title2 3

    " , "" ]) (Just [ ("title2 1", ["text 1", "text 2"]) , ("title2 2", ["text 3"]) , ("title2 3", []) ]) (chroot "body" $ inSerial $ many $ do title <- seekNext $ text "h2" ps <- untilNext (matches "h2") (many $ do -- New lines between tags count as text nodes, skip over -- these. optional $ stepNext $ matches textSelector stepNext $ text "p") return (title, ps)) ] scrapeTest :: (Eq a, Show a) => String -> String -> Maybe a -> Scraper String a -> Test scrapeTest label html expected scraper = label' ~: expected @=? actual where label' = label ++ ": scrape (" ++ html ++ ")" actual = scrape scraper (TagSoup.parseTags html)