module Text.HandsomeSoup (openUrl, fromUrl, parseHtml, (!), css) where
import Text.XML.HXT.Core
import Network.HTTP
import Network.URI
import Data.Tree.NTree.TypeDefs
import Control.Monad.Maybe
import Control.Monad.Trans
import Data.Maybe
import Text.Parsec
import qualified Data.Map as M
import Data.Monoid (mconcat)
import qualified Data.Functor.Identity as I
import qualified Debug.Trace as D
import Data.List
import Control.Monad
import Text.CSS.Parser hiding (css)
import Text.XML.HXT.Arrow.ReadDocument
import Text.XML.HXT.HTTP
-- | Helper function for getting page content. Example:
--
-- > contents <- runMaybeT $ openUrl "http://foo.com"
openUrl :: String -> MaybeT IO String
openUrl url = case parseURI url of
Nothing -> fail "couldn't parse url"
Just u -> liftIO (getResponseBody =<< simpleHTTP (mkRequest GET u))
-- | Given a url, returns a document. Example:
--
-- > doc = fromUrl "http://foo.com"
-- > doc = fromUrl "tests/test.html"
fromUrl :: String -> IOSArrow XmlTree (NTree XNode)
fromUrl url = readDocument [withValidate no,
withInputEncoding isoLatin1,
withParseByMimeType yes,
withHTTP [],
withWarnings no] url
-- | Given a string, parses it and returns a document. Example:
--
-- > doc = parseHtml "
hello!
"
parseHtml :: String -> IOSArrow XmlTree (NTree XNode)
parseHtml = readString [withParseHTML yes, withWarnings no]
-- | Shortcut for getting attributes. Example:
--
-- > doc >>> css "a" ! "href"
(!) :: ArrowXml cat => cat a XmlTree -> String -> cat a String
(!) a str = a >>> getAttrValue str
-- | A css selector for getting elements from a document. Example:
--
-- > doc >>> css "#menu li"
css :: ArrowXml a => [Char] -> a (NTree XNode) (NTree XNode)
css tag = case (parse selector "" tag) of
Left err -> D.trace (show err) this
Right x -> fromSelectors x
-- | Used internally. works on a selector (i.e a list of simple selectors)
fromSelectors sel@(s:selectors) = foldl (\acc selector -> acc <+> _fromSelectors selector) (_fromSelectors s) selectors
-- | Used internally. works on simple selectors and their combinators
_fromSelectors (s:selectors) = foldl (\acc selector -> make acc selector) (make this s) selectors
where
make acc sel@(Selector name attrs pseudo)
| name == "*" = acc >>> ((multi this >>> makeAttrs attrs) >>. makePseudos pseudo)
| otherwise = acc >>> ((multi $ hasName name >>> makeAttrs attrs) >>. makePseudos pseudo)
make acc Space = acc >>> getChildren
make acc ChildOf = acc >>> getChildren >>> processChildren none
makeAttrs (a:attrs) = foldl (\acc attr -> acc >>> makeAttr attr) (makeAttr a) attrs
makeAttrs [] = this
makeAttr (name, "") = hasAttr name
makeAttr (name, '~':value) = hasAttrValue name (elem value . words)
makeAttr (name, '|':value) = hasAttrValue name (headMatch value)
makeAttr (name, value) = hasAttrValue name (==value)
makePseudos (p:pseudos) = foldl (\acc pseudo -> acc >>> makePseudo pseudo) (makePseudo p) pseudos
makePseudos [] = id
makePseudo "first-child" = take 1
-- | Used internally to match attribute selectors like @ [att|=val] @.
-- From: http://www.w3.org/TR/CSS2/selector.html
-- "Represents an element with the att attribute, its value either being exactly "val" or beginning with "val" immediately followed by '-'".
headMatch value attrValue = value == attrValue || value `isPrefixOf` attrValue
where first = head . words $ attrValue