module Text.HandsomeSoup (openUrl, fromUrl, parseHtml, (!), css) where import Text.XML.HXT.Core import Network.HTTP import Network.URI import Data.Tree.NTree.TypeDefs import Control.Monad.Maybe import Control.Monad.Trans import Data.Maybe import Text.Parsec import qualified Data.Map as M import Data.Monoid (mconcat) import qualified Data.Functor.Identity as I import qualified Debug.Trace as D import Data.List import Control.Monad import Text.CSS.Parser hiding (css) import Text.XML.HXT.Arrow.ReadDocument import Text.XML.HXT.HTTP -- | Helper function for getting page content. Example: -- -- > contents <- runMaybeT $ openUrl "http://foo.com" openUrl :: String -> MaybeT IO String openUrl url = case parseURI url of Nothing -> fail "couldn't parse url" Just u -> liftIO (getResponseBody =<< simpleHTTP (mkRequest GET u)) -- | Given a url, returns a document. Example: -- -- > doc = fromUrl "http://foo.com" -- > doc = fromUrl "tests/test.html" fromUrl :: String -> IOSArrow XmlTree (NTree XNode) fromUrl url = readDocument [withValidate no, withInputEncoding isoLatin1, withParseByMimeType yes, withHTTP [], withWarnings no] url -- | Given a string, parses it and returns a document. Example: -- -- > doc = parseHtml "

hello!

" parseHtml :: String -> IOSArrow XmlTree (NTree XNode) parseHtml = readString [withParseHTML yes, withWarnings no] -- | Shortcut for getting attributes. Example: -- -- > doc >>> css "a" ! "href" (!) :: ArrowXml cat => cat a XmlTree -> String -> cat a String (!) a str = a >>> getAttrValue str -- | A css selector for getting elements from a document. Example: -- -- > doc >>> css "#menu li" css :: ArrowXml a => [Char] -> a (NTree XNode) (NTree XNode) css tag = case (parse selector "" tag) of Left err -> D.trace (show err) this Right x -> fromSelectors x -- | Used internally. works on a selector (i.e a list of simple selectors) fromSelectors sel@(s:selectors) = foldl (\acc selector -> acc <+> _fromSelectors selector) (_fromSelectors s) selectors -- | Used internally. works on simple selectors and their combinators _fromSelectors (s:selectors) = foldl (\acc selector -> make acc selector) (make this s) selectors where make acc sel@(Selector name attrs pseudo) | name == "*" = acc >>> ((multi this >>> makeAttrs attrs) >>. makePseudos pseudo) | otherwise = acc >>> ((multi $ hasName name >>> makeAttrs attrs) >>. makePseudos pseudo) make acc Space = acc >>> getChildren make acc ChildOf = acc >>> getChildren >>> processChildren none makeAttrs (a:attrs) = foldl (\acc attr -> acc >>> makeAttr attr) (makeAttr a) attrs makeAttrs [] = this makeAttr (name, "") = hasAttr name makeAttr (name, '~':value) = hasAttrValue name (elem value . words) makeAttr (name, '|':value) = hasAttrValue name (headMatch value) makeAttr (name, value) = hasAttrValue name (==value) makePseudos (p:pseudos) = foldl (\acc pseudo -> acc >>> makePseudo pseudo) (makePseudo p) pseudos makePseudos [] = id makePseudo "first-child" = take 1 -- | Used internally to match attribute selectors like @ [att|=val] @. -- From: http://www.w3.org/TR/CSS2/selector.html -- "Represents an element with the att attribute, its value either being exactly "val" or beginning with "val" immediately followed by '-'". headMatch value attrValue = value == attrValue || value `isPrefixOf` attrValue where first = head . words $ attrValue