-- Hoogle documentation, generated by Haddock -- See Hoogle, http://www.haskell.org/hoogle/ -- | A Haskell library to scrape and crawl web-pages -- -- A Haskell library to scrape and crawl web-pages @package hScraper @version 0.1.0.0 -- | Module for tidying Malformed html using libtidy(see README). module HScraper.Tidy -- | Takes Malformed html and reuturns correct html if it can be corrected. -- Output is empty if it cannot be corrected. tidy :: Text -> IO Text -- | Module for fetching different requests over network(Wrapper around -- http-conduit). module HScraper.Network -- | A Simple wrapper around simpleHttp so that it goes along with with -- other functions. fetchResponse :: String -> IO Text -- | Parses a list of (key,value) pairs to pass with GET and POST requests. parseParams :: [(String, String)] -> String -- | takes a Request and a Manager and returns the response. Useful when -- querying with a modified request. fetchRequestWith :: Request -> Manager -> IO Text -- | Minimal GET request, modify it to add proxy,cookies,user-agent etc. -- Then fetch using @fetchRequestWith. defaultGETRequest :: String -> [(String, String)] -> IO Request -- | Fetches request using @defaultGETRequest. fetchGETRequest :: String -> [(String, String)] -> IO Text -- | Minimal POST request, modify it to add proxy,cookies,user-agent etc. -- Then fetch using @fetchRequestWith. defaultPOSTRequest :: String -> [(String, String)] -> IO Request -- | Fetches request using @defaultPOSTRequest. fetchPOSTRequest :: String -> [(String, String)] -> IO Text module HScraper.Types data NodeType Text :: Text -> NodeType Element :: Text -> AttrList -> NodeType data NTree a NTree :: a -> [NTree a] -> NTree a NullTree :: NTree a type AttrList = [(Text, Text)] type HTMLTree = NTree NodeType toLeaf :: Text -> HTMLTree toTree :: Text -> AttrList -> [HTMLTree] -> HTMLTree type Name = Text type Class = Maybe Text type ID = Maybe Text data NodeQuery NodeQuery :: Name -> Class -> ID -> NodeQuery type Query = [NodeQuery] instance GHC.Read.Read HScraper.Types.NodeQuery instance GHC.Show.Show HScraper.Types.NodeQuery instance GHC.Show.Show HScraper.Types.NodeType instance GHC.Show.Show a => GHC.Show.Show (HScraper.Types.NTree a) instance GHC.Classes.Eq HScraper.Types.NodeType instance GHC.Classes.Eq a => GHC.Classes.Eq (HScraper.Types.NTree a) -- | A simple Query format to query the HTMLTree. -- -- The Syntax is as follows : -- "nodeName[Class(optional)]{ID(optional)} > -- nodeName[Class(optional)]{ID(optional)}" -- -- eg : "div{id1} > span[class][id_h1] > a" module HScraper.Query parseQuery :: String -> Either ParseError Query -- | Compares NodeQuery with a NodeType. (~=~) :: NodeQuery -> NodeType -> Bool -- | Applies >=> considering each node as root and combines -- the result. (|>>) :: HTMLTree -> Query -> [HTMLTree] -- | Returns the list of nodes matching the query with root matching the -- first NodeQuery, and subsequent Children satisfying subsequent -- NodeQueries continously. (>=>) :: HTMLTree -> Query -> [HTMLTree] -- | Get Combined Text of immediate children of current node. getText :: HTMLTree -> Text -- | Get Entire text contained in the subtree. getEntireText :: HTMLTree -> Text -- | Get the value of an attribute of a node. getAttribute :: String -> HTMLTree -> Maybe String -- | Module for parsing html. module HScraper.HTMLparser parseHtml :: Text -> Either ParseError HTMLTree module HScraper.Show -- | takes a HTMLtree and prints it in a neat manner. -- -- expected output -- --
--   |html
--          |head
--          |body
--                 YOLO
--   
--   
showTree :: IO HTMLTree -> IO () -- | Module for various convenience functions and error-free wrappers of -- parsers so that they can be used for batches of urls. module HScraper.Main -- | Tries to parse html from file. returns NullTree if parsing -- fails. getFromFile :: FilePath -> IO HTMLTree -- | like parseHtml but returns NullTree if parsing fails. getParsedHTML :: Text -> HTMLTree -- | Takes a String and tries to parse as Query returns empty -- query if parsing fails. getParsedQuery :: String -> Query -- | takes url and returns parsed HTMLTree. parseSite :: String -> IO HTMLTree -- | A library to parse, crawl and scrape webpages. -- -- An example : -- --
--   import HScraper
--   
--   main :: IO ()
--   main = do
--     html <- parseSite "https://kat.cr/leopard-raws-taimadou-gakuen-35-shiken-shoutai-05-raw-sun-1280x720-x264-aac-mp4-t11528616.html/"
--     let q1 = getParsedQuery "a[movieCover]"
--     print $ html |>> q1
--     let q2 = getParsedQuery "a"
--     let ans = html |>> q2
--     mapM_ (print . getAttribute "href" ) ans -- get all hyperlinks.
--   
module HScraper