{-# Language OverloadedStrings #-} -- todo [release] date? -- todo [release] fallo diventare script, asptta che cabal 2.4 vaa in -- deb stable. module Main where import System.Environment import Network.HTTP.Simple import Text.HTML.Scalpel.Core import Text.RSS.Syntax import Text.RSS.Export import qualified Data.Text as T import qualified Data.Text.Lazy.IO as TLI import qualified Data.Text.Encoding as TE import qualified Data.String as DS import qualified Text.URI as U import qualified Data.List as L main :: IO () main = -- get args map T.pack <$> getArgs >>= \as -> -- malformed input if null as || length as > 1 || head as `elem` ["--help", "-h"] then putStrLn helps else -- scrape let url = head as in scrapeGet url page >>= \p -> printRSS url p where scrapeGet :: URL -> Scraper T.Text a -> IO a scrapeGet url s = getURLBody url >>= \b -> let r = maybe (error errs) id (scrapeStringLike b s) in return r ----------- -- TYPES -- ----------- data Page = Page Title [Article] deriving (Show, Eq) type Title = T.Text data Article = Article Title URL Author deriving (Show, Eq) type Author = T.Text type URL = T.Text ------------ -- SCRAPE -- ------------ page :: Scraper T.Text Page page = Page <$> text "title" <*> chroot ("div" @: ["id" @= "gs_res_ccl_mid"]) entries entries :: Scraper T.Text [Article] entries = chroots ("div" @: [hasClass "gs_ri"]) paper where paper = Article <$> text ("h3" @: [hasClass "gs_rt"]) <*> (cleanGBooks <$> attr "href" "a") <*> text ("div" @: [hasClass "gs_a"]) buildRSS :: URL -> Page -> RSS buildRSS u (Page t as) = (nullRSS "rss title" "link") { rssChannel = (nullChannel u "link") { rssItems = map buildItem as }} where buildItem :: Article -> RSSItem buildItem (Article t u a) = (nullItem t) { rssItemLink = Just $ u, rssItemAuthor = Just $ a } ------------- -- GET/PUT -- ------------- getURLBody :: URL -> IO T.Text getURLBody url = getResponseBody <$> httpBS req >>= \b -> return (TE.decodeLatin1 b) where req = DS.fromString (T.unpack url) feedTitle :: URL -> Title feedTitle url = case lookup "q" (U.queryToPairs $ T.unpack url) of Nothing -> "Error: title not found" Just t -> "gscholar-rss: " <> T.pack t -- removes 'ots' and 'sig' from a google books url cleanGBooks :: URL -> URL cleanGBooks url | condA = let qry = U.uriQueryItems uri fqr = filter filtFun qry u' = uri { U.uriQuery = Just (U.pairsToQuery fqr)} in T.pack . show $ u' | otherwise = url where uri = maybe (error "gclean: misparse") id (U.parseURI $ T.unpack url) condA = let rn = maybe (error "regname") id (U.uriRegName uri) in L.isPrefixOf "books.google" rn filtFun :: (String, String) -> Bool filtFun (l, _) = not (elem l ["ots", "sig"]) printRSS :: URL -> Page -> IO () printRSS url s = case textRSS . buildRSS (feedTitle url) $ s of Nothing -> error errs Just s -> TLI.putStrLn s helps :: String helps = unlines ["invoke as: ", "gscholar-rss "] errs :: String errs = "Could not parse, please report to fa-ml@ariis.it"