module Network.Wikipedia ( isArticleURL , WikiArticle(..) , articleURL2Title , getArticleLinks , sanitizeArticle --, fetchPrintArticle --, fetchRawArticle , fetchArticle) where import Network.URL import Network.URI import Text.Regex.Posix import Text.Regex.Base import Network.HTTP import Text.HTML.TagSoup import Data.List (nub) import Data.Maybe (catMaybes, mapMaybe, fromJust) data WikiArticle = WikiArticleHTML { waTitle :: String, waContent :: String } | WikiArticleSRC { waTitle :: String, waContent :: String } deriving (Show, Ord, Eq) isArticleURL :: URL -> Bool isArticleURL (URL (Absolute (Host (HTTP False) xs Nothing)) ph []) = (xs =~ ".*en[.]wikipedia.org$") && (ph =~ "wiki/[^:/]+$" ) isArticleURL _ = False articleURL2Title :: URL -> String articleURL2Title x | isArticleURL x = filter (/= '%') $ urlEncode $ tail $ dropWhile (/='/') (url_path x) | otherwise = "" getArticleLinks :: WikiArticle -> [URL] getArticleLinks xs = let inTags = parseTags (waContent xs) aTags = filter (isTagOpenName "a") inTags hrefs = map (fromAttrib "href") aTags in mapMaybe importURL $ nub $ filter (=~ "^/wiki/[^:/]+$") hrefs -- http://en.wikipedia.org/w/index.php?title=Computer&printable=yes articleURL2PrintURL :: URL -> URL articleURL2PrintURL xs | isArticleURL xs = URL (url_type xs) "w/index.php" [("title",articleURL2Title xs),("printable","yes")] | otherwise = xs -- http://en.wikipedia.org/wiki/index.php?title=Psychology&action=raw articleURL2RawURL :: URL -> URL articleURL2RawURL xs | isArticleURL xs = URL (url_type xs) "w/index.php" [("title",articleURL2Title xs),("action","raw")] | otherwise = xs sanitizeArticle :: WikiArticle -> WikiArticle sanitizeArticle xs = let inTags = parseTags (waContent xs) outTags = processTags $ filterTags "img" $ filterTags "div" $ filterTags "link" $ filterTags "script" inTags in WikiArticleHTML (waTitle xs) (renderTags outTags) processTags xs = map removeEmptyAttr xs where removeEmptyAttr t@(TagOpen s xs) | null s = t | head s == '!' = t | otherwise = TagOpen s (filter (not . null . snd) xs) removeEmptyAttr t = t filterTags tn [] = [] filterTags tn (x:xs) | isTagOpenName tn x = filterTags tn $ dropWhile (not . isTagCloseName tn) xs | isTagCloseName tn x = filterTags tn xs | otherwise = x:filterTags tn xs {- fetchPrintArticle :: URL -> IO WikiArticle fetchPrintArticle xs = do contents <- getResponseBody =<< simpleHTTP (getRequest (exportURL (articleURL2PrintURL xs))) return (WikiArticleHTML (articleURL2Title xs) contents) fetchRawArticle :: URL -> IO WikiArticle fetchRawArticle xs = do contents <- getResponseBody =<< simpleHTTP (getRequest (exportURL (articleURL2RawURL xs))) return (WikiArticleSRC (articleURL2Title xs) contents) -} fetchArticle :: URL -> IO WikiArticle fetchArticle xs = do rsp <- Network.HTTP.simpleHTTP (getRequest (exportURL xs)) contents <- (getResponseBody rsp) return (WikiArticleHTML (articleURL2Title xs) contents)