module Network.Wikipedia ( isArticleURL , WikiArticle(..) , articleURL2Title , getArticleLinks , getArticleLinksAbs , getArticleImages , sanitizeArticle , sanitizeFileName --, fetchPrintArticle --, fetchRawArticle , fetchArticle) where import Network.URL import Text.Regex.Posix import Network.HTTP import Text.HTML.TagSoup import System.FilePath import Data.List (nub) import Data.Maybe (mapMaybe, fromJust) import Data.Char import qualified Data.ByteString.Lazy.Char8 as BC import qualified Codec.Compression.GZip as GZip data WikiArticle = WikiArticleHTML { waTitle :: String, waContent :: String } | WikiArticleSRC { waTitle :: String, waContent :: String } deriving (Show, Ord, Eq) sanitizeFileName :: FilePath -> FilePath sanitizeFileName cs = map removeNonAscii $ makeValid cs where removeNonAscii z | isAscii z = z | otherwise = '_' isArticleURL :: URL -> Bool isArticleURL (URL (Absolute (Host (HTTP False) xs Nothing)) ph []) = (xs =~ ".*[.]wikipedia.org") && (ph =~ "wiki/.*") isArticleURL _ = False articleURL2Title :: URL -> String articleURL2Title x = sanitizeFileName (takeFileName (url_path x)) articleRelURL2Title :: String -> String articleRelURL2Title ('#':xs) = '#':xs articleRelURL2Title x = case importURL ("http://en.wikipedia.org/"++x) of Nothing -> error $ "Invalid article relative path: "++x Just u -> articleURL2Title u isArticleImgURL :: String -> Bool isArticleImgURL cs = cs =~ "^http://upload.wikimedia.org/.*" getArticleImages :: WikiArticle -> [URL] getArticleImages x = let inTags = parseTags (waContent x) imgTags = filter (isTagOpenName "img") inTags imgSrcs = filter (isArticleImgURL) (map (fromAttrib "src") imgTags) in imgSrcs `seq` mapMaybe importURL imgSrcs getArticleLinks :: WikiArticle -> [URL] getArticleLinks xs = let inTags = parseTags (waContent xs) aTags = filter (isTagOpenName "a") inTags hrefs = map (fromAttrib "href") aTags in mapMaybe importURL $ nub $ filter (=~ "^/wiki/[^:/]+$") hrefs getArticleLinksAbs :: String -> WikiArticle -> [URL] getArticleLinksAbs domain xs = map toAbsURL ys where ys = getArticleLinks xs toAbsURL (URL _ path params) = (URL (Absolute (Host (HTTP False) domain Nothing)) path params) -- http://en.wikipedia.org/w/index.php?title=Computer&printable=yes {- articleURL2PrintURL :: URL -> URL articleURL2PrintURL xs | isArticleURL xs = URL (url_type xs) "w/index.php" [("title",articleURL2Title xs),("printable","yes")] | otherwise = xs -- http://en.wikipedia.org/wiki/index.php?title=Psychology&action=raw articleURL2RawURL :: URL -> URL articleURL2RawURL xs | isArticleURL xs = URL (url_type xs) "w/index.php" [("title",articleURL2Title xs),("action","raw")] | otherwise = xs -} sanitizeArticle :: [String] -> WikiArticle -> WikiArticle sanitizeArticle alnk xs = let inTags = parseTags (waContent xs) outTags = processTags alnk $ filterAllTags tags4Filter inTags tags4Filter = ["link", "script", "sup" ] in WikiArticleHTML (waTitle xs) (renderTags outTags) processTags alnk xs = procHrefTags alnk $ procImgTags $ map processAttrs xs where processAttrs (TagOpen "div" _) = TagText "" processAttrs (TagClose "div") = TagText "" processAttrs t@(TagOpen s ys) | null s = t | head s == '!' = t | otherwise = TagOpen s (removeStyleAttr $ removeEmptyAttr ys) processAttrs t = t removeEmptyAttr = filter (not . null . snd) removeStyleAttr = filter (\x -> (fst x) `notElem` ["style", "class"]) filterAllTags tgs xs = filter (not . isTagComment) $ foldr (filterTags) xs tgs isTagComment (TagComment _) = True isTagComment _ = False filterTags _ [] = [] filterTags tn (x:xs) | isTagOpenName tn x = filterTags tn $ dropWhile (not . isTagCloseName tn) xs | isTagCloseName tn x = filterTags tn xs | otherwise = x:filterTags tn xs procHrefTags _ [] = [] procHrefTags alnk (x:xs) | isTagOpenName "a" x = let relP = fromAttrib "href" x title = articleRelURL2Title relP imgOk = (TagOpen "a" [("href",title)]):(procHrefTags alnk xs) isInBook = elem (title) alnk || ((length title) > 0 && (head title == '#')) imgNok = let pre = takeWhile (not . isTagCloseName "a") xs post = tail $ dropWhile (not . isTagCloseName "a") xs in pre ++ (procHrefTags alnk post) in if isInBook then imgOk else imgNok | otherwise = x:(procHrefTags alnk xs) procImgTags [] = [] procImgTags (x:xs) | isTagOpenName "img" x = let absP = fromAttrib "src" x relP = "img" (sanitizeFileName $ takeFileName $ url_path $ fromJust $ importURL absP) imgOk = TagOpen "img" [("src",relP)] imgNok = tail $ dropWhile (not . isTagCloseName "img") xs in if isArticleImgURL absP then imgOk:(procImgTags xs) else procImgTags imgNok | otherwise = x:procImgTags xs {- fetchPrintArticle :: URL -> IO WikiArticle fetchPrintArticle xs = do contents <- getResponseBody =<< simpleHTTP (getRequest (exportURL (articleURL2PrintURL xs))) return (WikiArticleHTML (articleURL2Title xs) contents) fetchRawArticle :: URL -> IO WikiArticle fetchRawArticle xs = do contents <- getResponseBody =<< simpleHTTP (getRequest (exportURL (articleURL2RawURL xs))) return (WikiArticleSRC (articleURL2Title xs) contents) -} fetchArticle :: URL -> IO WikiArticle fetchArticle xs = do rsp <- Network.HTTP.simpleHTTP (insertHeader HdrAcceptEncoding "identity" $ getRequest (exportURL xs)) -- rsp <- Network.HTTP.simpleHTTP (getRequest (exportURL xs)) contents <- (getResponseBody rsp) let header = either (\_ -> []) rspHeaders rsp let fincts = case lookupHeader HdrContentEncoding header of Nothing -> contents Just mm -> if mm == "gzip" then BC.unpack (GZip.decompress (BC.pack contents)) else contents return (WikiArticleHTML (articleURL2Title xs) fincts)