module Network.Wikipedia ( isArticleURL
, WikiArticle(..)
, articleURL2Title
, getArticleLinks
, getArticleLinksAbs
, getArticleImages
, sanitizeArticle
, sanitizeFileName
, fetchArticle)
where
import Network.URL
import Text.Regex.Posix
import Network.HTTP
import Text.HTML.TagSoup
import System.FilePath
import Data.List (nub)
import Data.Maybe (mapMaybe, fromJust)
data WikiArticle = WikiArticleHTML { waTitle :: String, waContent :: String }
| WikiArticleSRC { waTitle :: String, waContent :: String } deriving (Show, Ord, Eq)
sanitizeFileName :: FilePath -> FilePath
sanitizeFileName cs = map (unPercent) $ urlEncode cs
where
unPercent c = if (c == '%' || c == ':') then 'X' else c
isArticleURL :: URL -> Bool
isArticleURL (URL (Absolute (Host (HTTP False) xs Nothing)) ph []) = (xs =~ ".*en[.]wikipedia.org$") && (ph =~ "wiki/[^/]+$" )
isArticleURL _ = False
articleURL2Title :: URL -> String
articleURL2Title x | isArticleURL x = sanitizeFileName (takeFileName (url_path x))
| otherwise = ""
articleRelURL2Title :: String -> String
articleRelURL2Title ('#':xs) = '#':xs
articleRelURL2Title x = case importURL ("http://en.wikipedia.org"++x) of
Nothing -> ""
Just u -> articleURL2Title u
isArticleImgURL :: String -> Bool
isArticleImgURL cs = cs =~ "^http://upload.wikimedia.org/.*"
getArticleImages :: WikiArticle -> [URL]
getArticleImages x = let inTags = parseTags (waContent x)
imgTags = filter (isTagOpenName "img") inTags
imgSrcs = filter (isArticleImgURL) (map (fromAttrib "src") imgTags)
in imgSrcs `seq` mapMaybe importURL imgSrcs
getArticleLinks :: WikiArticle -> [URL]
getArticleLinks xs = let inTags = parseTags (waContent xs)
aTags = filter (isTagOpenName "a") inTags
hrefs = map (fromAttrib "href") aTags
in mapMaybe importURL $ nub $ filter (=~ "^/wiki/[^:/]+$") hrefs
getArticleLinksAbs :: WikiArticle -> [URL]
getArticleLinksAbs xs = let ys = getArticleLinks xs
toAbsURL (URL _ path params) = (URL (Absolute (Host (HTTP False) "en.wikipedia.org" Nothing)) path params)
in map toAbsURL ys
sanitizeArticle :: [String] -> WikiArticle -> WikiArticle
sanitizeArticle alnk xs = let inTags = parseTags (waContent xs)
outTags = processTags alnk $ filterAllTags tags4Filter inTags
tags4Filter = ["link", "script", "sup" ]
in WikiArticleHTML (waTitle xs) (renderTags outTags)
processTags alnk xs = procHrefTags alnk $ procImgTags $ map processAttrs xs
where
processAttrs t@(TagOpen "div" _) = TagText ""
processAttrs t@(TagClose "div") = TagText ""
processAttrs t@(TagOpen s ys) | null s = t
| head s == '!' = t
| otherwise = TagOpen s (removeStyleAttr $ removeEmptyAttr ys)
processAttrs t = t
removeEmptyAttr xs = filter (not . null . snd) xs
removeStyleAttr xs = filter (\x -> (fst x) `notElem` ["style", "class"]) xs
filterAllTags tgs xs = filter (not . isTagComment) $ foldr (filterTags) xs tgs
isTagComment (TagComment _) = True
isTagComment _ = False
filterTags tn [] = []
filterTags tn (x:xs) | isTagOpenName tn x = filterTags tn $ dropWhile (not . isTagCloseName tn) xs
| isTagCloseName tn x = filterTags tn xs
| otherwise = x:filterTags tn xs
procHrefTags _ [] = []
procHrefTags alnk (x:xs) | isTagOpenName "a" x = let relP = fromAttrib "href" x
title = articleRelURL2Title relP
imgOk = (TagOpen "a" [("href",title)]):(procHrefTags alnk xs)
isInBook = elem (title) alnk || ((length title) > 0 && (head title == '#'))
imgNok = let pre = takeWhile (not . isTagCloseName "a") xs
post = tail $ dropWhile (not . isTagCloseName "a") xs
in pre ++ (procHrefTags alnk post)
in if isInBook then imgOk else imgNok
| otherwise = x:(procHrefTags alnk xs)
procImgTags [] = []
procImgTags (x:xs) | isTagOpenName "img" x = let absP = fromAttrib "src" x
relP = "img" </> (sanitizeFileName $ takeFileName $ url_path $ fromJust $ importURL absP)
imgOk = TagOpen "img" [("src",relP)]
imgNok = tail $ dropWhile (not . isTagCloseName "img") xs
in if isArticleImgURL absP then imgOk:(procImgTags xs) else procImgTags imgNok
| otherwise = x:procImgTags xs
fetchArticle :: URL -> IO WikiArticle
fetchArticle xs = do
rsp <- Network.HTTP.simpleHTTP (getRequest (exportURL xs))
contents <- (getResponseBody rsp)
return (WikiArticleHTML (articleURL2Title xs) contents)