module Network.Wikipedia ( isArticleURL
, WikiArticle(..)
, articleURL2Title
, getArticleLinks
, getArticleImages
, sanitizeArticle
, sanitizeFileName
, fetchArticle)
where
import Network.URL
import Network.URI
import Text.Regex.Posix
import Text.Regex.Base
import Network.HTTP
import Text.HTML.TagSoup
import System.FilePath
import Data.List (nub)
import Data.Maybe (catMaybes, mapMaybe, fromJust)
data WikiArticle = WikiArticleHTML { waTitle :: String, waContent :: String }
| WikiArticleSRC { waTitle :: String, waContent :: String } deriving (Show, Ord, Eq)
sanitizeFileName :: FilePath -> FilePath
sanitizeFileName cs = map (unPercent) $ urlEncode cs
where
unPercent c = if (c == '%') then 'X' else c
isArticleURL :: URL -> Bool
isArticleURL (URL (Absolute (Host (HTTP False) xs Nothing)) ph []) = (xs =~ ".*en[.]wikipedia.org$") && (ph =~ "wiki/[^:/]+$" )
isArticleURL _ = False
articleURL2Title :: URL -> String
articleURL2Title x | isArticleURL x = sanitizeFileName (takeFileName (url_path x))
| otherwise = ""
isArticleImgURL :: String -> Bool
isArticleImgURL cs = cs =~ "^http://upload.wikimedia.org/.*"
getArticleImages :: WikiArticle -> [URL]
getArticleImages x = let inTags = parseTags (waContent x)
imgTags = filter (isTagOpenName "img") inTags
imgSrcs = filter (isArticleImgURL) $ map (fromAttrib "src") imgTags
in mapMaybe importURL imgSrcs
getArticleLinks :: WikiArticle -> [URL]
getArticleLinks xs = let inTags = parseTags (waContent xs)
aTags = filter (isTagOpenName "a") inTags
hrefs = map (fromAttrib "href") aTags
in mapMaybe importURL $ nub $ filter (=~ "^/wiki/[^:/]+$") hrefs
articleURL2PrintURL :: URL -> URL
articleURL2PrintURL xs | isArticleURL xs = URL (url_type xs) "w/index.php" [("title",articleURL2Title xs),("printable","yes")]
| otherwise = xs
articleURL2RawURL :: URL -> URL
articleURL2RawURL xs | isArticleURL xs = URL (url_type xs) "w/index.php" [("title",articleURL2Title xs),("action","raw")]
| otherwise = xs
sanitizeArticle :: WikiArticle -> WikiArticle
sanitizeArticle xs = let inTags = parseTags (waContent xs)
outTags = procImgTags $ processTags $ filterAllTags tags4Filter inTags
tags4Filter = ["link", "script", "sup" ]
in WikiArticleHTML (waTitle xs) (renderTags outTags)
processTags xs = map processAttrs xs
where
processAttrs t@(TagOpen "a" _) = TagText ""
processAttrs t@(TagClose "a") = TagText ""
processAttrs t@(TagOpen "div" _) = TagText ""
processAttrs t@(TagClose "div") = TagText ""
processAttrs t@(TagOpen s xs) | null s = t
| head s == '!' = t
| otherwise = TagOpen s (removeStyleAttr $ removeEmptyAttr xs)
processAttrs t = t
removeEmptyAttr xs = filter (not . null . snd) xs
removeStyleAttr xs = filter (\x -> (fst x) `notElem` ["style", "id", "class"]) xs
filterAllTags tgs xs = filter (not . isTagComment) $ foldr (filterTags) xs tgs
isTagComment (TagComment _) = True
isTagComment _ = False
filterTags tn [] = []
filterTags tn (x:xs) | isTagOpenName tn x = filterTags tn $ dropWhile (not . isTagCloseName tn) xs
| isTagCloseName tn x = filterTags tn xs
| otherwise = x:filterTags tn xs
procImgTags [] = []
procImgTags (x:xs) | isTagOpenName "img" x = let absP = fromAttrib "src" x
relP = "img" </> (sanitizeFileName $ takeFileName $ url_path $ fromJust $ importURL absP)
imgOk = TagOpen "img" [("src",relP)]
imgNok = tail $ dropWhile (not . isTagCloseName "img") xs
in if isArticleImgURL absP then imgOk:(procImgTags xs) else procImgTags imgNok
| otherwise = x:procImgTags xs
fetchArticle :: URL -> IO WikiArticle
fetchArticle xs = do
rsp <- Network.HTTP.simpleHTTP (getRequest (exportURL xs))
contents <- (getResponseBody rsp)
return (WikiArticleHTML (articleURL2Title xs) contents)