module Network.Wikipedia ( isArticleURL
, WikiArticle(..)
, articleURL2Title
, getArticleLinks
, sanitizeArticle
, fetchArticle)
where
import Network.URL
import Network.URI
import Text.Regex.Posix
import Text.Regex.Base
import Network.HTTP
import Text.HTML.TagSoup
import Data.List (nub)
import Data.Maybe (catMaybes, mapMaybe, fromJust)
data WikiArticle = WikiArticleHTML { waTitle :: String, waContent :: String }
| WikiArticleSRC { waTitle :: String, waContent :: String } deriving (Show, Ord, Eq)
isArticleURL :: URL -> Bool
isArticleURL (URL (Absolute (Host (HTTP False) xs Nothing)) ph []) = (xs =~ ".*en[.]wikipedia.org$") && (ph =~ "wiki/[^:/]+$" )
isArticleURL _ = False
articleURL2Title :: URL -> String
articleURL2Title x | isArticleURL x = filter (/= '%') $ urlEncode $ tail $ dropWhile (/='/') (url_path x)
| otherwise = ""
getArticleLinks :: WikiArticle -> [URL]
getArticleLinks xs = let inTags = parseTags (waContent xs)
aTags = filter (isTagOpenName "a") inTags
hrefs = map (fromAttrib "href") aTags
in mapMaybe importURL $ nub $ filter (=~ "^/wiki/[^:/]+$") hrefs
articleURL2PrintURL :: URL -> URL
articleURL2PrintURL xs | isArticleURL xs = URL (url_type xs) "w/index.php" [("title",articleURL2Title xs),("printable","yes")]
| otherwise = xs
articleURL2RawURL :: URL -> URL
articleURL2RawURL xs | isArticleURL xs = URL (url_type xs) "w/index.php" [("title",articleURL2Title xs),("action","raw")]
| otherwise = xs
sanitizeArticle :: WikiArticle -> WikiArticle
sanitizeArticle xs = let inTags = parseTags (waContent xs)
outTags = processTags $ filterTags "img" $ filterTags "div" $ filterTags "link" $ filterTags "script" inTags
in WikiArticleHTML (waTitle xs) (renderTags outTags)
processTags xs = map removeEmptyAttr xs
where
removeEmptyAttr t@(TagOpen s xs) | null s = t
| head s == '!' = t
| otherwise = TagOpen s (filter (not . null . snd) xs)
removeEmptyAttr t = t
filterTags tn [] = []
filterTags tn (x:xs) | isTagOpenName tn x = filterTags tn $ dropWhile (not . isTagCloseName tn) xs
| isTagCloseName tn x = filterTags tn xs
| otherwise = x:filterTags tn xs
fetchArticle :: URL -> IO WikiArticle
fetchArticle xs = do
rsp <- Network.HTTP.simpleHTTP (getRequest (exportURL xs))
contents <- (getResponseBody rsp)
return (WikiArticleHTML (articleURL2Title xs) contents)