module Wiki4e.Commands.Articles ( wiki4e_sanitizeArticle , wiki4e_sanitizeArticles , wiki4e_fetchArticles , wiki4e_getArticleFiles , wiki4e_getArticleSanFiles , wiki4e_crawlArticlesLinks , wiki4e_listArticleImages , wiki4e_listArticlesImages , wiki4e_readArticle , loadArticleFile , articleURL2File , articleURL2SanFile) where import System.FilePath import Network.URL import Data.List (nub) import System.IO import Codec.EBook import qualified Data.ByteString as STR import Network.Wikipedia import Wiki4e.Commands.Common import Wiki4e.Commands.Fetching wiki4e_sanitizeArticle :: [String] -> FilePath -> FilePath -> IO () wiki4e_sanitizeArticle alnk inf ouf = withFile inf ReadMode (\hi -> do hSetEncoding hi utf8 withFile ouf WriteMode (\ho -> do hSetEncoding ho utf8 c <- hGetContents hi let a = sanitizeArticle alnk $ WikiArticleHTML "" c hPutStr ho $ waContent a ) ) wiki4e_fetchArticles :: Wiki4eConfig -> [URL] -> IO () wiki4e_fetchArticles config xs = mapM_ (\(st,x) -> wiki4e_fetch (nm x) fm st x) $ zip fsls xs where fsls = [ (total,i) | i <- [1..total] ] total = length xs nm x = articleURL2File config x fm x = do (WikiArticleHTML _ c) <- fetchArticle x return c wiki4e_sanitizeArticles :: Wiki4eConfig -> [URL] -> IO () wiki4e_sanitizeArticles config arts = do mapM_ (\x -> wiki4e_sanitizeArticle alnk x (outf x)) xs where xs = wiki4e_getArticleFiles config arts alnk = map (articleURL2Title) arts outf x = (w4confDirSanitized config) (takeFileName x) wiki4e_listArticleImages :: FilePath -> IO [URL] wiki4e_listArticleImages x = do c <- readFileUTF8 x let imgs = nub (getArticleImages (WikiArticleHTML "" c)) putStrLn ((show $ length imgs) ++ " images at "++x) return $! (imgs `seq` imgs) -- | Method expects already sanitized articles wiki4e_listArticlesImages :: Wiki4eConfig -> [URL] -> IO [URL] wiki4e_listArticlesImages config urls = do images <- mapM (wiki4e_listArticleImages) (wiki4e_getArticleFiles config urls) return $! nub (images `seq` concat images) wiki4e_crawlArticlesLinks :: Wiki4eConfig -> [URL] -> Int -> IO [URL] wiki4e_crawlArticlesLinks _ _ 0 = return [] wiki4e_crawlArticlesLinks _ [] _ = return [] wiki4e_crawlArticlesLinks config us (l + 1) = do wiki4e_fetchArticles config us let fs = map (outd ) $ map (articleURL2Title) us as <- mapM (wiki4e_readArticle) fs let xs = nub $ concat $ map (getArticleLinksAbs (w4confDomain config)) as ys <- wiki4e_crawlArticlesLinks config xs l return (nub $ us++xs++ys) where outd = w4confDirFetch config wiki4e_getArticleFiles :: Wiki4eConfig -> [URL] -> [FilePath] wiki4e_getArticleFiles config xs = map (articleURL2File config) xs wiki4e_getArticleSanFiles :: Wiki4eConfig -> [URL] -> [FilePath] wiki4e_getArticleSanFiles config xs = map (articleURL2SanFile config) xs articleURL2File :: Wiki4eConfig -> URL -> FilePath articleURL2File config x = (w4confDirFetch config) (articleURL2Title x) articleURL2SanFile :: Wiki4eConfig -> URL -> FilePath articleURL2SanFile config x = (w4confDirSanitized config) (articleURL2Title x) loadArticleFile :: Int -> FilePath -> FilePath -> IO BookItem loadArticleFile i bookDir fname = do cs <- STR.readFile fname return (BookItem aid bfile (toLazy cs) opsMediatype (Just (ChapterMetadata name))) where aid = show i bfile = bookDir name name = takeFileName $ normalise fname wiki4e_readArticle :: FilePath -> IO WikiArticle wiki4e_readArticle inf = do c <- readFileUTF8 inf return (WikiArticleHTML "" c)