module Analyze ( htmlIds , shouldAnalyzeFile , analyzeFile , analyze ) where import qualified Text.HTML.Tagchup.Parser as T ( runSoup ) import qualified Text.HTML.Tagchup.Tag.Match as M import qualified Text.HTML.Tagchup.Tag as T import qualified Text.XML.Basic.Name.LowerCase as N import qualified Text.XML.Basic.Attribute as A import Data.Char ( toLower ) import Data.Maybe ( mapMaybe, listToMaybe ) import Control.Arrow ( second, first ) import Control.Monad ( forM ) import System.Directory ( getDirectoryContents ) import System.FilePath ( (), takeExtension ) import qualified Data.Text as Txt import State.Types ( ChapterId, mkChapterId, CommentId, mkCommentId ) shouldAnalyzeFile :: FilePath -> Bool shouldAnalyzeFile fn = ext `elem` goodExts where ext = map toLower $ takeExtension fn goodExts = [".html", ".htm", ".xhtml"] analyze :: FilePath -> IO [(FilePath, [(Maybe ChapterId, [CommentId])])] analyze fn = do let ignoreEntry = (`elem` [".", ".."]) processEntry n | shouldAnalyzeFile n = processFile n `catch` \_ -> processDir n | otherwise = processDir n processFile n = (\xs -> [(n, xs)]) `fmap` analyzeFile (fn n) processDir n = map (first (n )) `fmap` analyze (fn n) fs <- filter (not . ignoreEntry) `fmap` getDirectoryContents fn results <- forM fs $ \n -> processEntry n `catch` \_ -> return [] return $ concat results analyzeFile :: FilePath -> IO [(Maybe ChapterId, [CommentId])] analyzeFile = fmap htmlIds . readFile -- Extract the paragraph ids from the chapter divs htmlIds :: String -> [(Maybe ChapterId, [CommentId])] htmlIds = map (first (mkChapterId . Txt.pack =<<)) . map (second (mapMaybe (mkCommentId . Txt.pack) . findIds)) . findChapterContent . T.runSoup findIds :: [T.T N.T String] -> [String] findIds = concatMap getId . concatMap snd . mapMaybe T.maybeOpen getId :: A.T N.T String -> [String] getId (A.Cons (A.Name (N.Cons "id")) i) = [i] getId _ = [] findChapterContent :: [T.T N.T String] -> [(Maybe String, [T.T N.T String])] findChapterContent ts = case dropWhile (not . divChapter) ts of (t:ts') -> let chId = listToMaybe (chapId t) in case findClose "div" ts' of Nothing -> -- No close tag, so no content, but check -- for well-formed tags further on (chId, []):findChapterContent ts' Just (f, ts'') -> -- Well-formed tag, so take its content (chId, f []):findChapterContent ts'' _ -> [] where chapId (T.Open _ attrs) = concatMap getId attrs chapId _ = [] divChapter :: T.T N.T String -> Bool divChapter (T.Open (T.Name (N.Cons "div")) attrs) = any hasChapterClass attrs divChapter _ = False hasChapterClass :: A.T N.T String -> Bool hasChapterClass (A.Cons (A.Name (N.Cons "class")) s) = "chapter" `elem` words s hasChapterClass _ = False findClose :: String -> [T.T N.T String] -> Maybe ([T.T N.T String] -> [T.T N.T String], [T.T N.T String]) findClose _ [] = Nothing findClose n (t:ts) | M.openNameLit n t = do -- Find the close of this inner tag (inside, ts') <- findClose n ts -- Resume finiding the close of the outer tag (inside', ts'') <- findClose n ts' -- Put the content of the inner tag let inner = (t:) . inside . (T.Close (T.Name (N.Cons n)):) return $ (inner . inside', ts'') | M.closeLit n t = Just (id, ts) | otherwise = do (inside, ts') <- findClose n ts return ((t:) . inside, ts')