module Analyze
    ( htmlIds
    , shouldAnalyzeFile
    , analyzeFile
    , analyze
    )
where
import qualified Text.HTML.Tagchup.Parser as T ( runSoup )
import qualified Text.HTML.Tagchup.Tag.Match as M
import qualified Text.HTML.Tagchup.Tag as T
import qualified Text.XML.Basic.Name.LowerCase as N
import qualified Text.XML.Basic.Attribute as A
import Data.Char ( toLower )
import Data.Maybe ( mapMaybe, listToMaybe )
import Control.Arrow ( second, first )
import Control.Monad ( forM )
import System.Directory ( getDirectoryContents )
import System.FilePath ( (</>), takeExtension )
import qualified Data.Text as Txt
import State.Types ( ChapterId, mkChapterId, CommentId, mkCommentId )

shouldAnalyzeFile :: FilePath -> Bool
shouldAnalyzeFile fn = ext `elem` goodExts
    where
      ext = map toLower $ takeExtension fn
      goodExts = [".html", ".htm", ".xhtml"]

analyze :: FilePath -> IO [(FilePath, [(Maybe ChapterId, [CommentId])])]
analyze fn = do
  let ignoreEntry = (`elem` [".", ".."])

      processEntry n
          | shouldAnalyzeFile n = processFile n `catch` \_ ->
                                  processDir n
          | otherwise           = processDir n

      processFile n = (\xs -> [(n, xs)]) `fmap` analyzeFile (fn </> n)
      processDir n = map (first (n </>)) `fmap` analyze (fn </> n)

  fs <- filter (not . ignoreEntry) `fmap` getDirectoryContents fn

  results <- forM fs $ \n ->
             processEntry n `catch` \_ -> return []

  return $ concat results

analyzeFile :: FilePath -> IO [(Maybe ChapterId, [CommentId])]
analyzeFile = fmap htmlIds . readFile

-- Extract the paragraph ids from the chapter divs
htmlIds :: String -> [(Maybe ChapterId, [CommentId])]
htmlIds = map (first (mkChapterId . Txt.pack =<<)) .
          map (second (mapMaybe (mkCommentId . Txt.pack) . findIds)) .
          findChapterContent .
          T.runSoup

findIds :: [T.T N.T String] -> [String]
findIds = concatMap getId . concatMap snd . mapMaybe T.maybeOpen

getId :: A.T N.T String -> [String]
getId (A.Cons (A.Name (N.Cons "id")) i) = [i]
getId _                        = []

findChapterContent :: [T.T N.T String] -> [(Maybe String, [T.T N.T String])]
findChapterContent ts =
    case dropWhile (not . divChapter) ts of
      (t:ts') -> let chId = listToMaybe (chapId t)
                 in case findClose "div" ts' of
                      Nothing ->
                          -- No close tag, so no content, but check
                          -- for well-formed tags further on
                          (chId, []):findChapterContent ts'
                      Just (f, ts'') ->
                          -- Well-formed tag, so take its content
                          (chId, f []):findChapterContent ts''
      _ -> []
    where
      chapId (T.Open _ attrs) = concatMap getId attrs
      chapId _ = []

divChapter :: T.T N.T String -> Bool
divChapter (T.Open (T.Name (N.Cons "div")) attrs) = any hasChapterClass attrs
divChapter _ = False

hasChapterClass :: A.T N.T String -> Bool
hasChapterClass (A.Cons (A.Name (N.Cons "class")) s) = "chapter" `elem` words s
hasChapterClass _ = False

findClose :: String -> [T.T N.T String]
          -> Maybe ([T.T N.T String] -> [T.T N.T String], [T.T N.T String])
findClose _ [] = Nothing
findClose n (t:ts)
    | M.openNameLit n t =
        do -- Find the close of this inner tag
           (inside, ts') <- findClose n ts
           -- Resume finiding the close of the outer tag
           (inside', ts'') <- findClose n ts'
           -- Put the content of the inner tag
           let inner = (t:) . inside . (T.Close (T.Name (N.Cons n)):)
           return $ (inner . inside', ts'')
    | M.closeLit n t    = Just (id, ts)
    | otherwise         =
        do (inside, ts') <- findClose n ts
           return ((t:) . inside, ts')