module Text.NKJP.Tar
( readCorpus
) where

import Control.Applicative ((<$>), (<*>))
import System.FilePath (takeBaseName, takeDirectory)
import Data.List (groupBy, find)
import Data.Function (on)
import qualified Data.Text.Lazy as L
import qualified Data.Text.Lazy.Encoding as L
import qualified Codec.Compression.GZip as GZip
import qualified Codec.Archive.Tar as Tar
import qualified Data.ByteString.Lazy as BS

readTar :: FilePath -> IO [Tar.Entry]
readTar tar
    =  Tar.foldEntries (:) [] (error . show)
    .  Tar.read . GZip.decompress
   <$> BS.readFile tar

withBase :: String -> [Tar.Entry] -> Maybe Tar.Entry
withBase baseName = find ((==baseName) . takeBaseName . Tar.entryPath)

procContent :: (L.Text -> a) -> Tar.Entry -> a
procContent f entry =
    let (Tar.NormalFile binary _) = Tar.entryContent entry
    in  f (L.decodeUtf8 binary)

-- | Visit each .tar directory and return (apart from the directory name)
-- processed contents of the entry with the given name or Nothing if such
-- entry doesn't exists in the directory.
readCorpus :: String -> (L.Text -> a) -> FilePath -> IO [(FilePath, Maybe a)]
readCorpus base f tarPath
    = map onGroup
    . groupBy ((==) `on` (takeDirectory . Tar.entryPath))
   <$> readTar tarPath
  where
    onGroup = (,)
        <$> takeDirectory . Tar.entryPath . head
        <*> (fmap (procContent f) . withBase base)