module Text.NKJP.Tar
( readCorpus
) where
import Control.Applicative ((<$>), (<*>))
import System.FilePath (takeBaseName, takeDirectory)
import Data.List (groupBy, find)
import Data.Function (on)
import qualified Data.Text.Lazy as L
import qualified Data.Text.Lazy.Encoding as L
import qualified Codec.Compression.GZip as GZip
import qualified Codec.Archive.Tar as Tar
import qualified Data.ByteString.Lazy as BS
readTar :: FilePath -> IO [Tar.Entry]
readTar tar
= Tar.foldEntries (:) [] (error . show)
. Tar.read . GZip.decompress
<$> BS.readFile tar
withBase :: String -> [Tar.Entry] -> Maybe Tar.Entry
withBase baseName = find ((==baseName) . takeBaseName . Tar.entryPath)
procContent :: (L.Text -> a) -> Tar.Entry -> a
procContent f entry =
let (Tar.NormalFile binary _) = Tar.entryContent entry
in f (L.decodeUtf8 binary)
readCorpus :: String -> (L.Text -> a) -> FilePath -> IO [(FilePath, Maybe a)]
readCorpus base f tarPath
= map onGroup
. groupBy ((==) `on` (takeDirectory . Tar.entryPath))
<$> readTar tarPath
where
onGroup = (,)
<$> takeDirectory . Tar.entryPath . head
<*> (fmap (procContent f) . withBase base)