{-# LANGUAGE DataKinds #-} {-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE StandaloneDeriving #-} {-# LANGUAGE TypeOperators #-} {-# LANGUAGE ExplicitNamespaces #-} module Text.HaskSeg.Utils (readDataset, writeDataset, writeState, readState, datasetToVocabulary, applySegmentation) where import Prelude hiding (lookup, getContents, readFile, strip, lines, writeFile, words) import System.IO (withFile, IOMode(..), stdin, stderr, openFile, stdout, hClose, Handle(..)) import Data.Text (Text, strip, lines, stripPrefix, splitOn, pack, unpack, words) import Data.Text.IO (getContents, readFile, hGetContents, hPutStr, writeFile, hPutStrLn) --import qualified Data.ByteString.Lazy as BS import qualified Data.ByteString.Lazy.Char8 as BS import Data.Text (Text) import qualified Data.Text.Lazy as T import qualified Data.Text.Lazy.IO as T import qualified Data.Text.Lazy.Encoding as T import Control.Monad (join, liftM, foldM) import Data.Set (Set) import qualified Data.Set as Set import Data.Map (Map) import qualified Data.Map as Map import Data.List (nub) import Codec.Compression.GZip (compress, decompress) import Text.HaskSeg.Types (Locations, Morph, Counts, Site, Location(..), Lookup, showLookup, showCounts, SamplingState(..), Params(..), Model, Token, Sentence, Dataset) import Text.HaskSeg.Probability (Probability) --type Token = String --type Sentence = [Token] --type Dataset = [Sentence] type Filename = String type Vocabulary = Set Token type Segmentation = Map Token [Token] readFileOrStdin :: Maybe String -> IO Text readFileOrStdin (Just f) = case suf of "gz" -> (liftM (pack . BS.unpack . decompress . BS.pack . unpack) . readFile) f _ -> readFile f where suf = (reverse . take 2 . reverse) f readFileOrStdin Nothing = getContents writeFileOrStdout :: Maybe String -> Text -> IO () writeFileOrStdout (Just f) s = case suf of "gz" -> writeFile f ((pack . BS.unpack . compress . BS.pack . unpack) s) _ -> writeFile f s where suf = (reverse . take 2 . reverse) f writeFileOrStdout Nothing s = hPutStr stdout s readDataset :: Maybe Filename -> Maybe Int -> IO Dataset readDataset (Just f) n = do bs <- readFile f let ls = (map words . (case n of Nothing -> id; Just i -> take i) . lines) bs --let ls = (map words . (case n of Nothing -> id; Just i -> take i) . lines . T.unpack . T.decodeUtf8) bs return $ map (map unpack) ls datasetToVocabulary :: Dataset -> Vocabulary datasetToVocabulary ss = Set.fromList $ nub ws where ws = concat ss writeDataset :: Maybe Filename -> Dataset -> IO () writeDataset (Just f) cs = BS.writeFile f bs where bs = (T.encodeUtf8 . T.pack . unlines . map unwords) cs applySegmentation :: Segmentation -> Dataset -> Dataset applySegmentation seg ds = map (concat . (map (\w -> Map.findWithDefault [[c] | c <- w] w seg))) ds --readVocabulary :: Filename -> IO Dataset --readVocabulary f = undefined --writeVocabulary :: Filename -> Dataset -> IO () --writeVocabulary f d = undefined writeState :: (Show a, Show p) => Maybe Filename -> Params p -> Locations a -> IO () writeState (Just f) p l = BS.writeFile f ((compress . T.encodeUtf8 . T.pack . show) $ (p, l)) readState :: (Read a, Read p) => Maybe Filename -> IO (Params p, Locations a) readState (Just f) = (liftM (read . T.unpack . T.decodeUtf8 . decompress) . BS.readFile) f