module NLP.HistPL.Lexicon
(
HistPL (..)
, Code (..)
, Key
, UID
, tryOpen
, open
, lookup
, lookupMany
, nthSuffix
, withPrefix
, dictKeys
, tryLoadK
, loadK
, dictIDs
, tryLoadI
, loadI
, save
, load
, module NLP.HistPL.Types
) where
import Prelude hiding (lookup)
import Control.Applicative ((<$>))
import Control.Arrow (first, second)
import Control.Monad (unless, guard, (<=<))
import Control.Monad.Trans.Maybe (MaybeT (..))
import qualified Control.Monad.Trans.State.Strict as S
import Pipes
import qualified Pipes.Prelude as P
import System.FilePath ((</>))
import System.Directory
( createDirectoryIfMissing, createDirectory, doesDirectoryExist )
import Data.List (foldl')
import Data.Binary (Binary, put, get, encodeFile, decodeFile)
import qualified Data.Set as S
import qualified Data.Map as M
import qualified Data.Text as T
import qualified Data.Text.IO as T
import qualified Data.DAWG.Dynamic as DM
import qualified NLP.HistPL.Binary as B
import NLP.HistPL.Binary.Util
import qualified NLP.HistPL.DAWG as D
import NLP.HistPL.Types
import qualified NLP.HistPL.Util as Util
entryDir :: String
entryDir = "entries"
keyDir :: String
keyDir = "keys"
formFile :: String
formFile = "forms.bin"
type Key = D.Key UID
type UID = Int
proxy :: LexEntry -> T.Text
proxy entry = case Util.allForms entry of
(x:_) -> x
[] -> error "proxy: entry with no forms"
showKey :: Key -> String
showKey D.Key{..} = (T.unpack . T.concat) [T.pack (show uid), "-", path]
parseKey :: String -> Key
parseKey x =
let (uid'S, (_:form'S)) = break (=='-') x
in D.Key (T.pack form'S) (read uid'S)
saveKey :: FilePath -> Key -> T.Text -> IO ()
saveKey path key i = T.writeFile (path </> keyDir </> showKey key) i
loadKey :: FilePath -> Key -> IO T.Text
loadKey path key = T.readFile (path </> keyDir </> showKey key)
saveEntry :: FilePath -> Key -> LexEntry -> IO ()
saveEntry path key x = do
saveKey path key (lexID x)
B.save (path </> entryDir) x
loadEntry :: FilePath -> Key -> IO LexEntry
loadEntry path = B.load (path </> entryDir) <=< loadKey path
tryLoadEntry :: FilePath -> Key -> IO (Maybe LexEntry)
tryLoadEntry path = maybeErr . loadEntry path
data HistPL = HistPL {
dictPath :: FilePath
, formMap :: D.DAWG UID () Code
}
data Code
= Orig
| Both
| Copy
deriving (Show, Eq, Ord)
instance Binary Code where
put Orig = put '1'
put Copy = put '2'
put Both = put '3'
get = get >>= \x -> return $ case x of
'1' -> Orig
'2' -> Copy
'3' -> Both
c -> error $ "get: invalid Code value '" ++ [c] ++ "'"
tryOpen :: FilePath -> IO (Maybe HistPL)
tryOpen path = runMaybeT $ do
formMap <- maybeErrT $ decodeFile (path </> formFile)
doesExist <- lift $ doesDirectoryExist (path </> entryDir)
guard doesExist
return $ HistPL path formMap
open :: FilePath -> IO HistPL
open path = tryOpen path >>=
maybe (fail "Failed to open the dictionary") return
dictKeys :: HistPL -> Producer Key IO ()
dictKeys hpl = do
let getPaths = getUsefulContents $ dictPath hpl </> keyDir
xs <- map parseKey <$> lift getPaths
each xs
loadK :: HistPL -> Key -> IO LexEntry
loadK hpl = loadEntry (dictPath hpl)
tryLoadK :: HistPL -> Key -> IO (Maybe LexEntry)
tryLoadK hpl = tryLoadEntry (dictPath hpl)
dictIDs :: HistPL -> Producer T.Text IO ()
dictIDs hpl = do
let getPaths = getUsefulContents $ dictPath hpl </> entryDir
xs <- map T.pack <$> lift getPaths
each xs
loadI :: HistPL -> T.Text -> IO LexEntry
loadI hpl i = B.load (dictPath hpl </> entryDir) i
tryLoadI :: HistPL -> T.Text -> IO (Maybe LexEntry)
tryLoadI hpl i = B.tryLoad (dictPath hpl </> entryDir) i
lookup :: HistPL -> T.Text -> IO [(LexEntry, Code)]
lookup hpl x = do
let lexSet = D.lookup x (formMap hpl)
sequence
[ ( , code) <$> loadK hpl key
| (key, code) <- getCode =<< M.assocs lexSet ]
where
getCode (key, val) =
[ (key { D.path = base }, code)
| (base, code) <- M.toList (D.forms val) ]
lookupMany :: HistPL -> [T.Text] -> IO [(LexEntry, Code)]
lookupMany hpl xs = do
let keyMap = M.fromListWith min $
getCode =<< M.assocs =<<
(flip D.lookup (formMap hpl) <$> xs)
sequence
[ ( , code) <$> loadK hpl key
| (key, code) <- M.toList keyMap ]
where
getCode (key, val) =
[ (key { D.path = base }, code)
| (base, code) <- M.toList (D.forms val) ]
nthSuffix :: HistPL -> T.Text -> Int -> Maybe T.Text
nthSuffix HistPL{..} x i = D.byIndex i (D.submap x formMap)
withPrefix :: HistPL -> T.Text -> Int
withPrefix HistPL{..} x = D.size (D.submap x formMap)
save :: FilePath -> Consumer (Maybe (LexEntry, S.Set T.Text)) IO ()
save binPath = do
lift $ do
createDirectoryIfMissing True binPath
emptyDirectory binPath >>= \empty -> unless empty $ do
error $ "save: directory " ++ binPath ++ " is not empty"
createDirectory $ binPath </> entryDir
createDirectory $ binPath </> keyDir
formMap <- S.evalStateT loop s0
lift $ encodeFile (binPath </> formFile) (D.weigh formMap)
where
loop = lift await >>= \x -> case x of
Nothing -> D.freeze . fst <$> S.get
Just (entry, forms) -> do
key <- getKey entry
saveBin key entry forms
loop
s0 = (D.empty, DM.empty)
getKey entry = do
km <- snd <$> S.get
let main = proxy entry
path = T.unpack main
num = maybe 0 id (DM.lookup path km) + (1 :: Int)
key = D.Key main num
S.modify $ second $ DM.insert path num
return key
saveBin key entry otherForms = do
lift $ lift $ saveEntry binPath key entry
let D.Key{..} = key
histForms = S.fromList (Util.allForms entry)
onlyHist = S.difference histForms otherForms
onlyOther = S.difference otherForms histForms
both = S.intersection histForms otherForms
list c s = [(y, uid, (), path, c) | y <- S.toList s]
xs = list Orig onlyHist
++ list Copy onlyOther
++ list Both both
S.modify $ first $ flip (foldl' (flip D.insert)) xs
load :: HistPL -> Producer (Key, LexEntry) IO ()
load hpl = dictKeys hpl >-> P.mapM (\x -> (x, ) <$> loadK hpl x)