module NLP.HistPL.Lexicon
(
HistPL
, Code (..)
, Key
, UID
, tryOpen
, open
, lookup
, lookupMany
, dictKeys
, tryLoad
, load
, dictIDs
, tryLoad'
, load'
, build
, loadAll
, module NLP.HistPL.Types
) where
import Prelude hiding (lookup)
import Control.Applicative ((<$>))
import Control.Monad (unless, guard)
import Control.Monad.IO.Class (liftIO, MonadIO)
import Control.Monad.Trans.Maybe (MaybeT (..))
import qualified Control.Monad.LazyIO as LazyIO
import System.IO.Unsafe (unsafeInterleaveIO)
import System.FilePath ((</>))
import System.Directory
( createDirectoryIfMissing, createDirectory, doesDirectoryExist )
import Data.List (mapAccumL)
import Data.Binary (Binary, put, get, encodeFile, decodeFile)
import qualified Data.Set as S
import qualified Data.Map as M
import qualified Data.Text as T
import qualified Data.Text.IO as T
import qualified Data.DAWG.Dynamic as DD
import qualified NLP.HistPL.Binary as B
import NLP.HistPL.Binary.Util
import qualified NLP.HistPL.DAWG as D
import NLP.HistPL.Types
import qualified NLP.HistPL.Util as Util
entryDir :: String
entryDir = "entries"
keyDir :: String
keyDir = "keys"
formFile :: String
formFile = "forms.bin"
type Key = D.Key UID
type UID = Int
proxy :: LexEntry -> T.Text
proxy entry = case Util.allForms entry of
(x:_) -> x
[] -> error "proxy: entry with no forms"
showKey :: Key -> String
showKey D.Key{..} = (T.unpack . T.concat) [T.pack (show uid), "-", path]
parseKey :: String -> Key
parseKey x =
let (uid'S, (_:form'S)) = break (=='-') x
in D.Key (T.pack form'S) (read uid'S)
getKey :: DD.DAWG Char Int -> LexEntry -> (DD.DAWG Char Int, Key)
getKey m x =
let main = proxy x
path = T.unpack main
num = maybe 0 id (DD.lookup path m) + 1
key = D.Key main num
in (DD.insert path num m, key)
getKeys :: [LexEntry] -> [Key]
getKeys = snd . mapAccumL getKey DD.empty
saveKey :: FilePath -> Key -> T.Text -> IO ()
saveKey path key i = T.writeFile (path </> keyDir </> showKey key) i
loadKey :: FilePath -> Key -> IO T.Text
loadKey path key = T.readFile (path </> keyDir </> showKey key)
saveEntry :: FilePath -> Key -> LexEntry -> IO ()
saveEntry path key x = do
saveKey path key (lexID x)
B.save (path </> entryDir) x
tryLoadEntry :: FilePath -> Key -> IO (Maybe LexEntry)
tryLoadEntry path key = maybeErr $ do
B.load (path </> entryDir) =<< loadKey path key
data HistPL = HistPL {
dictPath :: FilePath
, formMap :: D.DAWG UID () Code
}
data Code
= Orig
| Both
| Copy
deriving (Show, Eq, Ord)
instance Binary Code where
put Orig = put '1'
put Copy = put '2'
put Both = put '3'
get = get >>= \x -> return $ case x of
'1' -> Orig
'2' -> Copy
'3' -> Both
c -> error $ "get: invalid Code value '" ++ [c] ++ "'"
tryOpen :: FilePath -> IO (Maybe HistPL)
tryOpen path = runMaybeT $ do
formMap' <- maybeErrT $ decodeFile (path </> formFile)
doesExist <- liftIO $ doesDirectoryExist (path </> entryDir)
guard doesExist
return $ HistPL path formMap'
open :: FilePath -> IO HistPL
open path = tryOpen path >>=
maybe (fail "Failed to open the dictionary") return
dictKeys :: HistPL -> IO [Key]
dictKeys hpl = map parseKey <$> loadContents (dictPath hpl </> keyDir)
tryLoad :: HistPL -> Key -> IO (Maybe LexEntry)
tryLoad hpl key = unsafeInterleaveIO $ tryLoadEntry (dictPath hpl) key
load :: HistPL -> Key -> IO LexEntry
load hpl key = tryLoad hpl key >>= maybe
(fail $ "load: failed to open entry with the " ++ show key ++ " key")
return
dictIDs :: HistPL -> IO [T.Text]
dictIDs hpl = map T.pack <$> loadContents (dictPath hpl </> entryDir)
tryLoad' :: HistPL -> T.Text -> IO (Maybe LexEntry)
tryLoad' hpl i = unsafeInterleaveIO $ B.tryLoad (dictPath hpl </> entryDir) i
load' :: HistPL -> T.Text -> IO LexEntry
load' hpl i = tryLoad' hpl i >>= maybe
(fail $ "load': failed to load entry with the " ++ T.unpack i ++ " ID")
return
lookup :: HistPL -> T.Text -> IO [(LexEntry, Code)]
lookup hpl x = do
let lexSet = D.lookup x (formMap hpl)
sequence
[ ( , code) <$> load hpl key
| (key, code) <- getCode =<< M.assocs lexSet ]
where
getCode (key, val) =
[ (key { D.path = base }, code)
| (base, code) <- M.toList (D.forms val) ]
lookupMany :: HistPL -> [T.Text] -> IO [(LexEntry, Code)]
lookupMany hpl xs = do
let keyMap = M.fromListWith min $
getCode =<< M.assocs =<<
(flip D.lookup (formMap hpl) <$> xs)
sequence
[ ( , code) <$> load hpl key
| (key, code) <- M.toList keyMap ]
where
getCode (key, val) =
[ (key { D.path = base }, code)
| (base, code) <- M.toList (D.forms val) ]
build :: FilePath -> [(LexEntry, S.Set T.Text)] -> IO (HistPL)
build binPath xs = do
createDirectoryIfMissing True binPath
emptyDirectory binPath >>= \empty -> unless empty $ do
error $ "build: directory " ++ binPath ++ " is not empty"
createDirectory $ binPath </> entryDir
createDirectory $ binPath </> keyDir
formMap' <- D.fromList . concat <$>
LazyIO.mapM saveBin (zip3 keys entries forms)
encodeFile (binPath </> formFile) formMap'
return $ HistPL binPath formMap'
where
(entries, forms) = unzip xs
keys = getKeys entries
saveBin (key, lexEntry, otherForms) = do
saveEntry binPath key lexEntry
let D.Key{..} = key
histForms = S.fromList (Util.allForms lexEntry)
onlyHist = S.difference histForms otherForms
onlyOther = S.difference otherForms histForms
both = S.intersection histForms otherForms
list c s = [(y, uid, (), path, c) | y <- S.toList s]
return $ list Orig onlyHist ++ list Copy onlyOther ++ list Both both
loadAll :: HistPL -> IO [(Key, LexEntry)]
loadAll hpl = do
keys <- dictKeys hpl
LazyIO.forM keys $ \key -> do
entry <- load hpl key
return (key, entry)