module NLP.HistPL.Lexicon
(
HistPL
, Code (..)
, Key
, UID
, tryOpen
, open
, lookup
, lookupMany
, getIndex
, tryWithKey
, withKey
, save
, load
, module NLP.HistPL.Types
) where
import Prelude hiding (lookup)
import Control.Exception (try, SomeException)
import Control.Applicative (Applicative, (<$>), (<*>))
import Control.Monad (when, guard)
import Control.Monad.IO.Class (liftIO, MonadIO)
import Control.Monad.Trans.Maybe (MaybeT (..))
import System.IO.Unsafe (unsafeInterleaveIO)
import System.FilePath ((</>))
import System.Directory ( getDirectoryContents, createDirectoryIfMissing
, createDirectory, doesDirectoryExist )
import Data.List (mapAccumL)
import Data.Binary (Binary, put, get, encodeFile, decodeFile)
import qualified Data.Set as S
import qualified Data.Map as M
import qualified Data.Text as T
import qualified Data.DAWG.Dynamic as DD
import qualified NLP.HistPL.Dict as D
import NLP.HistPL.Types
import qualified NLP.HistPL.Util as Util
entryDir :: String
entryDir = "entries"
formMapFile :: String
formMapFile = "forms.bin"
type Key = D.Key UID
type UID = Int
proxy :: LexEntry -> T.Text
proxy entry = case Util.allForms entry of
(x:_) -> x
[] -> error "proxy: entry with no forms"
showKey :: Key -> String
showKey D.Key{..} = (T.unpack . T.concat) [T.pack (show uid), "-", path]
parseKey :: String -> Key
parseKey x =
let (uid'S, (_:form'S)) = break (=='-') x
in D.Key (T.pack form'S) (read uid'S)
loadContents :: FilePath -> IO [FilePath]
loadContents path = do
xs <- getDirectoryContents path
return [x | x <- xs, x /= ".", x /= ".."]
emptyDirectory :: FilePath -> IO Bool
emptyDirectory path = null <$> loadContents path
saveEntry :: FilePath -> Key -> LexEntry -> IO ()
saveEntry path x y = encodeFile (path </> showKey x) y
getKey :: DD.DAWG Char Int -> LexEntry -> (DD.DAWG Char Int, Key)
getKey m x =
let main = proxy x
path = T.unpack main
num = maybe 0 id (DD.lookup path m) + 1
key = D.Key main num
in (DD.insert path num m, key)
getKeys :: [LexEntry] -> [Key]
getKeys = snd . mapAccumL getKey DD.empty
mapIO'Lazy :: (a -> IO b) -> [a] -> IO [b]
mapIO'Lazy f (x:xs) = (:) <$> f x <*> unsafeInterleaveIO (mapIO'Lazy f xs)
mapIO'Lazy _ [] = return []
forIO'Lazy :: [a] -> (a -> IO b) -> IO [b]
forIO'Lazy = flip mapIO'Lazy
maybeErr :: MonadIO m => IO a -> m (Maybe a)
maybeErr io = do
r <- liftIO (try io)
case r of
Left (_e :: SomeException) -> return Nothing
Right x -> return (Just x)
maybeT :: Monad m => Maybe a -> MaybeT m a
maybeT = MaybeT . return
maybeErrT :: MonadIO m => IO a -> MaybeT m a
maybeErrT io = do
r <- liftIO (maybeErr io)
maybeT r
loadEntry :: FilePath -> Key -> IO (Maybe LexEntry)
loadEntry path key = do
maybeErr $ decodeFile (path </> showKey key)
data HistPL = HistPL {
dictPath :: FilePath
, formMap :: D.Dict UID () Code
}
data Code
= Orig
| Both
| Copy
deriving (Show, Eq, Ord)
instance Binary Code where
put Orig = put '1'
put Copy = put '2'
put Both = put '3'
get = get >>= \x -> return $ case x of
'1' -> Orig
'2' -> Copy
'3' -> Both
c -> error $ "get: invalid Code value '" ++ [c] ++ "'"
entryPath :: HistPL -> FilePath
entryPath = (</> entryDir) . dictPath
tryOpen :: FilePath -> IO (Maybe HistPL)
tryOpen path = runMaybeT $ do
formMap' <- maybeErrT $ decodeFile (path </> formMapFile)
doesExist <- liftIO $ doesDirectoryExist (path </> entryDir)
guard doesExist
return $ HistPL path formMap'
open :: FilePath -> IO HistPL
open path = tryOpen path >>=
maybe (fail "Failed to open the dictionary") return
getIndex :: HistPL -> IO [Key]
getIndex hpl = map parseKey <$> loadContents (entryPath hpl)
tryWithKey :: HistPL -> Key -> IO (Maybe LexEntry)
tryWithKey hpl key = unsafeInterleaveIO $ loadEntry (entryPath hpl) key
withKey :: HistPL -> Key -> IO LexEntry
withKey hpl key = tryWithKey hpl key >>= maybe
(fail $ "Failed to open entry with the " ++ show key ++ " key") return
lookup :: HistPL -> T.Text -> IO [(LexEntry, Code)]
lookup hpl x = do
let lexSet = D.lookup x (formMap hpl)
sequence
[ ( , code) <$> withKey hpl key
| (key, code) <- getCode =<< M.assocs lexSet ]
where
getCode (key, val) =
[ (key { D.path = base }, code)
| (base, code) <- M.toList (D.forms val) ]
lookupMany :: HistPL -> [T.Text] -> IO [(LexEntry, Code)]
lookupMany hpl xs = do
let keyMap = M.fromListWith min $
getCode =<< M.assocs =<<
(flip D.lookup (formMap hpl) <$> xs)
sequence
[ ( , code) <$> withKey hpl key
| (key, code) <- M.toList keyMap ]
where
getCode (key, val) =
[ (key { D.path = base }, code)
| (base, code) <- M.toList (D.forms val) ]
save :: FilePath -> [(LexEntry, S.Set T.Text)] -> IO (HistPL)
save binPath xs = do
createDirectoryIfMissing True binPath
isEmpty <- emptyDirectory binPath
when (not isEmpty) $ do
error $ "save: directory " ++ binPath ++ " is not empty"
let lexPath = binPath </> entryDir
createDirectory lexPath
formMap' <- D.fromList . concat <$>
mapIO'Lazy (saveBin lexPath) (zip3 keys entries forms)
encodeFile (binPath </> formMapFile) formMap'
return $ HistPL binPath formMap'
where
(entries, forms) = unzip xs
keys = getKeys entries
saveBin lexPath (key, lexEntry, otherForms) = do
saveEntry lexPath key lexEntry
let D.Key{..} = key
histForms = S.fromList (Util.allForms lexEntry)
onlyHist = S.difference histForms otherForms
onlyOther = S.difference otherForms histForms
both = S.intersection histForms otherForms
list c s = [(y, uid, (), path, c) | y <- S.toList s]
return $ list Orig onlyHist ++ list Copy onlyOther ++ list Both both
load :: HistPL -> IO [(Key, LexEntry)]
load hpl = do
keys <- getIndex hpl
forIO'Lazy keys $ \key -> do
entry <- withKey hpl key
return (key, entry)