module NLP.HistPL
(
BinEntry (..)
, Key (..)
, proxyForm
, binKey
, Rule (..)
, between
, apply
, HistPL
, tryOpen
, open
, lookup
, lookupBin
, getIndex
, withKey
, save
, load
, module NLP.HistPL.Types
) where
import Prelude hiding (lookup)
import Control.Exception (try, SomeException)
import Control.Applicative (Applicative, (<$>), (<*>))
import Control.Monad (when, guard)
import Control.Monad.IO.Class (liftIO, MonadIO)
import Control.Monad.Reader (lift)
import Control.Monad.Trans.Maybe (MaybeT (..))
import System.IO.Unsafe (unsafeInterleaveIO)
import System.FilePath ((</>))
import System.Directory ( getDirectoryContents, createDirectoryIfMissing
, createDirectory, doesDirectoryExist )
import Data.Maybe (catMaybes)
import Data.List (mapAccumL)
import Data.Binary (Binary, get, put, encodeFile, decodeFile)
import qualified Data.Set as S
import qualified Data.Text as T
import qualified Data.DAWG.Dynamic as DD
import qualified Data.DAWG.Static as D
import NLP.HistPL.Types
import qualified NLP.HistPL.Util as Util
type DAWG a = D.DAWG Char () a
entryDir :: String
entryDir = "entries"
formMapFile :: String
formMapFile = "forms.bin"
data BinEntry = BinEntry {
lexEntry :: LexEntry
, uid :: Int }
deriving (Show, Eq, Ord)
instance Binary BinEntry where
put BinEntry{..} = put lexEntry >> put uid
get = BinEntry <$> get <*> get
data Key = Key {
keyForm :: T.Text
, keyUid :: Int }
deriving (Show, Eq, Ord)
proxyForm :: LexEntry -> T.Text
proxyForm entry = case Util.allForms entry of
(x:_) -> x
[] -> error "proxyForm: entry with no forms"
binKey :: BinEntry -> Key
binKey BinEntry{..} = Key (proxyForm lexEntry) uid
showKey :: Key -> String
showKey Key{..} = (T.unpack . T.concat) [T.pack (show keyUid), "-", keyForm]
parseKey :: String -> Key
parseKey x =
let (uid'S, (_:form'S)) = break (=='-') x
in Key (T.pack form'S) (read uid'S)
loadContents :: FilePath -> IO [FilePath]
loadContents path = do
xs <- getDirectoryContents path
return [x | x <- xs, x /= ".", x /= ".."]
emptyDirectory :: FilePath -> IO Bool
emptyDirectory path = null <$> loadContents path
saveLexEntry :: FilePath -> BinEntry -> IO ()
saveLexEntry path x =
let binPath = showKey . binKey
in encodeFile (path </> binPath x) x
withUid :: DD.DAWG Char Int -> LexEntry -> (DD.DAWG Char Int, BinEntry)
withUid m x =
let path = T.unpack (proxyForm x)
num = maybe 0 id (DD.lookup path m) + 1
in (DD.insert path num m, BinEntry x num)
withUids :: [LexEntry] -> [BinEntry]
withUids = snd . mapAccumL withUid DD.empty
mapIO'Lazy :: (a -> IO b) -> [a] -> IO [b]
mapIO'Lazy f (x:xs) = (:) <$> f x <*> unsafeInterleaveIO (mapIO'Lazy f xs)
mapIO'Lazy _ [] = return []
save :: FilePath -> [LexEntry] -> IO ()
save path xs = do
createDirectoryIfMissing True path
isEmpty <- emptyDirectory path
when (not isEmpty) $ do
error $ "save: directory " ++ path ++ " is not empty"
let lexPath = path </> entryDir
createDirectory lexPath
formMap' <- D.fromListWith S.union . concat
<$> mapIO'Lazy (saveLex lexPath) (withUids xs)
encodeFile (path </> formMapFile) formMap'
where
saveLex lexPath x = do
saveLexEntry lexPath x
return $ rules x
rules binEntry =
[ ( T.unpack x
, S.singleton (between x key) )
| x <- Util.allForms (lexEntry binEntry) ]
where
key = binKey binEntry
load :: FilePath -> IO (Maybe [BinEntry])
load path = runMaybeT $ do
hpl <- MaybeT $ tryOpen path
lift $ do
keys <- getIndex hpl
catMaybes <$> mapM (withKey hpl) keys
maybeErr :: MonadIO m => IO a -> m (Maybe a)
maybeErr io = do
r <- liftIO (try io)
case r of
Left (_e :: SomeException) -> return Nothing
Right x -> return (Just x)
maybeT :: Monad m => Maybe a -> MaybeT m a
maybeT = MaybeT . return
maybeErrT :: MonadIO m => IO a -> MaybeT m a
maybeErrT io = do
r <- liftIO (maybeErr io)
maybeT r
loadLexEntry :: FilePath -> Key -> IO (Maybe BinEntry)
loadLexEntry path key = do
maybeErr $ decodeFile (path </> showKey key)
data Rule = Rule {
cut :: !Int
, suffix :: !T.Text
, ruleUid :: !Int }
deriving (Show, Eq, Ord)
instance Binary Rule where
put Rule{..} = put cut >> put suffix >> put ruleUid
get = Rule <$> get <*> get <*> get
apply :: Rule -> T.Text -> Key
apply r x =
let y = T.take (T.length x cut r) x `T.append` suffix r
in Key y (ruleUid r)
between :: T.Text -> Key -> Rule
between source dest =
let k = lcp source (keyForm dest)
in Rule (T.length source k) (T.drop k (keyForm dest)) (keyUid dest)
where
lcp a b = case T.commonPrefixes a b of
Just (c, _, _) -> T.length c
Nothing -> 0
data HistPL = HistPL {
dictPath :: FilePath
, formMap :: DAWG (S.Set Rule)
}
entryPath :: HistPL -> FilePath
entryPath = (</> entryDir) . dictPath
tryOpen :: FilePath -> IO (Maybe HistPL)
tryOpen path = runMaybeT $ do
formMap' <- maybeErrT $ decodeFile (path </> formMapFile)
doesExist <- liftIO $ doesDirectoryExist (path </> entryDir)
guard doesExist
return $ HistPL path formMap'
open :: FilePath -> IO HistPL
open path = tryOpen path >>=
maybe (fail "Failed to open the dictionary") return
getIndex :: HistPL -> IO [Key]
getIndex hpl = map parseKey <$> loadContents (entryPath hpl)
withKey :: HistPL -> Key -> IO (Maybe BinEntry)
withKey hpl key = unsafeInterleaveIO $ loadLexEntry (entryPath hpl) key
lookup :: HistPL -> T.Text -> IO [LexEntry]
lookup hpl = fmap (map lexEntry) . lookupBin hpl
lookupBin :: HistPL -> T.Text -> IO [BinEntry]
lookupBin hpl x = do
let keys = case D.lookup (T.unpack x) (formMap hpl) of
Nothing -> []
Just xs -> map (flip apply x) (S.toList xs)
catMaybes <$> mapM (withKey hpl) keys