module NLP.WordNet.Prims
(
initializeWordNet,
initializeWordNetWithOptions,
closeWordNet,
getIndexString,
getSynsetForSense,
readSynset,
indexToSenseKey,
indexLookup
)
where
import System.IO
import System.Environment
import Numeric (readHex, readDec)
import Data.Char (toLower, isSpace)
import Data.Array
import Data.Foldable (forM_)
import Control.Exception
import Control.Monad (when, liftM, mplus, unless)
import Data.List (findIndex, find, elemIndex)
import Data.Maybe (isNothing, fromJust, isJust, fromMaybe)
import NLP.WordNet.PrimTypes
import NLP.WordNet.Util
import NLP.WordNet.Consts
initializeWordNet :: IO WordNetEnv
initializeWordNet =
initializeWordNetWithOptions Nothing Nothing
initializeWordNetWithOptions ::
Maybe FilePath ->
Maybe (String -> SomeException -> IO ()) ->
IO WordNetEnv
initializeWordNetWithOptions mSearchdir mWarn = do
searchdir <- case mSearchdir of { Nothing -> getDefaultSearchDir ; Just d -> return d }
let warn = fromMaybe (\s e -> hPutStrLn stderr (s ++ "\n" ++ show e)) mWarn
version <- tryMaybe $ getEnv "WNDBVERSION"
dHands <- mapM (\pos' -> do
idxH <- openFileEx
(makePath [searchdir, "index." ++ partName pos'])
(BinaryMode ReadMode)
dataH <- openFileEx
(makePath [searchdir, "data." ++ partName pos'])
(BinaryMode ReadMode)
return (idxH, dataH)
) allPOS
sense <- tryMaybeWarn (warn "Warning: initializeWordNet: cannot open file index.sense")
$ openFileEx (makePath [searchdir, "index.sense"]) (BinaryMode ReadMode)
cntlst <- tryMaybeWarn (warn "Warning: initializeWordNet: cannot open file cntlist.rev")
$ openFileEx (makePath [searchdir, "cntlist.rev"]) (BinaryMode ReadMode)
keyidx <- tryMaybeWarn (warn "Warning: initializeWordNet: cannot open file index.key")
$ openFileEx (makePath [searchdir, "index.key" ]) (BinaryMode ReadMode)
rkeyidx <- tryMaybeWarn (warn "Warning: initializeWordNet: cannot open file index.key.rev")
$ openFileEx (makePath [searchdir, "index.key.rev"]) (BinaryMode ReadMode)
vsent <- tryMaybeWarn (warn "Warning: initializeWordNet: cannot open sentence files (sentidx.vrb and sents.vrb)")
$ do
idx <- openFileEx (makePath [searchdir, "sentidx.vrb"]) (BinaryMode ReadMode)
snt <- openFileEx (makePath [searchdir, "sents.vrb" ]) (BinaryMode ReadMode)
return (idx, snt)
mHands <- mapM (\pos' -> openFileEx
(makePath [searchdir, partName pos' ++ ".exc"])
(BinaryMode ReadMode)) allPOS
return WordNetEnv
{ dataHandles = listArray (Noun, Adv) dHands,
excHandles = listArray (Noun, Adv) mHands,
senseHandle = sense,
countListHandle = cntlst,
keyIndexHandle = keyidx,
revKeyIndexHandle = rkeyidx,
vSentHandle = vsent,
wnReleaseVersion = version,
dataDirectory = searchdir,
warnAbout = warn
}
where
getDefaultSearchDir = do
Just searchdir <- tryMaybe (getEnv "WNSEARCHDIR") >>= \m1 ->
tryMaybe (getEnv "WNHOME") >>= \m2 ->
return (m1 `mplus`
liftM (++dictDir) m2 `mplus`
Just defaultPath)
return searchdir
closeWordNet :: WordNetEnv -> IO ()
closeWordNet wne = do
mapM_ (\ (h1,h2) -> hClose h1 >> hClose h2) (elems (dataHandles wne))
mapM_ hClose (elems (excHandles wne))
mapM_ (`forM_` hClose)
[senseHandle wne, countListHandle wne, keyIndexHandle wne,
revKeyIndexHandle wne, liftM fst (vSentHandle wne), liftM snd (vSentHandle wne)]
getIndexString :: WordNetEnv -> String -> POS -> IO (Maybe String)
getIndexString wne str partOfSpeech = getIndexString' . cannonWNString $ str
where
getIndexString' [] = return Nothing
getIndexString' (s:ss) = do
i <- binarySearch (fst (dataHandles wne ! partOfSpeech)) s
if isJust i
then return (Just s)
else getIndexString' ss
getSynsetForSense :: WordNetEnv -> SenseKey -> IO (Maybe Synset)
getSynsetForSense wne _ | isNothing (senseHandle wne) = ioError $ userError "no sense dictionary"
getSynsetForSense wne key' = do
l <- binarySearch
(fromJust $ senseHandle wne)
(senseKeyString key')
case l of
Nothing -> return Nothing
Just l' -> do offset <- maybeRead $ takeWhile (not . isSpace) $
drop 1 $ dropWhile (not . isSpace) l'
ss <- readSynset wne (senseKeyPOS key') offset (senseKeyWord key')
return (Just ss)
readSynset :: WordNetEnv -> POS -> Offset -> String -> IO Synset
readSynset wne searchPos offset w = do
let h = snd (dataHandles wne ! searchPos)
hSeek h AbsoluteSeek offset
toks <- liftM words $ hGetLine h
(ptrTokS:fnumS:posS:numWordsS:rest1) <- matchN 4 toks
hiam <- maybeRead ptrTokS
fn <- maybeRead fnumS
let ss1 = synset0 { hereIAm = hiam,
pos = readEPOS posS,
fnum = fn,
ssType = if readEPOS posS == Satellite then IndirectAnt else UnknownEPos
}
let numWords = case readHex numWordsS of
(n,_):_ -> n
_ -> 0
let (wrds,ptrCountS:rest2) = splitAt (numWords*2) rest1
let ptrCount =
case readDec ptrCountS of
(n,_):_ -> n
_ -> 0
wrds' <- readWords ss1 wrds
let ss2 = ss1 { ssWords = wrds',
whichWord = elemIndex w wrds }
let (ptrs,rest3) = splitAt (ptrCount*4) rest2
let (fp,ss3) = readPtrs (False,ss2) ptrs
let ss4 = if fp && searchPos == Adj && ssType ss3 == UnknownEPos
then ss3 { ssType = Pertainym }
else ss3
let (ss5,rest4) =
if searchPos /= Verb
then (ss4, rest3)
else let (fcountS:_) = rest3
(_ , rest5) = splitAt (read fcountS * 3) rest4
in (ss4, rest5)
let ss6 = ss5 { defn = unwords $ drop 1 rest4 }
return ss6
where
readWords ss (w':lid:xs) = do
let s = map toLower $ replaceChar ' ' '_' w'
idx <- indexLookup wne s (fromEPOS $ pos ss)
let posn = case idx of
Nothing -> Nothing
Just ix -> elemIndex (hereIAm ss) (indexOffsets ix)
rest <- readWords ss xs
return ((w', fst $ head $ readHex lid, maybe AllSenses SenseNumber posn) : rest)
readWords _ _ = return []
readPtrs (fp,ss) (typ:off:ppos:lexp:xs) =
let (fp',ss') = readPtrs (fp,ss) xs
this = (getPointerType typ,
read off,
readEPOS ppos,
fst $ head $ readHex (take 2 lexp),
fst $ head $ readHex (drop 2 lexp))
in if searchPos == Adj && ssType ss' == UnknownEPos
then if getPointerType typ == Antonym
then (fp' , ss' { forms = this : forms ss',
ssType = DirectAnt })
else (True, ss' { forms = this : forms ss' })
else (fp', ss' { forms = this : forms ss' })
readPtrs (fp,ss) _ = (fp,ss)
indexToSenseKey :: WordNetEnv -> Index -> Int -> IO (Maybe SenseKey)
indexToSenseKey wne idx sense = do
let cpos = fromEPOS $ indexPOS idx
ss1 <- readSynset wne cpos (indexOffsets idx !! (sense1)) ""
ss2 <- followSatellites ss1
case findIndex ((==indexWord idx) . map toLower) (map (\ (w,_,_) -> w) $ ssWords ss2) of
Nothing -> return Nothing
Just j -> do
let skey = ((indexWord idx ++ "%") ++) (if ssType ss2 == Satellite
then show (fromEnum Satellite) ++ ":" ++
padTo 2 (show $ fnum ss2) ++ ":" ++ headWord ss2 ++ ":" ++
padTo 2 (show $ headSense ss2)
else show (fromEnum $ pos ss2) ++ ":" ++
padTo 2 (show $ fnum ss2) ++ ":" ++
padTo 2 (show $ lexId ss2 j) ++ "::")
return (Just $ SenseKey cpos skey (indexWord idx))
where
followSatellites ss
| ssType ss == Satellite =
case find (\ (f,_,_,_,_) -> f == Similar) (forms ss) of
Nothing -> return ss
Just (_,offset,p,_,_) -> do
adjss <- readSynset wne (fromEPOS p) offset ""
case ssWords adjss of
(hw,_,hs):_ -> return (ss { headWord = map toLower hw,
headSense = hs })
_ -> return ss
| otherwise = return ss
indexLookup :: WordNetEnv -> String -> POS -> IO (Maybe Index)
indexLookup wne w pos' = do
ml <- binarySearch (fst (dataHandles wne ! pos')) w
case ml of
Nothing -> return Nothing
Just l -> do
(wdS:posS:ccntS:pcntS:rest1) <- matchN 4 (words l)
isc <- maybeRead ccntS
pc <- maybeRead pcntS
let idx1 = index0 { indexWord = wdS,
indexPOS = readEPOS posS,
indexSenseCount = isc
}
let (ptrs,rest2) = splitAt pc rest1
let idx2 = idx1 { indexForms = map getPointerType ptrs }
(ocntS:tcntS:rest3) <- matchN 2 rest2
itc <- maybeRead tcntS
otc <- maybeRead ocntS
let idx3 = idx2 { indexTaggedCount = itc }
let (offsets,_) = splitAt otc rest3
io <- mapM maybeRead offsets
return (Just $ idx3 { indexOffsets = io })
binarySearch :: Handle -> String -> IO (Maybe String)
binarySearch h s = do
hSeek h SeekFromEnd 0
bot <- hTell h
binarySearch' 0 bot (bot `div` 2)
where
binarySearch' :: Integer -> Integer -> Integer -> IO (Maybe String)
binarySearch' top bot mid = do
hSeek h AbsoluteSeek (mid1)
when (mid /= 1) readUntilNL
eof <- hIsEOF h
if eof
then if top >= bot1
then return Nothing
else binarySearch' top (bot1) ((top+bot1) `div` 2)
else do
l <- hGetLine h
let key' = takeWhile (/=' ') l
if key' == s
then return (Just l)
else case (bot top) `div` 2 of
0 -> return Nothing
d -> case key' `compare` s of
LT -> binarySearch' mid bot (mid + d)
GT -> binarySearch' top mid (top + d)
EQ -> undefined
readUntilNL = do
eof <- hIsEOF h
unless eof $ do
hGetLine h
return ()