module NLP.WordNet.Prims
(
initializeWordNet,
initializeWordNetWithOptions,
closeWordNet,
getIndexString,
getSynsetForSense,
readSynset,
indexToSenseKey,
indexLookup
)
where
import System.IO
import System.Environment
import Numeric (readHex, readDec)
import Data.Char (toLower, isSpace)
import Data.Array
import Control.Exception
import Control.Monad (when, liftM, mplus)
import Data.List (findIndex, find)
import Data.Maybe (isNothing, fromJust, isJust, fromMaybe)
import GHC.Handle
import NLP.WordNet.PrimTypes
import NLP.WordNet.Util
import NLP.WordNet.Consts
initializeWordNet :: IO WordNetEnv
initializeWordNet =
initializeWordNetWithOptions Nothing Nothing
initializeWordNetWithOptions ::
Maybe FilePath ->
Maybe (String -> Exception -> IO ()) ->
IO WordNetEnv
initializeWordNetWithOptions mSearchdir mWarn = do
searchdir <- case mSearchdir of { Nothing -> getDefaultSearchDir ; Just d -> return d }
let warn = fromMaybe (\s e -> hPutStrLn stderr (s ++ "\n" ++ show e)) mWarn
version <- tryMaybe $ getEnv "WNDBVERSION"
dHands <- mapM (\pos -> do
idxH <- openFileEx
(makePath [searchdir, "index." ++ partName pos])
(BinaryMode ReadMode)
dataH <- openFileEx
(makePath [searchdir, "data." ++ partName pos])
(BinaryMode ReadMode)
return (idxH, dataH)
) allPOS
sense <- tryMaybeWarn (warn "Warning: initializeWordNet: cannot open file index.sense")
$ openFileEx (makePath [searchdir, "index.sense"]) (BinaryMode ReadMode)
cntlst <- tryMaybeWarn (warn "Warning: initializeWordNet: cannot open file cntlist.rev")
$ openFileEx (makePath [searchdir, "cntlist.rev"]) (BinaryMode ReadMode)
keyidx <- tryMaybeWarn (warn "Warning: initializeWordNet: cannot open file index.key")
$ openFileEx (makePath [searchdir, "index.key" ]) (BinaryMode ReadMode)
rkeyidx <- tryMaybeWarn (warn "Warning: initializeWordNet: cannot open file index.key.rev")
$ openFileEx (makePath [searchdir, "index.key.rev"]) (BinaryMode ReadMode)
vsent <- tryMaybeWarn (warn "Warning: initializeWordNet: cannot open sentence files (sentidx.vrb and sents.vrb)")
$ do
idx <- openFileEx (makePath [searchdir, "sentidx.vrb"]) (BinaryMode ReadMode)
snt <- openFileEx (makePath [searchdir, "sents.vrb" ]) (BinaryMode ReadMode)
return (idx, snt)
mHands <- mapM (\pos -> openFileEx
(makePath [searchdir, partName pos ++ ".exc"])
(BinaryMode ReadMode)) allPOS
return $ WordNetEnv
{ dataHandles = listArray (Noun, Adv) dHands,
excHandles = listArray (Noun, Adv) mHands,
senseHandle = sense,
countListHandle = cntlst,
keyIndexHandle = keyidx,
revKeyIndexHandle = rkeyidx,
vSentHandle = vsent,
wnReleaseVersion = version,
dataDirectory = searchdir,
warnAbout = warn
}
where
getDefaultSearchDir = do
Just searchdir <- tryMaybe (getEnv "WNSEARCHDIR") >>= \m1 ->
tryMaybe (getEnv "WNHOME") >>= \m2 ->
return (m1 `mplus`
liftM (++dictDir) m2 `mplus`
Just defaultPath)
return searchdir
closeWordNet :: WordNetEnv -> IO ()
closeWordNet wne = do
mapM_ (\ (h1,h2) -> hClose h1 >> hClose h2) (elems (dataHandles wne))
mapM_ hClose (elems (excHandles wne))
mapM_ (\x -> when (isJust x) $ hClose (fromJust x))
[senseHandle wne, countListHandle wne, keyIndexHandle wne,
revKeyIndexHandle wne, liftM fst (vSentHandle wne), liftM snd (vSentHandle wne)]
getIndexString :: WordNetEnv -> String -> POS -> IO (Maybe String)
getIndexString wne str partOfSpeech = getIndexString' . cannonWNString $ str
where
getIndexString' [] = return Nothing
getIndexString' (s:ss) = do
i <- binarySearch (fst (dataHandles wne ! partOfSpeech)) s
if isJust i
then return (Just s)
else getIndexString' ss
getSynsetForSense :: WordNetEnv -> SenseKey -> IO (Maybe Synset)
getSynsetForSense wne _ | isNothing (senseHandle wne) = ioError $ userError "no sense dictionary"
getSynsetForSense wne key = do
l <- binarySearch
(fromJust $ senseHandle wne)
(senseKeyString key)
case l of
Nothing -> return Nothing
Just l -> do offset <- maybeRead $ takeWhile (not . isSpace) $
drop 1 $ dropWhile (not . isSpace) l
ss <- readSynset wne (senseKeyPOS key) offset (senseKeyWord key)
return (Just ss)
readSynset :: WordNetEnv -> POS -> Offset -> String -> IO Synset
readSynset wne searchPos offset w = do
let h = snd (dataHandles wne ! searchPos)
hSeek h AbsoluteSeek offset
toks <- liftM words $ hGetLine h
(ptrTokS:fnumS:posS:numWordsS:rest1) <- matchN 4 toks
hiam <- maybeRead ptrTokS
fn <- maybeRead fnumS
let ss1 = synset0 { hereIAm = hiam,
pos = readEPOS posS,
fnum = fn,
ssType = if readEPOS posS == Satellite then IndirectAnt else UnknownEPos
}
let numWords = case readHex numWordsS of
(n,_):_ -> n
_ -> 0
let (wrds,ptrCountS:rest2) = splitAt (numWords*2) rest1
let ptrCount =
case readDec ptrCountS of
(n,_):_ -> n
_ -> 0
wrds' <- readWords ss1 wrds
let ss2 = ss1 { ssWords = wrds',
whichWord = findIndex (==w) wrds }
let (ptrs,rest3) = splitAt (ptrCount*4) rest2
let (fp,ss3) = readPtrs (False,ss2) ptrs
let ss4 = if fp && searchPos == Adj && ssType ss3 == UnknownEPos
then ss3 { ssType = Pertainym }
else ss3
let (ss5,rest4) =
if searchPos /= Verb
then (ss4, rest3)
else let (fcountS:rest4) = rest3
(synPtrs, rest5) = splitAt (read fcountS * 3) rest4
in (ss4, rest5)
let ss6 = ss5 { defn = unwords $ drop 1 rest4 }
return ss6
where
readWords ss (w:lid:xs) = do
let s = map toLower $ replaceChar ' ' '_' w
idx <- indexLookup wne s (fromEPOS $ pos ss)
let posn = case idx of
Nothing -> Nothing
Just ix -> findIndex (==hereIAm ss) (indexOffsets ix)
rest <- readWords ss xs
return ((w, fst $ head $ readHex lid, maybe AllSenses SenseNumber posn) : rest)
readWords _ _ = return []
readPtrs (fp,ss) (typ:off:ppos:lexp:xs) =
let (fp',ss') = readPtrs (fp,ss) xs
this = (getPointerType typ,
read off,
readEPOS ppos,
fst $ head $ readHex (take 2 lexp),
fst $ head $ readHex (drop 2 lexp))
in if searchPos == Adj && ssType ss' == UnknownEPos
then if getPointerType typ == Antonym
then (fp' , ss' { forms = this : forms ss',
ssType = DirectAnt })
else (True, ss' { forms = this : forms ss' })
else (fp', ss' { forms = this : forms ss' })
readPtrs (fp,ss) _ = (fp,ss)
indexToSenseKey :: WordNetEnv -> Index -> Int -> IO (Maybe SenseKey)
indexToSenseKey wne idx sense = do
let cpos = fromEPOS $ indexPOS idx
ss1 <- readSynset wne cpos (indexOffsets idx !! (sense1)) ""
ss2 <- followSatellites ss1
case findIndex ((==indexWord idx) . map toLower) (map (\ (w,_,_) -> w) $ ssWords ss2) of
Nothing -> return Nothing
Just j -> do
let skey = if ssType ss2 == Satellite
then indexWord idx ++ "%" ++ show (fromEnum Satellite) ++ ":" ++
padTo 2 (show $ fnum ss2) ++ ":" ++ headWord ss2 ++ ":" ++
padTo 2 (show $ headSense ss2)
else indexWord idx ++ "%" ++ show (fromEnum $ pos ss2) ++ ":" ++
padTo 2 (show $ fnum ss2) ++ ":" ++
padTo 2 (show $ lexId ss2 j) ++ "::"
return (Just $ SenseKey cpos skey (indexWord idx))
where
followSatellites ss
| ssType ss == Satellite =
case find (\ (f,_,_,_,_) -> f == Similar) (forms ss) of
Nothing -> return ss
Just (f,offset,p,j,k) -> do
adjss <- readSynset wne (fromEPOS p) offset ""
case ssWords adjss of
(hw,_,hs):_ -> return (ss { headWord = map toLower hw,
headSense = hs })
_ -> return ss
| otherwise = return ss
indexLookup :: WordNetEnv -> String -> POS -> IO (Maybe Index)
indexLookup wne w pos = do
ml <- binarySearch (fst (dataHandles wne ! pos)) w
case ml of
Nothing -> return Nothing
Just l -> do
(wdS:posS:ccntS:pcntS:rest1) <- matchN 4 (words l)
isc <- maybeRead ccntS
pc <- maybeRead pcntS
let idx1 = index0 { indexWord = wdS,
indexPOS = readEPOS posS,
indexSenseCount = isc
}
let (ptrs,rest2) = splitAt pc rest1
let idx2 = idx1 { indexForms = map getPointerType ptrs }
(ocntS:tcntS:rest3) <- matchN 2 rest2
itc <- maybeRead tcntS
otc <- maybeRead ocntS
let idx3 = idx2 { indexTaggedCount = itc }
let (offsets,_) = splitAt otc rest3
io <- mapM maybeRead offsets
return (Just $ idx3 { indexOffsets = io })
binarySearch :: Handle -> String -> IO (Maybe String)
binarySearch h s = do
hSeek h SeekFromEnd 0
bot <- hTell h
binarySearch' 0 bot (bot `div` 2)
where
binarySearch' :: Integer -> Integer -> Integer -> IO (Maybe String)
binarySearch' top bot mid = do
hSeek h AbsoluteSeek (mid1)
when (mid /= 1) readUntilNL
eof <- hIsEOF h
if eof
then if top >= bot1
then return Nothing
else binarySearch' top (bot1) ((top+bot1) `div` 2)
else do
l <- hGetLine h
let key = takeWhile (/=' ') l
if key == s
then return (Just l)
else case (bot top) `div` 2 of
0 -> return Nothing
d -> case key `compare` s of
LT -> binarySearch' mid bot (mid + d)
GT -> binarySearch' top mid (top + d)
readUntilNL = do
eof <- hIsEOF h
if eof
then return ()
else do hGetLine h; return ()