-- | NLP.WordNet.Prims provides primitive operations over the word net database.
-- The general scheme of things is to call 'initializeWordNet' to get a 'WordNetEnv'.
-- Once you have this, you can start querying.  A query usually looks like (suppose
-- we want "Dog" as a Noun:
--
-- 'getIndexString' on "Dog".  This will give us back a cannonicalized string, in this
-- case, still "dog".  We then use 'indexLookup' to get back an index for this string.
-- Then, we call 'indexToSenseKey' to with the index and the sense number (the Index
-- contains the number of senses) to get back a SenseKey.  We finally call
-- 'getSynsetForSense' on the sense key to get back a Synset.
--
-- We can continue to query like this or we can use the offsets provided in the
-- various fields of the Synset to query directly on an offset.  Given an offset
-- and a part of speech, we can use 'readSynset' directly to get a synset (instead
-- of going through all this business with indices and sensekeys.
module NLP.WordNet.Prims
    (
     initializeWordNet,
     initializeWordNetWithOptions,
     closeWordNet,
     getIndexString,
     getSynsetForSense,
     readSynset,
     indexToSenseKey,
     indexLookup
    )
    where

import System.IO -- hiding (try, catch)
import System.Environment
import Numeric (readHex, readDec)
import Data.Char (toLower, isSpace)
import Data.Array
import Data.Foldable (forM_)
import Control.Exception
import Control.Monad (when, liftM, mplus, unless)
import Data.List (findIndex, find, elemIndex)
import Data.Maybe (isNothing, fromJust, isJust, fromMaybe)

import NLP.WordNet.PrimTypes
import NLP.WordNet.Util
import NLP.WordNet.Consts

-- | initializeWordNet looks for the word net data files in the
-- default directories, starting with environment variables WNSEARCHDIR
-- and WNHOME, and then falling back to 'defaultPath' as defined in
-- NLP.WordNet.Consts.
initializeWordNet :: IO WordNetEnv
initializeWordNet =
  initializeWordNetWithOptions Nothing Nothing


-- | initializeWordNetWithOptions looks for the word net data files in the
-- specified directory.  Use this if wordnet is installed in a non-standard
-- place on your machine and you don't have the appropriate env vars set up.
initializeWordNetWithOptions :: 
    Maybe FilePath ->                          -- word net data directory
    Maybe (String -> SomeException -> IO ()) ->    -- "warning" function (by default, warnings go to stderr)
      IO WordNetEnv
initializeWordNetWithOptions mSearchdir mWarn = do
  searchdir <- case mSearchdir of { Nothing -> getDefaultSearchDir ; Just d -> return d }
  let warn = fromMaybe (\s e -> hPutStrLn stderr (s ++ "\n" ++ show e)) mWarn
  version <- tryMaybe $ getEnv "WNDBVERSION"
  dHands <- mapM (\pos' -> do
                  idxH  <- openFileEx
                             (makePath [searchdir, "index." ++ partName pos'])
                             (BinaryMode ReadMode)
                  dataH <- openFileEx 
                             (makePath [searchdir, "data." ++ partName pos'])
                             (BinaryMode ReadMode)
                  return (idxH, dataH)
                 ) allPOS
  -- the following are unnecessary
  sense   <- tryMaybeWarn (warn "Warning: initializeWordNet: cannot open file index.sense")
                        $ openFileEx (makePath [searchdir, "index.sense"]) (BinaryMode ReadMode)
  cntlst  <- tryMaybeWarn (warn "Warning: initializeWordNet: cannot open file cntlist.rev")
                        $ openFileEx (makePath [searchdir, "cntlist.rev"]) (BinaryMode ReadMode)
  keyidx  <- tryMaybeWarn (warn "Warning: initializeWordNet: cannot open file index.key")
                        $ openFileEx (makePath [searchdir, "index.key"  ]) (BinaryMode ReadMode)
  rkeyidx <- tryMaybeWarn (warn "Warning: initializeWordNet: cannot open file index.key.rev")
                        $ openFileEx (makePath [searchdir, "index.key.rev"]) (BinaryMode ReadMode)
  vsent   <- tryMaybeWarn (warn "Warning: initializeWordNet: cannot open sentence files (sentidx.vrb and sents.vrb)")
                        $ do
               idx <- openFileEx (makePath [searchdir, "sentidx.vrb"]) (BinaryMode ReadMode)
               snt <- openFileEx (makePath [searchdir, "sents.vrb"  ]) (BinaryMode ReadMode)
               return (idx, snt)
  mHands <- mapM (\pos' -> openFileEx 
                            (makePath [searchdir, partName pos' ++ ".exc"])
                            (BinaryMode ReadMode)) allPOS
  return WordNetEnv 
              { dataHandles = listArray (Noun, Adv) dHands,
                excHandles  = listArray (Noun, Adv) mHands,
                senseHandle = sense,
                countListHandle = cntlst,
                keyIndexHandle = keyidx,
                revKeyIndexHandle = rkeyidx,
                vSentHandle = vsent,
                wnReleaseVersion = version,
                dataDirectory = searchdir,
                warnAbout = warn
              }
  where
    getDefaultSearchDir = do
      Just searchdir <- tryMaybe (getEnv "WNSEARCHDIR") >>= \m1 ->
                        tryMaybe (getEnv "WNHOME") >>= \m2 ->
                        return (m1 `mplus` 
                                liftM (++dictDir) m2 `mplus` 
                                Just defaultPath)
      return searchdir

-- | closeWordNet is not strictly necessary.  However, a 'WordNetEnv' tends to
-- hog a few Handles, so if you run out of Handles and won't be using
-- your WordNetEnv for a while, you can close it and always create a new
-- one later.
closeWordNet :: WordNetEnv -> IO ()
closeWordNet wne = do
  mapM_ (\ (h1,h2) -> hClose h1 >> hClose h2) (elems (dataHandles wne))
  mapM_ hClose (elems (excHandles wne))
  mapM_ (`forM_` hClose)
        [senseHandle wne, countListHandle wne, keyIndexHandle wne,
         revKeyIndexHandle wne, liftM fst (vSentHandle wne), liftM snd (vSentHandle wne)]

-- | getIndexString takes a string and a part of speech and tries to find
-- that string (or something like it) in the database.  It is essentially
-- a cannonicalization routine and should be used before querying the
-- database, to ensure that your string is in the right form.
getIndexString :: WordNetEnv -> String -> POS -> IO (Maybe String)
getIndexString wne str partOfSpeech = getIndexString' . cannonWNString $ str
  where
    getIndexString' [] = return Nothing
    getIndexString' (s:ss) = do
      i <- binarySearch (fst (dataHandles wne ! partOfSpeech)) s
      if isJust i
        then return (Just s)
        else getIndexString' ss

-- | getSynsetForSense takes a sensekey and finds the appropriate Synset.  SenseKeys can
-- be built using indexToSenseKey.
getSynsetForSense :: WordNetEnv -> SenseKey -> IO (Maybe Synset)
getSynsetForSense wne _ | isNothing (senseHandle wne) = ioError $ userError "no sense dictionary"
getSynsetForSense wne key' = do
  l <- binarySearch
         (fromJust $ senseHandle wne)
         (senseKeyString key') -- ++ " " ++ charForPOS (senseKeyPOS key))
  case l of
    Nothing -> return Nothing
    Just l' -> do offset <- maybeRead $ takeWhile (not . isSpace) $
                               drop 1 $ dropWhile (not . isSpace) l'
                  ss <- readSynset wne (senseKeyPOS key') offset (senseKeyWord key')
                  return (Just ss)

-- | readSynset takes a part of speech, and an offset (the offset can be found
-- in another Synset) and (perhaps) a word we're looking for (this is optional)
-- and will return its Synset.
readSynset :: WordNetEnv -> POS -> Offset -> String -> IO Synset
readSynset wne searchPos offset w = do
  let h = snd (dataHandles wne ! searchPos)
  hSeek h AbsoluteSeek offset
  toks <- liftM words $ hGetLine h
  --print toks
  (ptrTokS:fnumS:posS:numWordsS:rest1) <- matchN 4 toks
  hiam <- maybeRead ptrTokS
  fn   <- maybeRead fnumS
  let ss1 = synset0 { hereIAm = hiam,
                      pos = readEPOS posS,
                      fnum = fn,
                      ssType = if readEPOS posS == Satellite then IndirectAnt else UnknownEPos
                   }
  let numWords = case readHex numWordsS of
                   (n,_):_ -> n
                   _       -> 0
--read numWordsS
  let (wrds,ptrCountS:rest2) = splitAt (numWords*2) rest1  -- words and lexids
  let ptrCount = 
        case readDec ptrCountS of
          (n,_):_ -> n
          _       -> 0
--  print (toks, ptrCountS, ptrCount)
  wrds' <- readWords ss1 wrds
  let ss2 = ss1 { ssWords = wrds',
                  whichWord = elemIndex w wrds }
  let (ptrs,rest3) = splitAt (ptrCount*4) rest2
  let (fp,ss3) = readPtrs (False,ss2) ptrs
  let ss4 = if fp && searchPos == Adj && ssType ss3 == UnknownEPos
              then ss3 { ssType = Pertainym }
              else ss3
  let (ss5,rest4) = 
        if searchPos /= Verb 
          then (ss4, rest3) 
          else let (fcountS:_) = rest3
                   (_ , rest5) = splitAt (read fcountS * 3) rest4
               in  (ss4, rest5)

  let ss6 = ss5 { defn = unwords $ drop 1 rest4 }

  return ss6
  where
    readWords ss (w':lid:xs) = do
      let s = map toLower $ replaceChar ' ' '_' w'
      idx  <- indexLookup wne s (fromEPOS $ pos ss)
--      print (w,st,idx)
      let posn = case idx of
                   Nothing -> Nothing
                   Just ix -> elemIndex (hereIAm ss) (indexOffsets ix)
      rest <- readWords ss xs
      return ((w', fst $ head $ readHex lid, maybe AllSenses SenseNumber posn) : rest)
    readWords _ _ = return []
    readPtrs (fp,ss) (typ:off:ppos:lexp:xs) = 
      let (fp',ss') = readPtrs (fp,ss) xs
          this = (getPointerType typ,
                  read off,
                  readEPOS ppos,
                  fst $ head $ readHex (take 2 lexp),
                  fst $ head $ readHex (drop 2 lexp))
      in  if searchPos == Adj && ssType ss' == UnknownEPos
            then if getPointerType typ == Antonym
                   then (fp' , ss' { forms = this : forms ss',
                                     ssType   = DirectAnt })
                   else (True, ss' { forms = this : forms ss' })
            else (fp', ss' { forms = this : forms ss' })
    readPtrs (fp,ss) _ = (fp,ss)

-- | indexToSenseKey takes an Index (as returned by, ex., indexLookup) and a sense
-- number and returns a SenseKey for that sense.
indexToSenseKey :: WordNetEnv -> Index -> Int -> IO (Maybe SenseKey)
indexToSenseKey wne idx sense = do
  let cpos = fromEPOS $ indexPOS idx
  ss1 <- readSynset wne cpos (indexOffsets idx !! (sense-1)) ""
  ss2 <- followSatellites ss1
  --print ss2
  case findIndex ((==indexWord idx) . map toLower) (map (\ (w,_,_) -> w) $ ssWords ss2) of
    Nothing -> return Nothing
    Just  j -> do
      let skey = ((indexWord idx ++ "%") ++) (if ssType ss2 == Satellite
                   then show (fromEnum Satellite) ++ ":" ++
                        padTo 2 (show $ fnum ss2) ++ ":" ++ headWord ss2 ++ ":" ++
                        padTo 2 (show $ headSense ss2)
                   else show (fromEnum $ pos ss2) ++ ":" ++
                        padTo 2 (show $ fnum ss2) ++ ":" ++ 
                        padTo 2 (show $ lexId ss2 j) ++ "::")
      return (Just $ SenseKey cpos skey (indexWord idx))
  where
    followSatellites ss 
        | ssType ss == Satellite =
            case find (\ (f,_,_,_,_) -> f == Similar) (forms ss) of
              Nothing -> return ss
              Just (_,offset,p,_,_) -> do
                adjss <- readSynset wne (fromEPOS p) offset ""
                case ssWords adjss of
                  (hw,_,hs):_ -> return (ss { headWord  = map toLower hw,
                                              headSense = hs })
                  _ -> return ss
        | otherwise = return ss

-- indexLookup takes a word and part of speech and gives back its index.
indexLookup :: WordNetEnv -> String -> POS -> IO (Maybe Index)
indexLookup wne w pos' = do
  ml <- binarySearch (fst (dataHandles wne ! pos')) w
  case ml of
    Nothing -> return Nothing
    Just  l -> do
      (wdS:posS:ccntS:pcntS:rest1) <- matchN 4 (words l)
      isc <- maybeRead ccntS
      pc  <- maybeRead pcntS
      let idx1 = index0 { indexWord = wdS,
                          indexPOS  = readEPOS posS,
                          indexSenseCount = isc
                        }
      let (ptrs,rest2) = splitAt pc rest1
      let idx2 = idx1 { indexForms = map getPointerType ptrs }
      (ocntS:tcntS:rest3) <- matchN 2 rest2
      itc <- maybeRead tcntS
      otc <- maybeRead ocntS
      let idx3 = idx2 { indexTaggedCount = itc }
      let (offsets,_) = splitAt otc rest3
      io <- mapM maybeRead offsets
      return (Just $ idx3 { indexOffsets = io })

-- do binary search on an index file
binarySearch :: Handle -> String -> IO (Maybe String)
binarySearch h s = do
  hSeek h SeekFromEnd 0
  bot <- hTell h
  binarySearch' 0 bot (bot `div` 2)
  where
    binarySearch' :: Integer -> Integer -> Integer -> IO (Maybe String)
    binarySearch' top bot mid = do
      hSeek h AbsoluteSeek (mid-1)
      when (mid /= 1) readUntilNL
      eof <- hIsEOF h
      if eof 
        then if top >= bot-1 
               then return Nothing
               else binarySearch' top (bot-1) ((top+bot-1) `div` 2)
        else do
          l <- hGetLine h
          let key' = takeWhile (/=' ') l
          if key' == s 
            then return (Just l)
            else case (bot - top) `div` 2 of
                   0 -> return Nothing
                   d -> case key' `compare` s of
                          LT -> binarySearch' mid bot (mid + d)
                          GT -> binarySearch' top mid (top + d)
                          EQ -> undefined
    readUntilNL = do
      eof <- hIsEOF h
      unless eof $ do
        hGetLine h
        return ()