-- | NLP.WordNet.Prims provides primitive operations over the word net database.
-- The general scheme of things is to call 'initializeWordNet' to get a 'WordNetEnv'.
-- Once you have this, you can start querying.  A query usually looks like (suppose
-- we want "Dog" as a Noun:
--
-- 'getIndexString' on "Dog".  This will give us back a cannonicalized string, in this
-- case, still "dog".  We then use 'indexLookup' to get back an index for this string.
-- Then, we call 'indexToSenseKey' to with the index and the sense number (the Index
-- contains the number of senses) to get back a SenseKey.  We finally call
-- 'getSynsetForSense' on the sense key to get back a Synset.
--
-- We can continue to query like this or we can use the offsets provided in the
-- various fields of the Synset to query directly on an offset.  Given an offset
-- and a part of speech, we can use 'readSynset' directly to get a synset (instead
-- of going through all this business with indices and sensekeys.
module NLP.WordNet.Prims
    (
     initializeWordNet,
     initializeWordNetWithOptions,
     closeWordNet,
     getIndexString,
     getSynsetForSense,
     readSynset,
     indexToSenseKey,
     indexLookup
    )
    where

import System.IO -- hiding (try, catch)
import System.Environment
import Numeric (readHex, readDec)
import Data.Char (toLower, isSpace)
import Data.Array
import Control.Exception
import Control.Monad (when, liftM, mplus)
import Data.List (findIndex, find)
import Data.Maybe (isNothing, fromJust, isJust, fromMaybe)
import GHC.Handle -- (openFileEx, BinaryMode(..))

import NLP.WordNet.PrimTypes
import NLP.WordNet.Util
import NLP.WordNet.Consts

-- | initializeWordNet looks for the word net data files in the
-- default directories, starting with environment variables WNSEARCHDIR
-- and WNHOME, and then falling back to 'defaultPath' as defined in
-- NLP.WordNet.Consts.
initializeWordNet :: IO WordNetEnv
initializeWordNet =
  initializeWordNetWithOptions Nothing Nothing


-- | initializeWordNetWithOptions looks for the word net data files in the
-- specified directory.  Use this if wordnet is installed in a non-standard
-- place on your machine and you don't have the appropriate env vars set up.
initializeWordNetWithOptions :: 
    Maybe FilePath ->                          -- word net data directory
    Maybe (String -> Exception -> IO ()) ->    -- "warning" function (by default, warnings go to stderr)
      IO WordNetEnv
initializeWordNetWithOptions mSearchdir mWarn = do
  searchdir <- case mSearchdir of { Nothing -> getDefaultSearchDir ; Just d -> return d }
  let warn = fromMaybe (\s e -> hPutStrLn stderr (s ++ "\n" ++ show e)) mWarn
  version <- tryMaybe $ getEnv "WNDBVERSION"
  dHands <- mapM (\pos -> do
                  idxH  <- openFileEx
                             (makePath [searchdir, "index." ++ partName pos])
                             (BinaryMode ReadMode)
                  dataH <- openFileEx 
                             (makePath [searchdir, "data." ++ partName pos])
                             (BinaryMode ReadMode)
                  return (idxH, dataH)
                 ) allPOS
  -- the following are unnecessary
  sense   <- tryMaybeWarn (warn "Warning: initializeWordNet: cannot open file index.sense")
                        $ openFileEx (makePath [searchdir, "index.sense"]) (BinaryMode ReadMode)
  cntlst  <- tryMaybeWarn (warn "Warning: initializeWordNet: cannot open file cntlist.rev")
                        $ openFileEx (makePath [searchdir, "cntlist.rev"]) (BinaryMode ReadMode)
  keyidx  <- tryMaybeWarn (warn "Warning: initializeWordNet: cannot open file index.key")
                        $ openFileEx (makePath [searchdir, "index.key"  ]) (BinaryMode ReadMode)
  rkeyidx <- tryMaybeWarn (warn "Warning: initializeWordNet: cannot open file index.key.rev")
                        $ openFileEx (makePath [searchdir, "index.key.rev"]) (BinaryMode ReadMode)
  vsent   <- tryMaybeWarn (warn "Warning: initializeWordNet: cannot open sentence files (sentidx.vrb and sents.vrb)")
                        $ do
               idx <- openFileEx (makePath [searchdir, "sentidx.vrb"]) (BinaryMode ReadMode)
               snt <- openFileEx (makePath [searchdir, "sents.vrb"  ]) (BinaryMode ReadMode)
               return (idx, snt)
  mHands <- mapM (\pos -> openFileEx 
                            (makePath [searchdir, partName pos ++ ".exc"])
                            (BinaryMode ReadMode)) allPOS
  return $ WordNetEnv 
              { dataHandles = listArray (Noun, Adv) dHands,
                excHandles  = listArray (Noun, Adv) mHands,
                senseHandle = sense,
                countListHandle = cntlst,
                keyIndexHandle = keyidx,
                revKeyIndexHandle = rkeyidx,
                vSentHandle = vsent,
                wnReleaseVersion = version,
                dataDirectory = searchdir,
                warnAbout = warn
              }
  where
    getDefaultSearchDir = do
      Just searchdir <- tryMaybe (getEnv "WNSEARCHDIR") >>= \m1 ->
                        tryMaybe (getEnv "WNHOME") >>= \m2 ->
                        return (m1 `mplus` 
                                liftM (++dictDir) m2 `mplus` 
                                Just defaultPath)
      return searchdir

-- | closeWordNet is not strictly necessary.  However, a 'WordNetEnv' tends to
-- hog a few Handles, so if you run out of Handles and won't be using
-- your WordNetEnv for a while, you can close it and always create a new
-- one later.
closeWordNet :: WordNetEnv -> IO ()
closeWordNet wne = do
  mapM_ (\ (h1,h2) -> hClose h1 >> hClose h2) (elems (dataHandles wne))
  mapM_ hClose (elems (excHandles wne))
  mapM_ (\x -> when (isJust x) $ hClose (fromJust x))
        [senseHandle wne, countListHandle wne, keyIndexHandle wne,
         revKeyIndexHandle wne, liftM fst (vSentHandle wne), liftM snd (vSentHandle wne)]

-- | getIndexString takes a string and a part of speech and tries to find
-- that string (or something like it) in the database.  It is essentially
-- a cannonicalization routine and should be used before querying the
-- database, to ensure that your string is in the right form.
getIndexString :: WordNetEnv -> String -> POS -> IO (Maybe String)
getIndexString wne str partOfSpeech = getIndexString' . cannonWNString $ str
  where
    getIndexString' [] = return Nothing
    getIndexString' (s:ss) = do
      i <- binarySearch (fst (dataHandles wne ! partOfSpeech)) s
      if isJust i
        then return (Just s)
        else getIndexString' ss

-- | getSynsetForSense takes a sensekey and finds the appropriate Synset.  SenseKeys can
-- be built using indexToSenseKey.
getSynsetForSense :: WordNetEnv -> SenseKey -> IO (Maybe Synset)
getSynsetForSense wne _ | isNothing (senseHandle wne) = ioError $ userError "no sense dictionary"
getSynsetForSense wne key = do
  l <- binarySearch
         (fromJust $ senseHandle wne)
         (senseKeyString key) -- ++ " " ++ charForPOS (senseKeyPOS key))
  case l of
    Nothing -> return Nothing
    Just l  -> do offset <- maybeRead $ takeWhile (not . isSpace) $
                               drop 1 $ dropWhile (not . isSpace) l
                  ss <- readSynset wne (senseKeyPOS key) offset (senseKeyWord key)
                  return (Just ss)

-- | readSynset takes a part of speech, and an offset (the offset can be found
-- in another Synset) and (perhaps) a word we're looking for (this is optional)
-- and will return its Synset.
readSynset :: WordNetEnv -> POS -> Offset -> String -> IO Synset
readSynset wne searchPos offset w = do
  let h = snd (dataHandles wne ! searchPos)
  hSeek h AbsoluteSeek offset
  toks <- liftM words $ hGetLine h
  --print toks
  (ptrTokS:fnumS:posS:numWordsS:rest1) <- matchN 4 toks
  hiam <- maybeRead ptrTokS
  fn   <- maybeRead fnumS
  let ss1 = synset0 { hereIAm = hiam,
                      pos = readEPOS posS,
                      fnum = fn,
                      ssType = if readEPOS posS == Satellite then IndirectAnt else UnknownEPos
                   }
  let numWords = case readHex numWordsS of
                   (n,_):_ -> n
                   _       -> 0
--read numWordsS
  let (wrds,ptrCountS:rest2) = splitAt (numWords*2) rest1  -- words and lexids
  let ptrCount = 
        case readDec ptrCountS of
          (n,_):_ -> n
          _       -> 0
--  print (toks, ptrCountS, ptrCount)
  wrds' <- readWords ss1 wrds
  let ss2 = ss1 { ssWords = wrds',
                  whichWord = findIndex (==w) wrds }
  let (ptrs,rest3) = splitAt (ptrCount*4) rest2
  let (fp,ss3) = readPtrs (False,ss2) ptrs
  let ss4 = if fp && searchPos == Adj && ssType ss3 == UnknownEPos
              then ss3 { ssType = Pertainym }
              else ss3
  let (ss5,rest4) = 
        if searchPos /= Verb 
          then (ss4, rest3) 
          else let (fcountS:rest4) = rest3
                   (synPtrs, rest5) = splitAt (read fcountS * 3) rest4
               in  (ss4, rest5)

  let ss6 = ss5 { defn = unwords $ drop 1 rest4 }

  return ss6
  where
    readWords ss (w:lid:xs) = do
      let s = map toLower $ replaceChar ' ' '_' w
      idx  <- indexLookup wne s (fromEPOS $ pos ss)
--      print (w,st,idx)
      let posn = case idx of
                   Nothing -> Nothing
                   Just ix -> findIndex (==hereIAm ss) (indexOffsets ix)
      rest <- readWords ss xs
      return ((w, fst $ head $ readHex lid, maybe AllSenses SenseNumber posn) : rest)
    readWords _ _ = return []
    readPtrs (fp,ss) (typ:off:ppos:lexp:xs) = 
      let (fp',ss') = readPtrs (fp,ss) xs
          this = (getPointerType typ,
                  read off,
                  readEPOS ppos,
                  fst $ head $ readHex (take 2 lexp),
                  fst $ head $ readHex (drop 2 lexp))
      in  if searchPos == Adj && ssType ss' == UnknownEPos
            then if getPointerType typ == Antonym
                   then (fp' , ss' { forms = this : forms ss',
                                     ssType   = DirectAnt })
                   else (True, ss' { forms = this : forms ss' })
            else (fp', ss' { forms = this : forms ss' })
    readPtrs (fp,ss) _ = (fp,ss)

-- | indexToSenseKey takes an Index (as returned by, ex., indexLookup) and a sense
-- number and returns a SenseKey for that sense.
indexToSenseKey :: WordNetEnv -> Index -> Int -> IO (Maybe SenseKey)
indexToSenseKey wne idx sense = do
  let cpos = fromEPOS $ indexPOS idx
  ss1 <- readSynset wne cpos (indexOffsets idx !! (sense-1)) ""
  ss2 <- followSatellites ss1
  --print ss2
  case findIndex ((==indexWord idx) . map toLower) (map (\ (w,_,_) -> w) $ ssWords ss2) of
    Nothing -> return Nothing
    Just  j -> do
      let skey = if ssType ss2 == Satellite
                   then indexWord idx ++ "%" ++ show (fromEnum Satellite) ++ ":" ++
                        padTo 2 (show $ fnum ss2) ++ ":" ++ headWord ss2 ++ ":" ++
                        padTo 2 (show $ headSense ss2)
                   else indexWord idx ++ "%" ++ show (fromEnum $ pos ss2) ++ ":" ++
                        padTo 2 (show $ fnum ss2) ++ ":" ++ 
                        padTo 2 (show $ lexId ss2 j) ++ "::"
      return (Just $ SenseKey cpos skey (indexWord idx))
  where
    followSatellites ss 
        | ssType ss == Satellite =
            case find (\ (f,_,_,_,_) -> f == Similar) (forms ss) of
              Nothing -> return ss
              Just (f,offset,p,j,k) -> do
                adjss <- readSynset wne (fromEPOS p) offset ""
                case ssWords adjss of
                  (hw,_,hs):_ -> return (ss { headWord  = map toLower hw,
                                              headSense = hs })
                  _ -> return ss
        | otherwise = return ss

-- indexLookup takes a word and part of speech and gives back its index.
indexLookup :: WordNetEnv -> String -> POS -> IO (Maybe Index)
indexLookup wne w pos = do
  ml <- binarySearch (fst (dataHandles wne ! pos)) w
  case ml of
    Nothing -> return Nothing
    Just  l -> do
      (wdS:posS:ccntS:pcntS:rest1) <- matchN 4 (words l)
      isc <- maybeRead ccntS
      pc  <- maybeRead pcntS
      let idx1 = index0 { indexWord = wdS,
                          indexPOS  = readEPOS posS,
                          indexSenseCount = isc
                        }
      let (ptrs,rest2) = splitAt pc rest1
      let idx2 = idx1 { indexForms = map getPointerType ptrs }
      (ocntS:tcntS:rest3) <- matchN 2 rest2
      itc <- maybeRead tcntS
      otc <- maybeRead ocntS
      let idx3 = idx2 { indexTaggedCount = itc }
      let (offsets,_) = splitAt otc rest3
      io <- mapM maybeRead offsets
      return (Just $ idx3 { indexOffsets = io })

-- do binary search on an index file
binarySearch :: Handle -> String -> IO (Maybe String)
binarySearch h s = do
  hSeek h SeekFromEnd 0
  bot <- hTell h
  binarySearch' 0 bot (bot `div` 2)
  where
    binarySearch' :: Integer -> Integer -> Integer -> IO (Maybe String)
    binarySearch' top bot mid = do
      hSeek h AbsoluteSeek (mid-1)
      when (mid /= 1) readUntilNL
      eof <- hIsEOF h
      if eof 
        then if top >= bot-1 
               then return Nothing
               else binarySearch' top (bot-1) ((top+bot-1) `div` 2)
        else do
          l <- hGetLine h
          let key = takeWhile (/=' ') l
          if key == s 
            then return (Just l)
            else case (bot - top) `div` 2 of
                   0 -> return Nothing
                   d -> case key `compare` s of
                          LT -> binarySearch' mid bot (mid + d)
                          GT -> binarySearch' top mid (top + d)
    readUntilNL = do
      eof <- hIsEOF h
      if eof 
        then return ()
        else do hGetLine h; return ()