-- | Basic types for dictionary handling. 

module NLP.Nerf.Dict.Base
(
-- * Lexicon entry
  Form
, isMultiWord
, NeType
, Entry (..)

-- * Dictionary
, NeDict
, mkDict
, siftDict
, saveDict
, loadDict

-- * Merging dictionaries
, merge
, diff
) where

import Control.Applicative ((<$>))
import Data.Binary (encodeFile, decodeFile)
import Data.Text.Binary ()
import qualified Data.Text as T
import qualified Data.Set as S
import qualified Data.Map as M

-- | A orthographic form.
type Form = T.Text

-- | Is the form a multiword one?
isMultiWord :: Form -> Bool
isMultiWord = (>1) . length . T.words

-- | A type of named entity.
type NeType = T.Text

-- | A Named Entity entry from the LMF dictionary.
data Entry = Entry
    { neOrth :: !Form	-- ^ Orthographic form of the NE
    , neType :: !NeType -- ^ Type of the NE
    } deriving (Show, Read, Eq, Ord)

-- | A NeDict is a map from forms to NE types.  Each NE may be annotated
-- with multiple types.
type NeDict = M.Map Form (S.Set NeType)

-- | Construct the dictionary from the list of entries.
mkDict :: [Entry] -> NeDict
mkDict xs = M.fromListWith S.union
    [(neOrth x, S.singleton $ neType x) | x <- xs]

-- | Remove dictionary entries which do not satisfy the predicate.
siftDict :: (Form -> S.Set NeType -> Bool) -> NeDict -> NeDict
siftDict f dict = M.fromList [(k, v) | (k, v) <- M.assocs dict, f k v]

-- | Save the dictionary in the file.
saveDict :: FilePath -> NeDict -> IO ()
saveDict = encodeFile

-- | Load the dictionary from the file.
loadDict :: FilePath -> IO NeDict
loadDict = decodeFile

-- | Merge dictionary resources.
merge :: [NeDict] -> NeDict
merge = M.unionsWith S.union

-- | Differentiate labels from separate dictionaries using
-- dictionary-unique prefixes.
diff :: [NeDict] -> [NeDict]
diff ds =
    [ mapS (addPrefix i) <$> dict
    | (i, dict) <- zip [0..] ds ]

-- | Map function over the set.
mapS :: Ord a => (a -> a) -> S.Set a -> S.Set a
mapS f s = S.fromList [f x | x <- S.toList s]
{-# INLINE mapS #-}

-- | Add integer prefix.
addPrefix :: Int -> T.Text -> T.Text
addPrefix = T.append . T.pack . show