{-# LANGUAGE TupleSections #-}

-- | Basic types for dictionary handling. 

module NLP.Nerf.Dict.Base
(
-- * Lexicon entry
  NeType
, Form
, isMultiWord
, Entry (..)

-- * Dictionary
, Label
, DAWG
, Dict
, fromPairs
, fromEntries
, siftDict
, saveDict
, loadDict

-- * Merging dictionaries
, merge
, diff
) where

import Control.Applicative ((<$>), (<*>))
import Data.Binary (encodeFile, decodeFile)
import Data.Text.Binary ()
import qualified Data.Text as T
import qualified Data.Set as S
import qualified Data.DAWG.Static as D
import qualified Data.DAWG.Trans.Vector as D

-- | A type of named entity.
type NeType = T.Text

-- | A orthographic form.
type Form = T.Text

-- | Is the form a multiword one?
isMultiWord :: Form -> Bool
isMultiWord = (>1) . length . T.words

-- | A Named Entity entry from the LMF dictionary.
data Entry = Entry
    { neOrth :: !Form	-- ^ Orthographic form of the NE
    , neType :: !NeType -- ^ Type of the NE
    } deriving (Show, Read, Eq, Ord)

-- | Dictionary label.
type Label = T.Text

-- | A 'Dict' is a map from forms to labels.  Each form may be annotated
-- with multiple labels.  The map is represented using the directed acyclic
-- word graph.
-- type Dict = D.DAWG (S.Set Label)
type DAWG = D.DAWG D.Trans Char ()
type Dict = DAWG (S.Set Label)

-- | Construct dictionary from the list of form/label pairs.
fromPairs :: [(Form, Label)] -> Dict
fromPairs xs = D.fromListWith S.union
    [ ( T.unpack x
      , S.singleton y)
    | (x, y) <- xs ]

-- | Construct dictionary from the list of entries.
fromEntries :: [Entry] -> Dict
fromEntries = fromPairs . map ((,) <$> neOrth <*> neType)

-- | Remove dictionary entries which do not satisfy the predicate.
siftDict :: (Form -> S.Set Label -> Bool) -> Dict -> Dict
siftDict f dict = D.fromList [(k, v) | (k, v) <- D.assocs dict, f (T.pack k) v]

-- | Save the dictionary in the file.
saveDict :: FilePath -> Dict -> IO ()
saveDict = encodeFile

-- | Load the dictionary from the file.
loadDict :: FilePath -> IO Dict
loadDict = decodeFile

-- | Merge dictionary resources.
merge :: [Dict] -> Dict
merge = unionsWith S.union

-- | Replacement implementation of unionsWith while there is
-- no such function in dawg library. 
unionsWith :: Ord a => (a -> a -> a) -> [DAWG a] -> DAWG a
unionsWith f = foldl (unionWith f) D.empty

-- | Replacement implementation of unionWith while there is
-- no such function in dawg library. 
unionWith :: Ord a => (a -> a -> a) -> DAWG a -> DAWG a -> DAWG a
unionWith f d d' = D.fromListWith f (D.assocs d ++ D.assocs d')

-- | Differentiate labels from separate dictionaries using
-- dictionary-unique prefixes.
diff :: [Dict] -> [Dict]
diff ds =
    [ mapS (addPrefix i) `mapD` dict
    | (i, dict) <- zip [0..] ds ]

-- | Map function over the DAWG elements.
mapD :: Ord b => (a -> b) -> DAWG a -> DAWG b
mapD f d = D.fromList [(x, f y) | (x, y) <- D.assocs d]

-- | Map function over the set.
mapS :: Ord b => (a -> b) -> S.Set a -> S.Set b
mapS f s = S.fromList [f x | x <- S.toList s]

-- | Add integer prefix.
addPrefix :: Int -> T.Text -> T.Text
addPrefix = T.append . T.pack . show