{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE TupleSections #-}


-- | A `D.DAWG`-based dictionary.


module NLP.HistPL.Dict
(
-- * Rule
  Rule (..)
, apply
, between

-- * Dictionary
, Dict
-- ** Entry
, Lex (..)
, Key (..)
, Val (..)
, Node
-- ** Entry set
, LexSet
, mkLexSet
, unLexSet
-- , encode
, decode
-- ** Query
, lookup
-- ** Conversion
, fromList
, toList
, entries
, revDict
) where


import Prelude hiding (lookup)
import Control.Applicative ((<$>), (<*>))
import Control.Arrow (first)
import Data.Binary (Binary, get, put)
import Data.Text.Binary ()
import qualified Data.Map as M
import qualified Data.Text as T
import qualified Data.DAWG.Static as D


------------------------------------------------------------------------
-- Rule
------------------------------------------------------------------------


-- | A rule for translating a form into another form.
data Rule = Rule {
    -- | Number of characters to cut from the end of the form.
      cut       :: !Int
    -- | A suffix to paste.
    , suffix    :: !T.Text
    } deriving (Show, Eq, Ord)


instance Binary Rule where
    put Rule{..} = put cut >> put suffix
    get = Rule <$> get <*> get


-- | Apply the rule.
apply :: Rule -> T.Text -> T.Text
apply r x = T.take (T.length x - cut r) x `T.append` suffix r


-- | Determine a rule which translates between two strings.
between :: T.Text -> T.Text -> Rule
between source dest =
    let k = lcp source dest
    in  Rule (T.length source - k) (T.drop k dest)
  where
    lcp a b = case T.commonPrefixes a b of
        Just (c, _, _)  -> T.length c
        Nothing         -> 0


------------------------------------------------------------------------
-- Entry componenets (key and value)
------------------------------------------------------------------------


-- | A key of a dictionary entry.
data Key i = Key {
    -- | A path of the entry, i.e. DAWG key.
      path  :: T.Text
    -- | Unique identifier among entries with the same `path`.
    , uid   :: i }
    deriving (Show, Eq, Ord)


-- | A value of the entry.
data Val a w b = Val {
    -- | Additional information assigned to the entry.
      info  :: a
    -- | A map of forms with additional info of type @b@ assigned.
    -- Invariant: in case of a reverse dictionary (from word forms
    -- to base forms) this map should contain exactly one element
    -- (a base form and a corresonding info).
    , forms :: M.Map w b }
    deriving (Show, Eq, Ord)


instance (Ord w, Binary a, Binary w, Binary b) => Binary (Val a w b) where
    put Val{..} = put info >> put forms
    get = Val <$> get <*> get


-- | A dictionary entry consists of a `Key` and a `Val`ue.
data Lex i a b = Lex {
    -- | Entry key.
      lexKey :: Key i
    -- | Entry value.
    , lexVal :: Val a T.Text b }
    deriving (Show, Eq, Ord)


-- | A set of dictionary entries.
type LexSet i a b = M.Map (Key i) (Val a T.Text b)


-- | Make lexical set from a list of entries.
mkLexSet :: Ord i => [Lex i a b] -> LexSet i a b
mkLexSet = M.fromList . map ((,) <$> lexKey <*> lexVal)


-- | List lexical entries.
unLexSet :: LexSet i a b -> [Lex i a b]
unLexSet = map (uncurry Lex) . M.toList


-- | Actual values stored in automaton states contain
-- all entry information but `path`.
type Node i a b = M.Map i (Val a Rule b)


-- | Map function over entry word forms.
mapW :: Ord w' => (w -> w') -> Val a w b -> Val a w' b
mapW f v =
    let g = M.fromList . map (first f) . M.toList
    in  v { forms = g (forms v) }


-- | Encode dictionary value given `path`.


-- | Decode dictionary value given `path`.
decode :: Ord i => T.Text -> Node i a b -> LexSet i a b
decode x n = M.fromList
    [ (Key x i, mapW (flip apply x) val)
    | (i, val) <- M.toList n ]


-- | Transform entry into a list.
toListE :: Lex i a b -> [(T.Text, i, a, T.Text, b)]
toListE (Lex Key{..} Val{..}) =
    [ (path, uid, info, form, y)
    | (form, y) <- M.assocs forms ]


------------------------------------------------------------------------


-- | A dictionary parametrized over ID @i@, with info @a@ for every
-- (key, i) pair and info @b@ for every (key, i, apply rule key) triple.
type Dict i a b = D.DAWG Char () (Node i a b)


-- | Lookup the key in the dictionary.
lookup :: Ord i => T.Text -> Dict i a b -> LexSet i a b
lookup x dict = decode x $ case D.lookup (T.unpack x) dict of
    Just m  -> m
    Nothing -> M.empty


-- | List dictionary lexical entries.
entries :: Ord i => Dict i a b -> [Lex i a b]
entries = concatMap f . D.assocs where
    f (key, val) = unLexSet $ decode (T.pack key) val


-- | Make dictionary from a list of (key, ID, entry info, form,
-- entry\/form info) tuples.
fromList :: (Ord i, Ord a, Ord b) => [(T.Text, i, a, T.Text, b)] -> Dict i a b
fromList xs = D.fromListWith union $
    [ ( T.unpack x
      , M.singleton i (Val a (M.singleton (between x y) b)) )
    | (x, i, a, y, b) <- xs ]
  where
    union = M.unionWith $ both const M.union
    both f g (Val x y) (Val x' y') = Val (f x x') (g y y')


-- | Transform dictionary back into the list of (key, ID, key\/ID info, elem,
-- key\/ID\/elem info) tuples.
toList :: (Ord i, Ord a, Ord b) => Dict i a b -> [(T.Text, i, a, T.Text, b)]
toList = concatMap toListE . entries


-- | Reverse the dictionary.
revDict :: (Ord i, Ord a, Ord b) => Dict i a b -> Dict i a b
revDict = 
    let swap (base, i, x, form, y) = (form, i, x, base, y)
    in  fromList . map swap . toList