{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE TupleSections #-}


-- | A `DS.DAWG`-based dictionary with additional information
-- assigned to lexical entries and word forms.


module NLP.HistPL.DAWG
(
-- * Rule
  Rule (..)
, apply
, between

-- * Entry
, Lex (..)
, Key (..)
, Val (..)

-- * Entry set
, LexSet
, mkLexSet
, unLexSet
, Node
, decode


-- * DAWG
, DAWG

-- ** Initialization
, DAWG'Init
, DM.empty
, insert
, DS.freeze

-- ** Query
, lookup
, submap

-- ** Weight
, DS.Weight
, DS.weigh
, DS.size
, index
, byIndex

-- ** Conversion
, fromList
, toList
, entries
, revDAWG
) where


import           Prelude hiding (lookup)
import           Control.Applicative ((<$>), (<*>))
import           Control.Arrow (first)
import           Data.Binary (Binary, get, put)
import           Data.Text.Binary ()
import           Data.List (foldl')
import qualified Data.Map as M
import qualified Data.Text as T
import qualified Data.DAWG.Static as DS
import qualified Data.DAWG.Dynamic as DM


------------------------------------------------------------------------
-- Rule
------------------------------------------------------------------------


-- | A rule for translating a form into another form.
data Rule = Rule {
    -- | Number of characters to cut from the end of the form.
      cut       :: !Int
    -- | A suffix to paste.
    , suffix    :: !T.Text
    } deriving (Show, Eq, Ord)


instance Binary Rule where
    put Rule{..} = put cut >> put suffix
    get = Rule <$> get <*> get


-- | Apply the rule.
apply :: Rule -> T.Text -> T.Text
apply r x = T.take (T.length x - cut r) x `T.append` suffix r


-- | Determine a rule which translates between two strings.
between :: T.Text -> T.Text -> Rule
between source dest =
    let k = lcp source dest
    in  Rule (T.length source - k) (T.drop k dest)
  where
    lcp a b = case T.commonPrefixes a b of
        Just (c, _, _)  -> T.length c
        Nothing         -> 0


------------------------------------------------------------------------
-- Entry componenets (key and value)
------------------------------------------------------------------------


-- | A key of a dictionary entry.
data Key i = Key {
    -- | A path of the entry, i.e. DAWG key.
      path  :: T.Text
    -- | Unique identifier among entries with the same `path`.
    , uid   :: i }
    deriving (Show, Eq, Ord)


-- | A value of the entry.
data Val a w b = Val {
    -- | Additional information assigned to the entry.
      info  :: a
    -- | A map of forms with additional info of type @b@ assigned.
    -- Invariant: in case of a reverse dictionary (from word forms
    -- to base forms) this map should contain exactly one element
    -- (a base form and a corresonding information).
    , forms :: M.Map w b }
    deriving (Show, Eq, Ord)


instance (Ord w, Binary a, Binary w, Binary b) => Binary (Val a w b) where
    put Val{..} = put info >> put forms
    get = Val <$> get <*> get


-- | A dictionary entry consists of a `Key` and a `Val`ue.
data Lex i a b = Lex {
    -- | Entry key.
      lexKey :: Key i
    -- | Entry value.
    , lexVal :: Val a T.Text b }
    deriving (Show, Eq, Ord)


------------------------------------------------------------------------
-- Set of entries 
------------------------------------------------------------------------


-- | A set of dictionary entries.
type LexSet i a b = M.Map (Key i) (Val a T.Text b)


-- | Make lexical set from a list of entries.
mkLexSet :: Ord i => [Lex i a b] -> LexSet i a b
mkLexSet = M.fromList . map ((,) <$> lexKey <*> lexVal)


-- | List lexical entries.
unLexSet :: LexSet i a b -> [Lex i a b]
unLexSet = map (uncurry Lex) . M.toList


-- | Map function over entry word forms.
mapW :: Ord w' => (w -> w') -> Val a w b -> Val a w' b
mapW f v =
    let g = M.fromList . map (first f) . M.toList
    in  v { forms = g (forms v) }


-- | Actual values stored in automaton states contain
-- all entry information but `path`.
type Node i a b = M.Map i (Val a Rule b)


-- | Decode dictionary value given `path`.
decode :: Ord i => T.Text -> Node i a b -> LexSet i a b
decode x n = M.fromList
    [ (Key x i, mapW (flip apply x) val)
    | (i, val) <- M.toList n ]


-- | Transform entry into a list.
toListE :: Lex i a b -> [(T.Text, i, a, T.Text, b)]
toListE (Lex Key{..} Val{..}) =
    [ (path, uid, info, form, y)
    | (form, y) <- M.assocs forms ]


------------------------------------------------------------------------
-- DAWG 
------------------------------------------------------------------------


-- | A dictionary parametrized over ID @i@, with info @a@ for every
-- (key, i) pair and info @b@ for every (key, i, apply rule key) triple.
type DAWG i a b = DS.DAWG Char DS.Weight (Node i a b)


------------------------------------------------------------------------
-- Initialization 
------------------------------------------------------------------------


-- | A `DAWG` initialization structure (a dynamic DAWG).
type DAWG'Init i a b = DM.DAWG Char (Node i a b)


-- | Insert a (key, ID, entry info, form, entry\/form info) into a
-- `DAWG'Init` structure.
insert
    :: (Ord i, Ord a, Ord b)
    => (T.Text, i, a, T.Text, b)
    -> DAWG'Init i a b
    -> DAWG'Init i a b
insert (x, i, a, y, b) = DM.insertWith union
    (T.unpack x)
    (M.singleton i (Val a (M.singleton (between x y) b)))
  where
    union = M.unionWith $ both const M.union
    both f g (Val x0 y0) (Val x1 y1) = Val (f x0 x1) (g y0 y1)


------------------------------------------------------------------------
-- Query 
------------------------------------------------------------------------


-- | Lookup the key in the dictionary.
lookup :: Ord i => T.Text -> DAWG i a b -> LexSet i a b
lookup x dict = decode x $ case DS.lookup (T.unpack x) dict of
    Just m  -> m
    Nothing -> M.empty


-- | Return the sub-dictionary containing all keys beginning with a prefix.
submap :: Ord i => T.Text -> DAWG i a b -> DAWG i a b
submap x dict = DS.submap (T.unpack x) dict


-- | Position in a set of all dictionary entries with respect
-- to the lexicographic order.
index :: T.Text -> DAWG i a b -> Maybe Int
index x = DS.index (T.unpack x)


-- | Find dictionary entry given its index with respect to the
-- lexicographic order.
byIndex :: Int -> DAWG i a b -> Maybe T.Text
byIndex ix = fmap T.pack . DS.byIndex ix


------------------------------------------------------------------------
-- Conversion
------------------------------------------------------------------------


-- | List dictionary lexical entries.
entries :: Ord i => DAWG i a b -> [Lex i a b]
entries = concatMap f . DS.assocs where
    f (key, val) = unLexSet $ decode (T.pack key) val


-- | Make dictionary from a list of (key, ID, entry info, form,
-- entry\/form info) tuples.
fromList :: (Ord i, Ord a, Ord b) => [(T.Text, i, a, T.Text, b)] -> DAWG i a b
fromList = DS.weigh . DS.freeze . foldl' (flip insert) DM.empty


-- | Transform dictionary back into the list of (key, ID, key\/ID info, elem,
-- key\/ID\/elem info) tuples.
toList :: (Ord i, Ord a, Ord b) => DAWG i a b -> [(T.Text, i, a, T.Text, b)]
toList = concatMap toListE . entries


-- | Reverse the dictionary.
revDAWG :: (Ord i, Ord a, Ord b) => DAWG i a b -> DAWG i a b
revDAWG = 
    let swap (base, i, x, form, y) = (form, i, x, base, y)
    in  fromList . map swap . toList