{-# LANGUAGE PatternGuards #-}
{-# LANGUAGE BangPatterns #-}

-- | The module provides functionality for manipulating PoliMorf, the
-- morphological dictionary for Polish. Apart from IO utilities there
-- is a 'merge' function which can be used to merge the PoliMorf with
-- another dictionary resources.

module Data.PoliMorf
( 
-- * Types
  Form
, Base
, Tag
, Entry (..)

-- * Parsing
, readPoliMorf
, parsePoliMorf

-- * Merging
, BaseMap
, mkBaseMap
, RelCode (..)
, merge
) where

import Control.Applicative ((<$>), (<*>))
import Data.Maybe (catMaybes)
import Data.Monoid (mappend)
import Data.List (foldl')
import Data.Binary (Binary, get, put)
import qualified Data.Map as M
import qualified Data.Set as S
import qualified Data.Text as T
import qualified Data.Text.Lazy as L
import qualified Data.Text.Lazy.IO as L

-- | A form.
type Form = T.Text

-- | A base form.
type Base = T.Text

-- | A morphosyntactic tag.
type Tag  = T.Text

-- | A category.
type Cat  = T.Text

-- | An entry from the PoliMorf dictionary.
data Entry = Entry
    { form :: !Form
    , base :: !Base
    , tag  :: !Tag
    , cat  :: !Cat }
    deriving (Eq, Ord, Show, Read)

-- | Read the PoliMorf from the file.
readPoliMorf :: FilePath -> IO [Entry]
readPoliMorf path = parsePoliMorf <$> L.readFile path

-- | Parse the PoliMorf into a list of entries.
parsePoliMorf :: L.Text -> [Entry]
parsePoliMorf = map parsePoliRow . L.lines 

-- | Get an entry pair from a PoliMorf row.
parsePoliRow :: L.Text -> Entry
parsePoliRow row = case map L.toStrict (L.split (=='\t') row) of
    [_form, _base, _tag, _cat] -> Entry _form _base _tag _cat
    _   -> error $ "parsePoliRow: invalid row \"" ++ L.unpack row ++ "\""

-- | A map from forms to their possible base forms (there may be many since
-- the form may be a member of multiple lexemes).
type BaseMap = M.Map Form (S.Set Base)

-- | Make the base map from the list of entries.
mkBaseMap :: [Entry] -> BaseMap
mkBaseMap = M.fromListWith S.union . map ((,) <$> form <*> S.singleton . base)

-- | Reliability information: how did we assign a particular label to
-- a particular word form.
data RelCode
    = Exact     -- ^ Label assigned in a direct manner
    | ByBase    -- ^ Label assigned based on a lemma label  
    | ByForm    -- ^ Based on labels of other forms within the same lexeme
    deriving (Eq, Ord, Show, Read)

instance Binary RelCode where
    put Exact   = put '1'
    put ByBase = put '2'
    put ByForm  = put '3'
    get = get >>= \x -> return $ case x of
        '1' -> Exact
        '2' -> ByBase
        '3' -> ByForm
        c   -> error $ "get: invalid RelCode code '" ++ [c] ++ "'"

-- | Merge the 'BaseMap' with the dictionary resource which maps forms to
-- sets of labels.  Every label is assigned a 'RelCode' which tells what
-- is the relation between the label and the form.  There are three
-- kinds of labels:
-- 'Exact' labels assigned in a direct manner, 'ByBase' labels assigned
-- to all forms which have a base form with a label in the input dictionary,
-- and 'ByForm' labels assigned to all forms which have a related form from the
-- same lexeme with a label in the input dictionary.
--
-- This function is far from being memory efficient right now.  If you plan to
-- run it with respect to the entire PoliMorf dictionary you should do it
-- on a machine with an abundance of available memory.
merge
    :: Ord a => BaseMap
    -> M.Map Form (S.Set a)
    -> M.Map Form (M.Map a RelCode)
merge poli dict0 = M.fromList
    [ (x, combine x)
    | x <- keys ]
  where
    -- Keys in the output dictionary.
    keys = S.toList (M.keysSet poli `S.union` M.keysSet dict0)

    -- Combining function.
    combine x = (M.unionsWith min . catMaybes)
        [ label Exact  <$> M.lookup x dict0 
        , label ByBase <$> M.lookup x dict1
        , label ByForm <$> M.lookup x dict2 ]

    label :: Ord a => RelCode -> S.Set a -> M.Map a RelCode
    label code s = M.fromList [(x, code) | x <- S.toList s]

    -- Extended to all base forms of dict0 keys.
    dict1 = fromListWith mappend
        [ (lemma, x)
        | (_form, x) <- M.assocs dict0
        , lemma <- elemsOn poli _form ]

    -- Extended to all forms of dict0 keys.
    dict2 = fromListWith mappend
        [ (form', x)
        | (_form, x) <- M.assocs dict0
        , lemma <- elemsOn poli _form
        , form' <- elemsOn ilop lemma ]

    -- Inverse poli dictionary.
    ilop = fromListWith mappend
        [ (lemma, S.singleton _form)
        | (_form, lemmas) <- M.assocs poli
        , lemma <- S.toList lemmas ]

elemsOn :: (Ord a, Ord b) => M.Map a (S.Set b) -> a -> [b]
elemsOn m x = case x `M.lookup` m of
    Just s  -> S.toList s
    Nothing -> []

fromListWith :: Ord k => (a -> a -> a) -> [(k, a)] -> M.Map k a
fromListWith f xs =
    let update m (!k, !x) = M.insertWith' f k x m
    in  foldl' update M.empty xs