{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE PatternGuards #-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE RecordWildCards #-}

-- | The module provides functionality for manipulating PoliMorf, the
-- morphological dictionary for Polish. Apart from IO utilities there
-- is a 'merge' function which can be used to merge the PoliMorf with
-- another dictionary resources.

module Data.PoliMorf
( 
-- * Core types
  Form
, Base
, Tag
, Cat
, Entry (..)
, atomic

-- * Parsing
, readPoliMorf
, parsePoliMorf

-- * Utilities
, Rule (..)
, apply
, DAWG
, AnaMap
, mkAnaMap
, anaWord
, mkRuleMap
, BaseMap
, mkBaseMap
, FormMap
, mkFormMap

-- -- * Merging
-- , RelCode (..)
-- , mergeWith
-- , merge
) where

import Control.Applicative ((<$>), (<*>))
-- import Data.Maybe (catMaybes)
-- import Data.Monoid (mappend)
import Data.Binary (Binary, get, put)
import Data.Text.Binary ()
import qualified Data.Set as S
import qualified Data.Map as M
import qualified Data.Text as T
import qualified Data.Text.Lazy as L
import qualified Data.Text.Lazy.IO as L
import qualified Data.DAWG.Static as D
import qualified Data.DAWG.Trans.Vector as D

-- | A form.
type Form = T.Text

-- | A base form.
type Base = T.Text

-- | A morphosyntactic tag.
type Tag  = T.Text

-- | A category.
type Cat  = T.Text

-- | An entry from the PoliMorf dictionary.
data Entry = Entry
    { form :: !Form
    , base :: !Base
    , tag  :: !Tag
    , cat  :: !Cat }
    deriving (Eq, Ord, Show, Read)

-- | Is the entry an atomic one?  More precisely, we treat all negative
-- forms starting with ''nie'' and all superlatives starting with ''naj''
-- as non-atomic entries.
atomic :: Entry -> Bool
atomic x
    | "sup" `T.isInfixOf` tag x && "naj" `T.isPrefixOf` form x = False
    | "neg" `T.isInfixOf` tag x && "nie" `T.isPrefixOf` form x = False
    | otherwise = True

-- | Read the PoliMorf from the file.
readPoliMorf :: FilePath -> IO [Entry]
readPoliMorf path = parsePoliMorf <$> L.readFile path

-- | Parse the PoliMorf into a list of entries.
parsePoliMorf :: L.Text -> [Entry]
parsePoliMorf = map parsePoliRow . L.lines 

-- | Get an entry pair from a PoliMorf row.
parsePoliRow :: L.Text -> Entry
parsePoliRow row = case map L.toStrict (L.split (=='\t') row) of
    [_form, _base, _tag, _cat] -> Entry _form _base _tag _cat
    _   -> error $ "parsePoliRow: invalid row \"" ++ L.unpack row ++ "\""

-- | A rule for translating a form into another one.
data Rule = Rule {
    -- | Number of characters to cut from the end of the form.
      cut       :: !Int
    -- | A suffix to paste.
    , suffix    :: !T.Text }
    deriving (Show, Eq, Ord)

instance Binary Rule where
    put Rule{..} = put cut >> put suffix
    get = Rule <$> get <*> get

-- | Apply the rule.
apply :: Rule -> T.Text -> T.Text
apply r x = T.take (T.length x - cut r) x `T.append` suffix r

-- | Make a rule to translate between two strings.
between :: T.Text -> T.Text -> Rule
between source dest =
    let k = lcp source dest
    in  Rule (T.length source - k) (T.drop k dest)
  where
    lcp a b = case T.commonPrefixes a b of
        Just (c, _, _)  -> T.length c
        Nothing         -> 0

type DAWG a = D.DAWG D.Trans Char () a

-- | A map from forms to their potential interpretations.  It can be used
-- directly to determine all potential dictionary interpretations of a
-- given form.
type AnaMap = DAWG (M.Map Rule (S.Set Tag))

-- | Construct an 'AnaMap' from a list of entries.
mkAnaMap :: [Entry] -> AnaMap
mkAnaMap xs = D.fromListWith (M.unionWith S.union) $
    [ ( T.unpack (form x)
      , M.singleton
            (between (form x) (base x))
            (S.singleton (tag x)) )
    | x <- xs ]

-- | Analyse word.
anaWord :: AnaMap -> Form -> M.Map Base (S.Set Tag)
anaWord anaMap x = case D.lookup (T.unpack x) anaMap of
    Just m  -> M.fromListWith S.union
        [ (apply rule x, tags)
        | (rule, tags) <- M.toList m ]
    Nothing -> M.empty

-- | A map from forms to their possible base forms (there may be many since
-- the form may be a member of multiple lexemes).
type BaseMap = DAWG (S.Set Rule)

-- | A map from base forms to all their potential forms.
type FormMap = DAWG (S.Set Rule)

-- | Make a rule map from a list of entries.
mkRuleMap :: [(T.Text, T.Text)] -> DAWG (S.Set Rule)
mkRuleMap xs = D.fromListWith S.union $
    [ ( T.unpack x
      , S.singleton (between x y) )
    | (x, y) <- xs ]

-- | Make a 'BaseMap' from a list of entries.
mkBaseMap :: [Entry] -> BaseMap
mkBaseMap = mkRuleMap . map ((,) <$> form <*> base)

-- | Make a 'FormMap' from a list of entries.
mkFormMap :: [Entry] -> FormMap
mkFormMap = mkRuleMap . map ((,) <$> base <*> form)

-- -- | Reliability information: how did we assign a particular label to
-- -- a particular word form.
-- data RelCode
--     = ByForm    -- ^ Based on labels of other forms within the same lexeme
--     | ByBase    -- ^ Label assigned based on a lemma label  
--     | Exact     -- ^ Label assigned in a direct manner
--     deriving (Eq, Ord, Show, Read)
-- 
-- instance Binary RelCode where
--     put Exact   = put '1'
--     put ByBase  = put '2'
--     put ByForm  = put '3'
--     get = get >>= \x -> return $ case x of
--         '1' -> Exact
--         '2' -> ByBase
--         '3' -> ByForm
--         c   -> error $ "get: invalid RelCode code '" ++ [c] ++ "'"
-- 
-- -- | Merge the 'BaseMap' with the dictionary resource which maps forms to sets
-- -- of labels.  Every label is assigned a 'RelCode' which tells what is the
-- -- relation between the label and the form. It is a generalized version
-- -- of the 'merge' function with additional function @f x y y'label@ which
-- -- can be used to determine the resultant set of labels for the form @x@
-- -- given ,,similar'' form @y@ and its original label @y'label@.
-- -- There are three kinds of labels:
-- -- 'Exact' labels assigned in a direct manner, 'ByBase' labels assigned
-- -- to all forms which have a base form with a label in the input dictionary,
-- -- and 'ByForm' labels assigned to all forms which have a related form from the
-- -- same lexeme with a label in the input dictionary.
-- mergeWith
--     :: Ord a
--     => (String -> String -> a -> a)
--     -> BaseMap
--     -> DAWG (S.Set a)
--     -> DAWG (M.Map a RelCode)
-- mergeWith f poli dict0 = D.fromList
--     [ (x, combine x)
--     | x <- keys ]
--   where
--     -- Keys in the output dictionary.
--     keys = join (D.keys poli) (D.keys dict0)
-- 
--     -- Combining function.
--     combine x = (M.unionsWith max . catMaybes)
--         [ label Exact  <$> D.lookup x dict0 
--         , label ByBase <$> D.lookup x dict1
--         , label ByForm <$> D.lookup x dict2 ]
-- 
--     label :: Ord a => RelCode -> S.Set a -> M.Map a RelCode
--     label code s = M.fromList [(x, code) | x <- S.toList s]
-- 
--     -- Extended to all base forms of dict0 keys.
--     dict1 = D.fromListWith mappend
--         [ (lemma, f'Set lemma _form x)
--         | (_form, x) <- D.assocs dict0
--         , lemma <- elemsOn poli _form ]
-- 
--     -- Extended to all forms of dict0 keys.
--     dict2 = D.fromListWith mappend
--         [ (form', f'Set form' _form x)
--         | (_form, x) <- D.assocs dict0
--         , lemma <- elemsOn poli _form
--         , form' <- elemsOn ilop lemma ]
-- 
--     -- Inverse poli dictionary.
--     ilop = mkRuleMap
--         [ (base'Text, form'Text)
--         | (form'String, rules) <- D.assocs poli
--         , rule <- S.toList rules
--         , let form'Text = T.pack form'String
--         , let base'Text = apply rule form'Text ]
--     
--     -- Merge to ascending lists.
--     join (x:xs) (y:ys)
--         | x < y     = x : join xs (y:ys)
--         | x > y     = y : join (x:xs) ys
--         | otherwise = x : join xs ys
--     join xs []  = xs
--     join [] ys  = ys
-- 
--     -- Version of f function working on label sets.
--     f'Set v w = S.fromList . map (f v w) . S.toList
-- 
-- -- | A specialized version of the 'mergeWith' function which doesn't
-- -- change labels in the resultant 'DAWG'.
-- merge
--     :: Ord a => BaseMap
--     -> DAWG (S.Set a)
--     -> DAWG (M.Map a RelCode)
-- merge = mergeWith $ \_ _ x -> x
-- 
-- elemsOn :: DAWG (S.Set Rule) -> String -> [String]
-- elemsOn m x = case x `D.lookup` m of
--     Just s  ->
--         [ T.unpack . apply rule . T.pack $ x
--         | rule <- S.toList s ]
--     Nothing -> []