{-# LANGUAGE RecordWildCards #-} {-# LANGUAGE TupleSections #-} module NLP.HistPL.Fusion ( -- * Basic types UID , POS , Word , Base , IsBase -- * Dictionary , BaseDAWG , FormDAWG -- ** Bilateral , Bila (..) , mkBila , withForm -- ** Contemporary , Poli , PLex , PLexSet , mkPoli -- * Correspondence , Corresp , buildCorresp -- ** Components , Core , Filter , Choice , byForms , posFilter , sumChoice ) where import Prelude hiding (lookup) import Control.Applicative ((<$>), (<*>)) import Data.Text.Binary () import qualified Data.Map as M import qualified Data.Text as T import qualified Data.PoliMorf as P import NLP.HistPL.Lexicon (UID) import qualified NLP.HistPL.Lexicon as H import qualified NLP.HistPL.Util as H import NLP.HistPL.DAWG ------------------------------------------------------------------------ -- | Part of speech. type POS = T.Text -- | Base form. type Base = T.Text -- | Word form. type Word = T.Text -- | Is the word form a base form? type IsBase = Bool -- | Dictionary keys represent base forms and rules transform base forms to -- their corresponding word forms. Info @a@ is assigned to every lexeme -- and info @b@ to every word form. type BaseDAWG i a b = DAWG i a b -- | Dictionary keys represent word forms and rules transform word forms to -- their corresponding base forms. Info @a@ is assigned to every lexeme -- and info @b@ to every word form. type FormDAWG i a b = DAWG i a b ------------------------------------------------------------------------ -- | Bilateral dictionary. data Bila i a b = Bila { baseDAWG :: BaseDAWG i a b , formDAWG :: FormDAWG i a b } deriving (Show, Eq, Ord) -- | Make bilateral dictionary from a list of (base form, ID, additional -- lexeme info, word form, additional word form info) tuples. mkBila :: (Ord i, Ord a, Ord b) => [(Base, i, a, Word, b)] -> Bila i a b mkBila xs = Bila { baseDAWG = baseDAWG' , formDAWG = formDAWG' } where baseDAWG' = fromList xs formDAWG' = revDAWG baseDAWG' -- | Identify entries which contain given word form. withForm :: Ord i => Bila i a b -> Word -> LexSet i a b withForm Bila{..} word = M.unions [ lookup base baseDAWG | let lexSet = lookup word formDAWG , (_, val) <- M.assocs lexSet , base <- M.keys (forms val) ] ------------------------------------------------------------------------ -- | PoliMorf dictionary in a bilateral form. type Poli = Bila POS () () -- | PoliMorf dictionary entry. type PLex = Lex POS () () -- | Set of PoliMorf dictionary entries. type PLexSet = LexSet POS () () -- | Make bilateral dictionary from PoliMorf. mkPoli :: [P.Entry] -> Poli mkPoli = mkBila . map ((,,(),,()) <$> P.base <*> P.pos <*> P.form) ------------------------------------------------------------------------ -- | A function which determines entries from a bilateral -- dictionary corresponing to a given historical lexeme. type Corresp = Poli -> H.LexEntry -> PLexSet -- | We provide three component types, `Core`, `Filter` and `Choice`, which -- can be combined together using the `buildCorresp` function to construct -- a `Corresp` function. The first one, `Core`, is used to identify a list -- of potential sets of lexemes. It is natural to define the core function -- in such a way because the task of determining corresponding lexemes can -- be usually divided into a set of smaller tasks of the same purpose. -- For example, we may want to identify `LexSet`s corresponding to individual -- word forms of the historical lexeme. type Core = Poli -> H.LexEntry -> [PLexSet] -- | Function which can be used to filter out lexemes which do not -- satisfy a particular predicate. For example, we may want to filter -- out lexemes with incompatible POS value. type Filter = H.LexEntry -> PLex -> Bool -- | The final choice of lexemes. Many different strategies can be used -- here sum of the sets, intersection, or voting. type Choice = [PLexSet] -> PLexSet -- | Identify `LexSet`s corresponding to individual word forms of the -- historical lexeme using the `withForm` function. byForms :: Core byForms bila lexEntry = [ withForm bila word | word <- H.allForms lexEntry ] -- | Filter out lexemes with POS value incompatible with the -- set of POS values assigned to the historical lexeme. posFilter :: Filter posFilter h p = uid (lexKey p) `elem` H.pos h -- | Sum of sets of lexemes. sumChoice :: Choice sumChoice = M.unions -- | Build `Corresp` function form individual components. buildCorresp :: Core -> Filter -> Choice -> Corresp buildCorresp core filt choice bila hLex = let filterSet = mkLexSet . filter (filt hLex) . unLexSet in choice . map filterSet . core bila $ hLex