{-# LANGUAGE RecordWildCards #-} {-# LANGUAGE OverloadedStrings #-} -- | Observation schema blocks for Nerf. module NLP.Nerf.Schema ( -- * Types Ox , Schema , void , sequenceS_ -- * Usage , schematize -- ** Configuration , Body (..) , Entry , entry , entryWith , SchemaConf (..) , nullConf , defaultConf , fromConf -- ** Schema blocks , Block , fromBlock , orthB , splitOrthB , lowPrefixesB , lowSuffixesB , lemmaB , shapeB , packedB , shapePairB , packedPairB , dictB ) where import Control.Applicative ((<$>), (<*>)) import Control.Monad (forM_, join) import Data.Maybe (maybeToList) import Data.Binary (Binary, put, get) import qualified Data.Char as C import qualified Data.Set as S import qualified Data.Vector as V import qualified Data.Text as T import qualified Data.DAWG.Static as D import qualified Data.CRF.Chain1 as CRF import qualified Control.Monad.Ox as Ox import qualified Control.Monad.Ox.Text as Ox import NLP.Nerf.Types import NLP.Nerf.Dict (Dict) -- | The Ox monad specialized to word token type and text observations. type Ox a = Ox.Ox Word T.Text a -- | A schema is a block of the Ox computation performed within the -- context of the sentence and the absolute sentence position. type Schema a = V.Vector Word -> Int -> Ox a -- | A dummy schema block. void :: a -> Schema a void x _ _ = return x -- | Sequence the list of schemas (or blocks) and discard individual values. sequenceS_ :: [V.Vector Word -> a -> Ox b] -> V.Vector Word -> a -> Ox () sequenceS_ xs sent = let ys = map ($sent) xs in \k -> sequence_ (map ($k) ys) -- | Record structure of the basic observation types. data BaseOb = BaseOb { orth :: Int -> Maybe T.Text , lowOrth :: Int -> Maybe T.Text } -- | Construct the 'BaseOb' structure given the sentence. mkBaseOb :: V.Vector Word -> BaseOb mkBaseOb sent = BaseOb { orth = _orth , lowOrth = _lowOrth } where at = Ox.atWith sent _orth = (id `at`) _lowOrth i = T.toLower <$> _orth i -- | A block is a chunk of the Ox computation performed within the -- context of the sentence and the list of absolute sentence positions. type Block a = V.Vector Word -> [Int] -> Ox a -- | Transform the block to the schema depending on the list of -- relative sentence positions. fromBlock :: Block a -> [Int] -> Schema a fromBlock blk xs sent = let blkSent = blk sent in \k -> blkSent [x + k | x <- xs] -- | Orthographic form at the current position. orthB :: Block () orthB sent = \ks -> let orthOb = Ox.atWith sent id in mapM_ (Ox.save . orthOb) ks -- | Orthographic form split into two observations: the lowercased form and -- the original form (only when different than the lowercased one). splitOrthB :: Block () splitOrthB sent = \ks -> do mapM_ (Ox.save . lowOrth) ks mapM_ (Ox.save . upOnlyOrth) ks where BaseOb{..} = mkBaseOb sent upOnlyOrth i = orth i >>= \x -> case T.any C.isUpper x of True -> Just x False -> Nothing -- | List of lowercased prefixes of given lengths. lowPrefixesB :: [Int] -> Block () lowPrefixesB ns sent = \ks -> forM_ ks $ \i -> mapM_ (Ox.save . lowPrefix i) ns where BaseOb{..} = mkBaseOb sent lowPrefix i j = Ox.prefix j =<< lowOrth i -- | List of lowercased suffixes of given lengths. lowSuffixesB :: [Int] -> Block () lowSuffixesB ns sent = \ks -> forM_ ks $ \i -> mapM_ (Ox.save . lowSuffix i) ns where BaseOb{..} = mkBaseOb sent lowSuffix i j = Ox.suffix j =<< lowOrth i -- | Lemma substitute parametrized by the number specifying the span -- over which lowercased prefixes and suffixes will be 'Ox.save'd. -- For example, @lemmaB 2@ will take affixes of lengths @0, -1@ and @-2@ -- on account. lemmaB :: Int -> Block () lemmaB n sent = \ks -> do mapM_ lowLemma ks where BaseOb{..} = mkBaseOb sent lowPrefix i j = Ox.prefix j =<< lowOrth i lowSuffix i j = Ox.suffix j =<< lowOrth i lowLemma i = Ox.group $ do mapM_ (Ox.save . lowPrefix i) [0, -1 .. -n] mapM_ (Ox.save . lowSuffix i) [0, -1 .. -n] -- | Shape of the word. shapeB :: Block () shapeB sent = \ks -> do mapM_ (Ox.save . shape) ks where BaseOb{..} = mkBaseOb sent shape i = Ox.shape <$> orth i -- | Packed shape of the word. packedB :: Block () packedB sent = \ks -> do mapM_ (Ox.save . shapeP) ks where BaseOb{..} = mkBaseOb sent shape i = Ox.shape <$> orth i shapeP i = Ox.pack <$> shape i -- -- | Shape and packed shape of the word. -- shapeAndPackedB :: Block () -- shapeAndPackedB sent = \ks -> do -- mapM_ (Ox.save . shape) ks -- mapM_ (Ox.save . shapeP) ks -- where -- BaseOb{..} = mkBaseOb sent -- shape i = Ox.shape <$> orth i -- shapeP i = Ox.pack <$> shape i -- | Combined shapes of two consecutive (at @k-1@ and @k@ positions) words. shapePairB :: Block () shapePairB sent = \ks -> forM_ ks $ \i -> do Ox.save $ link <$> shape i <*> shape (i - 1) where BaseOb{..} = mkBaseOb sent shape i = Ox.shape <$> orth i link x y = T.concat [x, "-", y] -- | Combined packed shapes of two consecutive (at @k-1@ and @k@ positions) -- words. packedPairB :: Block () packedPairB sent = \ks -> forM_ ks $ \i -> do Ox.save $ link <$> shapeP i <*> shapeP (i - 1) where BaseOb{..} = mkBaseOb sent shape i = Ox.shape <$> orth i shapeP i = Ox.pack <$> shape i link x y = T.concat [x, "-", y] -- | Plain dictionary search determined with respect to the list of -- relative positions. dictB :: Dict -> Block () dictB dict sent = \ks -> do mapM_ (Ox.saves . searchDict) ks where BaseOb{..} = mkBaseOb sent searchDict i = join . maybeToList $ S.toList <$> (orth i >>= flip D.lookup dict . T.unpack) -- | Body of configuration entry. data Body a = Body { -- | Range argument for the schema block. range :: [Int] -- | Additional arguments for the schema block. , args :: a } deriving (Show) instance Binary a => Binary (Body a) where put Body{..} = put range >> put args get = Body <$> get <*> get -- | Maybe entry. type Entry a = Maybe (Body a) -- | Entry with additional arguemnts. entryWith :: a -> [Int] -> Entry a entryWith v xs = Just (Body xs v) -- | Maybe entry with additional arguemnts. entryWith'Mb :: Maybe a -> [Int] -> Entry a entryWith'Mb (Just v) xs = Just (Body xs v) entryWith'Mb Nothing _ = Nothing -- | Plain entry with no additional arugments. entry :: [Int] -> Entry () entry = entryWith () -- | Configuration of the schema. All configuration elements specify the -- range over which a particular observation type should be taken on account. -- For example, the @[-1, 0, 2]@ range means that observations of particular -- type will be extracted with respect to previous (@k - 1@), current (@k@) -- and after the next (@k + 2@) positions when identifying the observation -- set for position @k@ in the input sentence. data SchemaConf = SchemaConf { -- | The 'orthB' schema block. orthC :: Entry () -- | The 'splitOrthB' schema block. , splitOrthC :: Entry () -- | The 'lowPrefixesB' schema block. The first list of ints -- represents lengths of prefixes. , lowPrefixesC :: Entry [Int] -- | The 'lowSuffixesB' schema block. The first list of ints -- represents lengths of suffixes. , lowSuffixesC :: Entry [Int] -- | The 'lemmaB' schema block. , lemmaC :: Entry Int -- | The 'shapeB' schema block. , shapeC :: Entry () -- | The 'packedB' schema block. , packedC :: Entry () -- | The 'shapePairB' schema block. , shapePairC :: Entry () -- | The 'packedPairB' schema block. , packedPairC :: Entry () -- | Dictionaries of NEs ('dictB' schema block). , dictC :: Entry [Dict] -- | Dictionary of internal triggers. , intTrigsC :: Entry Dict -- | Dictionary of external triggers. , extTrigsC :: Entry Dict } deriving (Show) instance Binary SchemaConf where put SchemaConf{..} = do put orthC put splitOrthC put lowPrefixesC put lowSuffixesC put lemmaC put shapeC put packedC put shapePairC put packedPairC put dictC put intTrigsC put extTrigsC get = SchemaConf <$> get <*> get <*> get <*> get <*> get <*> get <*> get <*> get <*> get <*> get <*> get <*> get -- | Null configuration of the observation schema. nullConf :: SchemaConf nullConf = SchemaConf Nothing Nothing Nothing Nothing Nothing Nothing Nothing Nothing Nothing Nothing Nothing Nothing -- | Default configuration of the observation schema. defaultConf :: [Dict] -- ^ Named Entity dictionaries -> Maybe Dict -- ^ Dictionary of internal triggers -> Maybe Dict -- ^ Dictionary of external triggers -> IO SchemaConf defaultConf neDicts intDict extDict = do return $ SchemaConf { orthC = Nothing , splitOrthC = entry [-1, 0] , lowPrefixesC = Nothing , lowSuffixesC = entryWith [2, 3, 4] [0] , lemmaC = entryWith 3 [-1, 0] , shapeC = entry [-1, 0] , packedC = entry [-1, 0] , shapePairC = entry [0] , packedPairC = entry [0] , dictC = entryWith neDicts [-1, 0] , intTrigsC = entryWith'Mb intDict [0] , extTrigsC = entryWith'Mb extDict [-1] } mkArg0 :: Block () -> Entry () -> Schema () mkArg0 blk (Just x) = fromBlock blk (range x) mkArg0 _ Nothing = void () mkArg1 :: (a -> Block ()) -> Entry a -> Schema () mkArg1 blk (Just x) = fromBlock (blk (args x)) (range x) mkArg1 _ Nothing = void () mkArgs1 :: (a -> Block ()) -> Entry [a] -> Schema () mkArgs1 blk (Just x) = sequenceS_ [ fromBlock (blk dict) (range x) | dict <- args x ] mkArgs1 _ Nothing = void () -- | Build the schema based on the configuration. fromConf :: SchemaConf -> Schema () fromConf SchemaConf{..} = sequenceS_ [ mkArg0 orthB orthC , mkArg0 splitOrthB splitOrthC , mkArg1 lowPrefixesB lowPrefixesC , mkArg1 lowSuffixesB lowSuffixesC , mkArg1 lemmaB lemmaC , mkArg0 shapeB shapeC , mkArg0 packedB packedC , mkArg0 shapePairB shapePairC , mkArg0 packedPairB packedPairC , mkArgs1 dictB dictC , mkArg1 dictB intTrigsC , mkArg1 dictB extTrigsC ] -- | Use the schema to extract observations from the sentence. schematize :: Schema a -> [Word] -> CRF.Sent Ob schematize schema xs = map (S.fromList . Ox.execOx . schema v) [0 .. n - 1] where v = V.fromList xs n = V.length v