{-# LANGUAGE RecordWildCards #-} -- | TAG conversion into flat production rules. -- Due to subtree sharing provided by `flattenWithSharing`, a single -- rule can be responsible for recognizing a subtree common to many -- different elementary trees. -- module NLP.Partage.SubtreeSharing ( -- * Grammar flattening flattenWithSharing ) where import Control.Applicative ((<$>)) import Control.Arrow (second) import Control.Monad (guard, forever) import qualified Control.Monad.State.Strict as E import Control.Monad.Trans.Class (lift) import Control.Monad.Identity (Identity(..)) import Control.Monad.Morph (generalize) import Data.Function (on) import Data.Monoid (mappend, mconcat) import Data.Maybe (isJust) import qualified Data.Set as S import qualified Data.Partition as Part import qualified Pipes as P import Pipes (hoist, (>->)) import NLP.Partage.FactGram.Internal ( FactGram, Lab(..), Rule(..), SymID ) import qualified NLP.Partage.FactGram.Internal as Rule import qualified NLP.Partage.Tree as G -------------------------------------------------- -- Compilation -------------------------------------------------- -- | Compile the given grammar into the list of rules. -- Common subtrees are shared. flattenWithSharing :: (Functor m, Monad m, Ord n, Ord t) => [ Either (G.Tree n t) (G.AuxTree n t) ] -> m (FactGram n t) flattenWithSharing ts = fmap snd $ runDupT $ Rule.runRM $ P.runEffect $ P.for shared (const $ return ()) where shared = hoist (hoist (hoist generalize)) ( hoist (hoist lift) rules >-> hoist lift rmDups ) rules = mapM_ getRules ts getRules (Left t) = Rule.treeRules t getRules (Right t) = Rule.auxRules t -------------------------------------------------- -- Eq/Ord Instances for RuleP -------------------------------------------------- -- | We define a newtype in order to define a custom Eq/Ord instances -- take the symbol of the head into account in a different manner. newtype RuleP n t = RuleP { unRuleP :: Rule n t } deriving (Show) -- | Ordinary label equality. labEq :: (Eq n, Eq t) => Lab n t -> Lab n t -> Bool labEq p q = p == q -- | Label equality. Concerning the `SymID` values, it is only -- checkes if either both are `Nothing` or both are `Just`. labEq' :: (Eq n, Eq t) => Lab n t -> Lab n t -> Bool labEq' p q = eq p q where eq x@NonT{} y@NonT{} = eqOn nonTerm x y && eqOn (isJust . labID) x y eq x@AuxVert{} y@AuxVert{} = eqOn nonTerm x y eq _ _ = p == q eqOn f x y = f x == f y -- | Ordinary label comparison. labCmp :: (Ord n, Ord t) => Lab n t -> Lab n t -> Ordering labCmp p q = compare p q -- | Label comparison. Concerning the `SymID` values, it is only -- checked if either both are `Nothing` or both are `Just`. labCmp' :: (Ord n, Ord t) => Lab n t -> Lab n t -> Ordering labCmp' p q = cmp p q where cmp x@NonT{} y@NonT{} = cmpOn nonTerm x y `mappend` cmpOn (isJust . labID) x y cmp x@AuxVert{} y@AuxVert{} = cmpOn nonTerm x y cmp _ _ = compare p q cmpOn f x y = compare (f x) (f y) instance (Eq n, Eq t) => Eq (RuleP n t) where r == s = (hdEq `on` headP) r s && ((==) `on` length.bodyP) r s && and [eq x y | (x, y) <- zip (bodyP r) (bodyP s)] where eq x y = labEq x y hdEq x y = labEq' x y headP = headR . unRuleP bodyP = bodyR . unRuleP instance (Ord n, Ord t) => Ord (RuleP n t) where r `compare` s = (hdCmp `on` headP) r s `mappend` (compare `on` length.bodyP) r s `mappend` mconcat [cmp x y | (x, y) <- zip (bodyP r) (bodyP s)] where cmp x y = labCmp x y hdCmp x y = labCmp' x y headP = headR . unRuleP bodyP = bodyR . unRuleP -------------------------------------------------- -- Substructure Sharing -------------------------------------------------- -- | Duplication-removal state serves to share common -- substructures. -- -- The idea is to remove redundant rules equivalent to other -- rules already present in the set of processed rules -- `rulDepo`(sit). -- -- Note that rules have to be processed in an appropriate order -- so that lower-level rules are processed before the -- higher-level rules from which they are referenced. data DupS n t = DupS { -- | A disjoint set for `SymID`s symDisj :: Part.Partition SymID -- | Rules already saved , rulDepo :: S.Set (RuleP n t) } -- Let us take a rule and let us assume that all identifiers it -- contains references to rules which have already been processed -- (for this assumption to be valid we just need to order the -- input set of rules properly). So we have a rule `r`, a set of -- processed rules `rs` and a clustering (disjoint-set) over -- `SymID`s present in `rs`. -- -- Now we want to process `r` and, in particular, check if it is -- not already in `rs` and update its `SymID`s. -- -- First we translate the body w.r.t. the existing clustering of -- `SymID`s (thanks to our assumption, these `SymID`s are already -- known and processed). The `SymID` in the root of the rule (if -- present) is the new one and it should not yet have been mentioned -- in `rs`. Even when `SymID` is not present in the root, we can -- still try to check if `r` is not present in `rs` -- after all, -- there may be some duplicates in the input grammar. -- -- Case 1: we have a rule with a `SymID` in the root. We want to -- check if there is already a rule in `rs` which: -- * Has identical body (remember that we have already -- transformed `SymID`s of the body of the rule in question) -- * Has the same non-terminal in the root and some `SymID` -- -- Case 2: the same as case 1 with the difference that we look -- for the rules which have an empty `SymID` in the root. -- -- For this to work we just need a specific comparison function -- which works as specified in the two cases desribed above -- (i.e. either there are some `SymID`s in the rule heads, or -- there are no `SymID`s in both heads.) -- -- Once we have this comparison (which is provided by the -- function `labCmp'` above), we simply process the set of rules -- incrementally. -- | Duplication-removal transformer. type DupT n t m = E.StateT (DupS n t) m -- | Duplication-removal monad. type DupM n t = DupT n t Identity -- | Run the transformer. runDupT :: (Functor m, Monad m, Ord t, Ord n) => DupT n t m b -> m (b, S.Set (Rule n t)) runDupT = fmap (second getRules) . flip E.runStateT (DupS Part.empty S.empty) where getRules = S.fromList . map unRuleP . S.toList. rulDepo -- | Update the body of the rule by replacing old `SymID`s with -- their representatives. updateBody :: RuleP n t -> DupM n t (RuleP n t) updateBody (RuleP r) = do d <- E.gets symDisj let body' = map (updLab d) (bodyR r) return . RuleP $ r { bodyR = body' } where updLab d x@NonT{..} = x { labID = updSym d <$> labID } updLab d x@AuxVert{..} = x { symID = updSym d symID } updLab _ x = x updSym = Part.rep -- | Find a rule if already present. findRule :: (Ord n, Ord t) => RuleP n t -> DupM n t (Maybe (RuleP n t)) findRule x = do s <- E.gets rulDepo return $ lookupSet x s -- | Join two `SymID`s. joinSym :: SymID -> SymID -> DupM n t () joinSym x y = E.modify $ \s@DupS{..} -> s { symDisj = Part.joinElems x y symDisj } -- | Save the rule in the underlying deposit. keepRule :: (Ord n, Ord t) => RuleP n t -> DupM n t () keepRule r = E.modify $ \s@DupS{..} -> s { rulDepo = S.insert r rulDepo } -- | Retrieve the symbol of the head of the rule. headSym :: RuleP n t -> Maybe SymID headSym r = case headR (unRuleP r) of NonT{..} -> labID AuxVert{..} -> Just symID _ -> Nothing -- | Removing duplicates updating `SymID`s at the same time. -- WARNING: The pipe assumes that `SymID`s to which the present -- rule refers have already been processed -- in other words, -- that rule on which the present rule depends have been -- processed earlier. -- -- This function is responsible for basic sharing of common -- subtrees. rmDups :: (Ord n, Ord t) => P.Pipe (Rule n t) -- Input (Rule n t) -- Output (DupM n t) -- Underlying state () -- No result rmDups = forever $ do r <- P.await >>= lift . updateBody . RuleP lift (findRule r) >>= \mr -> case mr of Nothing -> do lift $ keepRule r P.yield $ unRuleP r Just r' -> case (headSym r, headSym r') of (Just x, Just y) -> lift $ joinSym x y _ -> return () -------------------------------------------------- -- Utilities -------------------------------------------------- -- | Lookup an element in a set. lookupSet :: Ord a => a -> S.Set a -> Maybe a lookupSet x s = do y <- S.lookupLE x s guard $ x == y return y