-- | Morphosyntactic analysis utilities. -- -- See `reAnaSent` function for a description of how reanalsis is performed. -- At some point it would be nice to change the entire process so that -- sentence-level segmentation is also taken from the reanalysed data. module NLP.Concraft.Analysis ( -- * Analysis Analyse -- * Reanalysis , reAnaSent , reAnaPar ) where import qualified Control.Monad.LazyIO as LazyIO import qualified Data.Text.Lazy as L import NLP.Concraft.Morphosyntax import qualified Data.Tagset.Positional as P import qualified NLP.Concraft.Morphosyntax.Align as A --------------------- -- Analysis --------------------- -- | An analyser performs word-level segmentation and morphological analysis. type Analyse w t = L.Text -> IO (Sent w t) --------------------- -- Reanalysis --------------------- -- | Reanalyse sentence. -- -- From the reference sentence the function takes: -- -- * Word-level segmentation -- -- * Chosen interpretations (tags) -- -- From the reanalysed sentence the function takes: -- -- * Potential interpretations -- reAnaSent :: Word w => P.Tagset -> Analyse w P.Tag -> SentO w P.Tag -> IO (Sent w P.Tag) reAnaSent tagset ana sent = do let gold = segs sent reana <- ana (orig sent) return $ A.sync tagset gold reana -- | Reanalyse paragraph. reAnaPar :: Word w => P.Tagset -> Analyse w P.Tag -> [SentO w P.Tag] -> IO [Sent w P.Tag] reAnaPar tagset ana = LazyIO.mapM (reAnaSent tagset ana) --------------------- -- Junk --------------------- -- -- | Reanalyse paragraph. -- reanalyse :: Word w => P.Tagset -> Analyse w P.Tag -- -> [SentO w P.Tag] -> [Sent w P.Tag] -- reanalyse tagset ana xs = chunk -- -- We have to take sentence lengths from the reference corpus because -- -- token-level segmentation is also taken from the reference corpus -- -- (in case of inconsistencies between the two corpora). -- (map length gold) -- (A.sync tagset (concat gold) (concat reana)) -- where -- gold = map segs xs -- reana = ana . L.concat $ map orig xs -- -- -- -- | Divide the list into a list of chunks given the list of -- -- lengths of individual chunks. -- chunk :: [Int] -> [a] -> [[a]] -- chunk (n:ns) xs = -- let (first, rest) = splitAt n xs -- in first : chunk ns rest -- chunk [] [] = [] -- chunk [] _ = error "chunk: absurd"