{-# LANGUAGE TypeSynonymInstances #-} {-# LANGUAGE FlexibleInstances #-} -- | The module implements the tokenization used within Nerf -- and some other tokenization-related stuff. module NLP.Nerf.Tokenize ( -- * Tokenization tokenize -- * Synchronization , Word (..) , sync ) where import Control.Arrow (second) import Control.Monad ((>=>)) import qualified Data.Char as Char import qualified Data.List as L import qualified Data.Tree as T import qualified Data.Traversable as Tr import qualified Data.Text as Text import qualified Data.Text.Lazy as LazyText import qualified NLP.Tokenize as Tok import qualified Data.IntervalMap.Strict as I import Data.Named.Tree (NeForest, NeTree) ------------------------------------- -- Tokenization definition ------------------------------------- -- | Default tokenizator. defaultTokenizer :: Tok.Tokenizer defaultTokenizer = Tok.whitespace >=> Tok.uris >=> Tok.punctuation -- | Tokenize sentence using the default tokenizer. tokenize :: String -> [String] tokenize = Tok.run defaultTokenizer ------------------------------------- -- Word ------------------------------------- -- | A class of objects which can be converted to `String`. class Word a where word :: a -> String instance Word String where word = id instance Word Text.Text where word = Text.unpack instance Word LazyText.Text where word = LazyText.unpack essence :: Word a => a -> Int essence = length . filter (not . Char.isSpace) . word {-# INLINE essence #-} ------------------------------------- -- Grouping leaves ------------------------------------- unGroupLeaves :: NeForest a [b] -> NeForest a b unGroupLeaves = concatMap unGroupLeavesT unGroupLeavesT :: NeTree a [b] -> [NeTree a b] unGroupLeavesT (T.Node (Left v) xs) = [T.Node (Left v) (unGroupLeaves xs)] unGroupLeavesT (T.Node (Right vs) _) = [T.Node (Right v) [] | v <- vs] --------------------------------------------------------------- -- Identifying ranges --------------------------------------------------------------- type Range = I.Interval Int -- | Range computation step. ranged :: Word a => Int -> a -> (Int, (Range, a)) ranged p w = (q, (i, w)) where q = p + essence w i = I.IntervalCO p q -- | Compute ranges of individual tokens. rangedList :: Word a => [a] -> [(Range, a)] rangedList = snd . L.mapAccumL ranged 0 -- | Compute ranges of individual tokens. rangedForest :: Word b => NeForest a b -> NeForest a (Range, b) rangedForest = snd . L.mapAccumL (Tr.mapAccumL f) 0 where f acc (Left x) = (acc, Left x) f acc (Right x) = let (acc', y) = ranged acc x in (acc', Right y) --------------------------------------------------------------- -- Synchronizing named entities with new sentence tokenization --------------------------------------------------------------- -- | Replace leaves in the NE forest with corresponding tokens. replaceToks :: I.IntervalMap Int c -> NeForest a (Range, b) -> ( I.IntervalMap Int c , NeForest a (Range, c) ) replaceToks ivMap nes = second unGroupLeaves $ L.mapAccumL (Tr.mapAccumL replace) ivMap nes where replace im (Left x) = (im, Left x) replace im (Right (ran, _)) = let rsXs = I.intersecting im ran im' = L.foldl' (flip I.delete) im (map fst rsXs) in (im', Right rsXs) -- | Lift the first range of a tree to the top. liftRange :: NeTree a (Range, b) -> (Range, NeTree a b) liftRange (T.Node (Left v) xs) = (ran, T.Node (Left v) (map snd ys)) where ys = map liftRange xs ran = maybeHead $ map fst ys maybeHead (x:_) = x maybeHead [] = error "liftRange: invalid NE tree" liftRange (T.Node (Right (ran, v)) _) = (ran, T.Node (Right v) []) -- | Synchronize the list of NE trees with the new tokenization. sync :: (Word b, Word c) => NeForest a b -- ^ NE forest -> [c] -- ^ New tokenization -> NeForest a c -- ^ Resulting NE forest sync nes0 xs0 = map snd . I.toList . I.fromList $ map (second mkLeaf) (I.toList ivMap') ++ map liftRange nes' where -- Interval map of the new tokenization ivMap = I.fromList $ rangedList xs0 -- NE non-leaf trees with ranges nes = filter internal $ rangedForest nes0 -- Replace tokens... (ivMap', nes') = replaceToks ivMap nes -- Is it an internal node? internal x = case T.rootLabel x of Left _ -> True Right _ -> False -- Make a leaf tree mkLeaf x = T.Node (Right x) []