module NLP.Nerf.Tokenize
(
tokenize
, Word (..)
, moveNEs
) where
import Control.Monad ((>=>))
import Data.Foldable (foldMap)
import qualified Data.Char as Char
import qualified Data.List as L
import qualified Data.Tree as T
import qualified Data.Traversable as Tr
import qualified Data.Text as Text
import qualified Data.Text.Lazy as LazyText
import qualified NLP.Tokenize as Tok
import Data.Named.Tree (NeForest, NeTree, groupForestLeaves)
defaultTokenizer :: Tok.Tokenizer
defaultTokenizer
= Tok.whitespace
>=> Tok.uris
>=> Tok.punctuation
tokenize :: String -> [String]
tokenize = Tok.run defaultTokenizer
class Word a where
word :: a -> String
instance Word String where
word = id
instance Word Text.Text where
word = Text.unpack
instance Word LazyText.Text where
word = LazyText.unpack
essence :: Word a => a -> Int
essence = length . filter (not . Char.isSpace) . word
type Sync a b = [([a], [b])]
sync :: (Word a, Word b) => [a] -> [b] -> Sync a b
sync = sync' 0
sync' :: (Word a, Word b) => Int -> [a] -> [b] -> Sync a b
sync' r (x:xs) (y:ys)
| n + r == m = ([x], [y]) : sync' 0 xs ys
| n + r < m = join x $ sync' (n + r) xs (y:ys)
| otherwise = swap . join y $ sync' (m r) ys (x:xs)
where
n = essence x
m = essence y
join l ((ls, rs) : ps) = (l:ls, rs) : ps
join _ [] = error "sync'.join: bad arguments"
swap ((ls, rs) : ps) = (rs, ls) : swap ps
swap [] = []
sync' 0 [] [] = []
sync' _ _ _ = error "sync': bad arguments"
match :: (Word a, Word b) => [a] -> Sync a b -> ([b], Sync a b)
match xs ss =
let (sl, sr) = splitAcc isMatch 0 ss
in (concatMap snd sl, sr)
where
n = sum (map essence xs)
isMatch r (ys, _)
| m + r < n = (m + r, False)
| m + r == n = (m + r, True)
| otherwise = error "match.isMatch: no match"
where
m = sum (map essence ys)
splitAcc :: (acc -> a -> (acc, Bool)) -> acc -> [a] -> ([a], [a])
splitAcc _ _ [] = ([], [])
splitAcc f acc (x:xs)
| cond = ([x], xs)
| otherwise = join x (splitAcc f acc' xs)
where
(acc', cond) = f acc x
join y (ys, zs) = (y:ys, zs)
leaves :: NeForest a b -> [b]
leaves = concatMap $ foldMap (either (const []) (:[]))
unGroupLeaves :: NeForest a [b] -> NeForest a b
unGroupLeaves = concatMap unGroupLeavesT
unGroupLeavesT :: NeTree a [b] -> [NeTree a b]
unGroupLeavesT (T.Node (Left v) xs) =
[T.Node (Left v) (unGroupLeaves xs)]
unGroupLeavesT (T.Node (Right vs) _) =
[T.Node (Right v) [] | v <- vs]
substGroups :: (Word b, Word c) => NeForest a [b] -> Sync b c -> NeForest a [c]
substGroups fs ss = snd $ L.mapAccumL substGroupsT ss fs
substGroupsT
:: (Word b, Word c)
=> Sync b c -> NeTree a [b]
-> (Sync b c, NeTree a [c])
substGroupsT =
Tr.mapAccumL f
where
f s (Left v) = (s, Left v)
f s (Right v) =
let (v', s') = match v s
in (s', Right v')
moveNEs :: (Word b, Word c) => NeForest a b -> [c] -> NeForest a c
moveNEs ft ys
= unGroupLeaves
$ substGroups
(groupForestLeaves true ft)
(sync (leaves ft) ys)
where
true _ _ = True