{-# LANGUAGE TupleSections #-}
module NLP.Concraft.Morphosyntax.Align
( align
, sync
) where
import Prelude hiding (Word)
import Control.Applicative ((<|>))
import Data.Maybe (fromJust)
import Data.List (find)
import qualified Data.Set as S
import qualified Data.Map as M
import qualified Data.Char as C
import qualified Data.Text as T
import qualified Data.Tagset.Positional as P
import NLP.Concraft.Morphosyntax
sync :: Word w => P.Tagset -> [Seg w P.Tag] -> [Seg w P.Tag] -> [Seg w P.Tag]
sync tagset xs ys = concatMap (uncurry (moveDisamb tagset)) (align xs ys)
moveDisamb :: P.Tagset -> [Seg w P.Tag] -> [Seg w P.Tag] -> [Seg w P.Tag]
moveDisamb tagset [v] [w] =
[w {tags = mkWMap (map (,0) tagsNew ++ disambNew)}]
where
tagPairs = M.toList . unWMap . tags
tagsNew = map fst (tagPairs w)
disambNew = [(newDom x, c) | (x, c) <- tagPairs v, c > 0]
newDom tag = fromJust $
find ( ==tag) tagsNew
<|> find (~==tag) tagsNew
<|> Just tag
where
x ~== y = S.size (label x `S.intersection` label y) > 0
label = S.fromList . P.expand tagset
moveDisamb _ xs _ = xs
align :: Word w => [Seg w t] -> [Seg w t] -> [([Seg w t], [Seg w t])]
align [] [] = []
align [] _ = error "align: null xs, not null ys"
align _ [] = error "align: not null xs, null ys"
align xs ys =
let (x, y) = match xs ys
rest = align (drop (length x) xs) (drop (length y) ys)
in (x, y) : rest
match :: Word w => [Seg w t] -> [Seg w t] -> ([Seg w t], [Seg w t])
match xs' ys' =
doIt 0 xs' 0 ys'
where
doIt i (x:xs) j (y:ys)
| n == m = ([x], [y])
| n < m = addL x $ doIt n xs j (y:ys)
| otherwise = addR y $ doIt i (x:xs) m ys
where
n = i + size x
m = j + size y
doIt _ [] _ _ = error "match: the first argument is null"
doIt _ _ _ [] = error "match: the second argument is null"
size w = T.length . T.filter (not.C.isSpace) $ orth w
addL x (xs, ys) = (x:xs, ys)
addR y (xs, ys) = (xs, y:ys)