-- | -- Module : Conllu.Diff -- Copyright : © 2018 bruno cuconato -- License : LPGL-3 -- -- Maintainer : bruno cuconato -- Stability : experimental -- Portability : non-portable -- -- Build a diff of CoNLL-U elements (documents, sentences, words). it -- may show the diff (the print* functions return pairs of the -- differing fields in two words) or return the word pairs for further -- processing (the diff* functions). it expects paired sentences as -- input, and a default pairing function is provided. -- -- this module is useful for visualizing or debugging the processing -- of CoNLL-U corpora. be sure that the sentences are well-paired, or -- else it'll be -- as always -- garbage in, garbage out. module Conllu.Diff where import Conllu.Type import Conllu.Utils import Data.Maybe import Data.Ord --- -- * type synonims -- | CoNLL-U field diff. type FDiff = StringPair -- | pair of different words. type WDiff a = (CW a, CW a) -- | list of different words in a sentence. type SDiff a = [WDiff a] -- | list of lists of different words in sentences. type DDiff a = [SDiff a] --- -- * diffing functions diffW :: WDiff a -> Bool -- | 'True' if any word field pairs are mismatched. diffW = any isJust . printFieldDiffs diffWs :: [CW a] -> [CW a] -> [WDiff a] -- | filters the different word pairs. diffWs ws1 ws2 = filter diffW $ zip ws1 ws2 diffS :: (Sent, Sent) -> SDiff AW -- | diffs the sentence pair's words. diffS (s1, s2) = diffWs (_words s1) (_words s2) diffSs :: [(Sent, Sent)] -> DDiff AW -- | diffs the sentence pairs. diffSs = fmap diffS --- -- * auxiliary functions showM :: Show a => Maybe a -> String -- | shows a word field. showM (Just x) = show x showM Nothing = "_" --- -- * pairing functions pairSentsBy :: (Sent -> Sent -> Ordering) -> [Sent] -> [Sent] -> [(Sent, Sent)] -- | pairs sentences by some ordering of 'Sent'. pairSentsBy _f [] _ss = [] pairSentsBy _f _ss [] = [] pairSentsBy f ss1@(s1:st1) ss2@(s2:st2) = case f s1 s2 of LT -> pairSentsBy f st1 ss2 GT -> pairSentsBy f ss1 st2 EQ -> (s1, s2) : pairSentsBy f st1 st2 sentId :: Sent -> Maybe Index -- | try to find an index in a sentence's metadata looking for -- 'sent_id = n'. sentId s = let mi = lookup "sent_id " $ _meta s i = fromMaybe "0" mi in safeRead i :: Maybe Index pairSents :: [Sent] -> [Sent] -> [(Sent, Sent)] -- | pair sentences by their sent_id, found in their metadata. pairSents = pairSentsBy $ comparing sentId --- -- * printing functions printFieldDiffs :: WDiff a -> [Maybe StringPair] -- | list of maybe differing fields in a pair of words. printFieldDiffs (w1, w2) = fmap (diffField w1 w2) pfs where diffField w w' pf = let pf1 = pf w pf2 = pf w' in if pf1 /= pf2 then Just (pf1, pf2) else Nothing pfs = [ showM . _form , showM . _lemma , showM . _upos , showM . _xpos , show . _feats , showM . _rel , show . _deps , showM . _misc ] printWDiff :: WDiff a -> [StringPair] -- | list of differing fields in a pair of words. printWDiff = catMaybes . printFieldDiffs printSDiff :: SDiff a -> [[StringPair]] -- | list of differing words in a sentence. printSDiff = fmap printWDiff printDDiff :: DDiff a -> [[[StringPair]]] -- | list of lists of differing words in sentences. printDDiff = fmap printSDiff