-- |
-- Module      :  Conllu.Diff
-- Copyright   :  © 2018 bruno cuconato
-- License     :  LPGL-3
--
-- Maintainer  :  bruno cuconato <bcclaro+hackage@gmail.com>
-- Stability   :  experimental
-- Portability :  non-portable
--
-- Build a diff of CoNLL-U elements (documents, sentences, words). it
-- may show the diff (the print* functions return pairs of the
-- differing fields in two words) or return the word pairs for further
-- processing (the diff* functions). it expects paired sentences as
-- input, and a default pairing function is provided.
--
-- this module is useful for visualizing or debugging the processing
-- of CoNLL-U corpora. be sure that the sentences are well-paired, or
-- else it'll be -- as always -- garbage in, garbage out.

module Conllu.Diff where

import Conllu.Type
import Conllu.Utils

import Data.Maybe
import Data.Ord

---
-- * type synonims
-- | CoNLL-U field diff.
type FDiff = StringPair

-- | pair of different words.
type WDiff a = (CW a, CW a)
-- | list of different words in a sentence.
type SDiff a = [WDiff a]
-- | list of lists of different words in sentences.
type DDiff a  = [SDiff a]

---
-- * diffing functions
diffW :: WDiff a -> Bool
-- | 'True' if any word field pairs are mismatched.
diffW :: WDiff a -> Bool
diffW = (Maybe StringPair -> Bool) -> [Maybe StringPair] -> Bool
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
any Maybe StringPair -> Bool
forall a. Maybe a -> Bool
isJust ([Maybe StringPair] -> Bool)
-> (WDiff a -> [Maybe StringPair]) -> WDiff a -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. WDiff a -> [Maybe StringPair]
forall a. WDiff a -> [Maybe StringPair]
printFieldDiffs

diffWs :: [CW a] -> [CW a] -> [WDiff a]
-- | filters the different word pairs.
diffWs :: [CW a] -> [CW a] -> [WDiff a]
diffWs [CW a]
ws1 [CW a]
ws2 = (WDiff a -> Bool) -> [WDiff a] -> [WDiff a]
forall a. (a -> Bool) -> [a] -> [a]
filter WDiff a -> Bool
forall a. WDiff a -> Bool
diffW ([WDiff a] -> [WDiff a]) -> [WDiff a] -> [WDiff a]
forall a b. (a -> b) -> a -> b
$ [CW a] -> [CW a] -> [WDiff a]
forall a b. [a] -> [b] -> [(a, b)]
zip [CW a]
ws1 [CW a]
ws2

diffS :: (Sent, Sent) -> SDiff AW
-- | diffs the sentence pair's words.
diffS :: (Sent, Sent) -> SDiff AW
diffS (Sent
s1, Sent
s2) = [CW AW] -> [CW AW] -> SDiff AW
forall a. [CW a] -> [CW a] -> [WDiff a]
diffWs (Sent -> [CW AW]
_words Sent
s1) (Sent -> [CW AW]
_words Sent
s2)

diffSs :: [(Sent, Sent)] -> DDiff AW
-- | diffs the sentence pairs.
diffSs :: [(Sent, Sent)] -> DDiff AW
diffSs = ((Sent, Sent) -> SDiff AW) -> [(Sent, Sent)] -> DDiff AW
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (Sent, Sent) -> SDiff AW
diffS

---
-- * auxiliary functions
showM :: Show a => Maybe a -> String
-- | shows a word field.
showM :: Maybe a -> String
showM (Just a
x) = a -> String
forall a. Show a => a -> String
show a
x
showM Maybe a
Nothing  = String
"_"

---
-- * pairing functions
pairSentsBy ::
     (Sent -> Sent -> Ordering) -> [Sent] -> [Sent] -> [(Sent, Sent)]
-- | pairs sentences by some ordering of 'Sent'.
pairSentsBy :: (Sent -> Sent -> Ordering) -> [Sent] -> [Sent] -> [(Sent, Sent)]
pairSentsBy Sent -> Sent -> Ordering
_f []  [Sent]
_ss  = []
pairSentsBy Sent -> Sent -> Ordering
_f [Sent]
_ss [] = []
pairSentsBy Sent -> Sent -> Ordering
f ss1 :: [Sent]
ss1@(Sent
s1:[Sent]
st1) ss2 :: [Sent]
ss2@(Sent
s2:[Sent]
st2) =
  case Sent -> Sent -> Ordering
f Sent
s1 Sent
s2 of
    Ordering
LT -> (Sent -> Sent -> Ordering) -> [Sent] -> [Sent] -> [(Sent, Sent)]
pairSentsBy Sent -> Sent -> Ordering
f [Sent]
st1 [Sent]
ss2
    Ordering
GT -> (Sent -> Sent -> Ordering) -> [Sent] -> [Sent] -> [(Sent, Sent)]
pairSentsBy Sent -> Sent -> Ordering
f [Sent]
ss1 [Sent]
st2
    Ordering
EQ -> (Sent
s1, Sent
s2) (Sent, Sent) -> [(Sent, Sent)] -> [(Sent, Sent)]
forall a. a -> [a] -> [a]
: (Sent -> Sent -> Ordering) -> [Sent] -> [Sent] -> [(Sent, Sent)]
pairSentsBy Sent -> Sent -> Ordering
f [Sent]
st1 [Sent]
st2

sentId :: Sent -> Maybe Index
-- | try to find an index in a sentence's metadata looking for
-- 'sent_id = n'.
sentId :: Sent -> Maybe Index
sentId Sent
s =
  let mi :: Maybe String
mi = String -> [StringPair] -> Maybe String
forall a b. Eq a => a -> [(a, b)] -> Maybe b
lookup String
"sent_id " ([StringPair] -> Maybe String) -> [StringPair] -> Maybe String
forall a b. (a -> b) -> a -> b
$ Sent -> [StringPair]
_meta Sent
s
      i :: String
i = String -> Maybe String -> String
forall a. a -> Maybe a -> a
fromMaybe String
"0" Maybe String
mi
  in String -> Maybe Index
forall a. Read a => String -> Maybe a
safeRead String
i :: Maybe Index

pairSents :: [Sent] -> [Sent] -> [(Sent, Sent)]
-- | pair sentences by their sent_id, found in their metadata.
pairSents :: [Sent] -> [Sent] -> [(Sent, Sent)]
pairSents = (Sent -> Sent -> Ordering) -> [Sent] -> [Sent] -> [(Sent, Sent)]
pairSentsBy ((Sent -> Sent -> Ordering) -> [Sent] -> [Sent] -> [(Sent, Sent)])
-> (Sent -> Sent -> Ordering) -> [Sent] -> [Sent] -> [(Sent, Sent)]
forall a b. (a -> b) -> a -> b
$ (Sent -> Maybe Index) -> Sent -> Sent -> Ordering
forall a b. Ord a => (b -> a) -> b -> b -> Ordering
comparing Sent -> Maybe Index
sentId

---
-- * printing functions
printFieldDiffs :: WDiff a -> [Maybe StringPair]
-- | list of maybe differing fields in a pair of words.
printFieldDiffs :: WDiff a -> [Maybe StringPair]
printFieldDiffs (CW a
w1, CW a
w2) = ((CW a -> String) -> Maybe StringPair)
-> [CW a -> String] -> [Maybe StringPair]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (CW a -> CW a -> (CW a -> String) -> Maybe StringPair
forall b p. Eq b => p -> p -> (p -> b) -> Maybe (b, b)
diffField CW a
w1 CW a
w2) [CW a -> String]
forall a. [CW a -> String]
pfs
  where
    diffField :: p -> p -> (p -> b) -> Maybe (b, b)
diffField p
w p
w' p -> b
pf =
      let pf1 :: b
pf1 = p -> b
pf p
w
          pf2 :: b
pf2 = p -> b
pf p
w'
      in if b
pf1 b -> b -> Bool
forall a. Eq a => a -> a -> Bool
/= b
pf2
           then (b, b) -> Maybe (b, b)
forall a. a -> Maybe a
Just (b
pf1, b
pf2)
           else Maybe (b, b)
forall a. Maybe a
Nothing
    pfs :: [CW a -> String]
pfs =
      [ Maybe String -> String
forall a. Show a => Maybe a -> String
showM (Maybe String -> String)
-> (CW a -> Maybe String) -> CW a -> String
forall b c a. (b -> c) -> (a -> b) -> a -> c
. CW a -> Maybe String
forall a. CW a -> Maybe String
_form
      , Maybe String -> String
forall a. Show a => Maybe a -> String
showM (Maybe String -> String)
-> (CW a -> Maybe String) -> CW a -> String
forall b c a. (b -> c) -> (a -> b) -> a -> c
. CW a -> Maybe String
forall a. CW a -> Maybe String
_lemma
      , Maybe POS -> String
forall a. Show a => Maybe a -> String
showM (Maybe POS -> String) -> (CW a -> Maybe POS) -> CW a -> String
forall b c a. (b -> c) -> (a -> b) -> a -> c
. CW a -> Maybe POS
forall a. CW a -> Maybe POS
_upos
      , Maybe String -> String
forall a. Show a => Maybe a -> String
showM (Maybe String -> String)
-> (CW a -> Maybe String) -> CW a -> String
forall b c a. (b -> c) -> (a -> b) -> a -> c
. CW a -> Maybe String
forall a. CW a -> Maybe String
_xpos
      , FEATS -> String
forall a. Show a => a -> String
show  (FEATS -> String) -> (CW a -> FEATS) -> CW a -> String
forall b c a. (b -> c) -> (a -> b) -> a -> c
. CW a -> FEATS
forall a. CW a -> FEATS
_feats
      , Maybe Rel -> String
forall a. Show a => Maybe a -> String
showM (Maybe Rel -> String) -> (CW a -> Maybe Rel) -> CW a -> String
forall b c a. (b -> c) -> (a -> b) -> a -> c
. CW a -> Maybe Rel
forall a. CW a -> Maybe Rel
_rel
      , DEPS -> String
forall a. Show a => a -> String
show  (DEPS -> String) -> (CW a -> DEPS) -> CW a -> String
forall b c a. (b -> c) -> (a -> b) -> a -> c
. CW a -> DEPS
forall a. CW a -> DEPS
_deps
      , Maybe String -> String
forall a. Show a => Maybe a -> String
showM (Maybe String -> String)
-> (CW a -> Maybe String) -> CW a -> String
forall b c a. (b -> c) -> (a -> b) -> a -> c
. CW a -> Maybe String
forall a. CW a -> Maybe String
_misc
      ]

printWDiff :: WDiff a -> [StringPair]
-- | list of differing fields in a pair of words.
printWDiff :: WDiff a -> [StringPair]
printWDiff = [Maybe StringPair] -> [StringPair]
forall a. [Maybe a] -> [a]
catMaybes ([Maybe StringPair] -> [StringPair])
-> (WDiff a -> [Maybe StringPair]) -> WDiff a -> [StringPair]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. WDiff a -> [Maybe StringPair]
forall a. WDiff a -> [Maybe StringPair]
printFieldDiffs

printSDiff :: SDiff a -> [[StringPair]]
-- | list of differing words in a sentence.
printSDiff :: SDiff a -> [[StringPair]]
printSDiff = (WDiff a -> [StringPair]) -> SDiff a -> [[StringPair]]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap WDiff a -> [StringPair]
forall a. WDiff a -> [StringPair]
printWDiff

printDDiff :: DDiff a -> [[[StringPair]]]
-- | list of lists of differing words in sentences.
printDDiff :: DDiff a -> [[[StringPair]]]
printDDiff = (SDiff a -> [[StringPair]]) -> DDiff a -> [[[StringPair]]]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap SDiff a -> [[StringPair]]
forall a. SDiff a -> [[StringPair]]
printSDiff