-- |
-- Module      :  Conllu.Type
-- Copyright   :  © 2018 bruno cuconato
-- License     :  LPGL-3
--
-- Maintainer  :  bruno cuconato <bcclaro+hackage@gmail.com>
-- Stability   :  experimental
-- Portability :  non-portable
--
-- defines types for handling CoNLL-U data.

{-# LANGUAGE EmptyDataDecls #-}

module Conllu.Type where

---
-- imports
import           Conllu.Utils
import qualified Conllu.UposTagset as U
import qualified Conllu.DeprelTagset as D

import           Data.Ord

---
-- * type and data declarations
-- ** Documents and Sentences
type Doc = [Sent]

data Sent = Sent
  { Sent -> [Comment]
_meta  :: [Comment]  -- ^ the sentence's comments.
  , Sent -> [CW AW]
_words :: [CW AW] -- ^ the sentence's words.
  } deriving (Sent -> Sent -> Bool
(Sent -> Sent -> Bool) -> (Sent -> Sent -> Bool) -> Eq Sent
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: Sent -> Sent -> Bool
$c/= :: Sent -> Sent -> Bool
== :: Sent -> Sent -> Bool
$c== :: Sent -> Sent -> Bool
Eq, Int -> Sent -> ShowS
[Sent] -> ShowS
Sent -> String
(Int -> Sent -> ShowS)
-> (Sent -> String) -> ([Sent] -> ShowS) -> Show Sent
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [Sent] -> ShowS
$cshowList :: [Sent] -> ShowS
show :: Sent -> String
$cshow :: Sent -> String
showsPrec :: Int -> Sent -> ShowS
$cshowsPrec :: Int -> Sent -> ShowS
Show)

-- | most comments are (key, value) pairs.
type Comment    = StringPair
type StringPair = (String, String)

-- ** Words
-- | represents a word line in a CoNLL-U file. note that we have
-- collapsed some fields together: 'HEAD' and DEPREL have been
-- combined as a relation type Rel accessible by the '_rel' function;
-- the 'DEPS' field is merely a list of 'Rel'.
--
-- a C(oNLL-U)W(ord) may be a simple word, a multi-word token, or an
-- empty node. this is captured by the phantom type (the `a` in the
-- declaration), which can be parametrized by one of the data types
-- below in order to build functions that only operate on one of these
-- word types (see 'mkSWord' on how to do this). see the '_dep'
-- function, which only operates on simple words, which are the ones
-- that have a DEPREL field.
data CW a = CW
  { CW a -> ID
_id    :: ID        -- ^ ID field
  , CW a -> FORM
_form  :: FORM      -- ^ FORM field
  , CW a -> FORM
_lemma :: LEMMA     -- ^ LEMMA field
  , CW a -> UPOS
_upos  :: UPOS      -- ^ UPOS field
  , CW a -> FORM
_xpos  :: XPOS      -- ^ XPOS field
  , CW a -> FEATS
_feats :: FEATS     -- ^ FEATS field
  , CW a -> Maybe Rel
_rel   :: Maybe Rel -- ^ combined HEAD and DEPREL fields
  , CW a -> DEPS
_deps  :: DEPS      -- ^ DEPS field
  , CW a -> FORM
_misc  :: MISC      -- ^ MISC field
  } deriving (CW a -> CW a -> Bool
(CW a -> CW a -> Bool) -> (CW a -> CW a -> Bool) -> Eq (CW a)
forall a. CW a -> CW a -> Bool
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: CW a -> CW a -> Bool
$c/= :: forall a. CW a -> CW a -> Bool
== :: CW a -> CW a -> Bool
$c== :: forall a. CW a -> CW a -> Bool
Eq, Int -> CW a -> ShowS
[CW a] -> ShowS
CW a -> String
(Int -> CW a -> ShowS)
-> (CW a -> String) -> ([CW a] -> ShowS) -> Show (CW a)
forall a. Int -> CW a -> ShowS
forall a. [CW a] -> ShowS
forall a. CW a -> String
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [CW a] -> ShowS
$cshowList :: forall a. [CW a] -> ShowS
show :: CW a -> String
$cshow :: forall a. CW a -> String
showsPrec :: Int -> CW a -> ShowS
$cshowsPrec :: forall a. Int -> CW a -> ShowS
Show)

instance Ord (CW a) where
  compare :: CW a -> CW a -> Ordering
compare = (CW a -> ID) -> CW a -> CW a -> Ordering
forall a b. Ord a => (b -> a) -> b -> b -> Ordering
comparing CW a -> ID
forall a. CW a -> ID
_id

-- *** Word types
-- | phantom type for any kind of word.
data AW
-- | phantom type for a simple word.
data SW
-- | phantom type for multiword tokens. do note that in MWTs only the
-- 'ID', 'FORM' and 'MISC' fields may be non-empty.
data MT
-- | phantom type for an empty node.
data EN

-- *** Word Fields
data ID -- | Word ID field.
  = SID Index -- ^ word ID is an integer
  | MID Index
        Index -- ^ multi-word token ID is a range
  | EID Index
        Index -- ^ empty node ID is a decimal
  deriving (ID -> ID -> Bool
(ID -> ID -> Bool) -> (ID -> ID -> Bool) -> Eq ID
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: ID -> ID -> Bool
$c/= :: ID -> ID -> Bool
== :: ID -> ID -> Bool
$c== :: ID -> ID -> Bool
Eq, Int -> ID -> ShowS
[ID] -> ShowS
ID -> String
(Int -> ID -> ShowS)
-> (ID -> String) -> ([ID] -> ShowS) -> Show ID
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [ID] -> ShowS
$cshowList :: [ID] -> ShowS
show :: ID -> String
$cshow :: ID -> String
showsPrec :: Int -> ID -> ShowS
$cshowsPrec :: Int -> ID -> ShowS
Show)

instance Ord ID where
  compare :: ID -> ID -> Ordering
compare = ID -> ID -> Ordering
idOrd
    where
      idOrd :: ID -> ID -> Ordering
      idOrd :: ID -> ID -> Ordering
idOrd (SID Int
x) (SID Int
y) = Int -> Int -> Ordering
forall a. Ord a => a -> a -> Ordering
compare Int
x Int
y
      idOrd ID
id1 ID
id2 =
        let c :: Ordering
c = (ID -> Int) -> ID -> ID -> Ordering
forall a b. Ord a => (b -> a) -> b -> b -> Ordering
comparing ID -> Int
fstIx ID
id1 ID
id2
        in case Ordering
c of
          Ordering
EQ -> ID -> ID -> Ordering
sameIx ID
id1 ID
id2
          Ordering
_ -> Ordering
c
        where
          fstIx :: ID -> Index
          fstIx :: ID -> Int
fstIx (SID Int
i) = Int
i
          fstIx (MID Int
i Int
_ei) = Int
i
          fstIx (EID Int
i Int
_ei) = Int
i
          sndIx :: ID -> Index
          sndIx :: ID -> Int
sndIx (EID Int
_s Int
e) = Int
e
          sndIx (MID Int
_s Int
e) = Int
e
          sameIx :: ID -> ID -> Ordering
          sameIx :: ID -> ID -> Ordering
sameIx (SID Int
_) ID
_id = Ordering
GT
          sameIx ID
_id (SID Int
_) = Ordering
LT
          -- reverse ID order so that MID 1 4 comes before MID 1 2:
          sameIx ID
i1 ID
i2 = (ID -> Int) -> ID -> ID -> Ordering
forall a b. Ord a => (b -> a) -> b -> b -> Ordering
comparing ID -> Int
sndIx ID
i2 ID
i1

type FORM  = Maybe String
type LEMMA = Maybe String
type UPOS  = Maybe U.POS
type XPOS  = Maybe String
type FEATS = [Feat]
type HEAD  = ID
type DEPS  = [Rel]
type MISC  = Maybe String

-- | feature representation
data Feat = Feat
  { Feat -> String
_feat       :: String       -- ^ feature name
  , Feat -> [String]
_featValues :: [String]     -- ^ feature values
  , Feat -> FORM
_featType   :: Maybe String -- ^ feature type (inside brackets).
  } deriving (Feat -> Feat -> Bool
(Feat -> Feat -> Bool) -> (Feat -> Feat -> Bool) -> Eq Feat
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: Feat -> Feat -> Bool
$c/= :: Feat -> Feat -> Bool
== :: Feat -> Feat -> Bool
$c== :: Feat -> Feat -> Bool
Eq, Int -> Feat -> ShowS
FEATS -> ShowS
Feat -> String
(Int -> Feat -> ShowS)
-> (Feat -> String) -> (FEATS -> ShowS) -> Show Feat
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: FEATS -> ShowS
$cshowList :: FEATS -> ShowS
show :: Feat -> String
$cshow :: Feat -> String
showsPrec :: Int -> Feat -> ShowS
$cshowsPrec :: Int -> Feat -> ShowS
Show) -- add manual Ord instance?

-- | dependency relation representation.
data Rel = Rel
  { Rel -> ID
_head :: HEAD -- ^ head 'ID'
  , Rel -> EP
_deprel :: D.EP -- ^ dependency relation type
  , Rel -> FORM
_subdep :: Maybe String -- ^ dependency relation subtype
  , Rel -> Maybe [String]
_rest :: Maybe [String] -- ^ provisitonal, see issues #23,#17
  } deriving (Rel -> Rel -> Bool
(Rel -> Rel -> Bool) -> (Rel -> Rel -> Bool) -> Eq Rel
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: Rel -> Rel -> Bool
$c/= :: Rel -> Rel -> Bool
== :: Rel -> Rel -> Bool
$c== :: Rel -> Rel -> Bool
Eq, Int -> Rel -> ShowS
DEPS -> ShowS
Rel -> String
(Int -> Rel -> ShowS)
-> (Rel -> String) -> (DEPS -> ShowS) -> Show Rel
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: DEPS -> ShowS
$cshowList :: DEPS -> ShowS
show :: Rel -> String
$cshow :: Rel -> String
showsPrec :: Int -> Rel -> ShowS
$cshowsPrec :: Int -> Rel -> ShowS
Show)

type Index   = Int
-- | 'ID' separator in meta words
type IxSep   = Char

---
-- ** accessor functions
_dep :: CW SW -> Maybe D.EP
-- | get DEPREL main value, if it exists.
_dep :: CW SW -> Maybe EP
_dep CW SW
w = EP -> Maybe EP
forall a. a -> Maybe a
Just (EP -> Maybe EP) -> (Rel -> EP) -> Rel -> Maybe EP
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Rel -> EP
_deprel (Rel -> Maybe EP) -> Maybe Rel -> Maybe EP
forall (m :: * -> *) a b. Monad m => (a -> m b) -> m a -> m b
=<< CW SW -> Maybe Rel
forall a. CW a -> Maybe Rel
_rel CW SW
w

depIs :: D.EP -> CW SW -> Bool
-- | check if DEP is the one provided.
depIs :: EP -> CW SW -> Bool
depIs EP
d = Bool -> (EP -> Bool) -> Maybe EP -> Bool
forall b a. b -> (a -> b) -> Maybe a -> b
maybe Bool
False (EP
d EP -> EP -> Bool
forall a. Eq a => a -> a -> Bool
==) (Maybe EP -> Bool) -> (CW SW -> Maybe EP) -> CW SW -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. CW SW -> Maybe EP
_dep

---
-- ** constructor functions
mkDEP :: String -> D.EP
-- | read a main DEPREL (no subtype).
mkDEP :: String -> EP
mkDEP = String -> EP
forall a. Read a => String -> a
read (String -> EP) -> ShowS -> String -> EP
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ShowS
upcaseStr

mkUPOS :: String -> U.POS
-- | read an 'UPOS' tag.
mkUPOS :: String -> POS
mkUPOS = String -> POS
forall a. Read a => String -> a
read (String -> POS) -> ShowS -> String -> POS
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ShowS
upcaseStr

-- words
mkAW :: ID -> FORM -> LEMMA -> UPOS -> XPOS -> FEATS -> Maybe Rel
  -> DEPS -> MISC -> CW AW
-- | make a word from its fields, by default it has phantom type of AW
-- (any kind of word).
mkAW :: ID
-> FORM
-> FORM
-> UPOS
-> FORM
-> FEATS
-> Maybe Rel
-> DEPS
-> FORM
-> CW AW
mkAW = ID
-> FORM
-> FORM
-> UPOS
-> FORM
-> FEATS
-> Maybe Rel
-> DEPS
-> FORM
-> CW AW
forall a.
ID
-> FORM
-> FORM
-> UPOS
-> FORM
-> FEATS
-> Maybe Rel
-> DEPS
-> FORM
-> CW a
CW

mkSW :: CW AW -> CW SW
-- | coerce a word to a simple word.
mkSW :: CW AW -> CW SW
mkSW CW { $sel:_id:CW :: forall a. CW a -> ID
_id = ID
i
        , $sel:_form:CW :: forall a. CW a -> FORM
_form = FORM
f
        , $sel:_lemma:CW :: forall a. CW a -> FORM
_lemma = FORM
l
        , $sel:_upos:CW :: forall a. CW a -> UPOS
_upos = UPOS
u
        , $sel:_xpos:CW :: forall a. CW a -> FORM
_xpos = FORM
x
        , $sel:_feats:CW :: forall a. CW a -> FEATS
_feats = FEATS
fs
        , $sel:_rel:CW :: forall a. CW a -> Maybe Rel
_rel = Maybe Rel
r
        , $sel:_deps:CW :: forall a. CW a -> DEPS
_deps = DEPS
ds
        , $sel:_misc:CW :: forall a. CW a -> FORM
_misc = FORM
m
        } = ID
-> FORM
-> FORM
-> UPOS
-> FORM
-> FEATS
-> Maybe Rel
-> DEPS
-> FORM
-> CW SW
forall a.
ID
-> FORM
-> FORM
-> UPOS
-> FORM
-> FEATS
-> Maybe Rel
-> DEPS
-> FORM
-> CW a
CW ID
i FORM
f FORM
l UPOS
u FORM
x FEATS
fs Maybe Rel
r DEPS
ds FORM
m

{-- saved for a future validation module
---
-- validation
mTkOK :: FORM -> LEMMA -> UPOS -> XPOS -> FEATS -> (ID, (Dep, Maybe String))
  -> Bool
mTkOK fo l up xp fe h dr d =
  assSomething fo $
  assNothing l $
  assNothing up $
  assNothing xp $
  assNull fe $ assNothing h $ assNothing dr $ assNull d True

eTkOK :: Dephead -> DepRel -> Deps -> Bool
eTkOK h dr d =
  assNothing h $ assNothing dr $ (assert . not . null $ d) True
--}