module NLP.SyntaxNet.Types.CoNLL where
import qualified Data.ByteString as BS
import qualified Data.ByteString.Char8 as BSC
import Data.Char (toUpper, isUpper, toLower)
import Data.Csv as Csv
import Data.Maybe
import Data.Default
import Protolude
import Data.List.Split
import qualified Data.Text as T
import GHC.Generics
import Data.ConllToken (ConllToken(..), SyntaxErrorCoNLL(..))
import Data.SyntaxTree (SyntaxtTree(..), createSyntaxTree)
import Model.PennTreebank
import Model.UniversalTreebank
import Data.TagLabel
type SnConllToken a = ConllToken PosCG POS REL T.Text a
type SnConllTree a = SyntaxtTree PosCG POS REL T.Text a
data PosCG =
VERB
| NOUN
| PRON
| ADJ
| ADV
| ADP
| CONJ
| DET
| NUM
| PRT
| X
| PUNCT
| UnkCg
deriving (Show, Eq, Generic)
instance Csv.FromRecord (SnConllToken T.Text) where
parseRecord v = do
a0 <- v .! 0
a1 <- v .! 1
a2 <- v .! 2
a3 <- ( parsePosCf <$> v .! 3)
a4 <- ( parsePosFg <$> v .! 4)
a5 <- v .! 5
a6 <- v .! 6
a7 <- ( parseGER <$> v .! 7)
a8 <- v .! 8
a9 <- v .! 9
return (ConllToken a0 a1 a2 a3 a4 a5 a6 a7 a8 a9)
parsePosCf :: T.Text -> PosCG
parsePosCf s =
case (map toUpper $ T.unpack s) of
"VERB" -> VERB
"NOUN" -> NOUN
"PRON" -> PRON
"ADJ" -> ADJ
"ADV" -> ADV
"ADP" -> ADP
"CONJ" -> CONJ
"DET" -> DET
"NUM" -> NUM
"PRT" -> PRT
"X" -> X
"." -> PUNCT
otherwise -> UnkCg
parsePosFg :: T.Text -> POS
parsePosFg s =
case (map toUpper $ T.unpack s) of
"CC" -> CC
"CD" -> CD
"DT" -> DT
"EX" -> EX
"FW" -> FW
"IN" -> IN
"JJ" -> JJ
"JJR" -> JJR
"JJS" -> JJS
"LS" -> LS
"MD" -> MD
"NN" -> NN
"NNS" -> NNS
"NNP" -> NNP
"NNPS"-> NNPS
"PDT" -> PDT
"POS" -> POS
"PRP" -> PRP
"PRP$"-> fromJust $ fromLabelText "RPR$"
"RB" -> RB
"RBR" -> RBR
"RBS" -> RBS
"RP" -> RP
"SYM" -> SYM
"TO" -> TO
"UH" -> UH
"VB" -> VB
"VBD" -> VBD
"VBG" -> VBG
"VBN" -> VBN
"VBP" -> VBP
"VBZ" -> VBZ
"WDT" -> WDT
"WP" -> WP
"WRB" -> WRB
otherwise -> CC
parseGER :: T.Text -> REL
parseGER s =
case s of
"acl" -> Acl
"acl:relcl" -> fromJust $ fromLabelText "acl:relcl"
"advck" -> Advcl
"advmod" -> Advmod
"amod" -> Amod
"appos" -> Appos
"aux" -> Aux
"auxpass" -> Auxpass
"case" -> Case
"cc" -> Cc
"cc:preconj" -> fromJust $ fromLabelText "cc:preconj"
"ccomp" -> Ccomp
"compound" -> Compound
"compound:prt"-> fromJust $ fromLabelText "compound:prt"
"conj" -> Conj
"cop" -> Cop
"csubj" -> Csubj
"csubjpass" -> Csubjpass
"dep" -> Dep
"det" -> Det
"det:predet" -> fromJust $ fromLabelText "det:predet"
"discource" -> Discourse
"discolated" -> Dislocated
"dobj" -> Dobj
"expl" -> Expl
"fixed" -> fromJust $ fromLabelText "fixed"
"fixed:not" -> fromJust $ fromLabelText "fixed:not"
"flat" -> Flat
"foreign" -> Foreign
"goeswith" -> Goeswith
"iobj" -> Iobj
"list" -> List
"mark" -> Mark
"neg" -> Neg
"nmod" -> Nmod
"nmod:npmod" -> fromJust $ fromLabelText "nmod:npmod"
"nmod:poss" -> fromJust $ fromLabelText "nmod:poss"
"nmod:tmod" -> fromJust $ fromLabelText "nmod:tmod"
"nsubj" -> Nsubj
"nsubjpass" -> Nsubjpass
"nummod" -> Nummod
"orphan" -> Orphan
"parataxis" -> Parataxis
"punct" -> Punct
"reparandum" -> Reparandum
"ROOT" -> ROOT
"vocatile" -> Vocative
"xcomp" -> Xcomp
otherwise -> Punct