{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE DeriveAnyClass #-} {-# LANGUAGE TypeSynonymInstances #-} {-# LANGUAGE FlexibleInstances #-} module NLP.SyntaxNet.Types.CoNLL where import qualified Data.ByteString as BS import qualified Data.ByteString.Char8 as BSC import Data.Char (toUpper, isUpper, toLower) import Data.Csv as Csv import Data.Maybe import Data.Default import Protolude import Data.List.Split import qualified Data.Text as T import GHC.Generics import Data.ConllToken (ConllToken(..), SyntaxErrorCoNLL(..)) import Data.SyntaxTree (SyntaxtTree(..), createSyntaxTree) import Model.PennTreebank import Model.UniversalTreebank import Data.TagLabel -------------------------------------------------------------------------------- type SnConllToken a = ConllToken PosCG POS REL T.Text a type SnConllTree a = SyntaxtTree PosCG POS REL T.Text a -- | See "A Universal Part-of-Speech Tagset" by Slav Petrov, Dipanjan Das and Ryan McDonald -- for more details http://arxiv.org/abs/1104.2086 -- Sum type for 12 universal part-of-speech tags, coarse-grained data PosCG = VERB -- verbs (all tenses and modes) | NOUN -- nouns (common and proper) | PRON -- pronouns | ADJ -- adjectives | ADV -- adverbs | ADP -- adpositions (prepositions and postpositions) | CONJ -- conjunctions | DET -- determiners | NUM -- cardinal numbers | PRT -- particles or other function words | X -- other: foreign words, typos, abbreviations | PUNCT -- punctuation | UnkCg -- unknown deriving (Show, Eq, Generic) -------------------------------------------------------------------------------- instance Csv.FromRecord (SnConllToken T.Text) where parseRecord v = do a0 <- v .! 0 a1 <- v .! 1 a2 <- v .! 2 a3 <- ( parsePosCf <$> v .! 3) a4 <- ( parsePosFg <$> v .! 4) a5 <- v .! 5 a6 <- v .! 6 a7 <- ( parseGER <$> v .! 7) a8 <- v .! 8 a9 <- v .! 9 return (ConllToken a0 a1 a2 a3 a4 a5 a6 a7 a8 a9) -- | Converting coarse-grained textual types int -- ADT representation parsePosCf :: T.Text -> PosCG parsePosCf s = case (map toUpper $ T.unpack s) of "VERB" -> VERB "NOUN" -> NOUN "PRON" -> PRON "ADJ" -> ADJ "ADV" -> ADV "ADP" -> ADP "CONJ" -> CONJ "DET" -> DET "NUM" -> NUM "PRT" -> PRT "X" -> X "." -> PUNCT otherwise -> UnkCg -- | Converting fine-grained textual types into -- ADT representation parsePosFg :: T.Text -> POS parsePosFg s = case (map toUpper $ T.unpack s) of "CC" -> CC "CD" -> CD "DT" -> DT "EX" -> EX "FW" -> FW "IN" -> IN "JJ" -> JJ "JJR" -> JJR "JJS" -> JJS "LS" -> LS "MD" -> MD "NN" -> NN "NNS" -> NNS "NNP" -> NNP "NNPS"-> NNPS "PDT" -> PDT "POS" -> POS "PRP" -> PRP "PRP$"-> fromJust $ fromLabelText "RPR$" "RB" -> RB "RBR" -> RBR "RBS" -> RBS "RP" -> RP "SYM" -> SYM "TO" -> TO "UH" -> UH "VB" -> VB "VBD" -> VBD "VBG" -> VBG "VBN" -> VBN "VBP" -> VBP "VBZ" -> VBZ "WDT" -> WDT "WP" -> WP -- "WPS" -> WPS "WRB" -> WRB otherwise -> CC -- haskell-conll doesn't support fallback parseGER :: T.Text -> REL parseGER s = case s of "acl" -> Acl "acl:relcl" -> fromJust $ fromLabelText "acl:relcl" "advck" -> Advcl "advmod" -> Advmod "amod" -> Amod "appos" -> Appos "aux" -> Aux "auxpass" -> Auxpass "case" -> Case "cc" -> Cc "cc:preconj" -> fromJust $ fromLabelText "cc:preconj" "ccomp" -> Ccomp "compound" -> Compound "compound:prt"-> fromJust $ fromLabelText "compound:prt" "conj" -> Conj "cop" -> Cop "csubj" -> Csubj "csubjpass" -> Csubjpass "dep" -> Dep "det" -> Det "det:predet" -> fromJust $ fromLabelText "det:predet" -- haskell-conll have inconsistency in naming, why use _ whee you have camelcase "discource" -> Discourse "discolated" -> Dislocated "dobj" -> Dobj "expl" -> Expl "fixed" -> fromJust $ fromLabelText "fixed" "fixed:not" -> fromJust $ fromLabelText "fixed:not" -- haskell-conll doesn't support this one "flat" -> Flat "foreign" -> Foreign "goeswith" -> Goeswith "iobj" -> Iobj "list" -> List "mark" -> Mark "neg" -> Neg "nmod" -> Nmod "nmod:npmod" -> fromJust $ fromLabelText "nmod:npmod" "nmod:poss" -> fromJust $ fromLabelText "nmod:poss" "nmod:tmod" -> fromJust $ fromLabelText "nmod:tmod" "nsubj" -> Nsubj "nsubjpass" -> Nsubjpass "nummod" -> Nummod "orphan" -> Orphan "parataxis" -> Parataxis "punct" -> Punct "reparandum" -> Reparandum "ROOT" -> ROOT "vocatile" -> Vocative "xcomp" -> Xcomp otherwise -> Punct -- haskell-conll doesn't support uknown type, like UnkGer, so fall back ro Punct