{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE QuasiQuotes #-} import qualified Data.Aeson as J import qualified Data.ByteString.Lazy as BSL import Data.Either (isRight) import qualified Data.Text.Encoding as TE import Data.Text import Test.Tasty import Test.Tasty.HUnit import Text.RawString.QQ (r) import Text.CoreNLP.Types (Document) main = defaultMain tests tests :: TestTree tests = testGroup "basic" [testCorenlp454, testHeadlines] testCorenlp454 = testCase "test corenlp 4.5.4 output" $ do let eParsed = parseJsonDoc corenlp454 case eParsed of Right parsed -> pure () Left err -> fail $ show err testHeadlines = testCase "test headlines" $ assertBool "Not Right!" $ isRight $ parseJsonDoc headlines -- | Parse JSON output of CoreNLP. See 'headlines' source for an example JSON input. parseJsonDoc :: Text -> Either String Document parseJsonDoc = J.eitherDecode . BSL.fromStrict . TE.encodeUtf8 --- DATA -- | Output of this for CoreNLP 4.5.4 -- ``` -- curl -XPOST 'http://localhost:9000/?properties=%7B%22annotators%22:%20%22tokenize,ssplit,pos,ner%22,%20%22outputFormat%22:%20%22json%22%7D' -d 'Below is a quick snippet of code that demonstrates running a full pipeline on some sample text.' -- ``` corenlp454 :: Text corenlp454 = [r| { "sentences": [ { "index": 0, "entitymentions": [ ], "tokens": [ { "index": 1, "word": "Below", "originalText": "Below", "lemma": "below", "characterOffsetBegin": 0, "characterOffsetEnd": 5, "pos": "RB", "ner": "O", "before": "", "after": " " }, { "index": 2, "word": "is", "originalText": "is", "lemma": "be", "characterOffsetBegin": 6, "characterOffsetEnd": 8, "pos": "VBZ", "ner": "O", "before": " ", "after": " " }, { "index": 3, "word": "a", "originalText": "a", "lemma": "a", "characterOffsetBegin": 9, "characterOffsetEnd": 10, "pos": "DT", "ner": "O", "before": " ", "after": " " }, { "index": 4, "word": "quick", "originalText": "quick", "lemma": "quick", "characterOffsetBegin": 11, "characterOffsetEnd": 16, "pos": "JJ", "ner": "O", "before": " ", "after": " " }, { "index": 5, "word": "snippet", "originalText": "snippet", "lemma": "snippet", "characterOffsetBegin": 17, "characterOffsetEnd": 24, "pos": "NN", "ner": "O", "before": " ", "after": " " }, { "index": 6, "word": "of", "originalText": "of", "lemma": "of", "characterOffsetBegin": 25, "characterOffsetEnd": 27, "pos": "IN", "ner": "O", "before": " ", "after": " " }, { "index": 7, "word": "code", "originalText": "code", "lemma": "code", "characterOffsetBegin": 28, "characterOffsetEnd": 32, "pos": "NN", "ner": "O", "before": " ", "after": " " }, { "index": 8, "word": "that", "originalText": "that", "lemma": "that", "characterOffsetBegin": 33, "characterOffsetEnd": 37, "pos": "WDT", "ner": "O", "before": " ", "after": " " }, { "index": 9, "word": "demonstrates", "originalText": "demonstrates", "lemma": "demonstrate", "characterOffsetBegin": 38, "characterOffsetEnd": 50, "pos": "VBZ", "ner": "O", "before": " ", "after": " " }, { "index": 10, "word": "running", "originalText": "running", "lemma": "run", "characterOffsetBegin": 51, "characterOffsetEnd": 58, "pos": "VBG", "ner": "O", "before": " ", "after": " " }, { "index": 11, "word": "a", "originalText": "a", "lemma": "a", "characterOffsetBegin": 59, "characterOffsetEnd": 60, "pos": "DT", "ner": "O", "before": " ", "after": " " }, { "index": 12, "word": "full", "originalText": "full", "lemma": "full", "characterOffsetBegin": 61, "characterOffsetEnd": 65, "pos": "JJ", "ner": "O", "before": " ", "after": " " }, { "index": 13, "word": "pipeline", "originalText": "pipeline", "lemma": "pipeline", "characterOffsetBegin": 66, "characterOffsetEnd": 74, "pos": "NN", "ner": "O", "before": " ", "after": " " }, { "index": 14, "word": "on", "originalText": "on", "lemma": "on", "characterOffsetBegin": 75, "characterOffsetEnd": 77, "pos": "IN", "ner": "O", "before": " ", "after": " " }, { "index": 15, "word": "some", "originalText": "some", "lemma": "some", "characterOffsetBegin": 78, "characterOffsetEnd": 82, "pos": "DT", "ner": "O", "before": " ", "after": " " }, { "index": 16, "word": "sample", "originalText": "sample", "lemma": "sample", "characterOffsetBegin": 83, "characterOffsetEnd": 89, "pos": "NN", "ner": "O", "before": " ", "after": " " }, { "index": 17, "word": "text", "originalText": "text", "lemma": "text", "characterOffsetBegin": 90, "characterOffsetEnd": 94, "pos": "NN", "ner": "O", "before": " ", "after": "" }, { "index": 18, "word": ".", "originalText": ".", "lemma": ".", "characterOffsetBegin": 94, "characterOffsetEnd": 95, "pos": ".", "ner": "O", "before": "", "after": "" } ] } ] } |] -- | Copied from corenlp-parser tests headlines :: Text headlines = [r| { "docId": "headlines.txt", "sentences": [ { "index": 0, "parse": "(ROOT\n (S\n (NP\n (NP (NNP Jersey) (NNP Shore) (NN Season) (CD 6) (NN cast) (POS 's))\n (NNS salaries))\n (VP\n (VP (VBD revealed))\n (: ;)\n (NP\n (NP (JJR More))\n (PP (IN than)\n (NP (NNP President) (NNP Obama)))))\n (. !)))", "basicDependencies": [ { "dep": "ROOT", "governor": 0, "governorGloss": "ROOT", "dependent": 8, "dependentGloss": "revealed" }, { "dep": "compound", "governor": 5, "governorGloss": "cast", "dependent": 1, "dependentGloss": "Jersey" }, { "dep": "compound", "governor": 5, "governorGloss": "cast", "dependent": 2, "dependentGloss": "Shore" }, { "dep": "compound", "governor": 5, "governorGloss": "cast", "dependent": 3, "dependentGloss": "Season" }, { "dep": "nummod", "governor": 5, "governorGloss": "cast", "dependent": 4, "dependentGloss": "6" }, { "dep": "nmod:poss", "governor": 7, "governorGloss": "salaries", "dependent": 5, "dependentGloss": "cast" }, { "dep": "case", "governor": 5, "governorGloss": "cast", "dependent": 6, "dependentGloss": "'s" }, { "dep": "nsubj", "governor": 8, "governorGloss": "revealed", "dependent": 7, "dependentGloss": "salaries" }, { "dep": "punct", "governor": 8, "governorGloss": "revealed", "dependent": 9, "dependentGloss": ";" }, { "dep": "dobj", "governor": 8, "governorGloss": "revealed", "dependent": 10, "dependentGloss": "More" }, { "dep": "case", "governor": 13, "governorGloss": "Obama", "dependent": 11, "dependentGloss": "than" }, { "dep": "compound", "governor": 13, "governorGloss": "Obama", "dependent": 12, "dependentGloss": "President" }, { "dep": "nmod", "governor": 10, "governorGloss": "More", "dependent": 13, "dependentGloss": "Obama" }, { "dep": "punct", "governor": 8, "governorGloss": "revealed", "dependent": 14, "dependentGloss": "!" } ], "enhancedDependencies": [ { "dep": "ROOT", "governor": 0, "governorGloss": "ROOT", "dependent": 8, "dependentGloss": "revealed" }, { "dep": "compound", "governor": 5, "governorGloss": "cast", "dependent": 1, "dependentGloss": "Jersey" }, { "dep": "compound", "governor": 5, "governorGloss": "cast", "dependent": 2, "dependentGloss": "Shore" }, { "dep": "compound", "governor": 5, "governorGloss": "cast", "dependent": 3, "dependentGloss": "Season" }, { "dep": "nummod", "governor": 5, "governorGloss": "cast", "dependent": 4, "dependentGloss": "6" }, { "dep": "nmod:poss", "governor": 7, "governorGloss": "salaries", "dependent": 5, "dependentGloss": "cast" }, { "dep": "case", "governor": 5, "governorGloss": "cast", "dependent": 6, "dependentGloss": "'s" }, { "dep": "nsubj", "governor": 8, "governorGloss": "revealed", "dependent": 7, "dependentGloss": "salaries" }, { "dep": "punct", "governor": 8, "governorGloss": "revealed", "dependent": 9, "dependentGloss": ";" }, { "dep": "dobj", "governor": 8, "governorGloss": "revealed", "dependent": 10, "dependentGloss": "More" }, { "dep": "case", "governor": 13, "governorGloss": "Obama", "dependent": 11, "dependentGloss": "than" }, { "dep": "compound", "governor": 13, "governorGloss": "Obama", "dependent": 12, "dependentGloss": "President" }, { "dep": "nmod:than", "governor": 10, "governorGloss": "More", "dependent": 13, "dependentGloss": "Obama" }, { "dep": "punct", "governor": 8, "governorGloss": "revealed", "dependent": 14, "dependentGloss": "!" } ], "enhancedPlusPlusDependencies": [ { "dep": "ROOT", "governor": 0, "governorGloss": "ROOT", "dependent": 8, "dependentGloss": "revealed" }, { "dep": "compound", "governor": 5, "governorGloss": "cast", "dependent": 1, "dependentGloss": "Jersey" }, { "dep": "compound", "governor": 5, "governorGloss": "cast", "dependent": 2, "dependentGloss": "Shore" }, { "dep": "compound", "governor": 5, "governorGloss": "cast", "dependent": 3, "dependentGloss": "Season" }, { "dep": "nummod", "governor": 5, "governorGloss": "cast", "dependent": 4, "dependentGloss": "6" }, { "dep": "nmod:poss", "governor": 7, "governorGloss": "salaries", "dependent": 5, "dependentGloss": "cast" }, { "dep": "case", "governor": 5, "governorGloss": "cast", "dependent": 6, "dependentGloss": "'s" }, { "dep": "nsubj", "governor": 8, "governorGloss": "revealed", "dependent": 7, "dependentGloss": "salaries" }, { "dep": "punct", "governor": 8, "governorGloss": "revealed", "dependent": 9, "dependentGloss": ";" }, { "dep": "dobj", "governor": 8, "governorGloss": "revealed", "dependent": 10, "dependentGloss": "More" }, { "dep": "case", "governor": 13, "governorGloss": "Obama", "dependent": 11, "dependentGloss": "than" }, { "dep": "compound", "governor": 13, "governorGloss": "Obama", "dependent": 12, "dependentGloss": "President" }, { "dep": "nmod:than", "governor": 10, "governorGloss": "More", "dependent": 13, "dependentGloss": "Obama" }, { "dep": "punct", "governor": 8, "governorGloss": "revealed", "dependent": 14, "dependentGloss": "!" } ], "entitymentions": [ { "docTokenBegin": 3, "docTokenEnd": 4, "tokenBegin": 3, "tokenEnd": 4, "text": "6", "characterOffsetBegin": 20, "characterOffsetEnd": 21, "ner": "NUMBER", "normalizedNER": "6.0" }, { "docTokenBegin": 11, "docTokenEnd": 12, "tokenBegin": 11, "tokenEnd": 12, "text": "President", "characterOffsetBegin": 58, "characterOffsetEnd": 67, "ner": "TITLE" }, { "docTokenBegin": 12, "docTokenEnd": 13, "tokenBegin": 12, "tokenEnd": 13, "text": "Obama", "characterOffsetBegin": 68, "characterOffsetEnd": 73, "ner": "PERSON" } ], "tokens": [ { "index": 1, "word": "Jersey", "originalText": "Jersey", "lemma": "Jersey", "characterOffsetBegin": 0, "characterOffsetEnd": 6, "pos": "NNP", "ner": "O", "speaker": "PER0", "before": "", "after": " " }, { "index": 2, "word": "Shore", "originalText": "Shore", "lemma": "Shore", "characterOffsetBegin": 7, "characterOffsetEnd": 12, "pos": "NNP", "ner": "O", "speaker": "PER0", "before": " ", "after": " " }, { "index": 3, "word": "Season", "originalText": "Season", "lemma": "season", "characterOffsetBegin": 13, "characterOffsetEnd": 19, "pos": "NN", "ner": "O", "speaker": "PER0", "before": " ", "after": " " }, { "index": 4, "word": "6", "originalText": "6", "lemma": "6", "characterOffsetBegin": 20, "characterOffsetEnd": 21, "pos": "CD", "ner": "NUMBER", "normalizedNER": "6.0", "speaker": "PER0", "before": " ", "after": " " }, { "index": 5, "word": "cast", "originalText": "cast", "lemma": "cast", "characterOffsetBegin": 22, "characterOffsetEnd": 26, "pos": "NN", "ner": "O", "speaker": "PER0", "before": " ", "after": "" }, { "index": 6, "word": "'s", "originalText": "'s", "lemma": "'s", "characterOffsetBegin": 26, "characterOffsetEnd": 28, "pos": "POS", "ner": "O", "speaker": "PER0", "before": "", "after": " " }, { "index": 7, "word": "salaries", "originalText": "salaries", "lemma": "salary", "characterOffsetBegin": 29, "characterOffsetEnd": 37, "pos": "NNS", "ner": "O", "speaker": "PER0", "before": " ", "after": " " }, { "index": 8, "word": "revealed", "originalText": "revealed", "lemma": "reveal", "characterOffsetBegin": 38, "characterOffsetEnd": 46, "pos": "VBD", "ner": "O", "speaker": "PER0", "before": " ", "after": "" }, { "index": 9, "word": ";", "originalText": ";", "lemma": ";", "characterOffsetBegin": 46, "characterOffsetEnd": 47, "pos": ":", "ner": "O", "speaker": "PER0", "before": "", "after": " " }, { "index": 10, "word": "More", "originalText": "More", "lemma": "more", "characterOffsetBegin": 48, "characterOffsetEnd": 52, "pos": "JJR", "ner": "O", "speaker": "PER0", "before": " ", "after": " " }, { "index": 11, "word": "than", "originalText": "than", "lemma": "than", "characterOffsetBegin": 53, "characterOffsetEnd": 57, "pos": "IN", "ner": "O", "speaker": "PER0", "before": " ", "after": " " }, { "index": 12, "word": "President", "originalText": "President", "lemma": "President", "characterOffsetBegin": 58, "characterOffsetEnd": 67, "pos": "NNP", "ner": "TITLE", "speaker": "PER0", "before": " ", "after": " " }, { "index": 13, "word": "Obama", "originalText": "Obama", "lemma": "Obama", "characterOffsetBegin": 68, "characterOffsetEnd": 73, "pos": "NNP", "ner": "PERSON", "speaker": "PER0", "before": " ", "after": "" }, { "index": 14, "word": "!", "originalText": "!", "lemma": "!", "characterOffsetBegin": 73, "characterOffsetEnd": 74, "pos": ".", "ner": "O", "speaker": "PER0", "before": "", "after": "\n" } ] } ], "corefs": { "1": [ { "id": 1, "text": "6", "type": "PROPER", "number": "SINGULAR", "gender": "UNKNOWN", "animacy": "INANIMATE", "startIndex": 4, "endIndex": 5, "headIndex": 4, "sentNum": 1, "position": [ 1, 3 ], "isRepresentativeMention": true } ], "2": [ { "id": 2, "text": "Jersey Shore Season 6 cast 's salaries", "type": "NOMINAL", "number": "PLURAL", "gender": "UNKNOWN", "animacy": "INANIMATE", "startIndex": 1, "endIndex": 8, "headIndex": 7, "sentNum": 1, "position": [ 1, 1 ], "isRepresentativeMention": true } ], "3": [ { "id": 3, "text": "Jersey Shore Season 6 cast 's", "type": "PROPER", "number": "SINGULAR", "gender": "UNKNOWN", "animacy": "INANIMATE", "startIndex": 1, "endIndex": 7, "headIndex": 2, "sentNum": 1, "position": [ 1, 2 ], "isRepresentativeMention": true } ], "4": [ { "id": 4, "text": "More than President Obama", "type": "NOMINAL", "number": "UNKNOWN", "gender": "UNKNOWN", "animacy": "INANIMATE", "startIndex": 10, "endIndex": 14, "headIndex": 10, "sentNum": 1, "position": [ 1, 4 ], "isRepresentativeMention": true } ], "5": [ { "id": 5, "text": "President Obama", "type": "PROPER", "number": "SINGULAR", "gender": "MALE", "animacy": "ANIMATE", "startIndex": 12, "endIndex": 14, "headIndex": 13, "sentNum": 1, "position": [ 1, 5 ], "isRepresentativeMention": true } ] } } |]