module NLP.CoreNLP
( launchCoreNLP
, parseJsonDoc
, Dependency(..)
, Entitymention(..)
, Token(..)
, Sentence(..)
, Coref(..)
, CorefsId
, Corefs
, Document(..)
, test
) where
import Control.Monad (forM, when)
import qualified Data.Aeson as J
import Data.Aeson (FromJSON, ToJSON)
import Data.HashMap.Strict (HashMap)
import Data.Semigroup
import Data.String.Class as S
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.IO as T
import GHC.Generics (Generic)
import System.Exit
import System.IO (hFlush)
import System.IO.Temp
import System.Process
import Text.RawString.QQ (r)
jsonOpts :: J.Options
jsonOpts = J.defaultOptions {J.fieldLabelModifier = fm}
where
fm :: String -> String
fm "type_" = "type"
fm x = x
data Dependency = Dependency
{ dep :: Text
, governor :: Int
, governorGloss :: Text
, dependent :: Int
, dependentGloss :: Text
} deriving (Show, Eq, Generic)
instance FromJSON Dependency where
parseJSON = J.genericParseJSON jsonOpts
instance ToJSON Dependency where
toJSON = J.genericToJSON jsonOpts
data Entitymention = Entitymention
{ docTokenBegin :: Int
, docTokenEnd :: Int
, tokenBegin :: Int
, tokenEnd :: Int
, text :: Text
, characterOffsetBegin :: Int
, characterOffsetEnd :: Int
, ner :: Text
, normalizedNER :: Maybe Text
} deriving (Show, Eq, Generic)
instance FromJSON Entitymention where
parseJSON = J.genericParseJSON jsonOpts
instance ToJSON Entitymention where
toJSON = J.genericToJSON jsonOpts
data Token = Token
{ index :: Int
, word :: Text
, originalText :: Text
, lemma :: Text
, characterOffsetBegin :: Int
, characterOffsetEnd :: Int
, pos :: Text
, ner :: Text
, speaker :: Text
, before :: Text
, after :: Text
} deriving (Show, Eq, Generic)
instance FromJSON Token where
parseJSON = J.genericParseJSON jsonOpts
instance ToJSON Token where
toJSON = J.genericToJSON jsonOpts
data Sentence = Sentence
{ index :: Int
, parse :: Text
, basicDependencies :: [Dependency]
, enhancedDependencies :: [Dependency]
, enhancedPlusPlusDependencies :: [Dependency]
, entitymentions :: [Entitymention]
, tokens :: [Token]
} deriving (Show, Eq, Generic)
instance FromJSON Sentence where
parseJSON = J.genericParseJSON jsonOpts
instance ToJSON Sentence where
toJSON = J.genericToJSON jsonOpts
data Coref = Coref
{ id :: Int
, text :: Text
, type_ :: Text
, number :: Text
, gender :: Text
, animacy :: Text
, startIndex :: Int
, endIndex :: Int
, headIndex :: Int
, sentNum :: Int
, position :: [Int]
, isRepresentativeMention :: Bool
} deriving (Show, Eq, Generic)
instance FromJSON Coref where
parseJSON = J.genericParseJSON jsonOpts
instance ToJSON Coref where
toJSON = J.genericToJSON jsonOpts
type CorefsId = Text
type Corefs = HashMap CorefsId [Coref]
data Document = Document
{ docId :: Text
, sentences :: [Sentence]
, corefs :: Corefs
} deriving (Show, Eq, Generic)
instance FromJSON Document where
parseJSON = J.genericParseJSON jsonOpts
instance ToJSON Document where
toJSON = J.genericToJSON jsonOpts
parseJsonDoc :: Text -> Either String Document
parseJsonDoc = J.eitherDecode . S.fromText
launchCoreNLP ::
FilePath
-> [Text]
-> IO [Either String Document]
launchCoreNLP fp texts =
withSystemTempDirectory "corenlp-parser" $ \tempDir -> do
Prelude.putStrLn $ "Temp dir used is is: " <> tempDir
tmpFileNames <-
forM (zip [1 ..] texts) $ \(i :: Integer, txt) -> do
let fname = ("text-" ++ show i ++ ".txt")
T.writeFile (tempDir ++ "/" ++ fname) txt
return fname
withSystemTempFile "filelist.txt" $ \filelistTxt hfilelistTxt -> do
Prelude.putStrLn $ "Filelist.txt: " ++ show filelistTxt
Prelude.putStrLn $ "Temporary files: " ++ show tmpFileNames
let filesList =
T.unlines (map (\x -> S.toText (tempDir <> "/" <> x)) tmpFileNames)
Prelude.putStrLn $ "Filelist.txt filesList: " ++ show filesList
T.hPutStrLn hfilelistTxt (T.strip filesList)
hFlush hfilelistTxt
let spec =
shell
("java --add-modules java.se.ee -cp \"" ++
fp ++
"*\" -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma,ner,parse,dcoref -outputFormat json -filelist " <>
filelistTxt)
(_, _, _, processHandle) <- createProcess spec
code <- waitForProcess processHandle
when (code /= ExitSuccess) (error (show code))
let tmpFileNamesJson = map (<> ".json") tmpFileNames
_ <- Prelude.getLine
results <- forM tmpFileNamesJson $ \fname -> T.readFile fname
return (map parseJsonDoc results)
headlines :: Text
headlines =
S.toText $
[r|
{
"docId": "headlines.txt",
"sentences": [
{
"index": 0,
"parse": "(ROOT\n (S\n (NP\n (NP (NNP Jersey) (NNP Shore) (NN Season) (CD 6) (NN cast) (POS 's))\n (NNS salaries))\n (VP\n (VP (VBD revealed))\n (: ;)\n (NP\n (NP (JJR More))\n (PP (IN than)\n (NP (NNP President) (NNP Obama)))))\n (. !)))",
"basicDependencies": [
{
"dep": "ROOT",
"governor": 0,
"governorGloss": "ROOT",
"dependent": 8,
"dependentGloss": "revealed"
},
{
"dep": "compound",
"governor": 5,
"governorGloss": "cast",
"dependent": 1,
"dependentGloss": "Jersey"
},
{
"dep": "compound",
"governor": 5,
"governorGloss": "cast",
"dependent": 2,
"dependentGloss": "Shore"
},
{
"dep": "compound",
"governor": 5,
"governorGloss": "cast",
"dependent": 3,
"dependentGloss": "Season"
},
{
"dep": "nummod",
"governor": 5,
"governorGloss": "cast",
"dependent": 4,
"dependentGloss": "6"
},
{
"dep": "nmod:poss",
"governor": 7,
"governorGloss": "salaries",
"dependent": 5,
"dependentGloss": "cast"
},
{
"dep": "case",
"governor": 5,
"governorGloss": "cast",
"dependent": 6,
"dependentGloss": "'s"
},
{
"dep": "nsubj",
"governor": 8,
"governorGloss": "revealed",
"dependent": 7,
"dependentGloss": "salaries"
},
{
"dep": "punct",
"governor": 8,
"governorGloss": "revealed",
"dependent": 9,
"dependentGloss": ";"
},
{
"dep": "dobj",
"governor": 8,
"governorGloss": "revealed",
"dependent": 10,
"dependentGloss": "More"
},
{
"dep": "case",
"governor": 13,
"governorGloss": "Obama",
"dependent": 11,
"dependentGloss": "than"
},
{
"dep": "compound",
"governor": 13,
"governorGloss": "Obama",
"dependent": 12,
"dependentGloss": "President"
},
{
"dep": "nmod",
"governor": 10,
"governorGloss": "More",
"dependent": 13,
"dependentGloss": "Obama"
},
{
"dep": "punct",
"governor": 8,
"governorGloss": "revealed",
"dependent": 14,
"dependentGloss": "!"
}
],
"enhancedDependencies": [
{
"dep": "ROOT",
"governor": 0,
"governorGloss": "ROOT",
"dependent": 8,
"dependentGloss": "revealed"
},
{
"dep": "compound",
"governor": 5,
"governorGloss": "cast",
"dependent": 1,
"dependentGloss": "Jersey"
},
{
"dep": "compound",
"governor": 5,
"governorGloss": "cast",
"dependent": 2,
"dependentGloss": "Shore"
},
{
"dep": "compound",
"governor": 5,
"governorGloss": "cast",
"dependent": 3,
"dependentGloss": "Season"
},
{
"dep": "nummod",
"governor": 5,
"governorGloss": "cast",
"dependent": 4,
"dependentGloss": "6"
},
{
"dep": "nmod:poss",
"governor": 7,
"governorGloss": "salaries",
"dependent": 5,
"dependentGloss": "cast"
},
{
"dep": "case",
"governor": 5,
"governorGloss": "cast",
"dependent": 6,
"dependentGloss": "'s"
},
{
"dep": "nsubj",
"governor": 8,
"governorGloss": "revealed",
"dependent": 7,
"dependentGloss": "salaries"
},
{
"dep": "punct",
"governor": 8,
"governorGloss": "revealed",
"dependent": 9,
"dependentGloss": ";"
},
{
"dep": "dobj",
"governor": 8,
"governorGloss": "revealed",
"dependent": 10,
"dependentGloss": "More"
},
{
"dep": "case",
"governor": 13,
"governorGloss": "Obama",
"dependent": 11,
"dependentGloss": "than"
},
{
"dep": "compound",
"governor": 13,
"governorGloss": "Obama",
"dependent": 12,
"dependentGloss": "President"
},
{
"dep": "nmod:than",
"governor": 10,
"governorGloss": "More",
"dependent": 13,
"dependentGloss": "Obama"
},
{
"dep": "punct",
"governor": 8,
"governorGloss": "revealed",
"dependent": 14,
"dependentGloss": "!"
}
],
"enhancedPlusPlusDependencies": [
{
"dep": "ROOT",
"governor": 0,
"governorGloss": "ROOT",
"dependent": 8,
"dependentGloss": "revealed"
},
{
"dep": "compound",
"governor": 5,
"governorGloss": "cast",
"dependent": 1,
"dependentGloss": "Jersey"
},
{
"dep": "compound",
"governor": 5,
"governorGloss": "cast",
"dependent": 2,
"dependentGloss": "Shore"
},
{
"dep": "compound",
"governor": 5,
"governorGloss": "cast",
"dependent": 3,
"dependentGloss": "Season"
},
{
"dep": "nummod",
"governor": 5,
"governorGloss": "cast",
"dependent": 4,
"dependentGloss": "6"
},
{
"dep": "nmod:poss",
"governor": 7,
"governorGloss": "salaries",
"dependent": 5,
"dependentGloss": "cast"
},
{
"dep": "case",
"governor": 5,
"governorGloss": "cast",
"dependent": 6,
"dependentGloss": "'s"
},
{
"dep": "nsubj",
"governor": 8,
"governorGloss": "revealed",
"dependent": 7,
"dependentGloss": "salaries"
},
{
"dep": "punct",
"governor": 8,
"governorGloss": "revealed",
"dependent": 9,
"dependentGloss": ";"
},
{
"dep": "dobj",
"governor": 8,
"governorGloss": "revealed",
"dependent": 10,
"dependentGloss": "More"
},
{
"dep": "case",
"governor": 13,
"governorGloss": "Obama",
"dependent": 11,
"dependentGloss": "than"
},
{
"dep": "compound",
"governor": 13,
"governorGloss": "Obama",
"dependent": 12,
"dependentGloss": "President"
},
{
"dep": "nmod:than",
"governor": 10,
"governorGloss": "More",
"dependent": 13,
"dependentGloss": "Obama"
},
{
"dep": "punct",
"governor": 8,
"governorGloss": "revealed",
"dependent": 14,
"dependentGloss": "!"
}
],
"entitymentions": [
{
"docTokenBegin": 3,
"docTokenEnd": 4,
"tokenBegin": 3,
"tokenEnd": 4,
"text": "6",
"characterOffsetBegin": 20,
"characterOffsetEnd": 21,
"ner": "NUMBER",
"normalizedNER": "6.0"
},
{
"docTokenBegin": 11,
"docTokenEnd": 12,
"tokenBegin": 11,
"tokenEnd": 12,
"text": "President",
"characterOffsetBegin": 58,
"characterOffsetEnd": 67,
"ner": "TITLE"
},
{
"docTokenBegin": 12,
"docTokenEnd": 13,
"tokenBegin": 12,
"tokenEnd": 13,
"text": "Obama",
"characterOffsetBegin": 68,
"characterOffsetEnd": 73,
"ner": "PERSON"
}
],
"tokens": [
{
"index": 1,
"word": "Jersey",
"originalText": "Jersey",
"lemma": "Jersey",
"characterOffsetBegin": 0,
"characterOffsetEnd": 6,
"pos": "NNP",
"ner": "O",
"speaker": "PER0",
"before": "",
"after": " "
},
{
"index": 2,
"word": "Shore",
"originalText": "Shore",
"lemma": "Shore",
"characterOffsetBegin": 7,
"characterOffsetEnd": 12,
"pos": "NNP",
"ner": "O",
"speaker": "PER0",
"before": " ",
"after": " "
},
{
"index": 3,
"word": "Season",
"originalText": "Season",
"lemma": "season",
"characterOffsetBegin": 13,
"characterOffsetEnd": 19,
"pos": "NN",
"ner": "O",
"speaker": "PER0",
"before": " ",
"after": " "
},
{
"index": 4,
"word": "6",
"originalText": "6",
"lemma": "6",
"characterOffsetBegin": 20,
"characterOffsetEnd": 21,
"pos": "CD",
"ner": "NUMBER",
"normalizedNER": "6.0",
"speaker": "PER0",
"before": " ",
"after": " "
},
{
"index": 5,
"word": "cast",
"originalText": "cast",
"lemma": "cast",
"characterOffsetBegin": 22,
"characterOffsetEnd": 26,
"pos": "NN",
"ner": "O",
"speaker": "PER0",
"before": " ",
"after": ""
},
{
"index": 6,
"word": "'s",
"originalText": "'s",
"lemma": "'s",
"characterOffsetBegin": 26,
"characterOffsetEnd": 28,
"pos": "POS",
"ner": "O",
"speaker": "PER0",
"before": "",
"after": " "
},
{
"index": 7,
"word": "salaries",
"originalText": "salaries",
"lemma": "salary",
"characterOffsetBegin": 29,
"characterOffsetEnd": 37,
"pos": "NNS",
"ner": "O",
"speaker": "PER0",
"before": " ",
"after": " "
},
{
"index": 8,
"word": "revealed",
"originalText": "revealed",
"lemma": "reveal",
"characterOffsetBegin": 38,
"characterOffsetEnd": 46,
"pos": "VBD",
"ner": "O",
"speaker": "PER0",
"before": " ",
"after": ""
},
{
"index": 9,
"word": ";",
"originalText": ";",
"lemma": ";",
"characterOffsetBegin": 46,
"characterOffsetEnd": 47,
"pos": ":",
"ner": "O",
"speaker": "PER0",
"before": "",
"after": " "
},
{
"index": 10,
"word": "More",
"originalText": "More",
"lemma": "more",
"characterOffsetBegin": 48,
"characterOffsetEnd": 52,
"pos": "JJR",
"ner": "O",
"speaker": "PER0",
"before": " ",
"after": " "
},
{
"index": 11,
"word": "than",
"originalText": "than",
"lemma": "than",
"characterOffsetBegin": 53,
"characterOffsetEnd": 57,
"pos": "IN",
"ner": "O",
"speaker": "PER0",
"before": " ",
"after": " "
},
{
"index": 12,
"word": "President",
"originalText": "President",
"lemma": "President",
"characterOffsetBegin": 58,
"characterOffsetEnd": 67,
"pos": "NNP",
"ner": "TITLE",
"speaker": "PER0",
"before": " ",
"after": " "
},
{
"index": 13,
"word": "Obama",
"originalText": "Obama",
"lemma": "Obama",
"characterOffsetBegin": 68,
"characterOffsetEnd": 73,
"pos": "NNP",
"ner": "PERSON",
"speaker": "PER0",
"before": " ",
"after": ""
},
{
"index": 14,
"word": "!",
"originalText": "!",
"lemma": "!",
"characterOffsetBegin": 73,
"characterOffsetEnd": 74,
"pos": ".",
"ner": "O",
"speaker": "PER0",
"before": "",
"after": "\n"
}
]
}
],
"corefs": {
"1": [
{
"id": 1,
"text": "6",
"type": "PROPER",
"number": "SINGULAR",
"gender": "UNKNOWN",
"animacy": "INANIMATE",
"startIndex": 4,
"endIndex": 5,
"headIndex": 4,
"sentNum": 1,
"position": [
1,
3
],
"isRepresentativeMention": true
}
],
"2": [
{
"id": 2,
"text": "Jersey Shore Season 6 cast 's salaries",
"type": "NOMINAL",
"number": "PLURAL",
"gender": "UNKNOWN",
"animacy": "INANIMATE",
"startIndex": 1,
"endIndex": 8,
"headIndex": 7,
"sentNum": 1,
"position": [
1,
1
],
"isRepresentativeMention": true
}
],
"3": [
{
"id": 3,
"text": "Jersey Shore Season 6 cast 's",
"type": "PROPER",
"number": "SINGULAR",
"gender": "UNKNOWN",
"animacy": "INANIMATE",
"startIndex": 1,
"endIndex": 7,
"headIndex": 2,
"sentNum": 1,
"position": [
1,
2
],
"isRepresentativeMention": true
}
],
"4": [
{
"id": 4,
"text": "More than President Obama",
"type": "NOMINAL",
"number": "UNKNOWN",
"gender": "UNKNOWN",
"animacy": "INANIMATE",
"startIndex": 10,
"endIndex": 14,
"headIndex": 10,
"sentNum": 1,
"position": [
1,
4
],
"isRepresentativeMention": true
}
],
"5": [
{
"id": 5,
"text": "President Obama",
"type": "PROPER",
"number": "SINGULAR",
"gender": "MALE",
"animacy": "ANIMATE",
"startIndex": 12,
"endIndex": 14,
"headIndex": 13,
"sentNum": 1,
"position": [
1,
5
],
"isRepresentativeMention": true
}
]
}
}
|]
test :: IO ()
test = do
print (parseJsonDoc headlines)