{-# LANGUAGE QuasiQuotes #-} {-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE DuplicateRecordFields #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE RecordWildCards #-} -- | Module provides a handy wrapper around the CoreNLP project\'s -- command-line utility https://nlp.stanford.edu/software/corenlp.html -- , and a parser for some of its output formats. module NLP.CoreNLP ( launchCoreNLP , parseJsonDoc , Dependency(..) , Entitymention(..) , Token(..) , Sentence(..) , PennPOS(..) , Coref(..) , CorefsId , Corefs , Document(..) , NamedEntity(..) , ParsedDocument(..) , LaunchOptions(..) -- * Internal , extractSuccessDocs , test ) where import Control.Applicative import Control.Concurrent import Control.Concurrent.Async import Control.Exception.Safe import Control.Monad (forM, when) import qualified Crypto.Hash as Crypto import qualified Data.Aeson as J import Data.Aeson (FromJSON, ToJSON) import Data.Default import Data.Either import Data.HashMap.Strict (HashMap) import Data.List.Split (chunksOf) import Data.Maybe import Data.Semigroup import qualified Data.Store as Store import Data.String.Class as S import Data.Text (Text) import qualified Data.Text as T import qualified Data.Text.IO as T import qualified Database.RocksDB.Base as Rocks import GHC.Generics (Generic) import System.Directory import System.Exit import System.IO (hFlush) import System.IO.Temp import System.Process import Text.RawString.QQ (r) -- | Contains generic "hacks" we put on top to go around limitations like a -- "type" keyword we can't use jsonOpts :: J.Options jsonOpts = J.defaultOptions {J.fieldLabelModifier = fm} where fm :: String -> String fm "type_" = "type" fm x = x data Dependency = Dependency { dep :: Text , governor :: Int , governorGloss :: Text , dependent :: Int , dependentGloss :: Text } deriving (Show, Eq, Generic) instance Store.Store Dependency instance FromJSON Dependency where parseJSON = J.genericParseJSON jsonOpts instance ToJSON Dependency where toJSON = J.genericToJSON jsonOpts data Entitymention = Entitymention { docTokenBegin :: Int , docTokenEnd :: Int , tokenBegin :: Int , tokenEnd :: Int , text :: Text , characterOffsetBegin :: Int , characterOffsetEnd :: Int , ner :: Text , normalizedNER :: Maybe Text } deriving (Show, Eq, Generic) instance Store.Store Entitymention instance FromJSON Entitymention where parseJSON = J.genericParseJSON jsonOpts instance ToJSON Entitymention where toJSON = J.genericToJSON jsonOpts data PennPOS = CC -- ^ Coordinating conjunction | CD -- ^ Cardinal number | DT -- ^ Determiner | EX -- ^ Existential *there* | FW -- ^ Foreign word | IN -- ^ Preposition or subordinating conjunction | JJ -- ^ Adjective | JJR -- ^ Adjective, comparative | JJS -- ^ Adjective, superlative | LS -- ^ List item marker | MD -- ^ Modal | NN -- ^ Noun, singular or mass | NNS -- ^ Noun, plural | NNP -- ^ Proper noun, singular | NNPS -- ^ Proper noun, plural | PDT -- ^ Predeterminer | POS -- ^ Possessive ending | PRP -- ^ Personal pronoun | PRPDollar -- ^ Possessive pronoun | RB -- ^ Adverb | RBR -- ^ Adverb, comparative | RBS -- ^ Adverb, superlative | RP -- ^ Particle | SYM -- ^ Symbol | TO -- ^ *to* | UH -- ^ Interjection | VB -- ^ Verb, base form | VBD -- ^ Verb, past tense | VBG -- ^ Verb, gerund or present participle | VBN -- ^ Verb, past participle | VBP -- ^ Verb, non-3rd person singular present | VBZ -- ^ Verb, 3rd person singular present | WDT -- ^ Wh-determiner | WP -- ^ Wh-pronoun | WPDollar -- ^ Possessive wh-pronoun | WRB -- ^ Wh-adverb | LRB -- ^ "-LRB-"? No idea what's this | RRB -- ^ "-RRB-"? No idea what's this | PosPunctuation -- ^ anyOf ".:,''$#$,", sometimes few together deriving (Show, Eq, Generic) instance Store.Store PennPOS instance FromJSON PennPOS where parseJSON (J.String "WP$") = pure WPDollar parseJSON (J.String "PRP$") = pure PRPDollar parseJSON (J.String "-LRB-") = pure LRB parseJSON (J.String "-RRB-") = pure RRB parseJSON x = J.genericParseJSON jsonOpts x <|> parsePunctuation x where parsePunctuation (J.String _) = pure PosPunctuation parsePunctuation _ = fail "Expecting POS to be a String" instance ToJSON PennPOS where toJSON WPDollar = J.String "WP$" toJSON PRPDollar = J.String "PRP$" toJSON PosPunctuation = J.String "." toJSON LRB = J.String "-LRB-" toJSON RRB = J.String "-RRB-" toJSON x = J.genericToJSON jsonOpts x -- | See https:\/\/stanfordnlp.github.io\/CoreNLP\/ner.html data NamedEntity = PERSON | LOCATION | ORGANIZATION | MISC | MONEY | NUMBER | ORDINAL | PERCENT | DATE | TIME | DURATION | SET | EMAIL | URL | CITY | STATE_OR_PROVINCE | COUNTRY | NATIONALITY | RELIGION | TITLE -- ^ Job title | IDEOLOGY | CRIMINAL_CHARGE | CAUSE_OF_DEATH | O -- ^ Not a named entity? TODO: check somehow deriving (Show, Eq, Generic) instance Store.Store NamedEntity instance FromJSON NamedEntity where parseJSON = J.genericParseJSON jsonOpts instance ToJSON NamedEntity where toJSON = J.genericToJSON jsonOpts data Token = Token { index :: Int , word :: Text , originalText :: Text , lemma :: Text , characterOffsetBegin :: Int , characterOffsetEnd :: Int , pos :: PennPOS , ner :: NamedEntity , speaker :: Text , before :: Text , after :: Text } deriving (Show, Eq, Generic) instance Store.Store Token instance FromJSON Token where parseJSON = J.genericParseJSON jsonOpts instance ToJSON Token where toJSON = J.genericToJSON jsonOpts data Sentence = Sentence { index :: Int , parse :: Text , basicDependencies :: [Dependency] , enhancedDependencies :: [Dependency] , enhancedPlusPlusDependencies :: [Dependency] , entitymentions :: [Entitymention] , tokens :: [Token] } deriving (Show, Eq, Generic) instance Store.Store Sentence instance FromJSON Sentence where parseJSON = J.genericParseJSON jsonOpts instance ToJSON Sentence where toJSON = J.genericToJSON jsonOpts data Coref = Coref { id :: Int , text :: Text , type_ :: Text , number :: Text , gender :: Text , animacy :: Text , startIndex :: Int , endIndex :: Int , headIndex :: Int , sentNum :: Int , position :: [Int] , isRepresentativeMention :: Bool } deriving (Show, Eq, Generic) instance Store.Store Coref instance FromJSON Coref where parseJSON = J.genericParseJSON jsonOpts instance ToJSON Coref where toJSON = J.genericToJSON jsonOpts type CorefsId = Text type Corefs = HashMap CorefsId [Coref] data Document = Document { docId :: Text , sentences :: [Sentence] , corefs :: Corefs } deriving (Show, Eq, Generic) instance Store.Store Document instance FromJSON Document where parseJSON = J.genericParseJSON jsonOpts instance ToJSON Document where toJSON = J.genericToJSON jsonOpts -- | Parse JSON output of CoreNLP. See 'headlines' source for an example JSON input. parseJsonDoc :: Text -> Either String Document parseJsonDoc = J.eitherDecode . S.fromText -- | Additional options data LaunchOptions = LaunchOptions { cacheDb :: Maybe FilePath -- ^ Optional path to a RocksDB file which will be used as a cache , memSizeMb :: Int -- ^ The "-Xmx" Java parameter , chunkSize :: Int -- ^ Number of elements by which to split the input. Good for caching upon a failure , numWorkers :: Int -- ^ Number of CoreNLP workers to launch in parallel } deriving (Show, Eq) instance Default LaunchOptions where def = LaunchOptions Nothing 8096 1000 2 -- | Got this from https:\/\/stackoverflow.com\/questions\/29155068\/running-parallel-url-downloads-with-a-worker-pool-in-haskell traverseThrottled :: Traversable t => Int -> (a -> IO b) -> t a -> IO (t b) traverseThrottled concLevel action taskContainer = do sem <- newQSem concLevel let throttledAction = bracket_ (waitQSem sem) (signalQSem sem) . action runConcurrently (traverse (Concurrently . throttledAction) taskContainer) -- | Launch CoreNLP with your inputs. This function will put every piece of 'Text' in a separate file, launch CoreNLP subprocess, and parse the results launchCoreNLP :: FilePath -- ^ Path to the directory where you extracted the CoreNLP project -> LaunchOptions -> [Text] -- ^ List of inputs -> IO [ParsedDocument] -- ^ List of parsed results launchCoreNLP fp' LaunchOptions {..} texts' = do let fp = ensureEndSlash fp' case cacheDb of Nothing -> go Nothing fp texts' Just cacheFp -> bracket (Rocks.open cacheFp def {Rocks.createIfMissing = True}) Rocks.close (\db -> go (Just db) fp texts') where ensureEndSlash :: String -> String ensureEndSlash t = if t !! (Prelude.length t - 1) == '/' then t else t <> "/" go :: Maybe Rocks.DB -> FilePath -> [Text] -> IO [ParsedDocument] go mcacheDb fp texts'' = do withSystemTempDirectory "corenlp-parser" $ \tempDir -> withCurrentDirectory tempDir $ do (cachedDocs, texts) <- getCachedDocs mcacheDb texts'' Prelude.putStrLn $ "Got documents out of cache: " ++ show (Prelude.length cachedDocs) res <- Prelude.concat <$> traverseThrottled numWorkers (goByChunk tempDir mcacheDb fp) (zip ([1 ..] :: [Integer]) (chunksOf chunkSize texts)) return (cachedDocs ++ res) goByChunk tempDir mcacheDb fp (chunkId, texts) = do Prelude.putStrLn "> goByChunk started" if Prelude.length texts <= 0 then return [] else do Prelude.putStrLn $ "Temp dir used is: " <> tempDir tmpFileNames <- forM (zip [1 ..] texts) $ \(i :: Integer, txt) -> do let fname = "text-" ++ show chunkId ++ "-" ++ show i ++ ".txt" T.writeFile (tempDir ++ "/" ++ fname) txt return fname withSystemTempFile "filelist.txt" $ \filelistTxt hfilelistTxt -> do Prelude.putStrLn $ "Filelist.txt: " ++ show filelistTxt let filesList = T.unlines (map (\x -> S.toText (tempDir <> "/" <> x)) tmpFileNames) T.hPutStrLn hfilelistTxt (T.strip filesList) hFlush hfilelistTxt let cmd = "java --add-modules java.se.ee -cp \"" ++ fp ++ "*\" -Xmx" ++ show memSizeMb ++ "m edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma,ner,parse,dcoref -outputFormat json -filelist " <> filelistTxt Prelude.putStrLn $ "Running a command: " ++ cmd let spec = shell cmd (_, _, _, processHandle) <- createProcess spec code <- waitForProcess processHandle when (code /= ExitSuccess) (error (show code)) let tmpFileNamesJson = map (<> ".json") tmpFileNames results <- forM tmpFileNamesJson $ \fname -> T.readFile fname rv <- extractSuccessDocs (zip texts (map parseJsonDoc results)) cacheResults mcacheDb rv return rv getCachedDocs :: Maybe Rocks.DB -> [Text] -> IO ([ParsedDocument], [Text]) getCachedDocs Nothing texts = return ([], texts) getCachedDocs (Just rocks) texts = do (rv :: [Either ParsedDocument Text]) <- forM texts $ \t -> do res <- Rocks.get rocks def (hash t) case res of Nothing -> return (Right t) Just bs -> case Store.decode bs of Left _e -> return (Right t) Right x -> return (Left x) return (partitionEithers rv) cacheResults Nothing _ = return () cacheResults (Just rocks) results = do let batch = map resultToOp results Rocks.write rocks def batch return () resultToOp pd@ParsedDocument {..} = Rocks.Put (hash origText) (Store.encode pd) hash t = S.fromString (show (Crypto.hash (S.toStrictByteString t) :: Crypto.Digest Crypto.SHA256)) -- | Datatype holding original text and a parsed 'Document' data ParsedDocument = ParsedDocument { origText :: Text , doc :: Document } deriving (Show, Eq, Generic) instance Store.Store ParsedDocument -- | Simple function to extract success results and print out errors extractSuccessDocs :: [(Text, Either String Document)] -> IO [ParsedDocument] extractSuccessDocs = fmap catMaybes . mapM f where f (_t, Left err) = Prelude.putStrLn ("Error parsing CoreNLP result: " ++ err) >> return Nothing f (t, Right r') = return (Just (ParsedDocument t r')) headlines :: Text headlines = [r| { "docId": "headlines.txt", "sentences": [ { "index": 0, "parse": "(ROOT\n (S\n (NP\n (NP (NNP Jersey) (NNP Shore) (NN Season) (CD 6) (NN cast) (POS 's))\n (NNS salaries))\n (VP\n (VP (VBD revealed))\n (: ;)\n (NP\n (NP (JJR More))\n (PP (IN than)\n (NP (NNP President) (NNP Obama)))))\n (. !)))", "basicDependencies": [ { "dep": "ROOT", "governor": 0, "governorGloss": "ROOT", "dependent": 8, "dependentGloss": "revealed" }, { "dep": "compound", "governor": 5, "governorGloss": "cast", "dependent": 1, "dependentGloss": "Jersey" }, { "dep": "compound", "governor": 5, "governorGloss": "cast", "dependent": 2, "dependentGloss": "Shore" }, { "dep": "compound", "governor": 5, "governorGloss": "cast", "dependent": 3, "dependentGloss": "Season" }, { "dep": "nummod", "governor": 5, "governorGloss": "cast", "dependent": 4, "dependentGloss": "6" }, { "dep": "nmod:poss", "governor": 7, "governorGloss": "salaries", "dependent": 5, "dependentGloss": "cast" }, { "dep": "case", "governor": 5, "governorGloss": "cast", "dependent": 6, "dependentGloss": "'s" }, { "dep": "nsubj", "governor": 8, "governorGloss": "revealed", "dependent": 7, "dependentGloss": "salaries" }, { "dep": "punct", "governor": 8, "governorGloss": "revealed", "dependent": 9, "dependentGloss": ";" }, { "dep": "dobj", "governor": 8, "governorGloss": "revealed", "dependent": 10, "dependentGloss": "More" }, { "dep": "case", "governor": 13, "governorGloss": "Obama", "dependent": 11, "dependentGloss": "than" }, { "dep": "compound", "governor": 13, "governorGloss": "Obama", "dependent": 12, "dependentGloss": "President" }, { "dep": "nmod", "governor": 10, "governorGloss": "More", "dependent": 13, "dependentGloss": "Obama" }, { "dep": "punct", "governor": 8, "governorGloss": "revealed", "dependent": 14, "dependentGloss": "!" } ], "enhancedDependencies": [ { "dep": "ROOT", "governor": 0, "governorGloss": "ROOT", "dependent": 8, "dependentGloss": "revealed" }, { "dep": "compound", "governor": 5, "governorGloss": "cast", "dependent": 1, "dependentGloss": "Jersey" }, { "dep": "compound", "governor": 5, "governorGloss": "cast", "dependent": 2, "dependentGloss": "Shore" }, { "dep": "compound", "governor": 5, "governorGloss": "cast", "dependent": 3, "dependentGloss": "Season" }, { "dep": "nummod", "governor": 5, "governorGloss": "cast", "dependent": 4, "dependentGloss": "6" }, { "dep": "nmod:poss", "governor": 7, "governorGloss": "salaries", "dependent": 5, "dependentGloss": "cast" }, { "dep": "case", "governor": 5, "governorGloss": "cast", "dependent": 6, "dependentGloss": "'s" }, { "dep": "nsubj", "governor": 8, "governorGloss": "revealed", "dependent": 7, "dependentGloss": "salaries" }, { "dep": "punct", "governor": 8, "governorGloss": "revealed", "dependent": 9, "dependentGloss": ";" }, { "dep": "dobj", "governor": 8, "governorGloss": "revealed", "dependent": 10, "dependentGloss": "More" }, { "dep": "case", "governor": 13, "governorGloss": "Obama", "dependent": 11, "dependentGloss": "than" }, { "dep": "compound", "governor": 13, "governorGloss": "Obama", "dependent": 12, "dependentGloss": "President" }, { "dep": "nmod:than", "governor": 10, "governorGloss": "More", "dependent": 13, "dependentGloss": "Obama" }, { "dep": "punct", "governor": 8, "governorGloss": "revealed", "dependent": 14, "dependentGloss": "!" } ], "enhancedPlusPlusDependencies": [ { "dep": "ROOT", "governor": 0, "governorGloss": "ROOT", "dependent": 8, "dependentGloss": "revealed" }, { "dep": "compound", "governor": 5, "governorGloss": "cast", "dependent": 1, "dependentGloss": "Jersey" }, { "dep": "compound", "governor": 5, "governorGloss": "cast", "dependent": 2, "dependentGloss": "Shore" }, { "dep": "compound", "governor": 5, "governorGloss": "cast", "dependent": 3, "dependentGloss": "Season" }, { "dep": "nummod", "governor": 5, "governorGloss": "cast", "dependent": 4, "dependentGloss": "6" }, { "dep": "nmod:poss", "governor": 7, "governorGloss": "salaries", "dependent": 5, "dependentGloss": "cast" }, { "dep": "case", "governor": 5, "governorGloss": "cast", "dependent": 6, "dependentGloss": "'s" }, { "dep": "nsubj", "governor": 8, "governorGloss": "revealed", "dependent": 7, "dependentGloss": "salaries" }, { "dep": "punct", "governor": 8, "governorGloss": "revealed", "dependent": 9, "dependentGloss": ";" }, { "dep": "dobj", "governor": 8, "governorGloss": "revealed", "dependent": 10, "dependentGloss": "More" }, { "dep": "case", "governor": 13, "governorGloss": "Obama", "dependent": 11, "dependentGloss": "than" }, { "dep": "compound", "governor": 13, "governorGloss": "Obama", "dependent": 12, "dependentGloss": "President" }, { "dep": "nmod:than", "governor": 10, "governorGloss": "More", "dependent": 13, "dependentGloss": "Obama" }, { "dep": "punct", "governor": 8, "governorGloss": "revealed", "dependent": 14, "dependentGloss": "!" } ], "entitymentions": [ { "docTokenBegin": 3, "docTokenEnd": 4, "tokenBegin": 3, "tokenEnd": 4, "text": "6", "characterOffsetBegin": 20, "characterOffsetEnd": 21, "ner": "NUMBER", "normalizedNER": "6.0" }, { "docTokenBegin": 11, "docTokenEnd": 12, "tokenBegin": 11, "tokenEnd": 12, "text": "President", "characterOffsetBegin": 58, "characterOffsetEnd": 67, "ner": "TITLE" }, { "docTokenBegin": 12, "docTokenEnd": 13, "tokenBegin": 12, "tokenEnd": 13, "text": "Obama", "characterOffsetBegin": 68, "characterOffsetEnd": 73, "ner": "PERSON" } ], "tokens": [ { "index": 1, "word": "Jersey", "originalText": "Jersey", "lemma": "Jersey", "characterOffsetBegin": 0, "characterOffsetEnd": 6, "pos": "NNP", "ner": "O", "speaker": "PER0", "before": "", "after": " " }, { "index": 2, "word": "Shore", "originalText": "Shore", "lemma": "Shore", "characterOffsetBegin": 7, "characterOffsetEnd": 12, "pos": "NNP", "ner": "O", "speaker": "PER0", "before": " ", "after": " " }, { "index": 3, "word": "Season", "originalText": "Season", "lemma": "season", "characterOffsetBegin": 13, "characterOffsetEnd": 19, "pos": "NN", "ner": "O", "speaker": "PER0", "before": " ", "after": " " }, { "index": 4, "word": "6", "originalText": "6", "lemma": "6", "characterOffsetBegin": 20, "characterOffsetEnd": 21, "pos": "CD", "ner": "NUMBER", "normalizedNER": "6.0", "speaker": "PER0", "before": " ", "after": " " }, { "index": 5, "word": "cast", "originalText": "cast", "lemma": "cast", "characterOffsetBegin": 22, "characterOffsetEnd": 26, "pos": "NN", "ner": "O", "speaker": "PER0", "before": " ", "after": "" }, { "index": 6, "word": "'s", "originalText": "'s", "lemma": "'s", "characterOffsetBegin": 26, "characterOffsetEnd": 28, "pos": "POS", "ner": "O", "speaker": "PER0", "before": "", "after": " " }, { "index": 7, "word": "salaries", "originalText": "salaries", "lemma": "salary", "characterOffsetBegin": 29, "characterOffsetEnd": 37, "pos": "NNS", "ner": "O", "speaker": "PER0", "before": " ", "after": " " }, { "index": 8, "word": "revealed", "originalText": "revealed", "lemma": "reveal", "characterOffsetBegin": 38, "characterOffsetEnd": 46, "pos": "VBD", "ner": "O", "speaker": "PER0", "before": " ", "after": "" }, { "index": 9, "word": ";", "originalText": ";", "lemma": ";", "characterOffsetBegin": 46, "characterOffsetEnd": 47, "pos": ":", "ner": "O", "speaker": "PER0", "before": "", "after": " " }, { "index": 10, "word": "More", "originalText": "More", "lemma": "more", "characterOffsetBegin": 48, "characterOffsetEnd": 52, "pos": "JJR", "ner": "O", "speaker": "PER0", "before": " ", "after": " " }, { "index": 11, "word": "than", "originalText": "than", "lemma": "than", "characterOffsetBegin": 53, "characterOffsetEnd": 57, "pos": "IN", "ner": "O", "speaker": "PER0", "before": " ", "after": " " }, { "index": 12, "word": "President", "originalText": "President", "lemma": "President", "characterOffsetBegin": 58, "characterOffsetEnd": 67, "pos": "NNP", "ner": "TITLE", "speaker": "PER0", "before": " ", "after": " " }, { "index": 13, "word": "Obama", "originalText": "Obama", "lemma": "Obama", "characterOffsetBegin": 68, "characterOffsetEnd": 73, "pos": "NNP", "ner": "PERSON", "speaker": "PER0", "before": " ", "after": "" }, { "index": 14, "word": "!", "originalText": "!", "lemma": "!", "characterOffsetBegin": 73, "characterOffsetEnd": 74, "pos": ".", "ner": "O", "speaker": "PER0", "before": "", "after": "\n" } ] } ], "corefs": { "1": [ { "id": 1, "text": "6", "type": "PROPER", "number": "SINGULAR", "gender": "UNKNOWN", "animacy": "INANIMATE", "startIndex": 4, "endIndex": 5, "headIndex": 4, "sentNum": 1, "position": [ 1, 3 ], "isRepresentativeMention": true } ], "2": [ { "id": 2, "text": "Jersey Shore Season 6 cast 's salaries", "type": "NOMINAL", "number": "PLURAL", "gender": "UNKNOWN", "animacy": "INANIMATE", "startIndex": 1, "endIndex": 8, "headIndex": 7, "sentNum": 1, "position": [ 1, 1 ], "isRepresentativeMention": true } ], "3": [ { "id": 3, "text": "Jersey Shore Season 6 cast 's", "type": "PROPER", "number": "SINGULAR", "gender": "UNKNOWN", "animacy": "INANIMATE", "startIndex": 1, "endIndex": 7, "headIndex": 2, "sentNum": 1, "position": [ 1, 2 ], "isRepresentativeMention": true } ], "4": [ { "id": 4, "text": "More than President Obama", "type": "NOMINAL", "number": "UNKNOWN", "gender": "UNKNOWN", "animacy": "INANIMATE", "startIndex": 10, "endIndex": 14, "headIndex": 10, "sentNum": 1, "position": [ 1, 4 ], "isRepresentativeMention": true } ], "5": [ { "id": 5, "text": "President Obama", "type": "PROPER", "number": "SINGULAR", "gender": "MALE", "animacy": "ANIMATE", "startIndex": 12, "endIndex": 14, "headIndex": 13, "sentNum": 1, "position": [ 1, 5 ], "isRepresentativeMention": true } ] } } |] test :: IO () test = do print (parseJsonDoc headlines)