module NLP.GenI (
ProgState(..), ProgStateRef, emptyProgState,
LexicalSelector,
runGeni,
GeniResults(..),
GeniResult(..), isSuccess, GeniError(..), GeniSuccess(..),
GeniLexSel(..),
ResultType(..),
initGeni, extractResults,
lemmaSentenceString, prettyResult,
showRealisations, histogram,
getTraces,
loadEverything,
Loadable(..),
loadLexicon,
loadGeniMacros,
loadTestSuite, parseSemInput,
loadRanking, BadInputException(..),
loadFromString,
)
where
import Control.Applicative ((<$>),(<*>))
import Control.DeepSeq
import Control.Exception
import Control.Monad.Error
import Data.Binary (Binary, decodeFile)
import Data.IORef (IORef, readIORef, modifyIORef)
import Data.List
import Data.Maybe (fromMaybe)
import Data.Monoid ( mappend, mempty )
import Data.Text ( Text )
import Data.Typeable (Typeable)
import System.CPUTime( getCPUTime )
import System.IO ( stderr )
import qualified Data.Map as Map
import qualified Data.Text as T
import qualified Data.Text.IO as T
import Data.FullList ( fromFL )
import Text.JSON
import qualified System.IO.UTF8 as UTF8
import NLP.GenI.Configuration
( Params, customMorph, customSelector
, getFlagP, hasFlagP, hasOpt, Optimisation(NoConstraints)
, MacrosFlg(..), LexiconFlg(..), TestSuiteFlg(..)
, MorphInfoFlg(..), MorphCmdFlg(..)
, RankingConstraintsFlg(..)
, PartialFlg(..)
, FromStdinFlg(..), VerboseModeFlg(..)
, NoLoadTestSuiteFlg(..)
, RootFeatureFlg(..)
, TracesFlg(..)
, grammarType
, GrammarType(..)
)
import NLP.GenI.General
( histogram, geniBug, snd3, first3, ePutStr, ePutStrLn, eFlush,
)
import NLP.GenI.GeniVal
import NLP.GenI.LexicalSelection ( LexicalSelector, LexicalSelection(..), defaultLexicalSelector )
import NLP.GenI.Lexicon
import NLP.GenI.Morphology
import NLP.GenI.OptimalityTheory
import NLP.GenI.Parser (geniMacros, geniTagElems,
geniLexicon, geniTestSuite,
geniTestSuiteString, geniSemanticInput,
geniMorphInfo,
runParser,
ParseError,
)
import NLP.GenI.Pretty
import NLP.GenI.Semantics
import NLP.GenI.Statistics
import NLP.GenI.Tag ( TagElem, idname, tsemantics, ttrace, setTidnums )
import NLP.GenI.TestSuite ( TestCase(..) )
import NLP.GenI.TreeSchema
import NLP.GenI.Warning
import qualified NLP.GenI.Builder as B
data ProgState = ProgState
{ pa :: Params
, gr :: Macros
, le :: Lexicon
, morphinf :: MorphInputFn
, ranking :: OtRanking
, traces :: [Text]
}
type ProgStateRef = IORef ProgState
emptyProgState :: Params -> ProgState
emptyProgState args = ProgState
{ pa = args
, gr = []
, le = []
, morphinf = const Nothing
, traces = []
, ranking = []
}
loadEverything :: ProgStateRef -> IO()
loadEverything pstRef =
do pst <- readIORef pstRef
let config = pa pst
isMissing f = not $ hasFlagP f config
isNotPreanchored = grammarType config /= PreAnchored
isNotPrecompiled = grammarType config /= PreCompiled
useTestSuite = isMissing FromStdinFlg
&& isMissing NoLoadTestSuiteFlg
let errormsg =
concat $ intersperse ", " [ msg | (con, msg) <- errorlst, con ]
errorlst =
[ (isMissing RootFeatureFlg,
"a root feature [empty feature is fine if you are not using polarity filtering]")
, (isNotPrecompiled && isMissing MacrosFlg,
"a tree file")
, (isNotPreanchored && isMissing LexiconFlg,
"a lexicon file")
, (useTestSuite && isMissing TestSuiteFlg,
"a test suite") ]
unless (null errormsg) $ fail ("Please specify: " ++ errormsg)
case grammarType config of
PreAnchored -> return ()
PreCompiled -> return ()
_ -> loadGeniMacros pstRef >> return ()
when isNotPreanchored $ loadLexicon pstRef >> return ()
loadMorphInfo pstRef
when useTestSuite $ loadTestSuite pstRef >> return ()
loadTraces pstRef
loadRanking pstRef
class Loadable x where
lParse :: FilePath
-> String -> Either ParseError x
lSet :: x -> ProgState -> ProgState
lSummarise :: x -> String
lParseFromFile :: Loadable x => FilePath -> IO (Either ParseError x)
lParseFromFile f = lParse f `fmap` UTF8.readFile f
lSetState :: Loadable x => ProgStateRef -> x -> IO x
lSetState pstRef x = modifyIORef pstRef (lSet x) >> return x
throwOnParseError :: String -> Either ParseError x -> IO x
throwOnParseError descr (Left err) = throwIO (BadInputException descr err)
throwOnParseError _ (Right p) = return p
data BadInputException = BadInputException String ParseError
deriving (Show, Typeable)
instance Exception BadInputException
data L a = Loadable a => L
loadOrDie :: forall f a . (Eq f, Typeable f, Loadable a)
=> L a
-> (FilePath -> f)
-> String
-> ProgStateRef
-> IO a
loadOrDie L flg descr pstRef =
withFlagOrDie flg pstRef descr $ \f -> do
v <- verbosity pstRef
x <- withLoadStatus v f descr lParseFromFile
>>= throwOnParseError descr
>>= lSetState pstRef
return x
loadFromString :: Loadable a => ProgStateRef
-> String
-> String
-> IO a
loadFromString pstRef descr s =
throwOnParseError descr (lParse "" s) >>= lSetState pstRef
instance Loadable Lexicon where
lParse f = fmap toLexicon . runParser geniLexicon () f
where
fixEntry = finaliseVars ""
. sorter
toLexicon = map fixEntry
sorter l = l { isemantics = (sortByAmbiguity . isemantics) l }
lSet x p = p { le = x }
lSummarise x = show (length x) ++ " lemmas"
instance Loadable Macros where
lParse f = runParser geniMacros () f
lSet x p = p { gr = x }
lSummarise x = show (length x) ++ " schemata"
loadLexicon :: ProgStateRef -> IO Lexicon
loadLexicon = loadOrDie (L :: L Lexicon) LexiconFlg "lexicon"
loadGeniMacros :: ProgStateRef -> IO Macros
loadGeniMacros pstRef =
withFlagOrDie MacrosFlg pstRef descr $ \f -> do
v <- verbosity pstRef
withLoadStatus v f descr (parseFromFileMaybeBinary lParseFromFile)
>>= throwOnParseError "tree schemata"
>>= lSetState pstRef
where
descr = "trees"
loadOptional :: forall f a . (Eq f, Typeable f, Loadable a)
=> L a
-> (FilePath -> f)
-> String
-> ProgStateRef
-> IO ()
loadOptional L flg descr pstRef =
withFlagOrIgnore flg pstRef $ \f -> do
v <- verbosity pstRef
x <- withLoadStatus v f descr lParseFromFile
>>= throwOnParseError descr
>>= lSetState pstRef
let _ = x :: a
return ()
newtype MorphFnL = MorphFnL MorphInputFn
instance Loadable MorphFnL where
lParse f = fmap (MorphFnL . readMorph) . runParser geniMorphInfo () f
lSet (MorphFnL x) p = p { morphinf = x }
lSummarise _ = "morphinfo"
newtype TracesL = TracesL [Text]
instance Loadable TracesL where
lParse _ = Right . TracesL . T.lines . T.pack
lSet (TracesL xs) p = p { traces = xs }
lSummarise (TracesL xs) = show (length xs) ++ " traces"
instance Loadable OtRanking where
lParse _ = resultToEither2 . decode
lSet r p = p { ranking = r }
lSummarise _ = "ranking"
loadMorphInfo :: ProgStateRef -> IO ()
loadMorphInfo = loadOptional (L :: L MorphFnL) MorphInfoFlg "morphological info"
loadTraces :: ProgStateRef -> IO ()
loadTraces = loadOptional (L :: L TracesL) TracesFlg "traces"
loadRanking :: ProgStateRef -> IO ()
loadRanking = loadOptional (L :: L OtRanking) RankingConstraintsFlg "OT constraints"
resultToEither2 :: Result a -> Either ParseError a
resultToEither2 r =
case resultToEither r of
Left e -> runParser (fail e) () "" []
Right x -> Right x
newtype TestSuiteL = TestSuiteL [TestCase]
instance Loadable TestSuiteL where
lParse f s =
case runParser geniTestSuite () f s of
Left e -> Left e
Right sem -> case runParser geniTestSuiteString () f s of
Left e -> Left e
Right mStrs -> Right (TestSuiteL (zipWith cleanup sem mStrs))
where
cleanup tc str =
tc { tcSem = first3 sortSem (tcSem tc)
, tcSemString = str }
lSet (TestSuiteL _) p = p
lSummarise (TestSuiteL x) = show (length x) ++ " cases"
loadTestSuite :: ProgStateRef -> IO [TestCase]
loadTestSuite pstRef = do
TestSuiteL xs <- loadOrDie (L :: L TestSuiteL) TestSuiteFlg "test suite" pstRef
return xs
parseSemInput :: String -> Either ParseError SemInput
parseSemInput = fmap smooth . runParser geniSemanticInput () "semantics"
where
smooth (s,r,l) = (sortSem s, sort r, l)
withFlag :: forall f a . (Eq f, Typeable f)
=> (FilePath -> f)
-> ProgStateRef
-> IO a
-> (FilePath -> IO a)
-> IO a
withFlag flag pstRef z job =
do config <- pa `fmap` readIORef pstRef
case getFlagP flag config of
Nothing -> z
Just x -> job x
withFlagOrIgnore :: forall f . (Eq f, Typeable f)
=> (FilePath -> f)
-> ProgStateRef
-> (FilePath -> IO ())
-> IO ()
withFlagOrIgnore flag pstRef = withFlag flag pstRef (return ())
withFlagOrDie :: forall f a . (Eq f, Typeable f)
=> (FilePath -> f)
-> ProgStateRef
-> String
-> (FilePath -> IO a)
-> IO a
withFlagOrDie flag pstRef description =
withFlag flag pstRef (fail msg)
where
msg = "Please specify a " ++ description ++ " file!"
withLoadStatus :: Loadable a
=> Bool
-> FilePath
-> String
-> (FilePath -> IO (Either ParseError a))
-> IO (Either ParseError a)
withLoadStatus False f _ p = p f
withLoadStatus True f d p = do
ePutStr $ unwords [ "Loading", d, f ++ "... " ]
eFlush
mx <- p f
ePutStrLn $ either (const "ERROR") (\x -> lSummarise x ++ " loaded") mx
return mx
parseFromFileMaybeBinary :: Binary a
=> (FilePath -> IO (Either ParseError a))
-> FilePath
-> IO (Either ParseError a)
parseFromFileMaybeBinary p f =
if (".genib" `isSuffixOf` f)
then Right `fmap` decodeFile f
else p f
data GeniResults = GeniResults
{ grResults :: [GeniResult]
, grGlobalWarnings :: [Text]
, grStatistics :: Statistics
}
data GeniResult = GError GeniError
| GSuccess GeniSuccess
deriving (Ord, Eq)
isSuccess :: GeniResult -> Bool
isSuccess (GSuccess _) = True
isSuccess (GError _) = False
data GeniError = GeniError [Text]
deriving (Ord, Eq)
data GeniSuccess = GeniSuccess
{ grLemmaSentence :: LemmaPlusSentence
, grRealisations :: [Text]
, grResultType :: ResultType
, grWarnings :: [Text]
, grDerivation :: B.TagDerivation
, grOrigin :: Integer
, grLexSelection :: [GeniLexSel]
, grRanking :: Int
, grViolations :: [OtViolation]
} deriving (Ord, Eq)
data GeniLexSel = GeniLexSel
{ nlTree :: Text
, nlTrace :: [Text]
} deriving (Ord, Eq)
data ResultType = CompleteResult | PartialResult deriving (Ord, Eq)
instance Pretty GeniError where
pretty (GeniError xs) = T.intercalate "\n" $ map ("Error:" <+>) xs
runGeni :: ProgStateRef -> SemInput -> B.Builder st it Params -> IO (GeniResults,st)
runGeni pstRef semInput builder = do
pst <- readIORef pstRef
let config = pa pst
run = B.run builder
(initStuff, initWarns) <- initGeni pstRef semInput
start <- rnf initStuff `seq` getCPUTime
let (finalSt, stats) = run initStuff config
results <- extractResults pstRef builder finalSt
end <- rnf results `seq` getCPUTime
let elapsedTime = picosToMillis $! end start
diff = round (elapsedTime :: Double) :: Int
stats2 = updateMetrics (incrIntMetric "gen_time" (fromIntegral diff) ) stats
gresults = GeniResults { grResults = results
, grStatistics = stats2
, grGlobalWarnings = map showWarnings (fromGeniWarnings initWarns)
}
return (gresults, finalSt)
where
showWarnings = T.intercalate "\n" . showGeniWarning
extractResults :: ProgStateRef -> B.Builder st it Params -> st -> IO [GeniResult]
extractResults pstRef builder finalSt = do
config <- pa <$> readIORef pstRef
let uninflected = B.unpack builder finalSt
(rawResults, resultTy) =
if null uninflected && hasFlagP PartialFlg config
then (B.partial builder finalSt, PartialResult)
else (uninflected , CompleteResult)
status = B.finished builder finalSt
finaliseResults pstRef (resultTy, status, rawResults)
initGeni :: ProgStateRef -> SemInput -> IO (B.Input, GeniWarnings)
initGeni pstRef semInput_ = do
pst <- readIORef pstRef
let semInput = stripMorphStuff pst
. maybeRemoveConstraints pst
$ semInput_
selection <- runLexSelection pstRef semInput
let initStuff = B.Input
{ B.inSemInput = semInput
, B.inLex = lsLexEntries selection
, B.inCands = map (\c -> (c,1)) (lsAnchored selection)
}
return (initStuff, lsWarnings selection)
where
stripMorphStuff pst = first3 (stripMorphSem (morphinf pst))
maybeRemoveConstraints pst =
if hasOpt NoConstraints (pa pst) then removeConstraints else id
finaliseResults :: ProgStateRef -> (ResultType, B.GenStatus, [B.Output]) -> IO [GeniResult]
finaliseResults pstRef (ty, status, os) = do
pst <- readIORef pstRef
mss <- case getFlagP MorphCmdFlg (pa pst) of
Nothing -> let morph = fromMaybe (map sansMorph) (customMorph (pa pst))
in return (morph sentences)
Just cmd -> map snd `fmap` inflectSentencesUsingCmd cmd sentences
let unranked = zipWith (sansRanking pst) os mss
rank = rankResults (getTraces pst) grDerivation (ranking pst)
successes = map addRanking (rank unranked)
failures = case status of
B.Error str -> [GeniError [str]]
B.Finished -> []
B.Active -> []
return (map GError failures ++ map GSuccess successes)
where
sentences = map snd3 os
sansRanking pst (i,l,d) rs = GeniSuccess
{ grLemmaSentence = l
, grRealisations = moRealisations rs
, grWarnings = moWarnings rs
, grDerivation = d
, grLexSelection = map (\x -> GeniLexSel x (getTraces pst x)) (B.lexicalSelection d)
, grRanking = 1
, grViolations = []
, grResultType = ty
, grOrigin = i
}
addRanking (i,res,vs) = res { grViolations = vs, grRanking = i }
showRealisations :: [String] -> String
showRealisations [] = "(none)"
showRealisations xs = unlines . map sho . Map.toList . histogram $ xs
where
sho (x,1) = x
sho (x,c) = x ++ " (" ++ show c ++ " instances)"
lemmaSentenceString :: GeniSuccess -> Text
lemmaSentenceString = T.unwords . map lpLemma . grLemmaSentence
prettyResult :: ProgState -> GeniSuccess -> Text
prettyResult pst nr =
T.intercalate "\n" . map showOne . grRealisations $ nr
where
showOne str = pretty theRanking <> ". " <> str <> "\n" <> violations
violations = prettyViolations tracesFn verbose (grViolations nr)
theRanking = grRanking nr
verbose = hasFlagP VerboseModeFlg (pa pst)
tracesFn = getTraces pst
getTraces :: ProgState -> Text -> [Text]
getTraces pst tname =
filt $ concat [ ptrace t | t <- gr pst, pidname t == readPidname tname ]
where
filt = case traces pst of
[] -> id
theTs -> filter (`elem` theTs)
readPidname :: Text -> Text
readPidname n =
case T.splitOn ":" n of
(_:_:p:_) -> p
_ -> geniBug "NLP.GenI.readPidname or combineName are broken"
runLexSelection :: ProgStateRef -> SemInput -> IO LexicalSelection
runLexSelection pstRef (tsem,_,litConstrs) = do
pst <- readIORef pstRef
let config = pa pst
verbose = hasFlagP VerboseModeFlg config
selector <- getLexicalSelector pstRef
selection <- selector (gr pst) (le pst) tsem
let lexCand = lsLexEntries selection
candFinal = finaliseLexSelection (morphinf pst) tsem litConstrs (lsAnchored selection)
when verbose $ T.hPutStrLn stderr . T.unlines $
"Lexical items selected:"
: map (indent . showLexeme . fromFL . iword) lexCand
++ ["Trees anchored (family) :"]
++ map (indent . idname) candFinal
let semWarnings = case missingLiterals candFinal tsem of
[] -> []
xs -> [NoLexSelection xs]
return $ selection { lsAnchored = candFinal
, lsWarnings = mkGeniWarnings semWarnings `mappend` lsWarnings selection
}
where
indent x = ' ' `T.cons` x
getLexicalSelector :: ProgStateRef -> IO LexicalSelector
getLexicalSelector pstRef = do
config <- pa <$> readIORef pstRef
case (customSelector config, grammarType config) of
(Just s, _) -> return s
(Nothing, PreAnchored) -> mkPreAnchoredLexicalSelector pstRef
(Nothing, _) -> return defaultLexicalSelector
missingLiterals :: [TagElem] -> [Literal GeniVal] -> [Literal GeniVal]
missingLiterals cands tsem =
tsem \\ (nub $ concatMap tsemantics cands)
finaliseLexSelection :: MorphInputFn -> Sem -> [LitConstr] -> [TagElem] -> [TagElem]
finaliseLexSelection morph tsem litConstrs =
setTidnums . considerCoherency . considerLc . considerMorph
where
considerMorph = attachMorph morph tsem
matchesLc t = all (`elem` myTrace) constrs
where constrs = concat [ cs | (l,cs) <- litConstrs, l `elem` mySem ]
mySem = tsemantics t
myTrace = ttrace t
considerLc = filter matchesLc
considerCoherency = filter (all (`elem` tsem) . tsemantics)
newtype PreAnchoredL = PreAnchoredL [TagElem]
instance Loadable PreAnchoredL where
lParse f = fmap PreAnchoredL
. runParser geniTagElems () f
lSet _ p = p
lSummarise (PreAnchoredL xs) = show (length xs) ++ " trees"
readPreAnchored :: ProgStateRef -> IO [TagElem]
readPreAnchored pstRef = do
PreAnchoredL xs <- loadOrDie (L :: L PreAnchoredL)
MacrosFlg "preanchored trees" pstRef
return xs
mkPreAnchoredLexicalSelector :: ProgStateRef -> IO LexicalSelector
mkPreAnchoredLexicalSelector pstRef = do
xs <- readPreAnchored pstRef
return (\_ _ _ -> return (LexicalSelection xs [] mempty))
verbosity :: ProgStateRef -> IO Bool
verbosity = fmap (hasFlagP VerboseModeFlg . pa)
. readIORef
instance JSON GeniResult where
readJSON j =
case readJSON j of
Ok s -> Ok (GSuccess s)
Error _ -> GError `fmap` readJSON j
showJSON (GSuccess x) = showJSON x
showJSON (GError x) = showJSON x
instance JSON GeniSuccess where
readJSON j = do
jo <- fromJSObject `fmap` readJSON j
let field x = maybe (fail $ "Could not find: " ++ x) readJSON
$ lookup x jo
GeniSuccess <$> field "raw"
<*> field "realisations"
<*> field "result-type"
<*> field "warnings"
<*> field "derivation"
<*> field "chart-item"
<*> field "lexical-selection"
<*> field "ranking"
<*> field "violations"
showJSON nr =
JSObject . toJSObject $ [ ("raw", showJSON $ grLemmaSentence nr)
, ("realisations", showJSONs $ grRealisations nr)
, ("derivation", showJSONs $ grDerivation nr)
, ("lexical-selection", showJSONs $ grLexSelection nr)
, ("ranking", showJSON $ grRanking nr)
, ("violations", showJSONs $ grViolations nr)
, ("result-type", showJSON $ grResultType nr)
, ("chart-item", showJSON $ grOrigin nr)
, ("warnings", showJSONs $ grWarnings nr)
]
instance JSON GeniError where
readJSON j =
do jo <- fromJSObject `fmap` readJSON j
let field x = maybe (fail $ "Could not find: " ++ x) readJSON
$ lookup x jo
GeniError <$> field "errors"
showJSON (GeniError xs) =
JSObject . toJSObject $ [ ("errors", showJSON xs) ]
instance JSON ResultType where
readJSON j =
do js <- fromJSString `fmap` readJSON j
case js of
"partial" -> return PartialResult
"complete" -> return CompleteResult
ty -> fail $ "unknown result type: " ++ ty
showJSON CompleteResult = JSString $ toJSString "complete"
showJSON PartialResult = JSString $ toJSString "partial"
instance JSON GeniLexSel where
readJSON j =
do jo <- fromJSObject `fmap` readJSON j
let field x = maybe (fail $ "Could not find: " ++ x) readJSON
$ lookup x jo
GeniLexSel <$> field "lex-item"
<*> field "trace"
showJSON x =
JSObject . toJSObject $ [ ("lex-item", showJSON $ nlTree x)
, ("trace", showJSONs $ nlTrace x)
]
picosToMillis :: Integer -> Double
picosToMillis t = realToFrac t / (10^(9 :: Int))
instance NFData GeniResult where
rnf (GError x1) = rnf x1 `seq` ()
rnf (GSuccess x1) = rnf x1 `seq` ()
instance NFData GeniSuccess where
rnf (GeniSuccess x1 x2 x3 x4 x5 x6 x7 x8 x9)
= rnf x1 `seq`
rnf x2 `seq`
rnf x3 `seq`
rnf x4 `seq`
rnf x5 `seq` rnf x6 `seq` rnf x7 `seq` rnf x8 `seq` rnf x9 `seq` ()
instance NFData GeniError where
rnf (GeniError x1) = rnf x1 `seq` ()
instance NFData ResultType where
rnf CompleteResult = ()
rnf PartialResult = ()
instance NFData GeniLexSel where
rnf (GeniLexSel x1 x2) = rnf x1 `seq` rnf x2 `seq` ()