src/NLP/GenI/OptimalityTheory.lhs

% GenI surface realiser % Copyright (C) 2009 Eric Kow % % This program is free software; you can redistribute it and/or % modify it under the terms of the GNU General Public License % as published by the Free Software Foundation; either version 2 % of the License, or (at your option) any later version. % % This program is distributed in the hope that it will be useful, % but WITHOUT ANY WARRANTY; without even the implied warranty of % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the % GNU General Public License for more details. % % You should have received a copy of the GNU General Public License % along with this program; if not, write to the Free Software % Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. \chapter{Ranking output} \label{cha:ranking} \begin{code}

module NLP.GenI.OptimalityTheory
   ( -- * Input
     OtConstraint(..), OtRanking,
     -- * Output
     GetTraces, OtResult, OtViolation, RankedOtConstraint(..),
     rankResults, otWarnings,
     -- * Display
     prettyViolations,prettyRank
   )
 where

import Control.Applicative ( (<$>), (<*>) )
import Control.Arrow ( first )
import Data.Function (on)
import Data.Char ( isSpace )
import Data.List (nub, partition, sort, sortBy, groupBy, intersperse, (\\), unfoldr )
import Text.JSON

import NLP.GenI.Btypes ( Macros, ptrace )
import qualified NLP.GenI.Builder as B

\end{code} If your tree schemata are annotated with traces (TODO link to traces and metagrammars), you can re-use them as a basis for ranking the output produced by GenI. The basic idea is to supply a list of either positive, negative or negative conjunction constraints. For users familiar with Haskell, the constraints are described with the following type: \begin{includecodeinmanual} \begin{code}

data OtConstraint = PositiveC String -- ^ the trace must appear
                  | NegativeC String -- ^ the trace must NOT appear
                  | NegativeConjC [String] -- ^ these traces must not appear AT THE SAME TIME
 deriving (Show, Eq)

\end{code} \end{includecodeinmanual} Roughly speaking the more highly ranked the constraint, the greater the impact of a violation of that constraint will be. See section \ref{sec:ranking-procedure} for more details on the ranking procedure. \begin{code}

data RankedOtConstraint = RankedOtConstraint Int OtConstraint
 deriving (Show, Eq)

instance Ord RankedOtConstraint where
 compare (RankedOtConstraint r1 _) (RankedOtConstraint r2 _) = compare r1 r2

-- | Same as 'RankedOtConstraint' with the sorting inverted
newtype RankedOtConstraint2 = RankedOtConstraint2 RankedOtConstraint deriving Eq

instance Ord RankedOtConstraint2 where
 compare (RankedOtConstraint2 x) (RankedOtConstraint2 y) = compare y x


type OtRanking = [[OtConstraint]]

data OtViolation = OtViolation { otLexName            :: String -- ^ empty for global
                               , otConstraintViolated :: RankedOtConstraint }
 deriving (Show, Eq, Ord)

data LexItem = LexItem
       { lLexname :: String
       , lTraces :: [String]
       } deriving (Ord, Eq, Show)

type GetTraces = String -> [String]
type OtResult x = (Int,x,[OtViolation])

\end{code} \section{Input format} Constraints are expressed in JSON as a list of \jargon{ranking levels}. A ranking level is a list of constraints that should be assigned the same rank. In lieu of a formal description, we provide an example below: \small{NB: Either the JSON format or the JSON parser used by GenI is strict enough to refuse initial whitespace in this file.} \begin{verbatim} [ [{"neg-constraint": "dian0Vn1dePassive"}, {"pos-constraint": "CanonicalSubject"}], [{"neg-conj-constraint": ["InvertedNominalSubject", "CanonicalSententialObjectFinite"]}], [{"neg-conj-constraint": ["InvertedNominalSubject", "UnboundedCleft"]}, {"neg-constraint": "CleftSubject"}] ] \end{verbatim} This example constraints file has three ranking levels. These levels contain following constraints: \begin{enumerate} \item A negative constraint saying that \verb!dian0Vn1dePassive! should not appear, and a positive one saying that \verb!CanonicalSubject! \emph{should} appear. These constraints appear together only because the author of the example thinks they should have the same rank, not because there is neccesarily any inherent relationship between them. \item A single negative conjunction constraint saying that \verb!InvertedNominalSubject! and \verb!CanonicalSententialObjectFinite! should not appear together. \item A negative conjunction constraint saying tat \verb!InvertedNominalSubject! and \verb!UnboundedCleft! should not appear together; and also a negative constraints saying that \verb!CleftSubject! should not appear. As with the first ranking level, there is no relationship between these two constraints. We just put them on the same level to give them the same rank \end{enumerate} \begin{code}

instance JSON OtConstraint where
 readJSON j =
    do jv <- fromJSObject `fmap` readJSON j
       case lookup "pos-constraint" jv of
        Just v    -> PositiveC `fmap` readJSON v
        Nothing   -> case lookup "neg-constraint" jv of
         Just v   -> NegativeC `fmap` readJSON v
         Nothing  -> case lookup "neg-conj-constraint" jv of
          Just v  -> NegativeConjC `fmap` readJSONs v
          Nothing -> fail $ "Could not read OtConstraint"
 showJSON (PositiveC c) =
     JSObject . toJSObject $ [ ("pos-constraint", showJSON c ) ]
 showJSON (NegativeC c) =
     JSObject . toJSObject $ [ ("neg-constraint", showJSON c ) ]
 showJSON (NegativeConjC cs) =
     JSObject . toJSObject $ [ ("neg-conj-constraint", showJSONs cs ) ]

\end{code} \begin{code}

-- ---------------------------------------------------------------------
-- top level stuff
-- ---------------------------------------------------------------------
otWarnings :: Macros -> OtRanking -> [OtViolation] -> [String]
otWarnings gram ranking blocks =
    addWarning neTraces neTracesW
  . addWarning nvConstraints nvConstraintsW
  $ []
 where
  addWarning xs w = if null xs then id else (w xs :)
  neTracesW xs = "these traces never appear in the grammar: " ++ unwords xs
  neTraces  = nonExistentTraces gram ranking
  nvConstraintsW xs = "these constraints are never violated: " ++ unwords (map prettyConstraint xs)
  nvConstraints = neverViolated blocks ranking

rankResults :: GetTraces -> (a -> B.Derivation) -> OtRanking -> [a] -> [OtResult a]
rankResults getTraces getDerivation r = squish . sortResults . map addViolations
 where
   addViolations x = (x, getViolations x)
   getViolations  = violations (concatRank r) . lexTraces getTraces . getDerivation
   squish         = concat . zipWith applyRank [1..]
   applyRank i    = map (\(x,vs) -> (i,x,vs))

\end{code} \begin{code}

-- ---------------------------------------------------------------------
-- detecting violations
-- ---------------------------------------------------------------------

violations :: [RankedOtConstraint] -> [LexItem] -> [OtViolation]
violations cs ls = posVs ls ++ negVs ls
 where
  negVs  = concatMap (\l -> negViolations cs (lLexname l) (lTraces l))
  posVs  = posViolations cs . concatMap lTraces

-- | A positive constraint is violated when a trace is NOT present
posViolations :: [RankedOtConstraint] -> [String] -> [OtViolation]
posViolations cs ss =
 [ OtViolation "" c | c@(RankedOtConstraint _ (PositiveC s)) <- cs, not (s `elem` ss) ]

-- | A negative constraint is violated when a trace is present
--
--   Note that we will not notice if a constraint is violated more
--   than once.  If you want to count multiple violations, you'll
--   either need to partition the input strings and map this function
--   on each sublist or rewrite this code.
negViolations :: [RankedOtConstraint]
              -> String   -- ^ lex name
              -> [String] -- ^ traces
              -> [OtViolation]
negViolations cs l ss =
 [ OtViolation l c | c@(RankedOtConstraint _ (NegativeC s)) <- cs, s `elem` ss ] ++
 [ OtViolation l c | c@(RankedOtConstraint _ (NegativeConjC xs)) <- cs, all (`elem` ss) xs ]

\end{code} \section{Ranking procedure} \label{sec:ranking-procedure} Generation results are sorted according to their highest-ranking constraint violation (moving on to the next-highest ranking violation and so forth in case of a tie). The best result appears first. \begin{code}

-- | Violations sorted so that the highest ranking constraint
--   (smallest number) goes first
sortedViolations :: (a, [OtViolation]) -> [RankedOtConstraint2]
sortedViolations = map (RankedOtConstraint2 . otConstraintViolated) . sort . snd

-- | Sort the sentences so that the ones with the *lowest*
--   ranking violations (biggest number) go first.
--   Note that we return in groups for the sake of ties.
sortResults :: [(a, [OtViolation])] -> [[(a, [OtViolation])]]
sortResults = sortAndGroupByDecoration compare sortedViolations

lexTraces :: GetTraces -> B.Derivation -> [LexItem]
lexTraces getTraces = map (toLexItem getTraces) . B.lexicalSelection

toLexItem :: GetTraces -> String -> LexItem
toLexItem getTraces t =
 LexItem { lLexname = t
         , lTraces  = getTraces t }

\end{code} \section{Output format} Constraint violations can be outputted as JSON objects as the following example shows \begin{verbatim} { "lex-item": "discuter:n0Vn1pn2:Tn0Vn1pn2-5830:22", , "rank": 6, , "violation": {"neg-constraint": "passiveVerbMorphology"} } \end{verbatim} Positive constraint violations are not associated with any lexical items so the lex-item field is omitted for them. \begin{code}

-- ---------------------------------------------------------------------
-- output
-- ---------------------------------------------------------------------
instance JSON RankedOtConstraint where
 readJSON j =
    do jo <- fromJSObject `fmap` readJSON j
       let field x = maybe (fail $ "Could not find: " ++ x) readJSON
                   $ lookup x jo
       RankedOtConstraint <$> field "rank"
                          <*> field "violation"
 showJSON = JSObject . toJSObject . rankedOtConstraintToPairs

rankedOtConstraintToPairs :: RankedOtConstraint -> [ (String, JSValue) ]
rankedOtConstraintToPairs (RankedOtConstraint r c) =
  [ ("rank", showJSON r), ("violation", showJSON c) ]

instance JSON OtViolation where
 readJSON j =
    do jo <- fromJSObject `fmap` readJSON j
       case lookup "lex-item" jo of
         Nothing -> OtViolation "" <$> readJSON j
         Just l  -> OtViolation <$> readJSON l
                                <*> readJSON j

 showJSON ov = JSObject . toJSObject $ pairs
  where
   pairs = case otLexName ov of
             "" -> basicPairs
             l  -> ("lex-item", showJSON l) : basicPairs
   basicPairs = rankedOtConstraintToPairs (otConstraintViolated ov)

-- ---------------------------------------------------------------------
-- pretty printing
-- ---------------------------------------------------------------------

-- TODO: Return as a pretty Doc
prettyViolations :: GetTraces -> Bool -> [OtViolation] -> String
prettyViolations getTraces noisy vs =
   unlines $ (if null posVs then []  else [ indented 1 75 . showPosVs $ posVs ])
           ++ map showLexVs negBuckets
 where
  (posVs, negVs) = partition (null . otLexName) vs
  negBuckets = buckets otLexName negVs
  --
  showPosVs  = unwords . map (prettyRankedConstraint . otConstraintViolated)
  showLexVs (l,lvs) =
    let itmName = "(" ++ l ++ ")"
        constraints = map otConstraintViolated lvs
        allTraces = indented 4 75 . unwords . getTraces $ l
    in (indented 2 75 . unwords $ itmName : map prettyRankedConstraint constraints)
       ++ (if noisy then "\n" ++ allTraces else "")

prettyRankedConstraint :: RankedOtConstraint -> String
prettyRankedConstraint (RankedOtConstraint r c) = prettyConstraint c ++ " " ++ prettyRank r

prettyConstraint :: OtConstraint -> String
prettyConstraint (PositiveC str) = '+' : str
prettyConstraint (NegativeC str) = '*' : str
prettyConstraint (NegativeConjC strs) = "*(" ++ (concat $ intersperse " & " strs) ++ ")"

prettyRank :: Int -> String
prettyRank r = "(r" ++ show r ++ ")"

-- ---------------------------------------------------------------------
-- detecting impossible constraints or other potential errors
-- ---------------------------------------------------------------------

neverViolated :: [OtViolation] -> [[OtConstraint]] -> [OtConstraint]
neverViolated vs ranking = concat ranking \\ cs_used
 where
  cs_used = nub . map (noRank . otConstraintViolated) $ vs

nonExistentTraces :: Macros -> [[OtConstraint]] -> [String]
nonExistentTraces ms vs = r_traces \\ m_traces
 where
  m_traces = nub $ concatMap ptrace ms
  r_traces = nub $ concatMap cTraces $ concat vs

cTraces :: OtConstraint -> [String]
cTraces (PositiveC c) = [c]
cTraces (NegativeConjC cs) = cs
cTraces (NegativeC c) = [c]

-- ----------------------------------------------------------------------
-- helpers
-- ----------------------------------------------------------------------

concatRank :: [[OtConstraint]] -> [RankedOtConstraint]
concatRank = concat . zipWith rank [1..]
 where
  rank x ys = map (RankedOtConstraint x) ys

noRank :: RankedOtConstraint -> OtConstraint
noRank (RankedOtConstraint _ c) = c

-- ----------------------------------------------------------------------
-- odds and ends
-- ----------------------------------------------------------------------

buckets :: Ord b => (a -> b) -> [a] -> [ (b,[a]) ]
buckets f = map (first head . unzip)
          . groupBy ((==) `on` fst)
          . sortBy (compare `on` fst)
          . map (\x -> (f x, x))

-- | Results are grouped so that ties can be noticed
sortAndGroupByDecoration :: Eq b => (b -> b -> Ordering) -> (a -> b) -> [a] -> [[a]]
sortAndGroupByDecoration cmp f = map (map snd)
                               . groupBy ((==) `on` fst)
                               . sortBy (cmp `on` fst)
                               . map (\x -> (f x, x))

indented :: Int -> Int -> String -> String
indented x len = concat . intersperse "\n" . map (\s -> spaces x ++ s) . unfoldr f
 where
  f ""  = Nothing
  f str = Just $ splitAtBefore len str

spaces :: Int -> String
spaces n = replicate n ' '

splitAtBefore :: Int -> String -> (String, String)
splitAtBefore len xs
  | length xs < len = (xs, "")
  | any isSpace xs  = (begin, trim $ drop (length begin) xs)
  | otherwise       = (xs, "")
 where
  begin
   | length upToSpace > len = upToSpace
   | otherwise = reverse . trim . dropWhile isNotSpace . reverse . take len $ xs
  upToSpace = takeWhile isNotSpace xs
  isNotSpace = not . isSpace
  trim = drop 1

\end{code}