-- GenI surface realiser -- Copyright (C) 2005 Carlos Areces and Eric Kow -- -- This program is free software; you can redistribute it and/or -- modify it under the terms of the GNU General Public License -- as published by the Free Software Foundation; either version 2 -- of the License, or (at your option) any later version. -- -- This program is distributed in the hope that it will be useful, -- but WITHOUT ANY WARRANTY; without even the implied warranty of -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -- GNU General Public License for more details. -- -- You should have received a copy of the GNU General Public License -- along with this program; if not, write to the Free Software -- Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. {-# LANGUAGE CPP, FlexibleInstances #-} {-# LANGUAGE OverloadedStrings #-} {-# OPTIONS_GHC -fno-warn-unused-do-bind #-} module NLP.GenI.Parser ( -- * Test suites geniTestSuite, geniSemanticInput, geniTestSuiteString, geniDerivations, toSemInputString, -- * Trees geniMacros, geniTagElems, -- * Lexicon and morph geniLexicon, geniMorphInfo, -- * Basics geniFeats, geniSemantics, geniValue, geniWords, -- * Helpers geniWord, geniLanguageDef, tillEof, -- parseFromFile, -- UTF-8 version module Text.Parsec, module Text.Parsec.String, ) where import Control.Applicative ( (<*>), (<$>), (*>), (<*) ) import Control.Monad (liftM, when) import Data.Text ( Text ) import Text.Parsec import Text.Parsec.String hiding ( parseFromFile ) -- TODO: replace with Text.Parsec.Text import Text.Parsec.Language (emptyDef) import Text.Parsec.Token (TokenParser, LanguageDef, commentLine, commentStart, commentEnd, opLetter, reservedOpNames, reservedNames, identLetter, identStart, makeTokenParser) import qualified Data.Map as Map import qualified Data.Text as T import qualified Data.Tree as T import qualified System.IO.UTF8 as UTF8 --import qualified Text.Parsec.Expr as P import qualified Text.Parsec.Token as P import NLP.GenI.FeatureStructure ( Flist, AvPair(..), sortFlist ) import NLP.GenI.General (isGeniIdentLetter) import NLP.GenI.GeniShow ( GeniShow(..), geniKeyword ) import NLP.GenI.GeniVal ( GeniVal, mkGConst, mkGConstNone, mkGVar, mkGAnon, isAnon ) import NLP.GenI.Lexicon ( fromLexSem, mkFullLexEntry, LexEntry(..) ) import NLP.GenI.Pretty ( above ) import NLP.GenI.Semantics ( Literal(..), Sem, sortSem, LitConstr, SemInput ) import NLP.GenI.Tag (TagElem(..), setTidnums) import NLP.GenI.TestSuite ( TestCase(..) ) import NLP.GenI.TreeSchema (SchemaTree, Ttree(..), Ptype(..), GNode(..), GType(..) ) -- import BoolExp import Data.FullList ( FullList, Listable(..) ) -- General notes -- reserved words #define SEMANTICS "semantics" #define SENTENCE "sentence" #define OUTPUT "output" #define TRACE "trace" #define ANCHOR "anchor" #define SUBST "subst" #define FOOT "foot" #define LEX "lex" #define TYPE "type" #define ACONSTR "aconstr" #define INITIAL "initial" #define AUXILIARY "auxiliary" #define IDXCONSTRAINTS "idxconstraints" #define BEGIN "begin" #define END "end" -- Lexer geniLanguageDef :: LanguageDef () geniLanguageDef = emptyDef { commentLine = "%" , commentStart = "/*" , commentEnd = "*/" , opLetter = oneOf "" , reservedOpNames = [""] , reservedNames = [ SEMANTICS , SENTENCE, OUTPUT, IDXCONSTRAINTS, TRACE , ANCHOR , SUBST , FOOT , LEX , TYPE , ACONSTR , INITIAL , AUXILIARY , BEGIN , END ] , identLetter = identStuff , identStart = identStuff } where identStuff = satisfy isGeniIdentLetter geniValue :: Parser GeniVal geniValue = ((try $ anonymous) "_ or ?_") <|> (constants "a constant or atomic disjunction") <|> (variable "a variable") where question = "?" disjunction = geniAtomicDisjunction constants :: Parser GeniVal constants = mkGConst <$> disjunction variable :: Parser GeniVal variable = do symbol question v <- identifier mcs <- option Nothing $ (symbol "/" >> Just `liftM` disjunction) return (mkGVar v mcs) anonymous :: Parser GeniVal anonymous = do optional $ symbol question symbol "_" return mkGAnon geniAtomicDisjunction :: Parser (FullList Text) geniAtomicDisjunction = do (x:xs) <- atom `sepBy1` (symbol "|") return (x !: xs) where atom = looseIdentifier <|> stringLiteral geniFancyDisjunction :: Parser [GeniVal] geniFancyDisjunction = geniValue `sepBy1` symbol ";" class GeniValLike v where geniValueLike :: Parser v instance GeniValLike GeniVal where geniValueLike = geniValue instance GeniValLike [GeniVal] where geniValueLike = geniFancyDisjunction -- We make no attempt to check for / guarantee uniqueness here -- because the same sort of format is used for things which are -- not strictly speaking feature structures geniFeats :: GeniValLike v => Parser (Flist v) geniFeats = option [] $ squares $ many geniAttVal geniAttVal :: GeniValLike v => Parser (AvPair v) geniAttVal = do att <- identifierR "an attribute"; colon val <- geniValueLike "a GenI value" return $ AvPair att val geniSemantics :: Parser Sem geniSemantics = do sem <- many (geniLiteral "a literal") return (sortSem sem) geniLiteral :: Parser (Literal GeniVal) geniLiteral = geniLiteral_ mkGAnon geniValue geniLiteral_ :: a -> Parser a -> Parser (Literal a) geniLiteral_ zero gv = Literal <$> (option zero handleParser "a handle") <*> (gv "a predicate") <*> (parens (many gv) "some parameters") where handleParser = try $ gv <* char ':' geniSemanticInput :: Parser (Sem,Flist GeniVal,[LitConstr]) geniSemanticInput = do keywordSemantics (sem,litC) <- liftM unzip $ squares $ many literalAndConstraint idxC <- option [] geniIdxConstraints -- let sem2 = createHandles sem semlitC2 = [ (s,c) | (s,c) <- zip sem2 litC, (not.null) c ] return (createHandles sem, idxC, semlitC2) where -- set all anonymous handles to some unique value -- this is to simplify checking if a result is -- semantically complete createHandles :: Sem -> Sem createHandles = zipWith setHandle ([1..] :: [Int]) -- setHandle i (Literal h pred_ par) = let h2 = if isAnon h then mkGConstNone ("genihandle" `T.append` T.pack (show i)) else h in Literal h2 pred_ par -- literalAndConstraint :: Parser LitConstr literalAndConstraint = do l <- geniLiteral t <- option [] $ squares $ many identifier return (l,t) -- | The original string representation of the semantics (for gui) geniSemanticInputString :: Parser Text geniSemanticInputString = do keywordSemantics s <- squaresString whiteSpace optional geniIdxConstraints return s geniIdxConstraints :: Parser (Flist GeniVal) geniIdxConstraints = keyword IDXCONSTRAINTS >> geniFeats {- geniLitConstraints :: Parser (BoolExp T.Text) geniLitConstraints = P.buildExpressionParser table piece where piece = (Cond <$> identifier) <|> do { string "~"; Not `liftM` geniLitConstraints } <|> parens geniLitConstraints table = [ [ op "&" And P.AssocLeft ] , [ op "|" Or P.AssocLeft ] ] op s f assoc = P.Infix (do { string s ; return f }) assoc -} squaresString :: Parser Text squaresString = do char '[' s <- (T.concat <$> many inSq) <|> squaresString char ']' return $ "[" `T.append` s `T.append` "]" where inSq :: Parser Text inSq = T.pack <$> many1 (noneOf "[]") -- the output end of things -- displaying preformatted semantic input data SemInputString = SemInputString Text (Flist GeniVal) instance GeniShow SemInputString where geniShowText (SemInputString semStr idxC) = geniKeyword SEMANTICS semStr `above` r where r | null idxC = "" | otherwise = geniKeyword IDXCONSTRAINTS (geniShowText idxC) toSemInputString :: SemInput -> Text -> SemInputString toSemInputString (_,lc,_) s = SemInputString s lc geniTestSuite :: Parser [TestCase] geniTestSuite = tillEof (many geniTestCase) -- | Just the String representations of the semantics -- in the test suite geniTestSuiteString :: Parser [Text] geniTestSuiteString = tillEof (many geniTestCaseString) -- | This is only used by the script genimakesuite geniDerivations :: Parser [TestCaseOutput] geniDerivations = tillEof $ many geniOutput geniTestCase :: Parser TestCase geniTestCase = TestCase <$> (option "" (identifier "a test case name")) <*> lookAhead geniSemanticInputString <*> geniSemanticInput <*> many geniSentence -- note that the keyword is NOT optional type TestCaseOutput = (Text, Map.Map (Text,Text) [Text]) geniOutput :: Parser TestCaseOutput geniOutput = do ws <- keyword OUTPUT >> squares geniWords ds <- Map.fromList <$> many geniTraces return (ws, ds) geniTraces :: Parser ((Text,Text), [Text]) geniTraces = do keyword TRACE squares $ do k1 <- withWhite geniWord k2 <- withWhite geniWord whiteSpace >> char '!' >> whiteSpace traces <- geniWord `sepEndBy1` whiteSpace return ((k1,k2), traces) withWhite :: Parser a -> Parser a withWhite p = p >>= (\a -> whiteSpace >> return a) geniSentence :: Parser Text geniSentence = optional (keyword SENTENCE) >> squares geniWords geniWords :: Parser Text geniWords = T.unwords <$> (sepEndBy1 geniWord whiteSpace "a sentence") geniWord :: Parser Text geniWord = T.pack <$> many1 (noneOf "[]\v\f\t\r\n ") -- | The original string representation of a test case semantics -- (for gui) geniTestCaseString :: Parser Text geniTestCaseString = do option "" (identifier "a test case name") geniSemanticInputString <* (many geniSentence >> many geniOutput) -- ---------------------------------------------------------------------- -- Lexicon -- ---------------------------------------------------------------------- geniLexicon :: Parser [LexEntry] geniLexicon = tillEof $ many1 geniLexicalEntry geniLexicalEntry :: Parser LexEntry geniLexicalEntry = do lemmas <- geniAtomicDisjunction "a lemma (or disjunction thereof)" family <- identifier "a tree family" (pars, interface) <- option ([],[]) $ parens paramsParser equations <- option [] $ do keyword "equations" geniFeats "path equations" filters <- option [] $ do keyword "filters" geniFeats keywordSemantics (sem, pols) <- fromLexSem <$> squares geniLexSemantics -- return (mkFullLexEntry lemmas family pars interface filters equations sem pols) where paramsParser :: Parser ([GeniVal], Flist GeniVal) paramsParser = do pars <- many geniValue "some parameters" interface <- option [] $ do symbol "!" many geniAttVal return (pars, interface) geniLexSemantics :: Parser [Literal PolValue] geniLexSemantics = sortSem <$> many (geniLexLiteral "a literal") type PolValue = (GeniVal, Int) geniLexLiteral :: Parser (Literal PolValue) geniLexLiteral = geniLiteral_ (mkGAnon,0) geniPolValue geniPolValue :: Parser (GeniVal, Int) geniPolValue = do p <- geniPolarity v <- geniValue return (v,p) -- ---------------------------------------------------------------------- -- Tree schemata -- ---------------------------------------------------------------------- geniMacros :: Parser [SchemaTree] geniMacros = tillEof $ many geniTreeDef initType, auxType :: Parser Ptype initType = do { reserved INITIAL ; return Initial } auxType = do { reserved AUXILIARY ; return Auxiliar } geniTreeDef :: Parser SchemaTree geniTreeDef = do sourcePos <- getPosition family <- identifier tname <- option "" (colon *> identifier) (pars,iface) <- geniParams theTtype <- (initType <|> auxType) theTree <- geniTree -- sanity checks? let treeFail x = do setPosition sourcePos -- FIXME does not do what I expect fail $ "In tree " ++ T.unpack family ++ ":" ++ T.unpack tname ++ " " ++ show sourcePos ++ ": " ++ x let theNodes = T.flatten theTree numFeet = length [ x | x <- theNodes, gtype x == Foot ] numAnchors = length [ x | x <- theNodes, ganchor x ] when (not $ any ganchor theNodes) $ treeFail "At least one node in an LTAG tree must be an anchor" when (numAnchors > 1) $ treeFail "There can be no more than 1 anchor node in a tree" when (numFeet > 1) $ treeFail "There can be no more than 1 foot node in a tree" when (theTtype == Initial && numFeet > 0) $ treeFail "Initial trees may not have foot nodes" -- psem <- option Nothing $ do { keywordSemantics; liftM Just (squares geniSemantics) } ptrc <- option [] $ do { keyword TRACE; squares (many identifier) } -- return TT{ params = pars , pfamily = family , pidname = tname , pinterface = sortFlist iface , ptype = theTtype , tree = theTree , ptrace = ptrc , psemantics = psem } geniTree :: (Ord v, GeniValLike v) => Parser (T.Tree (GNode v)) geniTree = do node <- geniNode kids <- option [] (braces $ many geniTree) "child nodes" -- sanity checks let noKidsAllowed t c = when (c node && (not.null $ kids)) $ fail $ t ++ " nodes may *not* have any children" noKidsAllowed "Anchor" $ ganchor noKidsAllowed "Substitution" $ (== Subs) . gtype noKidsAllowed "Foot" $ (== Foot) . gtype -- return (T.Node node kids) geniNode :: (Ord v, GeniValLike v) => Parser (GNode v) geniNode = do name <- identifier nodeType <- geniNodeAnnotation lex_ <- if nodeType == AnnoLexeme then ((stringLiteral <|> identifier) `sepBy` symbol "|") "some lexemes" else return [] constr <- case nodeType of AnnoDefault -> adjConstraintParser AnnoAnchor -> adjConstraintParser _ -> return True -- features only obligatory for non-lex nodes (top,bot) <- if nodeType == AnnoLexeme then option ([],[]) $ try topbotParser else topbotParser return $ GN { gnname = name , gtype = fromAnnotation nodeType , gup = sortFlist top , gdown = sortFlist bot , glexeme = lex_ , ganchor = nodeType == AnnoAnchor , gaconstr = constr , gorigin = "" } where adjConstraintParser = option False $ reserved ACONSTR >> char ':' >> symbol "noadj" >> return True topbotParser = do top <- geniFeats "top features" symbol "!" bot <- geniFeats "bot features" return (top,bot) -- | Should be purely internal type to help parsing. -- Injection to 'GType'. -- -- We don't just use GType directly because the annotations convey -- subtle distinctions that aren't encoded, particularly between -- lexemes and anchors data Annotation = AnnoAnchor | AnnoLexeme | AnnoSubst | AnnoFoot | AnnoDefault deriving Eq fromAnnotation :: Annotation -> GType fromAnnotation AnnoLexeme = Lex fromAnnotation AnnoAnchor = Lex fromAnnotation AnnoSubst = Subs fromAnnotation AnnoFoot = Foot fromAnnotation AnnoDefault = Other geniNodeAnnotation :: Parser Annotation geniNodeAnnotation = (keyword TYPE *> ty) <|> (reserved ANCHOR >> return AnnoAnchor) <|> return AnnoDefault where ty = choice [ try (symbol s) >> return t | (s,t) <- table ] table = [ (ANCHOR, AnnoAnchor) , (FOOT, AnnoFoot) , (SUBST, AnnoSubst) , (LEX, AnnoLexeme) ] -- | This makes it possible to read anchored trees, which may be -- useful for debugging purposes. -- -- FIXME: note that this is very rudimentary; we do not set id numbers, -- parse polarities. You'll have to call -- some of our helper functions if you want that functionality. geniTagElems :: Parser [TagElem] geniTagElems = tillEof $ setTidnums `fmap` many geniTagElem geniTagElem :: Parser TagElem geniTagElem = do family <- identifier tname <- option "" $ (colon *> identifier) iface <- (snd `liftM` geniParams) <|> geniFeats theType <- initType <|> auxType theTree <- geniTree sem <- do { keywordSemantics; squares geniSemantics } -- return $ TE { idname = tname , ttreename = family , tinterface = iface , ttype = theType , ttree = theTree , tsemantics = sem , tidnum = -1 -- provisional id , tpolarities = Map.empty , tsempols = [] , ttrace = [] } -- | 'geniParams' recognises a list of parameters optionally followed by a -- bang (\verb$!$) and a list of attribute-value pairs. This whole thing is -- to wrapped in the parens. -- -- TODO: deprecate geniParams :: Parser ([GeniVal], Flist GeniVal) geniParams = parens $ do pars <- many geniValue "some parameters" interface <- option [] $ do { symbol "!"; many geniAttVal } return (pars, interface) -- ---------------------------------------------------------------------- -- Morphology -- ---------------------------------------------------------------------- geniMorphInfo :: Parser [(Text,Flist GeniVal)] geniMorphInfo = tillEof $ many morphEntry morphEntry :: Parser (Text,Flist GeniVal) morphEntry = (,) <$> identifier <*> geniFeats -- ====================================================================== -- Everything else -- ====================================================================== -- ---------------------------------------------------------------------- -- Polarities -- ---------------------------------------------------------------------- -- | 'geniPolarity' associates a numerical value to a polarity symbol, -- that is, '+' or '-'. geniPolarity :: Parser Int geniPolarity = option 0 (plus <|> minus) where plus = do { char '+'; return 1 } minus = do { char '-'; return (-1) } -- ---------------------------------------------------------------------- -- keyword -- ---------------------------------------------------------------------- {-# INLINE keyword #-} keyword :: Text -> Parser Text keyword k = (try $ do { reserved k; colon; return k }) T.unpack k ++ ":" {-# INLINE keywordSemantics #-} keywordSemantics :: Parser Text keywordSemantics = keyword SEMANTICS -- ---------------------------------------------------------------------- -- language def helpers -- ---------------------------------------------------------------------- lexer :: TokenParser () lexer = makeTokenParser geniLanguageDef whiteSpace :: Parser () whiteSpace = P.whiteSpace lexer identifier :: Parser Text identifier = decode <$> P.identifier lexer -- stolen from Parsec code (ident) -- | Like 'identifier' but allows for reserved words too looseIdentifier :: Parser Text looseIdentifier = decode <$> do { i <- ident ; whiteSpace; return i } where ident = do { c <- identStart geniLanguageDef ; cs <- many (identLetter geniLanguageDef) ; return (c:cs) } "identifier" colon :: Parser Text colon = decode <$> P.colon lexer stringLiteral :: Parser Text stringLiteral = decode <$> P.stringLiteral lexer squares, braces, parens :: Parser a -> Parser a squares = P.squares lexer braces = P.braces lexer parens = P.parens lexer reserved :: Text -> Parser Text reserved s = P.reserved lexer (T.unpack s) >> return s symbol :: Text -> Parser Text symbol s = P.symbol lexer (T.unpack s) >> return s decode :: String -> Text decode = T.pack -- ---------------------------------------------------------------------- -- parsec helpers -- ---------------------------------------------------------------------- -- | identifier, permitting reserved words too identifierR :: Parser Text identifierR = decode <$> do { c <- P.identStart geniLanguageDef ; cs <- many (P.identLetter geniLanguageDef) ; return (c:cs) } "identifier or reserved word" tillEof :: Parser a -> Parser a tillEof p = whiteSpace *> p <* eof -- stolen from Parsec and adapted to use UTF-8 input parseFromFile :: Parser a -> SourceName -> IO (Either ParseError a) parseFromFile p fname = do{ input <- UTF8.readFile fname ; return (parse p fname input) }