-- GenI surface realiser -- Copyright (C) 2005 Carlos Areces and Eric Kow -- -- This program is free software; you can redistribute it and/or -- modify it under the terms of the GNU General Public License -- as published by the Free Software Foundation; either version 2 -- of the License, or (at your option) any later version. -- -- This program is distributed in the hope that it will be useful, -- but WITHOUT ANY WARRANTY; without even the implied warranty of -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -- GNU General Public License for more details. -- -- You should have received a copy of the GNU General Public License -- along with this program; if not, write to the Free Software -- Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. {-# LANGUAGE CPP #-} {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE OverloadedStrings #-} {-# OPTIONS_GHC -fno-warn-unused-do-bind #-} module NLP.GenI.Parser ( -- * Test suites geniTestSuite, geniSemanticInput, geniTestSuiteString, geniDerivations, -- * Trees geniMacros, geniTagElems, -- * Lexicon and morph geniLexicon, geniMorphInfo, -- * Basics geniFeats, geniSemantics, geniValue, geniWords, -- * Helpers geniWord, geniLanguageDef, tillEof, -- parseFromFile, -- UTF-8 version module Text.Parsec, module Text.Parsec.Text, ) where import Control.Applicative ((*>), (<$>), (<*), (<*>), pure) import Control.Monad (liftM, when) import qualified Data.ByteString as B import Data.Functor.Identity (Identity) import qualified Data.Map as Map import Data.Monoid ((<>)) import Data.Text (Text) import qualified Data.Text as T import qualified Data.Text.Encoding as T import qualified Data.Tree as T import Text.Parsec import Text.Parsec.Text import Text.Parsec.Token (GenLanguageDef (..), makeTokenParser) import qualified Text.Parsec.Token as P -- import BoolExp import Data.FullList (FullList, Listable (..)) import NLP.GenI.FeatureStructure (AvPair (..), Flist, sortFlist) import NLP.GenI.General (isGeniIdentLetter) import NLP.GenI.GeniShow (GeniShow (..), geniKeyword) import NLP.GenI.GeniVal (GeniVal, SchemaVal (..), isAnon, mkGAnon, mkGConst, mkGConstNone, mkGVar) import NLP.GenI.Lexicon (LexEntry (..), fromLexSem, mkFullLexEntry) import NLP.GenI.Pretty (above) import NLP.GenI.Semantics (LitConstr, Literal (..), Sem, SemInput, sortSem) import NLP.GenI.Tag (TagElem (..), setTidnums) import NLP.GenI.TestSuite (TestCase (..)) import NLP.GenI.TreeSchema (GNode (..), GType (..), Ptype (..), AdjunctionConstraint(..), SchemaTree, Ttree (..)) -- General notes -- reserved words #define SEMANTICS "semantics" #define SENTENCE "sentence" #define OUTPUT "output" #define TRACE "trace" #define ANCHOR "anchor" #define SUBST "subst" #define FOOT "foot" #define LEX "lex" #define TYPE "type" #define ACONSTR "aconstr" #define INITIAL "initial" #define AUXILIARY "auxiliary" #define IDXCONSTRAINTS "idxconstraints" #define BEGIN "begin" #define END "end" -- Lexer geniLanguageDef :: P.GenLanguageDef Text () Identity geniLanguageDef = LanguageDef { commentLine = "%" , commentStart = "/*" , commentEnd = "*/" , nestedComments = True , opStart = opLetter geniLanguageDef , opLetter = oneOf "" , reservedOpNames = [] , reservedNames = [ SEMANTICS , SENTENCE, OUTPUT, IDXCONSTRAINTS, TRACE , ANCHOR , SUBST , FOOT , LEX , TYPE , ACONSTR , INITIAL , AUXILIARY , BEGIN , END ] , identLetter = identStuff , identStart = identStuff , caseSensitive = True } where identStuff = satisfy isGeniIdentLetter geniValue :: Parser GeniVal geniValue = ((try $ anonymous) "_ or ?_") <|> (constants "a constant or atomic disjunction") <|> (variable "a variable") where question = "?" disjunction = geniAtomicDisjunction constants :: Parser GeniVal constants = mkGConst <$> disjunction variable :: Parser GeniVal variable = do symbol question v <- identifier mcs <- option Nothing $ (symbol "/" >> Just `liftM` disjunction) return (mkGVar v mcs) anonymous :: Parser GeniVal anonymous = do optional $ symbol question symbol "_" return mkGAnon geniAtomicDisjunction :: Parser (FullList Text) geniAtomicDisjunction = do (x:xs) <- atom `sepBy1` (symbol "|") return (x !: xs) where atom = looseFlexiIdentifier geniFancyDisjunction :: Parser SchemaVal geniFancyDisjunction = SchemaVal <$> geniValue `sepBy1` symbol ";" class GeniValLike v where geniValueLike :: Parser v instance GeniValLike GeniVal where geniValueLike = geniValue instance GeniValLike SchemaVal where geniValueLike = geniFancyDisjunction -- We make no attempt to check for / guarantee uniqueness here -- because the same sort of format is used for things which are -- not strictly speaking feature structures geniFeats :: GeniValLike v => Parser (Flist v) geniFeats = option [] $ squares $ many geniAttVal geniAttVal :: GeniValLike v => Parser (AvPair v) geniAttVal = do att <- identifierR "an attribute"; colon val <- geniValueLike "a GenI value" return $ AvPair att val geniSemantics :: Parser Sem geniSemantics = do sem <- many (geniLiteral "a literal") return (sortSem sem) geniLiteral :: Parser (Literal GeniVal) geniLiteral = geniLiteral_ mkGAnon geniValue geniLiteral_ :: a -> Parser a -> Parser (Literal a) geniLiteral_ zero gv = Literal <$> (option zero handleParser "a handle") <*> (gv "a predicate") <*> (parens (many gv) "some parameters") where handleParser = try $ gv <* char ':' geniSemanticInput :: Parser (Sem,Flist GeniVal,[LitConstr]) geniSemanticInput = do keywordSemantics (sem,litC) <- liftM unzip $ squares $ many literalAndConstraint idxC <- option [] geniIdxConstraints -- let sem2 = createHandles sem semlitC2 = [ (s,c) | (s,c) <- zip sem2 litC, (not.null) c ] return (createHandles sem, idxC, semlitC2) where -- set all anonymous handles to some unique value -- this is to simplify checking if a result is -- semantically complete createHandles :: Sem -> Sem createHandles = zipWith setHandle ([1..] :: [Int]) -- setHandle i (Literal h pred_ par) = let h2 = if isAnon h then mkGConstNone ("genihandle" `T.append` T.pack (show i)) else h in Literal h2 pred_ par -- literalAndConstraint :: Parser LitConstr literalAndConstraint = do l <- geniLiteral t <- option [] $ squares $ many identifier return (l,t) -- | The original string representation of the semantics (for gui) geniSemanticInputString :: Parser Text geniSemanticInputString = do keywordSemantics s <- squaresString whiteSpace xs <- option [] geniIdxConstraints return (spitBack s xs) where -- this is a bit embarassing spitBack semStr idxC = geniKeyword SEMANTICS semStr `above` r where r | null idxC = "" | otherwise = geniKeyword IDXCONSTRAINTS (geniShowText idxC) geniIdxConstraints :: Parser (Flist GeniVal) geniIdxConstraints = keyword IDXCONSTRAINTS >> geniFeats {- geniLitConstraints :: Parser (BoolExp T.Text) geniLitConstraints = P.buildExpressionParser table piece where piece = (Cond <$> identifier) <|> do { string "~"; Not `liftM` geniLitConstraints } <|> parens geniLitConstraints table = [ [ op "&" And P.AssocLeft ] , [ op "|" Or P.AssocLeft ] ] op s f assoc = P.Infix (do { string s ; return f }) assoc -} squaresString :: Parser Text squaresString = between (char '[') (char ']') $ do xs <- many1 (nonSq <|> squaresString) return $ "[" <> T.concat xs <> "]" where nonSq :: Parser Text nonSq = T.pack <$> many1 (noneOf "[]") -- the output end of things -- displaying preformatted semantic input geniTestSuite :: Parser [TestCase SemInput] geniTestSuite = tillEof (many geniTestCase) -- | Just the String representations of the semantics -- in the test suite geniTestSuiteString :: Parser [Text] geniTestSuiteString = tillEof (many geniTestCaseString) -- | This is only used by the script genimakesuite geniDerivations :: Parser [TestCaseOutput] geniDerivations = tillEof $ many geniOutput geniTestCase :: Parser (TestCase SemInput) geniTestCase = TestCase <$> (option "" (flexiIdentifier "a test case name")) <*> lookAhead geniSemanticInputString <*> geniSemanticInput <*> many geniSentence <*> pure Nothing -- note that the keyword is NOT optional type TestCaseOutput = (Text, Map.Map (Text,Text) [Text]) geniOutput :: Parser TestCaseOutput geniOutput = do ws <- keyword OUTPUT >> squares geniWords ds <- Map.fromList <$> many geniTraces return (ws, ds) geniTraces :: Parser ((Text,Text), [Text]) geniTraces = do keyword TRACE squares $ do k1 <- withWhite geniWord k2 <- withWhite geniWord whiteSpace >> char '!' >> whiteSpace traces <- geniWord `sepEndBy1` whiteSpace return ((k1,k2), traces) withWhite :: Parser a -> Parser a withWhite p = p >>= (\a -> whiteSpace >> return a) geniSentence :: Parser Text geniSentence = optional (keyword SENTENCE) >> squares geniWords geniWords :: Parser Text geniWords = T.unwords <$> (sepEndBy1 geniWord whiteSpace "a sentence") geniWord :: Parser Text geniWord = T.pack <$> many1 (noneOf "[]\v\f\t\r\n ") -- | The original string representation of a test case semantics -- (for gui) geniTestCaseString :: Parser Text geniTestCaseString = do option "" (flexiIdentifier "a test case name") geniSemanticInputString <* (many geniSentence >> many geniOutput) -- ---------------------------------------------------------------------- -- Lexicon -- ---------------------------------------------------------------------- geniLexicon :: Parser [LexEntry] geniLexicon = tillEof $ many1 geniLexicalEntry geniLexicalEntry :: Parser LexEntry geniLexicalEntry = do lemmas <- geniAtomicDisjunction "a lemma (or disjunction thereof)" family <- identifier "a tree family" (pars, interface) <- option ([],[]) $ parens paramsParser equations <- option [] $ do keyword "equations" geniFeats "path equations" filters <- option [] $ do keyword "filters" geniFeats keywordSemantics (sem, pols) <- fromLexSem <$> squares geniLexSemantics -- return (mkFullLexEntry lemmas family pars interface filters equations sem pols) where paramsParser :: Parser ([GeniVal], Flist GeniVal) paramsParser = do pars <- many geniValue "some parameters" interface <- option [] $ do symbol "!" many geniAttVal return (pars, interface) geniLexSemantics :: Parser [Literal PolValue] geniLexSemantics = sortSem <$> many (geniLexLiteral "a literal") type PolValue = (GeniVal, Int) geniLexLiteral :: Parser (Literal PolValue) geniLexLiteral = geniLiteral_ (mkGAnon,0) geniPolValue geniPolValue :: Parser (GeniVal, Int) geniPolValue = do p <- geniPolarity v <- geniValue return (v,p) -- ---------------------------------------------------------------------- -- Tree schemata -- ---------------------------------------------------------------------- geniMacros :: Parser [SchemaTree] geniMacros = tillEof $ many geniTreeDef initType, auxType :: Parser Ptype initType = do { reserved INITIAL ; return Initial } auxType = do { reserved AUXILIARY ; return Auxiliar } geniTreeDef :: Parser SchemaTree geniTreeDef = do sourcePos <- getPosition family <- identifier tname <- option "" (colon *> identifier) (pars,iface) <- geniParams theTtype <- (initType <|> auxType) theTree <- geniTree -- sanity checks? let treeFail x = do setPosition sourcePos -- FIXME does not do what I expect fail $ "In tree " ++ T.unpack family ++ ":" ++ T.unpack tname ++ " " ++ show sourcePos ++ ": " ++ x let theNodes = T.flatten theTree numFeet = length [ x | x <- theNodes, gtype x == Foot ] numAnchors = length [ x | x <- theNodes, ganchor x ] when (not $ any ganchor theNodes) $ treeFail "At least one node in an LTAG tree must be an anchor" when (numAnchors > 1) $ treeFail "There can be no more than 1 anchor node in a tree" when (numFeet > 1) $ treeFail "There can be no more than 1 foot node in a tree" when (theTtype == Initial && numFeet > 0) $ treeFail "Initial trees may not have foot nodes" -- psem <- option Nothing $ do { keywordSemantics; liftM Just (squares geniSemantics) } ptrc <- option [] $ do { keyword TRACE; squares (many identifier) } -- return TT{ params = pars , pfamily = family , pidname = tname , pinterface = sortFlist iface , ptype = theTtype , tree = theTree , ptrace = ptrc , psemantics = psem } geniTree :: (Ord v, GeniValLike v) => Parser (T.Tree (GNode v)) geniTree = do node <- geniNode kids <- option [] (braces $ many geniTree) "child nodes" -- sanity checks let noKidsAllowed t c = when (c node && (not.null $ kids)) $ fail $ t ++ " nodes may *not* have any children" noKidsAllowed "Anchor" $ ganchor noKidsAllowed "Substitution" $ (== Subs) . gtype noKidsAllowed "Foot" $ (== Foot) . gtype -- return (T.Node node kids) geniNode :: (Ord v, GeniValLike v) => Parser (GNode v) geniNode = do name <- identifier nodeType <- geniNodeAnnotation lex_ <- if nodeType == AnnoLexeme then (flexiIdentifier `sepBy` symbol "|") "some lexemes" else return [] constr <- case nodeType of AnnoDefault -> adjConstraintParser AnnoAnchor -> adjConstraintParser _ -> return ExplicitNoAdj -- maybe InferredNoAdj instead? -- features only obligatory for non-lex nodes (top,bot) <- if nodeType == AnnoLexeme then option ([],[]) $ try topbotParser else topbotParser return $ GN { gnname = name , gtype = fromAnnotation nodeType , gup = sortFlist top , gdown = sortFlist bot , glexeme = lex_ , ganchor = nodeType == AnnoAnchor , gaconstr = constr , gorigin = "" } where adjConstraintParser = option MaybeAdj $ reserved ACONSTR >> char ':' >> symbol "noadj" >> return ExplicitNoAdj topbotParser = do top <- geniFeats "top features" symbol "!" bot <- geniFeats "bot features" return (top,bot) -- | Should be purely internal type to help parsing. -- Injection to 'GType'. -- -- We don't just use GType directly because the annotations convey -- subtle distinctions that aren't encoded, particularly between -- lexemes and anchors data Annotation = AnnoAnchor | AnnoLexeme | AnnoSubst | AnnoFoot | AnnoDefault deriving Eq fromAnnotation :: Annotation -> GType fromAnnotation AnnoLexeme = Lex fromAnnotation AnnoAnchor = Lex fromAnnotation AnnoSubst = Subs fromAnnotation AnnoFoot = Foot fromAnnotation AnnoDefault = Other geniNodeAnnotation :: Parser Annotation geniNodeAnnotation = (keyword TYPE *> ty) <|> (reserved ANCHOR >> return AnnoAnchor) <|> return AnnoDefault where ty = choice [ try (symbol s) >> return t | (s,t) <- table ] table = [ (ANCHOR, AnnoAnchor) , (FOOT, AnnoFoot) , (SUBST, AnnoSubst) , (LEX, AnnoLexeme) ] -- | This makes it possible to read anchored trees, which may be -- useful for debugging purposes. -- -- FIXME: note that this is very rudimentary; we do not set id numbers, -- parse polarities. You'll have to call -- some of our helper functions if you want that functionality. geniTagElems :: Parser [TagElem] geniTagElems = tillEof $ setTidnums `fmap` many geniTagElem geniTagElem :: Parser TagElem geniTagElem = do family <- identifier tname <- option "" $ (colon *> identifier) iface <- (snd `liftM` geniParams) <|> geniFeats theType <- initType <|> auxType theTree <- geniTree sem <- do { keywordSemantics; squares geniSemantics } -- return $ TE { idname = tname , ttreename = family , tinterface = iface , ttype = theType , ttree = theTree , tsemantics = sem , tidnum = -1 -- provisional id , tpolarities = Map.empty , tsempols = [] , ttrace = [] } -- | 'geniParams' recognises a list of parameters optionally followed by a -- bang (\verb$!$) and a list of attribute-value pairs. This whole thing is -- to wrapped in the parens. -- -- TODO: deprecate geniParams :: Parser ([GeniVal], Flist GeniVal) geniParams = parens $ do pars <- many geniValue "some parameters" interface <- option [] $ do { symbol "!"; many geniAttVal } return (pars, interface) -- ---------------------------------------------------------------------- -- Morphology -- ---------------------------------------------------------------------- geniMorphInfo :: Parser [(Text,Flist GeniVal)] geniMorphInfo = tillEof $ many morphEntry morphEntry :: Parser (Text,Flist GeniVal) morphEntry = (,) <$> identifier <*> geniFeats -- ====================================================================== -- Everything else -- ====================================================================== -- ---------------------------------------------------------------------- -- Polarities -- ---------------------------------------------------------------------- -- | 'geniPolarity' associates a numerical value to a polarity symbol, -- that is, '+' or '-'. geniPolarity :: Parser Int geniPolarity = option 0 (plus <|> minus) where plus = do { char '+'; return 1 } minus = do { char '-'; return (-1) } -- ---------------------------------------------------------------------- -- keyword -- ---------------------------------------------------------------------- {-# INLINE keyword #-} keyword :: Text -> Parser Text keyword k = (try $ do { reserved k; colon; return k }) T.unpack k ++ ":" {-# INLINE keywordSemantics #-} keywordSemantics :: Parser Text keywordSemantics = keyword SEMANTICS -- ---------------------------------------------------------------------- -- language def helpers -- ---------------------------------------------------------------------- lexer :: P.GenTokenParser Text () Identity lexer = makeTokenParser geniLanguageDef whiteSpace :: Parser () whiteSpace = P.whiteSpace lexer identifier :: Parser Text identifier = decode <$> P.identifier lexer -- | Like 'identifier', but also accepts string literals flexiIdentifier :: Parser Text flexiIdentifier = stringLiteral <|> identifier -- stolen from Parsec code (ident) -- | Like 'identifier' but allows for reserved words too looseIdentifier :: Parser Text looseIdentifier = decode <$> do { i <- ident ; whiteSpace; return i } where ident = do { c <- identStart geniLanguageDef ; cs <- many (identLetter geniLanguageDef) ; return (c:cs) } "identifier" -- | Accepts: identifiers, bare reserved words, and string literals looseFlexiIdentifier :: Parser Text looseFlexiIdentifier = looseIdentifier <|> stringLiteral colon :: Parser Text colon = decode <$> P.colon lexer stringLiteral :: Parser Text stringLiteral = decode <$> P.stringLiteral lexer squares, braces, parens :: Parser a -> Parser a squares = P.squares lexer braces = P.braces lexer parens = P.parens lexer reserved :: Text -> Parser Text reserved s = P.reserved lexer (T.unpack s) >> return s symbol :: Text -> Parser Text symbol s = P.symbol lexer (T.unpack s) >> return s decode :: String -> Text decode = T.pack -- ---------------------------------------------------------------------- -- parsec helpers -- ---------------------------------------------------------------------- -- | identifier, permitting reserved words too identifierR :: Parser Text identifierR = decode <$> do { c <- P.identStart geniLanguageDef ; cs <- many (P.identLetter geniLanguageDef) ; return (c:cs) } "identifier or reserved word" tillEof :: Parser a -> Parser a tillEof p = whiteSpace *> p <* eof -- stolen from Parsec and adapted to use UTF-8 input parseFromFile :: Parser a -> SourceName -> IO (Either ParseError a) parseFromFile p fname = do { input <- T.decodeUtf8 <$> B.readFile fname ; return (parse p fname input) }