module NLP.GenI.Parser (
geniTestSuite, geniSemanticInput, geniTestSuiteString,
geniDerivations,
toSemInputString,
geniMacros, geniTagElems,
geniLexicon, geniMorphInfo,
geniFeats, geniSemantics, geniValue, geniWords,
geniWord, geniLanguageDef, tillEof,
parseFromFile,
module Text.Parsec,
module Text.Parsec.String,
) where
import Control.Applicative ( (<*>), (<$>), (*>), (<*) )
import Control.Monad (liftM, when)
import Data.Text ( Text )
import Text.Parsec
import Text.Parsec.String hiding ( parseFromFile )
import Text.Parsec.Language (emptyDef)
import Text.Parsec.Token (TokenParser,
LanguageDef,
commentLine, commentStart, commentEnd, opLetter,
reservedOpNames, reservedNames, identLetter, identStart,
makeTokenParser)
import qualified Data.Map as Map
import qualified Data.Text as T
import qualified Data.Tree as T
import qualified System.IO.UTF8 as UTF8
import qualified Text.Parsec.Token as P
import NLP.GenI.FeatureStructure ( Flist, AvPair(..), sortFlist )
import NLP.GenI.General (isGeniIdentLetter)
import NLP.GenI.GeniShow ( GeniShow(..), geniKeyword )
import NLP.GenI.GeniVal ( GeniVal, mkGConst, mkGConstNone, mkGVar, mkGAnon, isAnon )
import NLP.GenI.Lexicon ( fromLexSem, mkFullLexEntry, LexEntry(..) )
import NLP.GenI.Pretty ( above )
import NLP.GenI.Semantics ( Literal(..), Sem, sortSem, LitConstr, SemInput )
import NLP.GenI.Tag (TagElem(..), setTidnums)
import NLP.GenI.TestSuite ( TestCase(..) )
import NLP.GenI.TreeSchema (SchemaTree, Ttree(..), Ptype(..), GNode(..), GType(..) )
import Data.FullList ( FullList, Listable(..) )
#define SEMANTICS "semantics"
#define SENTENCE "sentence"
#define OUTPUT "output"
#define TRACE "trace"
#define ANCHOR "anchor"
#define SUBST "subst"
#define FOOT "foot"
#define LEX "lex"
#define TYPE "type"
#define ACONSTR "aconstr"
#define INITIAL "initial"
#define AUXILIARY "auxiliary"
#define IDXCONSTRAINTS "idxconstraints"
#define BEGIN "begin"
#define END "end"
geniLanguageDef :: LanguageDef ()
geniLanguageDef = emptyDef
{ commentLine = "%"
, commentStart = "/*"
, commentEnd = "*/"
, opLetter = oneOf ""
, reservedOpNames = [""]
, reservedNames =
[ SEMANTICS , SENTENCE, OUTPUT, IDXCONSTRAINTS, TRACE
, ANCHOR , SUBST , FOOT , LEX , TYPE , ACONSTR
, INITIAL , AUXILIARY
, BEGIN , END ]
, identLetter = identStuff
, identStart = identStuff
}
where identStuff = satisfy isGeniIdentLetter
geniValue :: Parser GeniVal
geniValue = ((try $ anonymous) <?> "_ or ?_")
<|> (constants <?> "a constant or atomic disjunction")
<|> (variable <?> "a variable")
where
question = "?"
disjunction = geniAtomicDisjunction
constants :: Parser GeniVal
constants = mkGConst <$> disjunction
variable :: Parser GeniVal
variable =
do symbol question
v <- identifier
mcs <- option Nothing $ (symbol "/" >> Just `liftM` disjunction)
return (mkGVar v mcs)
anonymous :: Parser GeniVal
anonymous =
do optional $ symbol question
symbol "_"
return mkGAnon
geniAtomicDisjunction :: Parser (FullList Text)
geniAtomicDisjunction = do
(x:xs) <- atom `sepBy1` (symbol "|")
return (x !: xs)
where
atom = looseIdentifier <|> stringLiteral
geniFancyDisjunction :: Parser [GeniVal]
geniFancyDisjunction = geniValue `sepBy1` symbol ";"
class GeniValLike v where
geniValueLike :: Parser v
instance GeniValLike GeniVal where
geniValueLike = geniValue
instance GeniValLike [GeniVal] where
geniValueLike = geniFancyDisjunction
geniFeats :: GeniValLike v => Parser (Flist v)
geniFeats = option [] $ squares $ many geniAttVal
geniAttVal :: GeniValLike v => Parser (AvPair v)
geniAttVal = do
att <- identifierR <?> "an attribute"; colon
val <- geniValueLike <?> "a GenI value"
return $ AvPair att val
geniSemantics :: Parser Sem
geniSemantics =
do sem <- many (geniLiteral <?> "a literal")
return (sortSem sem)
geniLiteral :: Parser (Literal GeniVal)
geniLiteral = geniLiteral_ mkGAnon geniValue
geniLiteral_ :: a -> Parser a -> Parser (Literal a)
geniLiteral_ zero gv =
Literal <$> (option zero handleParser <?> "a handle")
<*> (gv <?> "a predicate")
<*> (parens (many gv) <?> "some parameters")
where
handleParser = try $ gv <* char ':'
geniSemanticInput :: Parser (Sem,Flist GeniVal,[LitConstr])
geniSemanticInput =
do keywordSemantics
(sem,litC) <- liftM unzip $ squares $ many literalAndConstraint
idxC <- option [] geniIdxConstraints
let sem2 = createHandles sem
semlitC2 = [ (s,c) | (s,c) <- zip sem2 litC, (not.null) c ]
return (createHandles sem, idxC, semlitC2)
where
createHandles :: Sem -> Sem
createHandles = zipWith setHandle ([1..] :: [Int])
setHandle i (Literal h pred_ par) =
let h2 = if isAnon h
then mkGConstNone ("genihandle" `T.append` T.pack (show i))
else h
in Literal h2 pred_ par
literalAndConstraint :: Parser LitConstr
literalAndConstraint =
do l <- geniLiteral
t <- option [] $ squares $ many identifier
return (l,t)
geniSemanticInputString :: Parser Text
geniSemanticInputString = do
keywordSemantics
s <- squaresString
whiteSpace
optional geniIdxConstraints
return s
geniIdxConstraints :: Parser (Flist GeniVal)
geniIdxConstraints = keyword IDXCONSTRAINTS >> geniFeats
squaresString :: Parser Text
squaresString = do
char '['
s <- (T.concat <$> many inSq) <|> squaresString
char ']'
return $ "[" `T.append` s `T.append` "]"
where
inSq :: Parser Text
inSq = T.pack <$> many1 (noneOf "[]")
data SemInputString = SemInputString Text (Flist GeniVal)
instance GeniShow SemInputString where
geniShowText (SemInputString semStr idxC) =
geniKeyword SEMANTICS semStr `above` r
where
r | null idxC = ""
| otherwise = geniKeyword IDXCONSTRAINTS (geniShowText idxC)
toSemInputString :: SemInput -> Text -> SemInputString
toSemInputString (_,lc,_) s = SemInputString s lc
geniTestSuite :: Parser [TestCase]
geniTestSuite =
tillEof (many geniTestCase)
geniTestSuiteString :: Parser [Text]
geniTestSuiteString =
tillEof (many geniTestCaseString)
geniDerivations :: Parser [TestCaseOutput]
geniDerivations = tillEof $ many geniOutput
geniTestCase :: Parser TestCase
geniTestCase =
TestCase <$> (option "" (identifier <?> "a test case name"))
<*> lookAhead geniSemanticInputString
<*> geniSemanticInput
<*> many geniSentence
type TestCaseOutput = (Text, Map.Map (Text,Text) [Text])
geniOutput :: Parser TestCaseOutput
geniOutput = do
ws <- keyword OUTPUT >> squares geniWords
ds <- Map.fromList <$> many geniTraces
return (ws, ds)
geniTraces :: Parser ((Text,Text), [Text])
geniTraces = do
keyword TRACE
squares $ do
k1 <- withWhite geniWord
k2 <- withWhite geniWord
whiteSpace >> char '!' >> whiteSpace
traces <- geniWord `sepEndBy1` whiteSpace
return ((k1,k2), traces)
withWhite :: Parser a -> Parser a
withWhite p = p >>= (\a -> whiteSpace >> return a)
geniSentence :: Parser Text
geniSentence = optional (keyword SENTENCE) >> squares geniWords
geniWords :: Parser Text
geniWords =
T.unwords <$> (sepEndBy1 geniWord whiteSpace <?> "a sentence")
geniWord :: Parser Text
geniWord = T.pack <$> many1 (noneOf "[]\v\f\t\r\n ")
geniTestCaseString :: Parser Text
geniTestCaseString = do
option "" (identifier <?> "a test case name")
geniSemanticInputString <* (many geniSentence >> many geniOutput)
geniLexicon :: Parser [LexEntry]
geniLexicon = tillEof $ many1 geniLexicalEntry
geniLexicalEntry :: Parser LexEntry
geniLexicalEntry =
do lemmas <- geniAtomicDisjunction <?> "a lemma (or disjunction thereof)"
family <- identifier <?> "a tree family"
(pars, interface) <- option ([],[]) $ parens paramsParser
equations <- option [] $ do keyword "equations"
geniFeats <?> "path equations"
filters <- option [] $ do keyword "filters"
geniFeats
keywordSemantics
(sem, pols) <- fromLexSem <$> squares geniLexSemantics
return (mkFullLexEntry lemmas family pars interface filters equations sem pols)
where
paramsParser :: Parser ([GeniVal], Flist GeniVal)
paramsParser = do
pars <- many geniValue <?> "some parameters"
interface <- option [] $ do symbol "!"
many geniAttVal
return (pars, interface)
geniLexSemantics :: Parser [Literal PolValue]
geniLexSemantics = sortSem <$> many (geniLexLiteral <?> "a literal")
type PolValue = (GeniVal, Int)
geniLexLiteral :: Parser (Literal PolValue)
geniLexLiteral = geniLiteral_ (mkGAnon,0) geniPolValue
geniPolValue :: Parser (GeniVal, Int)
geniPolValue =
do p <- geniPolarity
v <- geniValue
return (v,p)
geniMacros :: Parser [SchemaTree]
geniMacros = tillEof $ many geniTreeDef
initType, auxType :: Parser Ptype
initType = do { reserved INITIAL ; return Initial }
auxType = do { reserved AUXILIARY ; return Auxiliar }
geniTreeDef :: Parser SchemaTree
geniTreeDef =
do sourcePos <- getPosition
family <- identifier
tname <- option "" (colon *> identifier)
(pars,iface) <- geniParams
theTtype <- (initType <|> auxType)
theTree <- geniTree
let treeFail x =
do setPosition sourcePos
fail $ "In tree " ++ T.unpack family ++ ":"
++ T.unpack tname ++ " " ++ show sourcePos ++ ": " ++ x
let theNodes = T.flatten theTree
numFeet = length [ x | x <- theNodes, gtype x == Foot ]
numAnchors = length [ x | x <- theNodes, ganchor x ]
when (not $ any ganchor theNodes) $
treeFail "At least one node in an LTAG tree must be an anchor"
when (numAnchors > 1) $
treeFail "There can be no more than 1 anchor node in a tree"
when (numFeet > 1) $
treeFail "There can be no more than 1 foot node in a tree"
when (theTtype == Initial && numFeet > 0) $
treeFail "Initial trees may not have foot nodes"
psem <- option Nothing $ do { keywordSemantics; liftM Just (squares geniSemantics) }
ptrc <- option [] $ do { keyword TRACE; squares (many identifier) }
return TT{ params = pars
, pfamily = family
, pidname = tname
, pinterface = sortFlist iface
, ptype = theTtype
, tree = theTree
, ptrace = ptrc
, psemantics = psem
}
geniTree :: (Ord v, GeniValLike v) => Parser (T.Tree (GNode v))
geniTree =
do node <- geniNode
kids <- option [] (braces $ many geniTree)
<?> "child nodes"
let noKidsAllowed t c = when (c node && (not.null $ kids)) $
fail $ t ++ " nodes may *not* have any children"
noKidsAllowed "Anchor" $ ganchor
noKidsAllowed "Substitution" $ (== Subs) . gtype
noKidsAllowed "Foot" $ (== Foot) . gtype
return (T.Node node kids)
geniNode :: (Ord v, GeniValLike v) => Parser (GNode v)
geniNode = do
name <- identifier
nodeType <- geniNodeAnnotation
lex_ <- if nodeType == AnnoLexeme
then ((stringLiteral <|> identifier) `sepBy` symbol "|") <?> "some lexemes"
else return []
constr <- case nodeType of
AnnoDefault -> adjConstraintParser
AnnoAnchor -> adjConstraintParser
_ -> return True
(top,bot) <- if nodeType == AnnoLexeme
then option ([],[]) $ try topbotParser
else topbotParser
return $ GN { gnname = name
, gtype = fromAnnotation nodeType
, gup = sortFlist top
, gdown = sortFlist bot
, glexeme = lex_
, ganchor = nodeType == AnnoAnchor
, gaconstr = constr
, gorigin = ""
}
where
adjConstraintParser = option False $ reserved ACONSTR >> char ':' >> symbol "noadj" >> return True
topbotParser = do
top <- geniFeats <?> "top features"
symbol "!"
bot <- geniFeats <?> "bot features"
return (top,bot)
data Annotation = AnnoAnchor
| AnnoLexeme
| AnnoSubst
| AnnoFoot
| AnnoDefault
deriving Eq
fromAnnotation :: Annotation -> GType
fromAnnotation AnnoLexeme = Lex
fromAnnotation AnnoAnchor = Lex
fromAnnotation AnnoSubst = Subs
fromAnnotation AnnoFoot = Foot
fromAnnotation AnnoDefault = Other
geniNodeAnnotation :: Parser Annotation
geniNodeAnnotation =
(keyword TYPE *> ty) <|>
(reserved ANCHOR >> return AnnoAnchor) <|>
return AnnoDefault
where
ty = choice [ try (symbol s) >> return t | (s,t) <- table ]
table =
[ (ANCHOR, AnnoAnchor)
, (FOOT, AnnoFoot)
, (SUBST, AnnoSubst)
, (LEX, AnnoLexeme)
]
geniTagElems :: Parser [TagElem]
geniTagElems = tillEof $ setTidnums `fmap` many geniTagElem
geniTagElem :: Parser TagElem
geniTagElem = do
family <- identifier
tname <- option "" $ (colon *> identifier)
iface <- (snd `liftM` geniParams) <|> geniFeats
theType <- initType <|> auxType
theTree <- geniTree
sem <- do { keywordSemantics; squares geniSemantics }
return $ TE { idname = tname
, ttreename = family
, tinterface = iface
, ttype = theType
, ttree = theTree
, tsemantics = sem
, tidnum = 1
, tpolarities = Map.empty
, tsempols = []
, ttrace = []
}
geniParams :: Parser ([GeniVal], Flist GeniVal)
geniParams = parens $ do
pars <- many geniValue <?> "some parameters"
interface <- option [] $ do { symbol "!"; many geniAttVal }
return (pars, interface)
geniMorphInfo :: Parser [(Text,Flist GeniVal)]
geniMorphInfo = tillEof $ many morphEntry
morphEntry :: Parser (Text,Flist GeniVal)
morphEntry = (,) <$> identifier <*> geniFeats
geniPolarity :: Parser Int
geniPolarity = option 0 (plus <|> minus)
where
plus = do { char '+'; return 1 }
minus = do { char '-'; return (1) }
keyword :: Text -> Parser Text
keyword k =
(try $ do { reserved k; colon; return k }) <?> T.unpack k ++ ":"
keywordSemantics :: Parser Text
keywordSemantics = keyword SEMANTICS
lexer :: TokenParser ()
lexer = makeTokenParser geniLanguageDef
whiteSpace :: Parser ()
whiteSpace = P.whiteSpace lexer
identifier :: Parser Text
identifier = decode <$> P.identifier lexer
looseIdentifier :: Parser Text
looseIdentifier =
decode <$> do { i <- ident ; whiteSpace; return i }
where
ident = do
{ c <- identStart geniLanguageDef
; cs <- many (identLetter geniLanguageDef)
; return (c:cs) } <?> "identifier"
colon :: Parser Text
colon = decode <$> P.colon lexer
stringLiteral :: Parser Text
stringLiteral = decode <$> P.stringLiteral lexer
squares, braces, parens :: Parser a -> Parser a
squares = P.squares lexer
braces = P.braces lexer
parens = P.parens lexer
reserved :: Text -> Parser Text
reserved s = P.reserved lexer (T.unpack s) >> return s
symbol :: Text -> Parser Text
symbol s = P.symbol lexer (T.unpack s) >> return s
decode :: String -> Text
decode = T.pack
identifierR :: Parser Text
identifierR = decode <$> do
{ c <- P.identStart geniLanguageDef
; cs <- many (P.identLetter geniLanguageDef)
; return (c:cs)
} <?> "identifier or reserved word"
tillEof :: Parser a -> Parser a
tillEof p = whiteSpace *> p <* eof
parseFromFile :: Parser a -> SourceName -> IO (Either ParseError a)
parseFromFile p fname
= do{ input <- UTF8.readFile fname
; return (parse p fname input)
}