module NLP.GizaPlusPlus.Parsec where
import Text.ParserCombinators.Parsec
import Text.ParserCombinators.Parsec.Language
type OneToManyPair = (String, [Integer])
alignFile :: CharParser () [([String], [OneToManyPair])]
alignFile = manyTill alignSentencePair eof
alignSentencePair :: CharParser () ([String], [OneToManyPair])
alignSentencePair =
do char '#'; manyTill (noneOf "\n") (char '\n')
target <- sepEndBy alignWord justSpace
char '\n'
alignment <- sepEndBy alignWordPair justSpace
char '\n'
return (target, alignment)
alignWordPair :: CharParser () OneToManyPair
alignWordPair =
do wordFrom <- alignWord
justSpace
indicesTo <- between lbrack rbrack
$ do justSpace
option [] $ sepEndBy1 natural justSpace
return (wordFrom, indicesTo)
where
lbrack = string "({"
rbrack = string "})"
alignWord :: CharParser () String
alignWord = many1 (noneOf " \n\t")
justSpace :: CharParser () Char
justSpace = char ' '
natural :: CharParser () Integer
natural = many digit >>= readM
readM :: (Monad m,Read a) => String -> m a
readM s = case reads s of
[(x, "")] -> return x
_ -> fail "readM: no parse"