module NLP.GizaPlusPlus.Parsec where import Text.ParserCombinators.Parsec import Text.ParserCombinators.Parsec.Language type OneToManyPair = (String, [Integer]) alignFile :: CharParser () [([String], [OneToManyPair])] alignFile = manyTill alignSentencePair eof alignSentencePair :: CharParser () ([String], [OneToManyPair]) alignSentencePair = do char '#'; manyTill (noneOf "\n") (char '\n') target <- sepEndBy alignWord justSpace char '\n' alignment <- sepEndBy alignWordPair justSpace char '\n' return (target, alignment) alignWordPair :: CharParser () OneToManyPair alignWordPair = do wordFrom <- alignWord justSpace indicesTo <- between lbrack rbrack $ do justSpace option [] $ sepEndBy1 natural justSpace return (wordFrom, indicesTo) where lbrack = string "({" rbrack = string "})" alignWord :: CharParser () String alignWord = many1 (noneOf " \n\t") justSpace :: CharParser () Char justSpace = char ' ' natural :: CharParser () Integer natural = many digit >>= readM -- suggested by John Meacham on the haskell libraries list readM :: (Monad m,Read a) => String -> m a readM s = case reads s of [(x, "")] -> return x _ -> fail "readM: no parse"