NLP/GizaPlusPlus/Parsec.hs

module NLP.GizaPlusPlus.Parsec where

import Text.ParserCombinators.Parsec
import Text.ParserCombinators.Parsec.Language

type OneToManyPair = (String, [Integer])

alignFile :: CharParser () [([String], [OneToManyPair])]
alignFile = manyTill alignSentencePair eof

alignSentencePair :: CharParser () ([String], [OneToManyPair])
alignSentencePair =
 do char '#'; manyTill (noneOf "\n") (char '\n')
    target <- sepEndBy alignWord justSpace
    char '\n'
    alignment <- sepEndBy alignWordPair justSpace
    char '\n'
    return (target, alignment)

alignWordPair :: CharParser () OneToManyPair
alignWordPair =
 do wordFrom  <- alignWord
    justSpace
    indicesTo <- between lbrack rbrack
                 $ do justSpace
                      option [] $ sepEndBy1 natural justSpace
    return (wordFrom, indicesTo)
 where
    lbrack = string "({"
    rbrack = string "})"

alignWord :: CharParser () String
alignWord = many1 (noneOf " \n\t")

justSpace :: CharParser () Char
justSpace = char ' '

natural :: CharParser () Integer
natural = many digit >>= readM

-- suggested by John Meacham on the haskell libraries list
readM :: (Monad m,Read a) => String -> m a
readM s = case reads s of
    [(x, "")] -> return x
    _         -> fail "readM: no parse"