-- | -- Module : Conllu.Parse -- Copyright : © 2018 bruno cuconato -- License : LPGL-3 -- -- Maintainer : bruno cuconato -- Stability : experimental -- Portability : non-portable -- -- Parsers for CoNLL-U format. the CoNLL-U format is based in the -- deprecated CoNLL format (defined -- [here](https://web.archive.org/web/20161105025307/http://ilk.uvt.nl/conll/)) -- and is defined [here](http://universaldependencies.org/format.html) module Conllu.Parse ( Parser -- * parsers , parseConlluWith , parseConllu -- * customizable parsers , ParserC(ParserC) , parserC -- * default parsers , rawSents , sentence , comment , word -- * CoNLL-U field parsers , emptyField , idW , form , lemma , upos , xpos , feats , deprel , deps , misc -- * utility parsers , commentPair , listPair , stringNot , stringWOSpaces , stringWSpaces -- * parser combinators , keyValue , maybeEmpty , orEmpty , listP ) where --- -- imports import Conllu.Type import qualified Conllu.DeprelTagset as D import qualified Conllu.UposTagset as U import Control.Monad (void, liftM2) import Data.Either import Data.Maybe import Data.Void (Void) import Text.Megaparsec (ParseError, Parsec, (), (<|>), between, eitherP, endBy1, eof, lookAhead, many, option, optional, parse, parseErrorPretty, sepBy, sepBy1, skipManyTill, some, takeWhileP, try, withRecovery) import Text.Megaparsec.Char import qualified Text.Megaparsec.Char.Lexer as L -- | Parser type synonym type Parser = Parsec Void String -- | Parser raw output type RawData t e = [Either (ParseError t e) Sent] -- | DEPREL field type synonym type DEPREL = Maybe (D.EP, Maybe String) --- -- conllu parsers rawSents :: Parser (RawData Char Void) -- | parse CoNLL-U sentences with recovery. rawSents = rawSentsC sentence rawSentsC :: Parser Sent -> Parser (RawData Char Void) -- | parse CoNLL-U sentences with recovery, using a custom parser. rawSentsC sent = between ws eof (e `endBy1` lineFeed) where e = withRecovery recover (Right <$> sent) recover err = Left err <$ skipManyTill anyChar -- if parser consumes the first newline but can't parse the -- second, it breaks; it can't consume the second one, because -- that one has to be consumed by the endBy1 (try $ lineFeed *> lookAhead lineFeed) lineFeed :: Parser () -- | parse a blank line. lineFeed = lexeme . void $ newline -- spaces shouldn't exist, but no -- problem being lax here sentence :: Parser Sent -- | the default sentence parser. sentence = sentenceC comment word sentenceC :: Parser Comment -> Parser (CW AW) -> Parser Sent -- | the customizable sentence parser. sentenceC c t = liftM2 Sent (many c) (some t) comment :: Parser Comment -- | parse a comment. comment = (symbol "#" "comment starter") *> commentPair <* lineFeed "comment content" word :: Parser (CW AW) -- | the default word parser. word = wordC idW form lemma upos xpos feats deprel deps misc wordC :: Parser ID -> Parser FORM -> Parser LEMMA -> Parser UPOS -> Parser XPOS -> Parser FEATS -> Parser DEPREL -> Parser DEPS -> Parser MISC -> Parser (CW AW) -- | the customizable token parser. wordC ixp fop lp upp xpp fsp drp dsp mp = do i <- ixp <* tab mf <- fop <* tab ml <- lp <* tab mup <- upp <* tab mxp <- xpp <* tab mfs <- fsp <* tab mdh <- dhp <* tab mdr <- drp <* tab ds <- dsp <* tab mm <- mp <* lineFeed return $ mkAW i mf ml mup mxp mfs (rel mdh mdr) ds mm where dhp = maybeEmpty ixp "HEAD" rel :: Maybe ID -> DEPREL -> Maybe Rel rel mdh mdr = do dh <- mdh (dr, sdr) <- mdr return $ Rel dh dr sdr Nothing emptyField :: Parser (Maybe a) -- | parse an empty field. emptyField = symbol "_" *> return Nothing "empty field" idW :: Parser ID -- | parse the ID field, which might be an integer, a range, or a -- decimal. idW = do ix <- index mix <- optional metaIndex "meta token ID" return $ case mix of Nothing -> SID ix Just (Left _, eix) -> MID ix eix Just (Right _, eix) -> EID ix eix where index :: Parser Index index = do ix <- some digitChar "ID" return (read ix :: Int) indexSep :: Parser (Either IxSep IxSep) indexSep = eitherP (char '-') (char '.') "meta separator" metaIndex :: Parser (Either IxSep IxSep, Index) metaIndex = do sep <- indexSep ix <- index return (sep, ix) form :: Parser FORM -- | parse the FORM field. form = orEmpty stringWSpaces "FORM" lemma :: Parser LEMMA -- | parse the LEMMA field. lemma = orEmpty stringWSpaces "LEMMA" upos :: Parser UPOS -- | parse the UPOS field. upos = maybeEmpty upos' "UPOS" where upos' :: Parser U.POS upos' = fmap mkUPOS stringWOSpaces xpos :: Parser XPOS -- | parse the XPOS field. xpos = maybeEmpty stringWOSpaces "XPOS" feats :: Parser FEATS -- | parse the FEATS field. feats = listP (feat `sepBy` symbol "|" "FEATS") where feat = do k <- lexeme (some alphaNumChar "feature key") ft <- optional $ between (symbol "[") (symbol "]") (some alphaNumChar) _ <- symbol "=" vs <- fvalue `sepBy1` symbol "," return $ Feat k vs ft fvalue = lexeme (some alphaNumChar "feature value") deprel :: Parser DEPREL -- | parse the DEPREL field. deprel = maybeEmpty deprel' dep :: Parser D.EP dep = fmap mkDEP (letters "DEPREL") deprel' :: Parser (D.EP, Maybe String) -- | parse a non-empty DEPREL field. deprel' = liftM2 (,) dep subdeprel where subdeprel :: Parser (Maybe String) subdeprel = optional (symbol ":" *> letters "DEPREL subtype") deps :: Parser DEPS -- | parse the DEPS field. deps = listP (eDep `sepBy` symbol "|" "DEPS") where eDep = do h <- idW "enhanced dependency HEAD" _ <- sep d <- dep "enhanced dependency DEPREL" restI <- optional (sep *> stringNot "\t| :" `sepBy` sep "enhanced dependency information") return $ Rel h d Nothing restI sep = symbol ":" misc :: Parser MISC -- | parse the MISC field. misc = orEmpty stringWSpaces "MISC" --- -- utility parsers commentPair :: Parser Comment -- | parse a comment pair. commentPair = keyValue "=" (stringNot "=\n\t") (option "" stringWSpaces) listPair :: String -> Parser a -> Parser b -> Parser [(a, b)] -- | parse a list of pairs. listPair sep p q = keyValue sep p q `sepBy1` symbol "|" stringNot :: String -> Parser String -- | parse any chars except the ones provided. stringNot s = lexeme . some $ satisfy (`notElem` s) stringWOSpaces :: Parser String -- | parse a string until a space, a tab, or a newline. stringWOSpaces = stringNot " \t\n" stringWSpaces :: Parser String -- | parse a string until a tab or a newline. stringWSpaces = stringNot "\t\n" letters :: Parser String -- | parse a string of letters. letters = lexeme $ some letterChar --- -- parser combinators keyValue :: String -> Parser a -> Parser b -> Parser (a, b) -- | parse a (key, value) pair. keyValue sep p q = do key <- p _ <- optional $ symbol sep value <- q return (key, value) -- | two combinators are needed for parsing the empty field (without -- lookahead). this has to do with the fact that if we do -- -- > form <|> emptyField -- -- we would parse "_" as a non-empty FORM field. but if we did -- -- > emptyField <|> form -- -- we would parse "_" in "_something" and then the parser would choke -- expecting a tab. maybeEmpty :: Parser a -> Parser (Maybe a) -- | a parser combinator for parsers that won't parse "_" (e.g., as -- 'lemma' would). maybeEmpty p = emptyField <|> fmap Just p orEmpty :: Parser String -> Parser (Maybe String) -- | a parser combinator for parsers that may parse "_". orEmpty p = do r <- p case r of "_" -> return Nothing _ -> return $ Just r listP :: Parser [a] -> Parser [a] -- | parse a list of values that may be an empty field. using a parser -- that returns a possibly empty list like 'sepBy' and many will -- return the correct result for the empty field ('_'), but will -- report it the same as any other syntax error. listP p = fromMaybe [] <$> maybeEmpty p --- -- lexing symbol :: String -> Parser String symbol = L.symbol ws lexeme :: Parser a -> Parser a lexeme = L.lexeme ws ws :: Parser () ws = void $ takeWhileP (Just "space") (== ' ') --- -- customizable parser data ParserC = ParserC { _commentP :: Parser Comment , _idP :: Parser ID , _formP :: Parser FORM , _lemmaP :: Parser LEMMA , _upostagP :: Parser UPOS , _xpostagP :: Parser XPOS , _featsP :: Parser FEATS , _deprelP :: Parser DEPREL , _depsP :: Parser DEPS , _miscP :: Parser MISC } deriving () customC :: ParserC customC = ParserC { _commentP = comment , _idP = idW , _formP = form , _lemmaP = lemma , _upostagP = upos , _xpostagP = xpos , _featsP = feats , _deprelP = deprel , _depsP = deps , _miscP = misc } parserC :: ParserC -> Parser Sent -- | defines a custom parser of sentences. if you only need to -- customize one field parser (e.g., to parse special comments or a -- special MISC field), you can do: -- -- @ -- parserC ParserC{_commentP = myCommentsParser } -- @ parserC p = let i = _idP p f = _formP p l = _lemmaP p up = _upostagP p xp = _xpostagP p fs = _featsP p dr = _deprelP p ds = _depsP p m = _miscP p c = _commentP p w = wordC i f l up xp fs dr ds m s = sentenceC c w in s --- -- parse parseConlluWith :: Parser Sent -- ^ the sentence parser to be used. -> FilePath -- ^ the source whose stream is being supplied in the -- next argument (may be "" for no file) -> String -- ^ stream to be parsed -> Either String Doc -- | parse a CoNLL-U document using a customized parser. parseConlluWith p fp s = case parse doc fp s of Left err -> Left $ parseErrorPretty err Right d -> let (ls, rs) = partitionEithers d in if null ls then Right rs else Left $ concatMap parseErrorPretty ls where doc = rawSentsC p parseConllu :: FilePath -> String -> Either String Doc -- | parse a CoNLL-U document using the default parser. parseConllu = parseConlluWith sentence