module Conllu.Parse
( Parser
, parseConlluWith
, parseConllu
, ParserC(ParserC)
, parserC
, rawSents
, sentence
, comment
, word
, emptyField
, idW
, form
, lemma
, upos
, xpos
, feats
, deprel
, deps
, misc
, commentPair
, listPair
, stringNot
, stringWOSpaces
, stringWSpaces
, keyValue
, maybeEmpty
, orEmpty
, listP )
where
import Conllu.Type
import qualified Conllu.DeprelTagset as D
import qualified Conllu.UposTagset as U
import Control.Monad (void, liftM2)
import Data.Either
import Data.Maybe
import Data.Void (Void)
import Text.Megaparsec
(ParseError, Parsec, (<?>), (<|>), between, eitherP, endBy1, eof,
lookAhead, many, option, optional, parse, parseErrorPretty, sepBy,
sepBy1, skipManyTill, some, takeWhileP, try, withRecovery)
import Text.Megaparsec.Char
import qualified Text.Megaparsec.Char.Lexer as L
type Parser = Parsec Void String
type RawData t e = [Either (ParseError t e) Sent]
type DEPREL = Maybe (D.EP, Maybe String)
rawSents :: Parser (RawData Char Void)
rawSents = rawSentsC sentence
rawSentsC :: Parser Sent -> Parser (RawData Char Void)
rawSentsC sent = between ws eof (e `endBy1` lineFeed)
where
e = withRecovery recover (Right <$> sent)
recover err =
Left err <$
skipManyTill anyChar
(try $ lineFeed *> lookAhead lineFeed)
lineFeed :: Parser ()
lineFeed = lexeme . void $ newline
sentence :: Parser Sent
sentence = sentenceC comment word
sentenceC :: Parser Comment -> Parser (CW AW)
-> Parser Sent
sentenceC c t = liftM2 Sent (many c) (some t)
comment :: Parser Comment
comment =
(symbol "#" <?> "comment starter") *> commentPair <*
lineFeed <?> "comment content"
word :: Parser (CW AW)
word =
wordC idW form lemma upos xpos feats deprel deps misc
wordC ::
Parser ID
-> Parser FORM
-> Parser LEMMA
-> Parser UPOS
-> Parser XPOS
-> Parser FEATS
-> Parser DEPREL
-> Parser DEPS
-> Parser MISC
-> Parser (CW AW)
wordC ixp fop lp upp xpp fsp drp dsp mp = do
i <- ixp <* tab
mf <- fop <* tab
ml <- lp <* tab
mup <- upp <* tab
mxp <- xpp <* tab
mfs <- fsp <* tab
mdh <- dhp <* tab
mdr <- drp <* tab
ds <- dsp <* tab
mm <- mp <* lineFeed
return $ mkAW i mf ml mup mxp mfs (rel mdh mdr) ds mm
where
dhp = maybeEmpty ixp <?> "HEAD"
rel :: Maybe ID -> DEPREL -> Maybe Rel
rel mdh mdr = do
dh <- mdh
(dr, sdr) <- mdr
return $ Rel dh dr sdr Nothing
emptyField :: Parser (Maybe a)
emptyField = symbol "_" *> return Nothing <?> "empty field"
idW :: Parser ID
idW = do
ix <- index
mix <- optional metaIndex <?> "meta token ID"
return $
case mix of
Nothing -> SID ix
Just (Left _, eix) -> MID ix eix
Just (Right _, eix) -> EID ix eix
where
index :: Parser Index
index = do
ix <- some digitChar <?> "ID"
return (read ix :: Int)
indexSep :: Parser (Either IxSep IxSep)
indexSep = eitherP (char '-') (char '.') <?> "meta separator"
metaIndex :: Parser (Either IxSep IxSep, Index)
metaIndex = do
sep <- indexSep
ix <- index
return (sep, ix)
form :: Parser FORM
form = orEmpty stringWSpaces <?> "FORM"
lemma :: Parser LEMMA
lemma = orEmpty stringWSpaces <?> "LEMMA"
upos :: Parser UPOS
upos = maybeEmpty upos' <?> "UPOS"
where
upos' :: Parser U.POS
upos' = fmap mkUPOS stringWOSpaces
xpos :: Parser XPOS
xpos = maybeEmpty stringWOSpaces <?> "XPOS"
feats :: Parser FEATS
feats = listP (feat `sepBy` symbol "|" <?> "FEATS")
where
feat = do
k <- lexeme (some alphaNumChar <?> "feature key")
ft <-
optional $
between (symbol "[") (symbol "]") (some alphaNumChar)
_ <- symbol "="
vs <- fvalue `sepBy1` symbol ","
return $ Feat k vs ft
fvalue = lexeme (some alphaNumChar <?> "feature value")
deprel :: Parser DEPREL
deprel = maybeEmpty deprel'
dep :: Parser D.EP
dep = fmap mkDEP (letters <?> "DEPREL")
deprel' :: Parser (D.EP, Maybe String)
deprel' = liftM2 (,) dep subdeprel
where
subdeprel :: Parser (Maybe String)
subdeprel = optional (symbol ":" *> letters <?> "DEPREL subtype")
deps :: Parser DEPS
deps = listP (eDep `sepBy` symbol "|" <?> "DEPS")
where
eDep = do
h <- idW <?> "enhanced dependency HEAD"
_ <- sep
d <- dep <?> "enhanced dependency DEPREL"
restI <-
optional
(sep *>
stringNot "\t| :" `sepBy` sep <?>
"enhanced dependency information")
return $ Rel h d Nothing restI
sep = symbol ":"
misc :: Parser MISC
misc = orEmpty stringWSpaces <?> "MISC"
commentPair :: Parser Comment
commentPair =
keyValue "=" (stringNot "=\n\t") (option "" stringWSpaces)
listPair :: String -> Parser a -> Parser b -> Parser [(a, b)]
listPair sep p q = keyValue sep p q `sepBy1` symbol "|"
stringNot :: String -> Parser String
stringNot s = lexeme . some $ satisfy (`notElem` s)
stringWOSpaces :: Parser String
stringWOSpaces = stringNot " \t\n"
stringWSpaces :: Parser String
stringWSpaces = stringNot "\t\n"
letters :: Parser String
letters = lexeme $ some letterChar
keyValue :: String -> Parser a -> Parser b -> Parser (a, b)
keyValue sep p q = do
key <- p
_ <- optional $ symbol sep
value <- q
return (key, value)
maybeEmpty :: Parser a -> Parser (Maybe a)
maybeEmpty p = emptyField <|> fmap Just p
orEmpty :: Parser String -> Parser (Maybe String)
orEmpty p = do
r <- p
case r of
"_" -> return Nothing
_ -> return $ Just r
listP :: Parser [a] -> Parser [a]
listP p = fromMaybe [] <$> maybeEmpty p
symbol :: String -> Parser String
symbol = L.symbol ws
lexeme :: Parser a -> Parser a
lexeme = L.lexeme ws
ws :: Parser ()
ws = void $ takeWhileP (Just "space") (== ' ')
data ParserC = ParserC
{ _commentP :: Parser Comment
, _idP :: Parser ID
, _formP :: Parser FORM
, _lemmaP :: Parser LEMMA
, _upostagP :: Parser UPOS
, _xpostagP :: Parser XPOS
, _featsP :: Parser FEATS
, _deprelP :: Parser DEPREL
, _depsP :: Parser DEPS
, _miscP :: Parser MISC
} deriving ()
customC :: ParserC
customC = ParserC
{ _commentP = comment
, _idP = idW
, _formP = form
, _lemmaP = lemma
, _upostagP = upos
, _xpostagP = xpos
, _featsP = feats
, _deprelP = deprel
, _depsP = deps
, _miscP = misc
}
parserC :: ParserC -> Parser Sent
parserC p =
let i = _idP p
f = _formP p
l = _lemmaP p
up = _upostagP p
xp = _xpostagP p
fs = _featsP p
dr = _deprelP p
ds = _depsP p
m = _miscP p
c = _commentP p
w = wordC i f l up xp fs dr ds m
s = sentenceC c w
in s
parseConlluWith
:: Parser Sent
-> FilePath
-> String
-> Either String Doc
parseConlluWith p fp s =
case parse doc fp s of
Left err -> Left $ parseErrorPretty err
Right d ->
let (ls, rs) = partitionEithers d
in if null ls
then Right rs
else Left $ concatMap parseErrorPretty ls
where
doc = rawSentsC p
parseConllu :: FilePath -> String -> Either String Doc
parseConllu = parseConlluWith sentence