-- |
-- Module      :  Conllu.Parse
-- Copyright   :  © 2018 bruno cuconato
-- License     :  LPGL-3
--
-- Maintainer  :  bruno cuconato <bcclaro+hackage@gmail.com>
-- Stability   :  experimental
-- Portability :  non-portable
--
-- Parsers for CoNLL-U format.  the CoNLL-U format is based in the
-- deprecated CoNLL format (defined
-- [here](https://web.archive.org/web/20161105025307/http://ilk.uvt.nl/conll/))
-- and is defined [here](http://universaldependencies.org/format.html)

module Conllu.Parse
  ( Parser
  -- * parsers
  , parseConlluWith
  , parseConllu
  -- * customizable parsers
  , ParserC(ParserC)
  , parserC
  -- * default parsers
  , rawSents
  , sentence
  , comment
  , word
  -- * CoNLL-U field parsers
  , emptyField
  , idW
  , form
  , lemma
  , upos
  , xpos
  , feats
  , deprel
  , deps
  , misc
    -- * utility parsers
  , commentPair
  , listPair
  , stringNot
  , stringWOSpaces
  , stringWSpaces
    -- * parser combinators
  , keyValue
  , maybeEmpty
  , orEmpty
  , listP )
where

---
-- imports
import           Conllu.Type
import qualified Conllu.DeprelTagset as D
import qualified Conllu.UposTagset as U

import           Control.Monad (void, liftM2)
import           Data.Either
import           Data.Maybe
import           Data.Void (Void)

import qualified Text.Megaparsec as TM
import           Text.Megaparsec.Char
import qualified Text.Megaparsec.Char.Lexer as L

-- | Parser type synonym
type Parser = TM.Parsec Void String

-- | Parser raw output
type RawData t e = [Either (TM.ParseError t e) Sent]

-- | DEPREL field type synonym
type DEPREL = Maybe (D.EP, Maybe String)


---
-- conllu parsers
rawSents :: Parser (RawData String Void)
-- | parse CoNLL-U sentences with recovery.
rawSents :: Parser (RawData String Void)
rawSents = Parser Sent -> Parser (RawData String Void)
rawSentsC Parser Sent
sentence

rawSentsC :: Parser Sent -> Parser (RawData String Void)
-- | parse CoNLL-U sentences with recovery, using a custom parser.
rawSentsC :: Parser Sent -> Parser (RawData String Void)
rawSentsC Parser Sent
sent = ParsecT Void String Identity ()
-> ParsecT Void String Identity ()
-> Parser (RawData String Void)
-> Parser (RawData String Void)
forall (m :: * -> *) open close a.
Applicative m =>
m open -> m close -> m a -> m a
TM.between ParsecT Void String Identity ()
ws ParsecT Void String Identity ()
forall e s (m :: * -> *). MonadParsec e s m => m ()
TM.eof (ParsecT Void String Identity (Either (ParseError String Void) Sent)
-> ParsecT Void String Identity () -> Parser (RawData String Void)
forall (m :: * -> *) a sep. MonadPlus m => m a -> m sep -> m [a]
TM.endBy1 ParsecT Void String Identity (Either (ParseError String Void) Sent)
e ParsecT Void String Identity ()
lineFeed)
  where
    e :: ParsecT Void String Identity (Either (ParseError String Void) Sent)
e = (ParseError String Void
 -> ParsecT
      Void String Identity (Either (ParseError String Void) Sent))
-> ParsecT
     Void String Identity (Either (ParseError String Void) Sent)
-> ParsecT
     Void String Identity (Either (ParseError String Void) Sent)
forall e s (m :: * -> *) a.
MonadParsec e s m =>
(ParseError s e -> m a) -> m a -> m a
TM.withRecovery ParseError String Void
-> ParsecT
     Void String Identity (Either (ParseError String Void) Sent)
forall a b. a -> ParsecT Void String Identity (Either a b)
recover (Sent -> Either (ParseError String Void) Sent
forall a b. b -> Either a b
Right (Sent -> Either (ParseError String Void) Sent)
-> Parser Sent
-> ParsecT
     Void String Identity (Either (ParseError String Void) Sent)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Parser Sent
sent)
    recover :: a -> ParsecT Void String Identity (Either a b)
recover a
err =
      a -> Either a b
forall a b. a -> Either a b
Left a
err Either a b
-> ParsecT Void String Identity ()
-> ParsecT Void String Identity (Either a b)
forall (f :: * -> *) a b. Functor f => a -> f b -> f a
<$
      ParsecT Void String Identity Char
-> ParsecT Void String Identity ()
-> ParsecT Void String Identity ()
forall (m :: * -> *) a end. MonadPlus m => m a -> m end -> m end
TM.skipManyTill ParsecT Void String Identity Char
forall e s (m :: * -> *). MonadParsec e s m => m (Token s)
TM.anySingle
      -- if parser consumes the first newline but can't parse the
      -- second, it breaks; it can't consume the second one, because
      -- that one has to be consumed by the endBy1
      (ParsecT Void String Identity () -> ParsecT Void String Identity ()
forall e s (m :: * -> *) a. MonadParsec e s m => m a -> m a
TM.try (ParsecT Void String Identity ()
 -> ParsecT Void String Identity ())
-> ParsecT Void String Identity ()
-> ParsecT Void String Identity ()
forall a b. (a -> b) -> a -> b
$ ParsecT Void String Identity ()
lineFeed ParsecT Void String Identity ()
-> ParsecT Void String Identity ()
-> ParsecT Void String Identity ()
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> ParsecT Void String Identity () -> ParsecT Void String Identity ()
forall e s (m :: * -> *) a. MonadParsec e s m => m a -> m a
TM.lookAhead ParsecT Void String Identity ()
lineFeed)

lineFeed :: Parser ()
-- | parse a blank line.
lineFeed :: ParsecT Void String Identity ()
lineFeed = ParsecT Void String Identity () -> ParsecT Void String Identity ()
forall a. Parser a -> Parser a
lexeme (ParsecT Void String Identity ()
 -> ParsecT Void String Identity ())
-> (ParsecT Void String Identity Char
    -> ParsecT Void String Identity ())
-> ParsecT Void String Identity Char
-> ParsecT Void String Identity ()
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ParsecT Void String Identity Char
-> ParsecT Void String Identity ()
forall (f :: * -> *) a. Functor f => f a -> f ()
void (ParsecT Void String Identity Char
 -> ParsecT Void String Identity ())
-> ParsecT Void String Identity Char
-> ParsecT Void String Identity ()
forall a b. (a -> b) -> a -> b
$ ParsecT Void String Identity Char
forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
m (Token s)
newline -- Spaces shouldn't exist, but no problem being lax here

sentence :: Parser Sent
-- | the default sentence parser.
sentence :: Parser Sent
sentence = Parser Comment -> Parser (CW AW) -> Parser Sent
sentenceC Parser Comment
comment Parser (CW AW)
word

sentenceC :: Parser Comment -> Parser (CW AW) -> Parser Sent
-- | the customizable sentence parser.
sentenceC :: Parser Comment -> Parser (CW AW) -> Parser Sent
sentenceC Parser Comment
c Parser (CW AW)
t = ([Comment] -> [CW AW] -> Sent)
-> ParsecT Void String Identity [Comment]
-> ParsecT Void String Identity [CW AW]
-> Parser Sent
forall (m :: * -> *) a1 a2 r.
Monad m =>
(a1 -> a2 -> r) -> m a1 -> m a2 -> m r
liftM2 [Comment] -> [CW AW] -> Sent
Sent (Parser Comment -> ParsecT Void String Identity [Comment]
forall (m :: * -> *) a. MonadPlus m => m a -> m [a]
TM.many Parser Comment
c) (Parser (CW AW) -> ParsecT Void String Identity [CW AW]
forall (m :: * -> *) a. MonadPlus m => m a -> m [a]
TM.some Parser (CW AW)
t)

comment :: Parser Comment
-- | parse a comment.
comment :: Parser Comment
comment =
  (String -> Parser String
symbol String
"#" Parser String -> String -> Parser String
forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
TM.<?> String
"comment starter") Parser String -> Parser Comment -> Parser Comment
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> Parser Comment
commentPair Parser Comment -> ParsecT Void String Identity () -> Parser Comment
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<*
  ParsecT Void String Identity ()
lineFeed Parser Comment -> String -> Parser Comment
forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
TM.<?> String
"comment content"

word :: Parser (CW AW)
-- | the default word parser.
word :: Parser (CW AW)
word =
  Parser ID
-> Parser FORM
-> Parser FORM
-> Parser UPOS
-> Parser FORM
-> Parser FEATS
-> Parser DEPREL
-> Parser DEPS
-> Parser FORM
-> Parser (CW AW)
wordC Parser ID
idW Parser FORM
form Parser FORM
lemma Parser UPOS
upos Parser FORM
xpos Parser FEATS
feats Parser DEPREL
deprel Parser DEPS
deps Parser FORM
misc

wordC ::
     Parser ID
  -> Parser FORM
  -> Parser LEMMA
  -> Parser UPOS
  -> Parser XPOS
  -> Parser FEATS
  -> Parser DEPREL
  -> Parser DEPS
  -> Parser MISC
  -> Parser (CW AW)
-- | the customizable token parser.
wordC :: Parser ID
-> Parser FORM
-> Parser FORM
-> Parser UPOS
-> Parser FORM
-> Parser FEATS
-> Parser DEPREL
-> Parser DEPS
-> Parser FORM
-> Parser (CW AW)
wordC Parser ID
ixp Parser FORM
fop Parser FORM
lp Parser UPOS
upp Parser FORM
xpp Parser FEATS
fsp Parser DEPREL
drp Parser DEPS
dsp Parser FORM
mp = do
  ID
i   <- Parser ID
ixp Parser ID -> ParsecT Void String Identity Char -> Parser ID
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* ParsecT Void String Identity Char
forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
m (Token s)
tab
  FORM
mf  <- Parser FORM
fop Parser FORM -> ParsecT Void String Identity Char -> Parser FORM
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* ParsecT Void String Identity Char
forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
m (Token s)
tab
  FORM
ml  <- Parser FORM
lp  Parser FORM -> ParsecT Void String Identity Char -> Parser FORM
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* ParsecT Void String Identity Char
forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
m (Token s)
tab
  UPOS
mup <- Parser UPOS
upp Parser UPOS -> ParsecT Void String Identity Char -> Parser UPOS
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* ParsecT Void String Identity Char
forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
m (Token s)
tab
  FORM
mxp <- Parser FORM
xpp Parser FORM -> ParsecT Void String Identity Char -> Parser FORM
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* ParsecT Void String Identity Char
forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
m (Token s)
tab
  FEATS
mfs <- Parser FEATS
fsp Parser FEATS -> ParsecT Void String Identity Char -> Parser FEATS
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* ParsecT Void String Identity Char
forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
m (Token s)
tab
  Maybe ID
mdh <- ParsecT Void String Identity (Maybe ID)
dhp ParsecT Void String Identity (Maybe ID)
-> ParsecT Void String Identity Char
-> ParsecT Void String Identity (Maybe ID)
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* ParsecT Void String Identity Char
forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
m (Token s)
tab
  DEPREL
mdr <- Parser DEPREL
drp Parser DEPREL -> ParsecT Void String Identity Char -> Parser DEPREL
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* ParsecT Void String Identity Char
forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
m (Token s)
tab
  DEPS
ds  <- Parser DEPS
dsp Parser DEPS -> ParsecT Void String Identity Char -> Parser DEPS
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* ParsecT Void String Identity Char
forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
m (Token s)
tab
  FORM
mm  <- Parser FORM
mp  Parser FORM -> ParsecT Void String Identity () -> Parser FORM
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* ParsecT Void String Identity ()
lineFeed
  CW AW -> Parser (CW AW)
forall (m :: * -> *) a. Monad m => a -> m a
return (CW AW -> Parser (CW AW)) -> CW AW -> Parser (CW AW)
forall a b. (a -> b) -> a -> b
$ ID
-> FORM
-> FORM
-> UPOS
-> FORM
-> FEATS
-> Maybe Rel
-> DEPS
-> FORM
-> CW AW
mkAW ID
i FORM
mf FORM
ml UPOS
mup FORM
mxp FEATS
mfs (Maybe ID -> DEPREL -> Maybe Rel
rel Maybe ID
mdh DEPREL
mdr) DEPS
ds FORM
mm
  where
    dhp :: ParsecT Void String Identity (Maybe ID)
dhp = Parser ID -> ParsecT Void String Identity (Maybe ID)
forall a. Parser a -> Parser (Maybe a)
maybeEmpty Parser ID
ixp ParsecT Void String Identity (Maybe ID)
-> String -> ParsecT Void String Identity (Maybe ID)
forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
TM.<?> String
"HEAD"
    rel :: Maybe ID -> DEPREL -> Maybe Rel
    rel :: Maybe ID -> DEPREL -> Maybe Rel
rel Maybe ID
mdh DEPREL
mdr = do
      ID
dh <- Maybe ID
mdh
      (EP
dr, FORM
sdr) <- DEPREL
mdr
      Rel -> Maybe Rel
forall (m :: * -> *) a. Monad m => a -> m a
return (Rel -> Maybe Rel) -> Rel -> Maybe Rel
forall a b. (a -> b) -> a -> b
$ ID -> EP -> FORM -> Maybe [String] -> Rel
Rel ID
dh EP
dr FORM
sdr Maybe [String]
forall a. Maybe a
Nothing

emptyField :: Parser (Maybe a)
-- | parse an empty field.
emptyField :: Parser (Maybe a)
emptyField = String -> Parser String
symbol String
"_" Parser String -> Parser (Maybe a) -> Parser (Maybe a)
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> Maybe a -> Parser (Maybe a)
forall (m :: * -> *) a. Monad m => a -> m a
return Maybe a
forall a. Maybe a
Nothing Parser (Maybe a) -> String -> Parser (Maybe a)
forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
TM.<?> String
"empty field (_)"

idW :: Parser ID
-- | parse the ID field, which might be an integer, a range, or a
-- decimal.
idW :: Parser ID
idW = do
  Index
ix <- Parser Index
index
  Maybe (Either Char Char, Index)
mix <- ParsecT Void String Identity (Either Char Char, Index)
-> ParsecT Void String Identity (Maybe (Either Char Char, Index))
forall (f :: * -> *) a. Alternative f => f a -> f (Maybe a)
TM.optional ParsecT Void String Identity (Either Char Char, Index)
metaIndex ParsecT Void String Identity (Maybe (Either Char Char, Index))
-> String
-> ParsecT Void String Identity (Maybe (Either Char Char, Index))
forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
TM.<?> String
"meta token ID"
  ID -> Parser ID
forall (m :: * -> *) a. Monad m => a -> m a
return (ID -> Parser ID) -> ID -> Parser ID
forall a b. (a -> b) -> a -> b
$
    case Maybe (Either Char Char, Index)
mix of
      Maybe (Either Char Char, Index)
Nothing             -> Index -> ID
SID Index
ix
      Just (Left Char
_, Index
eix)  -> Index -> Index -> ID
MID Index
ix Index
eix
      Just (Right Char
_, Index
eix) -> Index -> Index -> ID
EID Index
ix Index
eix
  where
    index :: Parser Index
    index :: Parser Index
index = do
      String
ix <- ParsecT Void String Identity Char -> Parser String
forall (m :: * -> *) a. MonadPlus m => m a -> m [a]
TM.some ParsecT Void String Identity Char
forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
m (Token s)
digitChar Parser String -> String -> Parser String
forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
TM.<?> String
"ID"
      Index -> Parser Index
forall (m :: * -> *) a. Monad m => a -> m a
return (String -> Index
forall a. Read a => String -> a
read String
ix :: Int)
    indexSep :: Parser (Either IxSep IxSep)
    indexSep :: Parser (Either Char Char)
indexSep = ParsecT Void String Identity Char
-> ParsecT Void String Identity Char -> Parser (Either Char Char)
forall (m :: * -> *) a b.
Alternative m =>
m a -> m b -> m (Either a b)
TM.eitherP (Token String -> ParsecT Void String Identity (Token String)
forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
Token s -> m (Token s)
char Char
Token String
'-') (Token String -> ParsecT Void String Identity (Token String)
forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
Token s -> m (Token s)
char Char
Token String
'.') Parser (Either Char Char) -> String -> Parser (Either Char Char)
forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
TM.<?> String
"meta separator"
    metaIndex :: Parser (Either IxSep IxSep, Index)
    metaIndex :: ParsecT Void String Identity (Either Char Char, Index)
metaIndex = do
      Either Char Char
sep <- Parser (Either Char Char)
indexSep
      Index
ix <- Parser Index
index
      (Either Char Char, Index)
-> ParsecT Void String Identity (Either Char Char, Index)
forall (m :: * -> *) a. Monad m => a -> m a
return (Either Char Char
sep, Index
ix)

form :: Parser FORM
-- | parse the FORM field.
form :: Parser FORM
form = Parser String -> Parser FORM
orEmpty Parser String
stringWSpaces Parser FORM -> String -> Parser FORM
forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
TM.<?> String
"FORM"

lemma :: Parser LEMMA
-- | parse the LEMMA field.
lemma :: Parser FORM
lemma = Parser String -> Parser FORM
orEmpty Parser String
stringWSpaces Parser FORM -> String -> Parser FORM
forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
TM.<?> String
"LEMMA"

upos :: Parser UPOS
-- | parse the UPOS field.
upos :: Parser UPOS
upos = Parser POS -> Parser UPOS
forall a. Parser a -> Parser (Maybe a)
maybeEmpty Parser POS
upos'
  where
    upos' :: Parser U.POS
    upos' :: Parser POS
upos' = (String -> POS) -> Parser String -> Parser POS
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap String -> POS
mkUPOS (Parser String -> Parser POS) -> Parser String -> Parser POS
forall a b. (a -> b) -> a -> b
$ [Parser String] -> Parser String
forall (f :: * -> *) (m :: * -> *) a.
(Foldable f, Alternative m) =>
f (m a) -> m a
TM.choice ([Parser String] -> Parser String)
-> [Parser String] -> Parser String
forall a b. (a -> b) -> a -> b
$ (POS -> Parser String) -> [POS] -> [Parser String]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (String -> Parser String
forall e s (m :: * -> *).
(MonadParsec e s m, FoldCase (Tokens s)) =>
Tokens s -> m (Tokens s)
string' (String -> Parser String)
-> (POS -> String) -> POS -> Parser String
forall b c a. (b -> c) -> (a -> b) -> a -> c
. POS -> String
forall a. Show a => a -> String
show) [POS
U.ADJ .. POS
U.X]

xpos :: Parser XPOS
-- | parse the XPOS field.
xpos :: Parser FORM
xpos = Parser String -> Parser FORM
forall a. Parser a -> Parser (Maybe a)
maybeEmpty Parser String
stringWOSpaces Parser FORM -> String -> Parser FORM
forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
TM.<?> String
"XPOS"

feats :: Parser FEATS
-- | parse the FEATS field.
feats :: Parser FEATS
feats = Parser FEATS -> Parser FEATS
forall a. Parser [a] -> Parser [a]
listP (ParsecT Void String Identity Feat
feat ParsecT Void String Identity Feat -> Parser String -> Parser FEATS
forall (m :: * -> *) a sep. MonadPlus m => m a -> m sep -> m [a]
`TM.sepBy` String -> Parser String
symbol String
"|" Parser FEATS -> String -> Parser FEATS
forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
TM.<?> String
"FEATS")
  where
    feat :: ParsecT Void String Identity Feat
feat = do
      String
k  <- Parser String -> Parser String
forall a. Parser a -> Parser a
lexeme (ParsecT Void String Identity Char -> Parser String
forall (m :: * -> *) a. MonadPlus m => m a -> m [a]
TM.some ParsecT Void String Identity Char
forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
m (Token s)
alphaNumChar Parser String -> String -> Parser String
forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
TM.<?> String
"feature key")
      FORM
ft <-
        Parser String -> Parser FORM
forall (f :: * -> *) a. Alternative f => f a -> f (Maybe a)
TM.optional (Parser String -> Parser FORM) -> Parser String -> Parser FORM
forall a b. (a -> b) -> a -> b
$
        Parser String -> Parser String -> Parser String -> Parser String
forall (m :: * -> *) open close a.
Applicative m =>
m open -> m close -> m a -> m a
TM.between (String -> Parser String
symbol String
"[") (String -> Parser String
symbol String
"]") (ParsecT Void String Identity Char -> Parser String
forall (m :: * -> *) a. MonadPlus m => m a -> m [a]
TM.some ParsecT Void String Identity Char
forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
m (Token s)
alphaNumChar)
      String
_  <- String -> Parser String
symbol String
"="
      [String]
vs <- Parser String
fvalue Parser String
-> Parser String -> ParsecT Void String Identity [String]
forall (m :: * -> *) a sep. MonadPlus m => m a -> m sep -> m [a]
`TM.sepBy1` String -> Parser String
symbol String
","
      Feat -> ParsecT Void String Identity Feat
forall (m :: * -> *) a. Monad m => a -> m a
return (Feat -> ParsecT Void String Identity Feat)
-> Feat -> ParsecT Void String Identity Feat
forall a b. (a -> b) -> a -> b
$ String -> [String] -> FORM -> Feat
Feat String
k [String]
vs FORM
ft
    fvalue :: Parser String
fvalue = Parser String -> Parser String
forall a. Parser a -> Parser a
lexeme (ParsecT Void String Identity Char -> Parser String
forall (m :: * -> *) a. MonadPlus m => m a -> m [a]
TM.some ParsecT Void String Identity Char
forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
m (Token s)
alphaNumChar Parser String -> String -> Parser String
forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
TM.<?> String
"feature value")

deprel :: Parser DEPREL
-- | parse the DEPREL field.
deprel :: Parser DEPREL
deprel = Parser (EP, FORM) -> Parser DEPREL
forall a. Parser a -> Parser (Maybe a)
maybeEmpty Parser (EP, FORM)
deprel'

dep :: Parser D.EP
dep :: Parser EP
dep = (String -> EP) -> Parser String -> Parser EP
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap String -> EP
mkDEP (Parser String -> Parser EP) -> Parser String -> Parser EP
forall a b. (a -> b) -> a -> b
$ [Parser String] -> Parser String
forall (f :: * -> *) (m :: * -> *) a.
(Foldable f, Alternative m) =>
f (m a) -> m a
TM.choice ([Parser String] -> Parser String)
-> [Parser String] -> Parser String
forall a b. (a -> b) -> a -> b
$ (EP -> Parser String) -> [EP] -> [Parser String]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (String -> Parser String
forall e s (m :: * -> *).
(MonadParsec e s m, FoldCase (Tokens s)) =>
Tokens s -> m (Tokens s)
string' (String -> Parser String) -> (EP -> String) -> EP -> Parser String
forall b c a. (b -> c) -> (a -> b) -> a -> c
. EP -> String
forall a. Show a => a -> String
show) [EP
D.ACL .. EP
D.XCOMP]

deprel' :: Parser (D.EP, Maybe String)
-- | parse a non-empty DEPREL field.
deprel' :: Parser (EP, FORM)
deprel' = (EP -> FORM -> (EP, FORM))
-> Parser EP -> Parser FORM -> Parser (EP, FORM)
forall (m :: * -> *) a1 a2 r.
Monad m =>
(a1 -> a2 -> r) -> m a1 -> m a2 -> m r
liftM2 (,) Parser EP
dep Parser FORM
subdeprel
  where
    subdeprel :: Parser (Maybe String)
    subdeprel :: Parser FORM
subdeprel = Parser String -> Parser FORM
forall (f :: * -> *) a. Alternative f => f a -> f (Maybe a)
TM.optional (String -> Parser String
symbol String
":" Parser String -> Parser String -> Parser String
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> Parser String
letters Parser String -> String -> Parser String
forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
TM.<?> String
"DEPREL subtype")

deps :: Parser DEPS
-- | parse the DEPS field.
deps :: Parser DEPS
deps = Parser DEPS -> Parser DEPS
forall a. Parser [a] -> Parser [a]
listP (ParsecT Void String Identity Rel
eDep ParsecT Void String Identity Rel -> Parser String -> Parser DEPS
forall (m :: * -> *) a sep. MonadPlus m => m a -> m sep -> m [a]
`TM.sepBy` String -> Parser String
symbol String
"|" Parser DEPS -> String -> Parser DEPS
forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
TM.<?> String
"DEPS")
  where
    eDep :: ParsecT Void String Identity Rel
eDep = do
      ID
h <- Parser ID
idW Parser ID -> String -> Parser ID
forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
TM.<?> String
"enhanced dependency HEAD"
      String
_ <- Parser String
sep
      EP
d <- Parser EP
dep Parser EP -> String -> Parser EP
forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
TM.<?> String
"enhanced dependency DEPREL"
      Maybe [String]
restI <-
        ParsecT Void String Identity [String]
-> ParsecT Void String Identity (Maybe [String])
forall (f :: * -> *) a. Alternative f => f a -> f (Maybe a)
TM.optional
          (Parser String
sep Parser String
-> ParsecT Void String Identity [String]
-> ParsecT Void String Identity [String]
forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*>
           String -> Parser String
stringNot String
"\t| :" Parser String
-> Parser String -> ParsecT Void String Identity [String]
forall (m :: * -> *) a sep. MonadPlus m => m a -> m sep -> m [a]
`TM.sepBy` Parser String
sep ParsecT Void String Identity [String]
-> String -> ParsecT Void String Identity [String]
forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
TM.<?>
           String
"enhanced dependency information")
      Rel -> ParsecT Void String Identity Rel
forall (m :: * -> *) a. Monad m => a -> m a
return (Rel -> ParsecT Void String Identity Rel)
-> Rel -> ParsecT Void String Identity Rel
forall a b. (a -> b) -> a -> b
$ ID -> EP -> FORM -> Maybe [String] -> Rel
Rel ID
h EP
d FORM
forall a. Maybe a
Nothing Maybe [String]
restI
    sep :: Parser String
sep = String -> Parser String
symbol String
":"

misc :: Parser MISC
-- | parse the MISC field.
misc :: Parser FORM
misc = Parser String -> Parser FORM
orEmpty Parser String
stringWSpaces Parser FORM -> String -> Parser FORM
forall e s (m :: * -> *) a.
MonadParsec e s m =>
m a -> String -> m a
TM.<?> String
"MISC"

---
-- utility parsers
commentPair :: Parser Comment
-- | parse a comment pair.
commentPair :: Parser Comment
commentPair =
  String -> Parser String -> Parser String -> Parser Comment
forall a b. String -> Parser a -> Parser b -> Parser (a, b)
keyValue String
"=" (String -> Parser String
stringNot String
"=\n\t") (String -> Parser String -> Parser String
forall (m :: * -> *) a. Alternative m => a -> m a -> m a
TM.option String
"" Parser String
stringWSpaces)

listPair :: String -> Parser a -> Parser b -> Parser [(a, b)]
-- | parse a list of pairs.
listPair :: String -> Parser a -> Parser b -> Parser [(a, b)]
listPair String
sep Parser a
p Parser b
q = String -> Parser a -> Parser b -> Parser (a, b)
forall a b. String -> Parser a -> Parser b -> Parser (a, b)
keyValue String
sep Parser a
p Parser b
q Parser (a, b) -> Parser String -> Parser [(a, b)]
forall (m :: * -> *) a sep. MonadPlus m => m a -> m sep -> m [a]
`TM.sepBy1` String -> Parser String
symbol String
"|"

stringNot :: String -> Parser String
-- | parse any chars except the ones provided.
stringNot :: String -> Parser String
stringNot String
s = Parser String -> Parser String
forall a. Parser a -> Parser a
lexeme (Parser String -> Parser String) -> Parser String -> Parser String
forall a b. (a -> b) -> a -> b
$ FORM
-> (Token String -> Bool)
-> ParsecT Void String Identity (Tokens String)
forall e s (m :: * -> *).
MonadParsec e s m =>
FORM -> (Token s -> Bool) -> m (Tokens s)
TM.takeWhile1P FORM
forall a. Maybe a
Nothing (Char -> String -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`notElem` String
s)

stringWOSpaces :: Parser String
-- | parse a string until a space, a tab, or a newline.
stringWOSpaces :: Parser String
stringWOSpaces = String -> Parser String
stringNot String
" \t\n"

stringWSpaces :: Parser String
-- | parse a string until a tab or a newline.
stringWSpaces :: Parser String
stringWSpaces = String -> Parser String
stringNot String
"\t\n"

letters :: Parser String
-- | parse a string of letters.
letters :: Parser String
letters = Parser String -> Parser String
forall a. Parser a -> Parser a
lexeme (Parser String -> Parser String) -> Parser String -> Parser String
forall a b. (a -> b) -> a -> b
$ ParsecT Void String Identity Char -> Parser String
forall (m :: * -> *) a. MonadPlus m => m a -> m [a]
TM.some ParsecT Void String Identity Char
forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
m (Token s)
letterChar

---
-- parser combinators
keyValue :: String -> Parser a -> Parser b -> Parser (a, b)
-- | parse a (key, value) pair.
keyValue :: String -> Parser a -> Parser b -> Parser (a, b)
keyValue String
sep Parser a
p Parser b
q = do
  a
key   <- Parser a
p
  FORM
_     <- Parser String -> Parser FORM
forall (f :: * -> *) a. Alternative f => f a -> f (Maybe a)
TM.optional (Parser String -> Parser FORM) -> Parser String -> Parser FORM
forall a b. (a -> b) -> a -> b
$ String -> Parser String
symbol String
sep
  b
value <- Parser b
q
  (a, b) -> Parser (a, b)
forall (m :: * -> *) a. Monad m => a -> m a
return (a
key, b
value)


-- | two combinators are needed for parsing the empty field (without
-- lookahead). this has to do with the fact that if we do
--
-- > form <|> emptyField
--
-- we would parse "_" as a non-empty FORM field. but if we did
--
-- > emptyField <|> form
--
-- we would parse "_" in "_something" and then the parser would choke
-- expecting a tab.

maybeEmpty :: Parser a -> Parser (Maybe a)
-- | a parser combinator for parsers that won't parse "_" (e.g., as
-- 'lemma' would).
maybeEmpty :: Parser a -> Parser (Maybe a)
maybeEmpty Parser a
p = Parser (Maybe a)
forall a. Parser (Maybe a)
emptyField Parser (Maybe a) -> Parser (Maybe a) -> Parser (Maybe a)
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
TM.<|> (a -> Maybe a) -> Parser a -> Parser (Maybe a)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap a -> Maybe a
forall a. a -> Maybe a
Just Parser a
p

orEmpty :: Parser String -> Parser (Maybe String)
-- | a parser combinator for parsers that may parse "_".
orEmpty :: Parser String -> Parser FORM
orEmpty Parser String
p = do
  String
r <- Parser String
p
  case String
r of
    String
"_" -> FORM -> Parser FORM
forall (m :: * -> *) a. Monad m => a -> m a
return FORM
forall a. Maybe a
Nothing
    String
_   -> FORM -> Parser FORM
forall (m :: * -> *) a. Monad m => a -> m a
return (FORM -> Parser FORM) -> FORM -> Parser FORM
forall a b. (a -> b) -> a -> b
$ String -> FORM
forall a. a -> Maybe a
Just String
r

listP :: Parser [a] -> Parser [a]
-- | parse a list of values that may be an empty field. using a parser
-- that returns a possibly empty list like 'sepBy' and many will
-- return the correct result for the empty field ('_'), but will
-- report it the same as any other syntax error.
listP :: Parser [a] -> Parser [a]
listP Parser [a]
p = [a] -> Maybe [a] -> [a]
forall a. a -> Maybe a -> a
fromMaybe [] (Maybe [a] -> [a])
-> ParsecT Void String Identity (Maybe [a]) -> Parser [a]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Parser [a] -> ParsecT Void String Identity (Maybe [a])
forall a. Parser a -> Parser (Maybe a)
maybeEmpty Parser [a]
p

---
-- lexing
symbol :: String -> Parser String
symbol :: String -> Parser String
symbol = ParsecT Void String Identity ()
-> Tokens String -> ParsecT Void String Identity (Tokens String)
forall e s (m :: * -> *).
MonadParsec e s m =>
m () -> Tokens s -> m (Tokens s)
L.symbol ParsecT Void String Identity ()
ws

lexeme :: Parser a -> Parser a
lexeme :: Parser a -> Parser a
lexeme = ParsecT Void String Identity () -> Parser a -> Parser a
forall e s (m :: * -> *) a. MonadParsec e s m => m () -> m a -> m a
L.lexeme ParsecT Void String Identity ()
ws

ws :: Parser ()
ws :: ParsecT Void String Identity ()
ws = Parser String -> ParsecT Void String Identity ()
forall (f :: * -> *) a. Functor f => f a -> f ()
void (Parser String -> ParsecT Void String Identity ())
-> Parser String -> ParsecT Void String Identity ()
forall a b. (a -> b) -> a -> b
$ FORM
-> (Token String -> Bool)
-> ParsecT Void String Identity (Tokens String)
forall e s (m :: * -> *).
MonadParsec e s m =>
FORM -> (Token s -> Bool) -> m (Tokens s)
TM.takeWhileP (String -> FORM
forall a. a -> Maybe a
Just String
"space") (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
== Char
' ')

---
-- customizable parser
data ParserC = ParserC
  { ParserC -> Parser Comment
_commentP :: Parser Comment
  , ParserC -> Parser ID
_idP      :: Parser ID
  , ParserC -> Parser FORM
_formP    :: Parser FORM
  , ParserC -> Parser FORM
_lemmaP   :: Parser LEMMA
  , ParserC -> Parser UPOS
_upostagP :: Parser UPOS
  , ParserC -> Parser FORM
_xpostagP :: Parser XPOS
  , ParserC -> Parser FEATS
_featsP   :: Parser FEATS
  , ParserC -> Parser DEPREL
_deprelP  :: Parser DEPREL
  , ParserC -> Parser DEPS
_depsP    :: Parser DEPS
  , ParserC -> Parser FORM
_miscP    :: Parser MISC
  } deriving ()

customC :: ParserC
customC :: ParserC
customC = ParserC :: Parser Comment
-> Parser ID
-> Parser FORM
-> Parser FORM
-> Parser UPOS
-> Parser FORM
-> Parser FEATS
-> Parser DEPREL
-> Parser DEPS
-> Parser FORM
-> ParserC
ParserC
  { $sel:_commentP:ParserC :: Parser Comment
_commentP = Parser Comment
comment
  , $sel:_idP:ParserC :: Parser ID
_idP      = Parser ID
idW
  , $sel:_formP:ParserC :: Parser FORM
_formP    = Parser FORM
form
  , $sel:_lemmaP:ParserC :: Parser FORM
_lemmaP   = Parser FORM
lemma
  , $sel:_upostagP:ParserC :: Parser UPOS
_upostagP = Parser UPOS
upos
  , $sel:_xpostagP:ParserC :: Parser FORM
_xpostagP = Parser FORM
xpos
  , $sel:_featsP:ParserC :: Parser FEATS
_featsP   = Parser FEATS
feats
  , $sel:_deprelP:ParserC :: Parser DEPREL
_deprelP  = Parser DEPREL
deprel
  , $sel:_depsP:ParserC :: Parser DEPS
_depsP    = Parser DEPS
deps
  , $sel:_miscP:ParserC :: Parser FORM
_miscP    = Parser FORM
misc
  }

parserC :: ParserC -> Parser Sent
-- | defines a custom parser of sentences. if you only need to
-- customize one field parser (e.g., to parse special comments or a
-- special MISC field), you can do:
--
-- @
-- parserC ParserC{_commentP = myCommentsParser }
-- @
parserC :: ParserC -> Parser Sent
parserC ParserC
p =
  let i :: Parser ID
i  = ParserC -> Parser ID
_idP ParserC
p
      f :: Parser FORM
f  = ParserC -> Parser FORM
_formP ParserC
p
      l :: Parser FORM
l  = ParserC -> Parser FORM
_lemmaP ParserC
p
      up :: Parser UPOS
up = ParserC -> Parser UPOS
_upostagP ParserC
p
      xp :: Parser FORM
xp = ParserC -> Parser FORM
_xpostagP ParserC
p
      fs :: Parser FEATS
fs = ParserC -> Parser FEATS
_featsP ParserC
p
      dr :: Parser DEPREL
dr = ParserC -> Parser DEPREL
_deprelP ParserC
p
      ds :: Parser DEPS
ds = ParserC -> Parser DEPS
_depsP ParserC
p
      m :: Parser FORM
m  = ParserC -> Parser FORM
_miscP ParserC
p
      c :: Parser Comment
c  = ParserC -> Parser Comment
_commentP ParserC
p
      w :: Parser (CW AW)
w  = Parser ID
-> Parser FORM
-> Parser FORM
-> Parser UPOS
-> Parser FORM
-> Parser FEATS
-> Parser DEPREL
-> Parser DEPS
-> Parser FORM
-> Parser (CW AW)
wordC Parser ID
i Parser FORM
f Parser FORM
l Parser UPOS
up Parser FORM
xp Parser FEATS
fs Parser DEPREL
dr Parser DEPS
ds Parser FORM
m
      s :: Parser Sent
s  = Parser Comment -> Parser (CW AW) -> Parser Sent
sentenceC Parser Comment
c Parser (CW AW)
w
  in Parser Sent
s

---
-- parse
parseConlluWith
  :: Parser Sent -- ^ the sentence parser to be used.
  -> FilePath    -- ^ the source whose stream is being supplied in the
                 -- next argument (may be "" for no file)
  -> String      -- ^ stream to be parsed
  -> Either String Doc
-- | parse a CoNLL-U document using a customized parser.
parseConlluWith :: Parser Sent -> String -> String -> Either String Doc
parseConlluWith Parser Sent
p String
fp String
s =
  case Parser (RawData String Void)
-> String
-> String
-> Either (ParseErrorBundle String Void) (RawData String Void)
forall e s a.
Parsec e s a -> String -> s -> Either (ParseErrorBundle s e) a
TM.parse Parser (RawData String Void)
doc String
fp String
s of
    Left ParseErrorBundle String Void
err -> String -> Either String Doc
forall a b. a -> Either a b
Left (String -> Either String Doc) -> String -> Either String Doc
forall a b. (a -> b) -> a -> b
$ ParseErrorBundle String Void -> String
forall s e.
(VisualStream s, TraversableStream s, ShowErrorComponent e) =>
ParseErrorBundle s e -> String
TM.errorBundlePretty ParseErrorBundle String Void
err
    Right RawData String Void
d  ->
      let ([ParseError String Void]
ls, Doc
rs) = RawData String Void -> ([ParseError String Void], Doc)
forall a b. [Either a b] -> ([a], [b])
partitionEithers RawData String Void
d
      in if [ParseError String Void] -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
null [ParseError String Void]
ls
           then Doc -> Either String Doc
forall a b. b -> Either a b
Right Doc
rs
           else String -> Either String Doc
forall a b. a -> Either a b
Left (String -> Either String Doc) -> String -> Either String Doc
forall a b. (a -> b) -> a -> b
$ (ParseError String Void -> String)
-> [ParseError String Void] -> String
forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap ParseError String Void -> String
forall s e.
(VisualStream s, ShowErrorComponent e) =>
ParseError s e -> String
TM.parseErrorPretty [ParseError String Void]
ls
  where
    doc :: Parser (RawData String Void)
doc = Parser Sent -> Parser (RawData String Void)
rawSentsC Parser Sent
p


parseConllu :: FilePath -> String -> Either String Doc
-- | parse a CoNLL-U document using the default parser.
parseConllu :: String -> String -> Either String Doc
parseConllu = Parser Sent -> String -> String -> Either String Doc
parseConlluWith Parser Sent
sentence