{-# LANGUAGE BlockArguments #-}
{-# LANGUAGE DeriveFunctor #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE ViewPatterns #-}

{-| This module contains types and functions for working with the MDF
  dictionary format, used by programs such as [SIL Toolbox](https://software.sil.org/toolbox/).
  For more on the MDF format, refer to e.g.
  [Coward & Grimes (2000), /Making Dictionaries: A guide to lexicography and the Multi-Dictionary Formatter/](http://downloads.sil.org/legacy/shoebox/MDF_2000.pdf).
-}
module Brassica.MDF
       (
       -- * MDF files

         MDF(..)
       , MDFLanguage(..)
       , fieldLangs
       -- * Parsing

       , parseMDFRaw
       , parseMDFWithTokenisation
       -- ** Re-export

       , errorBundlePretty
       -- * Conversion

       , componentiseMDF
       , componentiseMDFWordsOnly
       , duplicateEtymologies
       ) where

import Control.Category ((>>>))
import Data.Char (isSpace)
import Data.Void (Void)

import qualified Data.Map as M
import Text.Megaparsec
import Text.Megaparsec.Char

import Brassica.SoundChange.Tokenise
import Brassica.SoundChange.Types (Grapheme, PWord)
import Data.Maybe (fromMaybe)

-- | An MDF (Multi-Dictionary Formatter) file, represented as a list

-- of (field marker, whitespace, field value) tuples. The field marker

-- is represented excluding its initial slash; whitespace after the

-- field marker is also stored, allowing the original MDF file to be

-- precisely recovered. Field values should includes all whitespace to

-- the next marker. All field values are stored as 'String's, with the

-- exception of 'Vernacular' fields, which have type @v@.

--

-- For instance, the following MDF file:

--

-- > \lx kapa

-- > \ps n

-- > \ge parent

-- > \se sakapa

-- > \ge father

--

-- Could be stored as:

--

-- > MDF [ ("lx", " ", Right "kapa\n")

-- >     , ("ps", " ", Left "n\n")

-- >     , ("ge", " ", Left "parent\n")

-- >     , ("se", " ", Right "sakapa\n")

-- >     , ("ge", " ", Left "father")

-- >     ]

newtype MDF v = MDF { forall v. MDF v -> [(String, String, Either String v)]
unMDF :: [(String, String, Either String v)] }
    deriving (Int -> MDF v -> ShowS
forall v. Show v => Int -> MDF v -> ShowS
forall v. Show v => [MDF v] -> ShowS
forall v. Show v => MDF v -> String
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [MDF v] -> ShowS
$cshowList :: forall v. Show v => [MDF v] -> ShowS
show :: MDF v -> String
$cshow :: forall v. Show v => MDF v -> String
showsPrec :: Int -> MDF v -> ShowS
$cshowsPrec :: forall v. Show v => Int -> MDF v -> ShowS
Show, forall a b. a -> MDF b -> MDF a
forall a b. (a -> b) -> MDF a -> MDF b
forall (f :: * -> *).
(forall a b. (a -> b) -> f a -> f b)
-> (forall a b. a -> f b -> f a) -> Functor f
<$ :: forall a b. a -> MDF b -> MDF a
$c<$ :: forall a b. a -> MDF b -> MDF a
fmap :: forall a b. (a -> b) -> MDF a -> MDF b
$cfmap :: forall a b. (a -> b) -> MDF a -> MDF b
Functor)

type Parser = Parsec Void String

sc :: Parser String
sc :: Parser String
sc = forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap (forall a. a -> Maybe a -> a
fromMaybe String
"") forall a b. (a -> b) -> a -> b
$ forall (f :: * -> *) a. Alternative f => f a -> f (Maybe a)
optional forall a b. (a -> b) -> a -> b
$ forall e s (m :: * -> *).
MonadParsec e s m =>
Maybe String -> (Token s -> Bool) -> m (Tokens s)
takeWhile1P (forall a. a -> Maybe a
Just String
"white space") Char -> Bool
isSpace

parseToSlash :: Parser String
parseToSlash :: Parser String
parseToSlash = forall e s (m :: * -> *).
MonadParsec e s m =>
Maybe String -> (Token s -> Bool) -> m (Tokens s)
takeWhileP (forall a. a -> Maybe a
Just String
"field value") (forall a. Eq a => a -> a -> Bool
/= Char
'\\')

entry :: Parser v -> Parser (String, String, Either String v)
entry :: forall v. Parser v -> Parser (String, String, Either String v)
entry Parser v
pv = do
    Token String
_ <- forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
Token s -> m (Token s)
char Char
'\\'
    String
marker <- forall e s (m :: * -> *).
MonadParsec e s m =>
Maybe String -> (Token s -> Bool) -> m (Tokens s)
takeWhile1P (forall a. a -> Maybe a
Just String
"field name") (Bool -> Bool
not forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Bool
isSpace)
    String
s <- Parser String
sc
    Either String v
value <- case forall k a. Ord k => k -> Map k a -> Maybe a
M.lookup String
marker Map String MDFLanguage
fieldLangs of
        Just MDFLanguage
Vernacular -> forall a b. b -> Either a b
Right forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Parser v
pv
        Maybe MDFLanguage
_ -> forall a b. a -> Either a b
Left forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Parser String
parseToSlash
    forall (f :: * -> *) a. Applicative f => a -> f a
pure (String
marker, String
s, Either String v
value)

-- | Parse an MDF file to an 'MDF', storing the 'Vernacular' fields as 'String's.

parseMDFRaw :: String -> Either (ParseErrorBundle String Void) (MDF String)
parseMDFRaw :: String -> Either (ParseErrorBundle String Void) (MDF String)
parseMDFRaw = forall e s a.
Parsec e s a -> String -> s -> Either (ParseErrorBundle s e) a
runParser (forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap forall v. [(String, String, Either String v)] -> MDF v
MDF forall a b. (a -> b) -> a -> b
$ Parser String
sc forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> forall (m :: * -> *) a. MonadPlus m => m a -> m [a]
many (forall v. Parser v -> Parser (String, String, Either String v)
entry Parser String
parseToSlash) forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* forall e s (m :: * -> *). MonadParsec e s m => m ()
eof) String
""

-- | Parse an MDF file to an 'MDF', parsing the 'Vernacular' fields

-- into 'Component's in the process.

parseMDFWithTokenisation
    :: [Grapheme]
    -> String
    -> Either (ParseErrorBundle String Void) (MDF [Component PWord])
parseMDFWithTokenisation :: [String]
-> String
-> Either (ParseErrorBundle String Void) (MDF [Component [String]])
parseMDFWithTokenisation (forall a. [[a]] -> [[a]]
sortByDescendingLength -> [String]
gs) =
    forall e s a.
Parsec e s a -> String -> s -> Either (ParseErrorBundle s e) a
runParser (forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap forall v. [(String, String, Either String v)] -> MDF v
MDF forall a b. (a -> b) -> a -> b
$ Parser String
sc forall (f :: * -> *) a b. Applicative f => f a -> f b -> f b
*> ParsecT
  Void
  String
  Identity
  [(String, String, Either String [Component [String]])]
p forall (f :: * -> *) a b. Applicative f => f a -> f b -> f a
<* forall e s (m :: * -> *). MonadParsec e s m => m ()
eof) String
""
  where
    p :: ParsecT
  Void
  String
  Identity
  [(String, String, Either String [Component [String]])]
p = forall (m :: * -> *) a. MonadPlus m => m a -> m [a]
many forall a b. (a -> b) -> a -> b
$ forall v. Parser v -> Parser (String, String, Either String v)
entry forall a b. (a -> b) -> a -> b
$ forall a.
ParsecT Void String Identity a
-> ParsecT Void String Identity [Component a]
componentsParser forall a b. (a -> b) -> a -> b
$ String -> [String] -> ParsecT Void String Identity [String]
wordParser String
"\\" [String]
gs

-- | Convert an 'MDF' to a list of 'Component's representing the same

-- textual content. Vernacular field values are left as is; everything

-- else is treated as a 'Separator', so that it is not disturbed by

-- operations such as rule application or rendering to text.

componentiseMDF :: MDF [Component a] -> [Component a]
componentiseMDF :: forall a. MDF [Component a] -> [Component a]
componentiseMDF = forall v. MDF v -> [(String, String, Either String v)]
unMDF forall {k} (cat :: k -> k -> *) (a :: k) (b :: k) (c :: k).
Category cat =>
cat a b -> cat b c -> cat a c
>>> forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap \case
    (String
m, String
s, Left String
v) -> [forall a. String -> Component a
Separator (Char
'\\'forall a. a -> [a] -> [a]
:String
m forall a. [a] -> [a] -> [a]
++ String
s forall a. [a] -> [a] -> [a]
++ String
v)]
    (String
m, String
s, Right [Component a]
v) -> forall a. String -> Component a
Separator (Char
'\\'forall a. a -> [a] -> [a]
:String
m forall a. [a] -> [a] -> [a]
++ String
s) forall a. a -> [a] -> [a]
: [Component a]
v

-- | As with 'componentiseMDF', but the resulting 'Component's contain

-- the contents of 'Vernacular' fields only; all else is

-- discarded. The first parameter specifies the 'Separator' to insert

-- after each vernacular field.

componentiseMDFWordsOnly :: MDF [Component a] -> [Component a]
componentiseMDFWordsOnly :: forall a. MDF [Component a] -> [Component a]
componentiseMDFWordsOnly = forall v. MDF v -> [(String, String, Either String v)]
unMDF forall {k} (cat :: k -> k -> *) (a :: k) (b :: k) (c :: k).
Category cat =>
cat a b -> cat b c -> cat a c
>>> forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap \case
    (String
_, String
_, Right [Component a]
v) -> [Component a]
v
    (String, String, Either String [Component a])
_ -> []

-- | Add etymological fields to an 'MDF' by duplicating the values in

-- @\lx@, @\se@ and @\ge@ fields. e.g.:

--

-- > \lx kapa

-- > \ps n

-- > \ge parent

-- > \se sakapa

-- > \ge father

--

-- Would become:

--

-- > \lx kapa

-- > \ps n

-- > \ge parent

-- > \et kapa

-- > \eg parent

-- > \se sakapa

-- > \ge father

-- > \et sakapa

-- > \eg father

--

-- This can be helpful when applying sound changes to an MDF file: the

-- vernacular words can be copied as etymologies, and then the sound

-- changes can be applied leaving the etymologies as is.

duplicateEtymologies
    :: (v -> String)
    -- ^ Function to convert from vernacular field values to

    -- strings. Can also be used to preprocess the value of the

    -- resulting @\et@ fields, e.g. by prepending @*@ or similar.

    -> MDF v -> MDF v
duplicateEtymologies :: forall v. (v -> String) -> MDF v -> MDF v
duplicateEtymologies v -> String
typeset = forall v. [(String, String, Either String v)] -> MDF v
MDF forall b c a. (b -> c) -> (a -> b) -> a -> c
. Maybe v
-> Maybe String
-> [(String, String, Either String v)]
-> [(String, String, Either String v)]
go forall a. Maybe a
Nothing forall a. Maybe a
Nothing forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall v. MDF v -> [(String, String, Either String v)]
unMDF
  where
    mkEt :: Maybe v -> Maybe String -> [(String, String, Either String b)]
mkEt Maybe v
word Maybe String
gloss = forall {b}.
[(String, String, Either String b)]
-> [(String, String, Either String b)]
word' forall {b}. [(String, String, Either String b)]
gloss'
      where
        word' :: [(String, String, Either String b)]
-> [(String, String, Either String b)]
word' = case Maybe v
word of
            Just v
et -> ((String
"et", String
" ", forall a b. a -> Either a b
Left forall a b. (a -> b) -> a -> b
$ v -> String
typeset v
et) forall a. a -> [a] -> [a]
:)
            Maybe v
Nothing -> forall a. a -> a
id
        gloss' :: [(String, String, Either String b)]
gloss' = case Maybe String
gloss of
            Just String
eg -> [(String
"eg", String
" ", forall a b. a -> Either a b
Left String
eg)]
            Maybe String
Nothing -> []

    go :: Maybe v
-> Maybe String
-> [(String, String, Either String v)]
-> [(String, String, Either String v)]
go Maybe v
word Maybe String
gloss [] = forall {b}.
Maybe v -> Maybe String -> [(String, String, Either String b)]
mkEt Maybe v
word Maybe String
gloss
    go Maybe v
word Maybe String
_ (f :: (String, String, Either String v)
f@(String
"ge", String
_, Left String
gloss'):[(String, String, Either String v)]
fs)                  -- store gloss field for future etymology

        = (String, String, Either String v)
f forall a. a -> [a] -> [a]
: Maybe v
-> Maybe String
-> [(String, String, Either String v)]
-> [(String, String, Either String v)]
go Maybe v
word (forall a. a -> Maybe a
Just String
gloss') [(String, String, Either String v)]
fs
    go Maybe v
word Maybe String
gloss (f :: (String, String, Either String v)
f@(String
m, String
_, Right v
word'):[(String, String, Either String v)]
fs)                 -- add etymology & store word if word or subentry field reached

        | String
m forall a. Eq a => a -> a -> Bool
== String
"lx" Bool -> Bool -> Bool
|| String
m forall a. Eq a => a -> a -> Bool
== String
"se"
        = forall {b}.
Maybe v -> Maybe String -> [(String, String, Either String b)]
mkEt Maybe v
word Maybe String
gloss forall a. [a] -> [a] -> [a]
++ (String, String, Either String v)
f forall a. a -> [a] -> [a]
: Maybe v
-> Maybe String
-> [(String, String, Either String v)]
-> [(String, String, Either String v)]
go (forall a. a -> Maybe a
Just v
word') forall a. Maybe a
Nothing [(String, String, Either String v)]
fs
    go Maybe v
word Maybe String
gloss (f :: (String, String, Either String v)
f@(String
"dt", String
_, Either String v
_):[(String, String, Either String v)]
fs)                        -- add etymology if date (usually final field in entry) reached

        = forall {b}.
Maybe v -> Maybe String -> [(String, String, Either String b)]
mkEt Maybe v
word Maybe String
gloss forall a. [a] -> [a] -> [a]
++ (String, String, Either String v)
f forall a. a -> [a] -> [a]
: Maybe v
-> Maybe String
-> [(String, String, Either String v)]
-> [(String, String, Either String v)]
go forall a. Maybe a
Nothing forall a. Maybe a
Nothing [(String, String, Either String v)]
fs
    go Maybe v
word Maybe String
gloss ((String, String, Either String v)
f:[(String, String, Either String v)]
fs) = (String, String, Either String v)
f forall a. a -> [a] -> [a]
: Maybe v
-> Maybe String
-> [(String, String, Either String v)]
-> [(String, String, Either String v)]
go Maybe v
word Maybe String
gloss [(String, String, Either String v)]
fs
    

-- | The designated language of an MDF field.

data MDFLanguage = English | National | Regional | Vernacular | Other
    deriving (MDFLanguage -> MDFLanguage -> Bool
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: MDFLanguage -> MDFLanguage -> Bool
$c/= :: MDFLanguage -> MDFLanguage -> Bool
== :: MDFLanguage -> MDFLanguage -> Bool
$c== :: MDFLanguage -> MDFLanguage -> Bool
Eq, Int -> MDFLanguage -> ShowS
[MDFLanguage] -> ShowS
MDFLanguage -> String
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [MDFLanguage] -> ShowS
$cshowList :: [MDFLanguage] -> ShowS
show :: MDFLanguage -> String
$cshow :: MDFLanguage -> String
showsPrec :: Int -> MDFLanguage -> ShowS
$cshowsPrec :: Int -> MDFLanguage -> ShowS
Show)

-- | A 'M.Map' from the most common field markers to the language of

-- their values.

--

-- (Note: This is currently hardcoded in the source code, based on the

-- values in the MDF definitions from SIL Toolbox. There’s probably a

-- more principled way of defining this, but hardcoding should suffice

-- for now.)

fieldLangs :: M.Map String MDFLanguage
fieldLangs :: Map String MDFLanguage
fieldLangs = forall k a. Ord k => [(k, a)] -> Map k a
M.fromList
    [ (String
"1d" , MDFLanguage
Vernacular) , (String
"1e" , MDFLanguage
Vernacular) , (String
"1i" , MDFLanguage
Vernacular)
    , (String
"1p" , MDFLanguage
Vernacular) , (String
"1s" , MDFLanguage
Vernacular) , (String
"2d" , MDFLanguage
Vernacular)
    , (String
"2p" , MDFLanguage
Vernacular) , (String
"2s" , MDFLanguage
Vernacular) , (String
"3d" , MDFLanguage
Vernacular)
    , (String
"3p" , MDFLanguage
Vernacular) , (String
"3s" , MDFLanguage
Vernacular) , (String
"4d" , MDFLanguage
Vernacular)
    , (String
"4p" , MDFLanguage
Vernacular) , (String
"4s" , MDFLanguage
Vernacular) , (String
"a"  , MDFLanguage
Vernacular)
    , (String
"an" , MDFLanguage
Vernacular) , (String
"bb" , MDFLanguage
English)    , (String
"bw" , MDFLanguage
English)
    , (String
"ce" , MDFLanguage
English)    , (String
"cf" , MDFLanguage
Vernacular) , (String
"cn" , MDFLanguage
National)
    , (String
"cr" , MDFLanguage
National)   , (String
"de" , MDFLanguage
English)    , (String
"dn" , MDFLanguage
National)
    , (String
"dr" , MDFLanguage
Regional)   , (String
"dt" , MDFLanguage
Other)      , (String
"dv" , MDFLanguage
Vernacular)
    , (String
"ec" , MDFLanguage
English)    , (String
"ee" , MDFLanguage
English)    , (String
"eg" , MDFLanguage
English)
    , (String
"en" , MDFLanguage
National)   , (String
"er" , MDFLanguage
Regional)   , (String
"es" , MDFLanguage
English)
    , (String
"et" , MDFLanguage
Other)  {- defined as vernacular in SIL Toolbox, but by
                         definition it's really a different language -}
    , (String
"ev" , MDFLanguage
Vernacular) , (String
"ge" , MDFLanguage
English)
    , (String
"gn" , MDFLanguage
National)   , (String
"gr" , MDFLanguage
Regional)   , (String
"gv" , MDFLanguage
Vernacular)
    , (String
"hm" , MDFLanguage
English)    , (String
"is" , MDFLanguage
English)    , (String
"lc" , MDFLanguage
Vernacular)
    , (String
"le" , MDFLanguage
English)    , (String
"lf" , MDFLanguage
English)    , (String
"ln" , MDFLanguage
National)
    , (String
"lr" , MDFLanguage
Regional)   , (String
"lt" , MDFLanguage
English)    , (String
"lv" , MDFLanguage
Vernacular)
    , (String
"lx" , MDFLanguage
Vernacular) , (String
"mn" , MDFLanguage
Vernacular) , (String
"mr" , MDFLanguage
Vernacular)
    , (String
"na" , MDFLanguage
English)    , (String
"nd" , MDFLanguage
English)    , (String
"ng" , MDFLanguage
English)
    , (String
"np" , MDFLanguage
English)    , (String
"nq" , MDFLanguage
English)    , (String
"ns" , MDFLanguage
English)
    , (String
"nt" , MDFLanguage
English)    , (String
"oe" , MDFLanguage
English)    , (String
"on" , MDFLanguage
National)
    , (String
"or" , MDFLanguage
Regional)   , (String
"ov" , MDFLanguage
Vernacular) , (String
"pc" , MDFLanguage
English)
    , (String
"pd" , MDFLanguage
English)    , (String
"pde", MDFLanguage
English)    , (String
"pdl", MDFLanguage
English)
    , (String
"pdn", MDFLanguage
National)   , (String
"pdr", MDFLanguage
Regional)   , (String
"pdv", MDFLanguage
Vernacular)
    , (String
"ph" , MDFLanguage
Other)      , (String
"pl" , MDFLanguage
Vernacular) , (String
"pn" , MDFLanguage
National)
    , (String
"ps" , MDFLanguage
English)    , (String
"rd" , MDFLanguage
Vernacular) , (String
"re" , MDFLanguage
English)
    , (String
"rf" , MDFLanguage
English)    , (String
"rn" , MDFLanguage
National)   , (String
"rr" , MDFLanguage
Regional)
    , (String
"sc" , MDFLanguage
English)    , (String
"sd" , MDFLanguage
English)    , (String
"se" , MDFLanguage
Vernacular)
    , (String
"sg" , MDFLanguage
Vernacular) , (String
"sn" , MDFLanguage
English)    , (String
"so" , MDFLanguage
English)
    , (String
"st" , MDFLanguage
English)    , (String
"sy" , MDFLanguage
Vernacular) , (String
"tb" , MDFLanguage
English)
    , (String
"th" , MDFLanguage
Vernacular) , (String
"u"  , MDFLanguage
Vernacular) , (String
"ue" , MDFLanguage
English)
    , (String
"un" , MDFLanguage
National)   , (String
"ur" , MDFLanguage
Regional)   , (String
"uv" , MDFLanguage
Vernacular)
    , (String
"va" , MDFLanguage
Vernacular) , (String
"ve" , MDFLanguage
English)    , (String
"vn" , MDFLanguage
National)
    , (String
"vr" , MDFLanguage
Regional)   , (String
"we" , MDFLanguage
English)    , (String
"wn" , MDFLanguage
National)
    , (String
"wr" , MDFLanguage
Regional)   , (String
"xe" , MDFLanguage
English)    , (String
"xn" , MDFLanguage
National)
    , (String
"xr" , MDFLanguage
Regional)   , (String
"xv" , MDFLanguage
Vernacular)
    ]