{-# LANGUAGE DeriveAnyClass    #-}
{-# LANGUAGE DeriveGeneric     #-}
{-# LANGUAGE DeriveTraversable #-}
{-# LANGUAGE LambdaCase        #-}
{-# LANGUAGE TypeFamilies      #-}
{-# LANGUAGE ViewPatterns      #-}

-- |
-- Module      : Brassica.SoundChange.Tokenise
-- Copyright   : See LICENSE file
-- License     : BSD3
-- Maintainer  : Brad Neimann
--
-- This module provides functions to parse a Brassica words file into
-- its constituent 'Component's, and to tokenise the words in that
-- file into their constituent graphemes to form 'PWord's. It also
-- provides functions to reverse these processes.
module Brassica.SoundChange.Tokenise
       (
       -- * High-level interface
         tokeniseWord
       , Component(..)
       , getWords
       , splitMultipleResults
       , joinComponents
       , tokeniseWords
       , detokeniseWords'
       , detokeniseWords
       , findFirstCategoriesDecl
       , withFirstCategoriesDecl
       -- * Lower-level functions
       , wordParser
       , componentsParser
       , sortByDescendingLength
       ) where

import Data.Char (isSpace)
import Data.Function (on)
import Data.Functor.Identity
import Data.List (intersperse, sortBy)
import Data.Maybe (mapMaybe)
import Data.Ord (Down(..))
import Data.Void (Void)
import GHC.Generics (Generic)

import Control.DeepSeq (NFData)
import Text.Megaparsec hiding (State)
import Text.Megaparsec.Char

import Brassica.SoundChange.Types

-- | Represents a component of a Brassica words file. Each word in the
-- input has type @a@ (often 'PWord' or @['PWord']@).
data Component a
    = Word a            -- ^ An input word to which sound changes will be applied
    | Separator String  -- ^ A separator, e.g. whitespace
    | Gloss String      -- ^ A gloss (in Brassica syntax, between square brackets)
    deriving (Component a -> Component a -> Bool
(Component a -> Component a -> Bool)
-> (Component a -> Component a -> Bool) -> Eq (Component a)
forall a. Eq a => Component a -> Component a -> Bool
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: forall a. Eq a => Component a -> Component a -> Bool
== :: Component a -> Component a -> Bool
$c/= :: forall a. Eq a => Component a -> Component a -> Bool
/= :: Component a -> Component a -> Bool
Eq, Int -> Component a -> ShowS
[Component a] -> ShowS
Component a -> String
(Int -> Component a -> ShowS)
-> (Component a -> String)
-> ([Component a] -> ShowS)
-> Show (Component a)
forall a. Show a => Int -> Component a -> ShowS
forall a. Show a => [Component a] -> ShowS
forall a. Show a => Component a -> String
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: forall a. Show a => Int -> Component a -> ShowS
showsPrec :: Int -> Component a -> ShowS
$cshow :: forall a. Show a => Component a -> String
show :: Component a -> String
$cshowList :: forall a. Show a => [Component a] -> ShowS
showList :: [Component a] -> ShowS
Show, (forall a b. (a -> b) -> Component a -> Component b)
-> (forall a b. a -> Component b -> Component a)
-> Functor Component
forall a b. a -> Component b -> Component a
forall a b. (a -> b) -> Component a -> Component b
forall (f :: * -> *).
(forall a b. (a -> b) -> f a -> f b)
-> (forall a b. a -> f b -> f a) -> Functor f
$cfmap :: forall a b. (a -> b) -> Component a -> Component b
fmap :: forall a b. (a -> b) -> Component a -> Component b
$c<$ :: forall a b. a -> Component b -> Component a
<$ :: forall a b. a -> Component b -> Component a
Functor, (forall m. Monoid m => Component m -> m)
-> (forall m a. Monoid m => (a -> m) -> Component a -> m)
-> (forall m a. Monoid m => (a -> m) -> Component a -> m)
-> (forall a b. (a -> b -> b) -> b -> Component a -> b)
-> (forall a b. (a -> b -> b) -> b -> Component a -> b)
-> (forall b a. (b -> a -> b) -> b -> Component a -> b)
-> (forall b a. (b -> a -> b) -> b -> Component a -> b)
-> (forall a. (a -> a -> a) -> Component a -> a)
-> (forall a. (a -> a -> a) -> Component a -> a)
-> (forall a. Component a -> [a])
-> (forall a. Component a -> Bool)
-> (forall a. Component a -> Int)
-> (forall a. Eq a => a -> Component a -> Bool)
-> (forall a. Ord a => Component a -> a)
-> (forall a. Ord a => Component a -> a)
-> (forall a. Num a => Component a -> a)
-> (forall a. Num a => Component a -> a)
-> Foldable Component
forall a. Eq a => a -> Component a -> Bool
forall a. Num a => Component a -> a
forall a. Ord a => Component a -> a
forall m. Monoid m => Component m -> m
forall a. Component a -> Bool
forall a. Component a -> Int
forall a. Component a -> [a]
forall a. (a -> a -> a) -> Component a -> a
forall m a. Monoid m => (a -> m) -> Component a -> m
forall b a. (b -> a -> b) -> b -> Component a -> b
forall a b. (a -> b -> b) -> b -> Component a -> b
forall (t :: * -> *).
(forall m. Monoid m => t m -> m)
-> (forall m a. Monoid m => (a -> m) -> t a -> m)
-> (forall m a. Monoid m => (a -> m) -> t a -> m)
-> (forall a b. (a -> b -> b) -> b -> t a -> b)
-> (forall a b. (a -> b -> b) -> b -> t a -> b)
-> (forall b a. (b -> a -> b) -> b -> t a -> b)
-> (forall b a. (b -> a -> b) -> b -> t a -> b)
-> (forall a. (a -> a -> a) -> t a -> a)
-> (forall a. (a -> a -> a) -> t a -> a)
-> (forall a. t a -> [a])
-> (forall a. t a -> Bool)
-> (forall a. t a -> Int)
-> (forall a. Eq a => a -> t a -> Bool)
-> (forall a. Ord a => t a -> a)
-> (forall a. Ord a => t a -> a)
-> (forall a. Num a => t a -> a)
-> (forall a. Num a => t a -> a)
-> Foldable t
$cfold :: forall m. Monoid m => Component m -> m
fold :: forall m. Monoid m => Component m -> m
$cfoldMap :: forall m a. Monoid m => (a -> m) -> Component a -> m
foldMap :: forall m a. Monoid m => (a -> m) -> Component a -> m
$cfoldMap' :: forall m a. Monoid m => (a -> m) -> Component a -> m
foldMap' :: forall m a. Monoid m => (a -> m) -> Component a -> m
$cfoldr :: forall a b. (a -> b -> b) -> b -> Component a -> b
foldr :: forall a b. (a -> b -> b) -> b -> Component a -> b
$cfoldr' :: forall a b. (a -> b -> b) -> b -> Component a -> b
foldr' :: forall a b. (a -> b -> b) -> b -> Component a -> b
$cfoldl :: forall b a. (b -> a -> b) -> b -> Component a -> b
foldl :: forall b a. (b -> a -> b) -> b -> Component a -> b
$cfoldl' :: forall b a. (b -> a -> b) -> b -> Component a -> b
foldl' :: forall b a. (b -> a -> b) -> b -> Component a -> b
$cfoldr1 :: forall a. (a -> a -> a) -> Component a -> a
foldr1 :: forall a. (a -> a -> a) -> Component a -> a
$cfoldl1 :: forall a. (a -> a -> a) -> Component a -> a
foldl1 :: forall a. (a -> a -> a) -> Component a -> a
$ctoList :: forall a. Component a -> [a]
toList :: forall a. Component a -> [a]
$cnull :: forall a. Component a -> Bool
null :: forall a. Component a -> Bool
$clength :: forall a. Component a -> Int
length :: forall a. Component a -> Int
$celem :: forall a. Eq a => a -> Component a -> Bool
elem :: forall a. Eq a => a -> Component a -> Bool
$cmaximum :: forall a. Ord a => Component a -> a
maximum :: forall a. Ord a => Component a -> a
$cminimum :: forall a. Ord a => Component a -> a
minimum :: forall a. Ord a => Component a -> a
$csum :: forall a. Num a => Component a -> a
sum :: forall a. Num a => Component a -> a
$cproduct :: forall a. Num a => Component a -> a
product :: forall a. Num a => Component a -> a
Foldable, Functor Component
Foldable Component
Functor Component
-> Foldable Component
-> (forall (f :: * -> *) a b.
    Applicative f =>
    (a -> f b) -> Component a -> f (Component b))
-> (forall (f :: * -> *) a.
    Applicative f =>
    Component (f a) -> f (Component a))
-> (forall (m :: * -> *) a b.
    Monad m =>
    (a -> m b) -> Component a -> m (Component b))
-> (forall (m :: * -> *) a.
    Monad m =>
    Component (m a) -> m (Component a))
-> Traversable Component
forall (t :: * -> *).
Functor t
-> Foldable t
-> (forall (f :: * -> *) a b.
    Applicative f =>
    (a -> f b) -> t a -> f (t b))
-> (forall (f :: * -> *) a. Applicative f => t (f a) -> f (t a))
-> (forall (m :: * -> *) a b.
    Monad m =>
    (a -> m b) -> t a -> m (t b))
-> (forall (m :: * -> *) a. Monad m => t (m a) -> m (t a))
-> Traversable t
forall (m :: * -> *) a.
Monad m =>
Component (m a) -> m (Component a)
forall (f :: * -> *) a.
Applicative f =>
Component (f a) -> f (Component a)
forall (m :: * -> *) a b.
Monad m =>
(a -> m b) -> Component a -> m (Component b)
forall (f :: * -> *) a b.
Applicative f =>
(a -> f b) -> Component a -> f (Component b)
$ctraverse :: forall (f :: * -> *) a b.
Applicative f =>
(a -> f b) -> Component a -> f (Component b)
traverse :: forall (f :: * -> *) a b.
Applicative f =>
(a -> f b) -> Component a -> f (Component b)
$csequenceA :: forall (f :: * -> *) a.
Applicative f =>
Component (f a) -> f (Component a)
sequenceA :: forall (f :: * -> *) a.
Applicative f =>
Component (f a) -> f (Component a)
$cmapM :: forall (m :: * -> *) a b.
Monad m =>
(a -> m b) -> Component a -> m (Component b)
mapM :: forall (m :: * -> *) a b.
Monad m =>
(a -> m b) -> Component a -> m (Component b)
$csequence :: forall (m :: * -> *) a.
Monad m =>
Component (m a) -> m (Component a)
sequence :: forall (m :: * -> *) a.
Monad m =>
Component (m a) -> m (Component a)
Traversable, (forall x. Component a -> Rep (Component a) x)
-> (forall x. Rep (Component a) x -> Component a)
-> Generic (Component a)
forall x. Rep (Component a) x -> Component a
forall x. Component a -> Rep (Component a) x
forall a.
(forall x. a -> Rep a x) -> (forall x. Rep a x -> a) -> Generic a
forall a x. Rep (Component a) x -> Component a
forall a x. Component a -> Rep (Component a) x
$cfrom :: forall a x. Component a -> Rep (Component a) x
from :: forall x. Component a -> Rep (Component a) x
$cto :: forall a x. Rep (Component a) x -> Component a
to :: forall x. Rep (Component a) x -> Component a
Generic, Component a -> ()
(Component a -> ()) -> NFData (Component a)
forall a. NFData a => Component a -> ()
forall a. (a -> ()) -> NFData a
$crnf :: forall a. NFData a => Component a -> ()
rnf :: Component a -> ()
NFData)

-- | Flatten a nested list of 'Component's.
joinComponents :: [Component [Component a]] -> [Component a]
joinComponents :: forall a. [Component [Component a]] -> [Component a]
joinComponents = (Component [Component a] -> [Component a])
-> [Component [Component a]] -> [Component a]
forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap Component [Component a] -> [Component a]
forall {a}. Component [Component a] -> [Component a]
go
  where
    go :: Component [Component a] -> [Component a]
go (Word [Component a]
cs) = [Component a]
cs
    go (Separator String
s) = [String -> Component a
forall a. String -> Component a
Separator String
s]
    go (Gloss String
s) = [String -> Component a
forall a. String -> Component a
Gloss String
s]

-- | Given a tokenised input string, return only the v'Word's within
-- it.
getWords :: [Component a] -> [a]
getWords :: forall a. [Component a] -> [a]
getWords = (Component a -> Maybe a) -> [Component a] -> [a]
forall a b. (a -> Maybe b) -> [a] -> [b]
mapMaybe ((Component a -> Maybe a) -> [Component a] -> [a])
-> (Component a -> Maybe a) -> [Component a] -> [a]
forall a b. (a -> b) -> a -> b
$ \case
    Word a
a -> a -> Maybe a
forall a. a -> Maybe a
Just a
a
    Component a
_ -> Maybe a
forall a. Maybe a
Nothing

-- | Given a 'Component' containing multiple values in a v'Word',
-- split it apart into a list of 'Component's in which the given
-- 'String' is used as a 'Separator' between multiple results.
--
-- For instance:
--
-- >>> splitMultipleResults "/" (Word ["abc", "def", "ghi"])
-- [Word "abc", Separator "/", Word "def", Separator "/", Word "ghi"]
--
-- >>> splitMultipleResults " " (Word ["abc"])
-- [Word "abc"]
splitMultipleResults :: String -> Component [a] -> [Component a]
splitMultipleResults :: forall a. String -> Component [a] -> [Component a]
splitMultipleResults String
wh (Word [a]
as) = Component a -> [Component a] -> [Component a]
forall a. a -> [a] -> [a]
intersperse (String -> Component a
forall a. String -> Component a
Separator String
wh) ([Component a] -> [Component a]) -> [Component a] -> [Component a]
forall a b. (a -> b) -> a -> b
$ a -> Component a
forall a. a -> Component a
Word (a -> Component a) -> [a] -> [Component a]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> [a]
as
splitMultipleResults String
_ (Separator String
w) = [String -> Component a
forall a. String -> Component a
Separator String
w]
splitMultipleResults String
_ (Gloss String
g) = [String -> Component a
forall a. String -> Component a
Gloss String
g]

-- | Megaparsec parser for 'PWord's — see 'tokeniseWord' documentation
-- for details on the parsing strategy. For most usecases
-- 'tokeniseWord' should suffice; 'wordParser' itself is only really
-- useful in unusual situations (e.g. as part of a larger parser).
--
-- The first parameter gives a list of characters aside from
-- whitespace which should be excluded from words, i.e. the parser
-- will stop if any of them are found. The second gives a list of
-- multigraphs which might be expected, as with 'tokeniseWord'.
--
-- Note: the second parameter __must__ be already be sorted by descending length;
-- otherwise multigraphs will not be parsed correctly (i.e. greedily).
wordParser :: [Char] -> [String] -> ParsecT Void String Identity PWord
wordParser :: String -> [String] -> ParsecT Void String Identity [String]
wordParser String
excludes [String]
gs = ParsecT Void String Identity String
-> ParsecT Void String Identity [String]
forall (m :: * -> *) a. MonadPlus m => m a -> m [a]
some (ParsecT Void String Identity String
 -> ParsecT Void String Identity [String])
-> ParsecT Void String Identity String
-> ParsecT Void String Identity [String]
forall a b. (a -> b) -> a -> b
$
    (String
"#" String
-> ParsecT Void String Identity (Token String)
-> ParsecT Void String Identity String
forall a b.
a
-> ParsecT Void String Identity b -> ParsecT Void String Identity a
forall (f :: * -> *) a b. Functor f => a -> f b -> f a
<$ Token String -> ParsecT Void String Identity (Token String)
forall e s (m :: * -> *).
MonadParsec e s m =>
Token s -> m (Token s)
single Char
Token String
'#')
    ParsecT Void String Identity String
-> ParsecT Void String Identity String
-> ParsecT Void String Identity String
forall a.
ParsecT Void String Identity a
-> ParsecT Void String Identity a -> ParsecT Void String Identity a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> [ParsecT Void String Identity String]
-> ParsecT Void String Identity String
forall (f :: * -> *) (m :: * -> *) a.
(Foldable f, Alternative m) =>
f (m a) -> m a
choice (String -> ParsecT Void String Identity String
Tokens String -> ParsecT Void String Identity (Tokens String)
forall e s (m :: * -> *).
MonadParsec e s m =>
Tokens s -> m (Tokens s)
chunk (String -> ParsecT Void String Identity String)
-> [String] -> [ParsecT Void String Identity String]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> [String]
gs)
    ParsecT Void String Identity String
-> ParsecT Void String Identity String
-> ParsecT Void String Identity String
forall a.
ParsecT Void String Identity a
-> ParsecT Void String Identity a -> ParsecT Void String Identity a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (Char -> String
forall a. a -> [a]
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Char -> String)
-> ParsecT Void String Identity Char
-> ParsecT Void String Identity String
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (Token String -> Bool)
-> ParsecT Void String Identity (Token String)
forall e s (m :: * -> *).
MonadParsec e s m =>
(Token s -> Bool) -> m (Token s)
satisfy (Bool -> Bool
not (Bool -> Bool) -> (Char -> Bool) -> Char -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Bool
exclude))
  where
    exclude :: Char -> Bool
exclude = Bool -> Bool -> Bool
(||) (Bool -> Bool -> Bool) -> (Char -> Bool) -> Char -> Bool -> Bool
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Char -> Bool
isSpace (Char -> Bool -> Bool) -> (Char -> Bool) -> Char -> Bool
forall a b. (Char -> a -> b) -> (Char -> a) -> Char -> b
forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> (Char -> String -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` String
excludes)

-- | Megaparsec parser for 'Component's. Similarly to 'wordParser',
-- usually it’s easier to use 'tokeniseWords' instead.
componentsParser
    :: ParsecT Void String Identity a  -- ^ Parser for individual words (e.g. 'wordParser')
    -> ParsecT Void String Identity [Component a]
componentsParser :: forall a.
ParsecT Void String Identity a
-> ParsecT Void String Identity [Component a]
componentsParser ParsecT Void String Identity a
p = ParsecT Void String Identity (Component a)
-> ParsecT Void String Identity [Component a]
forall (m :: * -> *) a. MonadPlus m => m a -> m [a]
many (ParsecT Void String Identity (Component a)
 -> ParsecT Void String Identity [Component a])
-> ParsecT Void String Identity (Component a)
-> ParsecT Void String Identity [Component a]
forall a b. (a -> b) -> a -> b
$
    (String -> Component a
forall a. String -> Component a
Separator (String -> Component a)
-> ParsecT Void String Identity String
-> ParsecT Void String Identity (Component a)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Maybe String
-> (Token String -> Bool)
-> ParsecT Void String Identity (Tokens String)
forall e s (m :: * -> *).
MonadParsec e s m =>
Maybe String -> (Token s -> Bool) -> m (Tokens s)
takeWhile1P Maybe String
forall a. Maybe a
Nothing Char -> Bool
Token String -> Bool
isSpace) ParsecT Void String Identity (Component a)
-> ParsecT Void String Identity (Component a)
-> ParsecT Void String Identity (Component a)
forall a.
ParsecT Void String Identity a
-> ParsecT Void String Identity a -> ParsecT Void String Identity a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|>
    (String -> Component a
forall a. String -> Component a
Gloss (String -> Component a)
-> ParsecT Void String Identity String
-> ParsecT Void String Identity (Component a)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Bool -> ParsecT Void String Identity String
forall {s} {m :: * -> *} {e}.
(Token s ~ Char, Tokens s ~ String, MonadParsec e s m) =>
Bool -> m String
gloss Bool
False) ParsecT Void String Identity (Component a)
-> ParsecT Void String Identity (Component a)
-> ParsecT Void String Identity (Component a)
forall a.
ParsecT Void String Identity a
-> ParsecT Void String Identity a -> ParsecT Void String Identity a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|>
    (a -> Component a
forall a. a -> Component a
Word (a -> Component a)
-> ParsecT Void String Identity a
-> ParsecT Void String Identity (Component a)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> ParsecT Void String Identity a
p)
  where
    gloss :: Bool -> m String
gloss Bool
returnBracketed = do
        Char
_ <- Token s -> m (Token s)
forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
Token s -> m (Token s)
char Char
Token s
'['
        [String]
contents <- m String -> m [String]
forall (m :: * -> *) a. MonadPlus m => m a -> m [a]
many (m String -> m [String]) -> m String -> m [String]
forall a b. (a -> b) -> a -> b
$ Bool -> m String
gloss Bool
True m String -> m String -> m String
forall a. m a -> m a -> m a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Maybe String -> (Token s -> Bool) -> m (Tokens s)
forall e s (m :: * -> *).
MonadParsec e s m =>
Maybe String -> (Token s -> Bool) -> m (Tokens s)
takeWhile1P Maybe String
forall a. Maybe a
Nothing (Bool -> Bool
not (Bool -> Bool) -> (Char -> Bool) -> Char -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> String -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` String
"[]"))
        Char
_ <- Token s -> m (Token s)
forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
Token s -> m (Token s)
char Char
Token s
']'
        String -> m String
forall a. a -> m a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (String -> m String) -> String -> m String
forall a b. (a -> b) -> a -> b
$ if Bool
returnBracketed
           then Char
'[' Char -> ShowS
forall a. a -> [a] -> [a]
: [String] -> String
forall (t :: * -> *) a. Foldable t => t [a] -> [a]
concat [String]
contents String -> ShowS
forall a. [a] -> [a] -> [a]
++ String
"]"
           else [String] -> String
forall (t :: * -> *) a. Foldable t => t [a] -> [a]
concat [String]
contents

-- | Sort a list of lists by the length of the inner lists, in
-- descending order.
sortByDescendingLength :: [[a]] -> [[a]]
sortByDescendingLength :: forall a. [[a]] -> [[a]]
sortByDescendingLength = ([a] -> [a] -> Ordering) -> [[a]] -> [[a]]
forall a. (a -> a -> Ordering) -> [a] -> [a]
sortBy (Down Int -> Down Int -> Ordering
forall a. Ord a => a -> a -> Ordering
compare (Down Int -> Down Int -> Ordering)
-> ([a] -> Down Int) -> [a] -> [a] -> Ordering
forall b c a. (b -> b -> c) -> (a -> b) -> a -> a -> c
`on` Int -> Down Int
forall a. a -> Down a
Down (Int -> Down Int) -> ([a] -> Int) -> [a] -> Down Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [a] -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length)

-- | Tokenise a 'String' input word into a 'PWord' by splitting it up
-- into t'Grapheme's. A list of available multigraphs is supplied as
-- the first argument.
--
-- Note that this tokeniser is greedy: if one of the given
-- multigraphs is a prefix of another, the tokeniser will prefer the
-- longest if possible. If there are no matching multigraphs starting
-- at a particular character in the 'String', 'tokeniseWord' will
-- take that character as forming its own t'Grapheme'. For instance:
--
-- >>> tokeniseWord [] "cherish"
-- Right [GMulti "c",GMulti "h",GMulti "e",GMulti "r",GMulti "i",GMulti "s",GMulti "h"]
--
-- >>> tokeniseWord ["e","h","i","r","s","sh"] "cherish"
-- Right [GMulti "c",GMulti "h",GMulti "e",GMulti "r",GMulti "i",GMulti "sh"]
--
-- >>> tokeniseWord ["c","ch","e","h","i","r","s","sh"] "cherish"
-- Right [GMulti "ch",GMulti "e",GMulti "r",GMulti "i",GMulti "sh"]
--
-- The resulting 'PWord' can be converted back to a 'String' using
-- 'concatWithBoundary'. (However, it is not strictly speaking a true
-- inverse as it deletes word boundaries).
tokeniseWord :: [String] -> String -> Either (ParseErrorBundle String Void) PWord
tokeniseWord :: [String]
-> String -> Either (ParseErrorBundle String Void) [String]
tokeniseWord ([String] -> [String]
forall a. [[a]] -> [[a]]
sortByDescendingLength -> [String]
gs) = ParsecT Void String Identity [String]
-> String
-> String
-> Either (ParseErrorBundle String Void) [String]
forall e s a.
Parsec e s a -> String -> s -> Either (ParseErrorBundle s e) a
parse (String -> [String] -> ParsecT Void String Identity [String]
wordParser String
"[" [String]
gs) String
""

-- | Given a list of available multigraphs, tokenise an input words
-- file into a list of words and other 'Component's. This uses the
-- same tokenisation strategy as 'tokeniseWords', but also recognises
-- 'Gloss'es (in square brackets) and 'Separator's (as whitespace).
tokeniseWords :: [String] -> String -> Either (ParseErrorBundle String Void) [Component PWord]
tokeniseWords :: [String]
-> String
-> Either (ParseErrorBundle String Void) [Component [String]]
tokeniseWords ([String] -> [String]
forall a. [[a]] -> [[a]]
sortByDescendingLength -> [String]
gs) =
        Parsec Void String [Component [String]]
-> String
-> String
-> Either (ParseErrorBundle String Void) [Component [String]]
forall e s a.
Parsec e s a -> String -> s -> Either (ParseErrorBundle s e) a
parse (ParsecT Void String Identity [String]
-> Parsec Void String [Component [String]]
forall a.
ParsecT Void String Identity a
-> ParsecT Void String Identity [Component a]
componentsParser (ParsecT Void String Identity [String]
 -> Parsec Void String [Component [String]])
-> ParsecT Void String Identity [String]
-> Parsec Void String [Component [String]]
forall a b. (a -> b) -> a -> b
$ String -> [String] -> ParsecT Void String Identity [String]
wordParser String
"[" [String]
gs) String
""

-- | Inverse of 'tokeniseWords': given a function to convert v'Word's
-- to strings, converts a list of 'Component's to strings.
detokeniseWords' :: (a -> String) -> [Component a] -> String
detokeniseWords' :: forall a. (a -> String) -> [Component a] -> String
detokeniseWords' a -> String
f = (Component a -> String) -> [Component a] -> String
forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap ((Component a -> String) -> [Component a] -> String)
-> (Component a -> String) -> [Component a] -> String
forall a b. (a -> b) -> a -> b
$ \case
    Word a
gs -> a -> String
f a
gs
    Separator String
w -> String
w
    Gloss String
g -> Char
'['Char -> ShowS
forall a. a -> [a] -> [a]
:(String
g String -> ShowS
forall a. [a] -> [a] -> [a]
++ String
"]")

-- | Specialisation of 'detokeniseWords'' for 'PWord's, converting
-- words to strings using 'concatWithBoundary'.
detokeniseWords :: [Component PWord] -> String
detokeniseWords :: [Component [String]] -> String
detokeniseWords = ([String] -> String) -> [Component [String]] -> String
forall a. (a -> String) -> [Component a] -> String
detokeniseWords' [String] -> String
concatWithBoundary

-- | Given a list of sound changes, extract the list of multigraphs
-- defined in the first 'GraphemeList' of the 'SoundChanges'.
findFirstCategoriesDecl :: SoundChanges c GraphemeList -> [String]
findFirstCategoriesDecl :: forall (c :: LexemeType -> *).
SoundChanges c GraphemeList -> [String]
findFirstCategoriesDecl (DeclS (GraphemeList Bool
_ [String]
gs):[Statement c GraphemeList]
_) = [String]
gs
findFirstCategoriesDecl (Statement c GraphemeList
_:[Statement c GraphemeList]
ss) = [Statement c GraphemeList] -> [String]
forall (c :: LexemeType -> *).
SoundChanges c GraphemeList -> [String]
findFirstCategoriesDecl [Statement c GraphemeList]
ss
findFirstCategoriesDecl [] = []

-- | CPS'd form of 'findFirstCategoriesDecl'. Nice for doing things
-- like @'withFirstCategoriesDecl' 'tokeniseWords' changes words@ (to
-- tokenise using the graphemes from the first categories declaration)
-- and so on.
withFirstCategoriesDecl :: ([String] -> t) -> SoundChanges c GraphemeList -> t
withFirstCategoriesDecl :: forall t (c :: LexemeType -> *).
([String] -> t) -> SoundChanges c GraphemeList -> t
withFirstCategoriesDecl [String] -> t
tok SoundChanges c GraphemeList
ss = [String] -> t
tok (SoundChanges c GraphemeList -> [String]
forall (c :: LexemeType -> *).
SoundChanges c GraphemeList -> [String]
findFirstCategoriesDecl SoundChanges c GraphemeList
ss)