{-# LANGUAGE DeriveAnyClass    #-}
{-# LANGUAGE DeriveGeneric     #-}
{-# LANGUAGE DeriveTraversable #-}
{-# LANGUAGE LambdaCase        #-}
{-# LANGUAGE TypeFamilies      #-}
{-# LANGUAGE ViewPatterns      #-}

module Brassica.SoundChange.Tokenise
       ( 
       -- * Components
         Component(..)
       , getWords
       , splitMultipleResults
       -- * High-level interface
       , tokeniseWord
       , tokeniseWords
       , detokeniseWords'
       , detokeniseWords
       , concatWithBoundary
       , findFirstCategoriesDecl
       , withFirstCategoriesDecl
       -- * Lower-level functions
       , wordParser
       , componentsParser
       , sortByDescendingLength
       ) where

import Data.Char (isSpace)
import Data.Function (on)
import Data.Functor.Identity
import Data.List (intersperse, sortBy)
import Data.Maybe (mapMaybe)
import Data.Ord (Down(..))
import Data.Void (Void)
import GHC.Generics (Generic)

import Control.DeepSeq (NFData)
import Text.Megaparsec hiding (State)
import Text.Megaparsec.Char

import Brassica.SoundChange.Types

-- | Represents a component of a tokenised input string. v'Word's in
-- the input are represented as the type parameter @a@ — which for
-- this reason will usually, though not always, be 'PWord'.
data Component a = Word a | Separator String | Gloss String
    deriving (Component a -> Component a -> Bool
(Component a -> Component a -> Bool)
-> (Component a -> Component a -> Bool) -> Eq (Component a)
forall a. Eq a => Component a -> Component a -> Bool
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: forall a. Eq a => Component a -> Component a -> Bool
== :: Component a -> Component a -> Bool
$c/= :: forall a. Eq a => Component a -> Component a -> Bool
/= :: Component a -> Component a -> Bool
Eq, Int -> Component a -> ShowS
[Component a] -> ShowS
Component a -> String
(Int -> Component a -> ShowS)
-> (Component a -> String)
-> ([Component a] -> ShowS)
-> Show (Component a)
forall a. Show a => Int -> Component a -> ShowS
forall a. Show a => [Component a] -> ShowS
forall a. Show a => Component a -> String
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: forall a. Show a => Int -> Component a -> ShowS
showsPrec :: Int -> Component a -> ShowS
$cshow :: forall a. Show a => Component a -> String
show :: Component a -> String
$cshowList :: forall a. Show a => [Component a] -> ShowS
showList :: [Component a] -> ShowS
Show, (forall a b. (a -> b) -> Component a -> Component b)
-> (forall a b. a -> Component b -> Component a)
-> Functor Component
forall a b. a -> Component b -> Component a
forall a b. (a -> b) -> Component a -> Component b
forall (f :: * -> *).
(forall a b. (a -> b) -> f a -> f b)
-> (forall a b. a -> f b -> f a) -> Functor f
$cfmap :: forall a b. (a -> b) -> Component a -> Component b
fmap :: forall a b. (a -> b) -> Component a -> Component b
$c<$ :: forall a b. a -> Component b -> Component a
<$ :: forall a b. a -> Component b -> Component a
Functor, (forall m. Monoid m => Component m -> m)
-> (forall m a. Monoid m => (a -> m) -> Component a -> m)
-> (forall m a. Monoid m => (a -> m) -> Component a -> m)
-> (forall a b. (a -> b -> b) -> b -> Component a -> b)
-> (forall a b. (a -> b -> b) -> b -> Component a -> b)
-> (forall b a. (b -> a -> b) -> b -> Component a -> b)
-> (forall b a. (b -> a -> b) -> b -> Component a -> b)
-> (forall a. (a -> a -> a) -> Component a -> a)
-> (forall a. (a -> a -> a) -> Component a -> a)
-> (forall a. Component a -> [a])
-> (forall a. Component a -> Bool)
-> (forall a. Component a -> Int)
-> (forall a. Eq a => a -> Component a -> Bool)
-> (forall a. Ord a => Component a -> a)
-> (forall a. Ord a => Component a -> a)
-> (forall a. Num a => Component a -> a)
-> (forall a. Num a => Component a -> a)
-> Foldable Component
forall a. Eq a => a -> Component a -> Bool
forall a. Num a => Component a -> a
forall a. Ord a => Component a -> a
forall m. Monoid m => Component m -> m
forall a. Component a -> Bool
forall a. Component a -> Int
forall a. Component a -> [a]
forall a. (a -> a -> a) -> Component a -> a
forall m a. Monoid m => (a -> m) -> Component a -> m
forall b a. (b -> a -> b) -> b -> Component a -> b
forall a b. (a -> b -> b) -> b -> Component a -> b
forall (t :: * -> *).
(forall m. Monoid m => t m -> m)
-> (forall m a. Monoid m => (a -> m) -> t a -> m)
-> (forall m a. Monoid m => (a -> m) -> t a -> m)
-> (forall a b. (a -> b -> b) -> b -> t a -> b)
-> (forall a b. (a -> b -> b) -> b -> t a -> b)
-> (forall b a. (b -> a -> b) -> b -> t a -> b)
-> (forall b a. (b -> a -> b) -> b -> t a -> b)
-> (forall a. (a -> a -> a) -> t a -> a)
-> (forall a. (a -> a -> a) -> t a -> a)
-> (forall a. t a -> [a])
-> (forall a. t a -> Bool)
-> (forall a. t a -> Int)
-> (forall a. Eq a => a -> t a -> Bool)
-> (forall a. Ord a => t a -> a)
-> (forall a. Ord a => t a -> a)
-> (forall a. Num a => t a -> a)
-> (forall a. Num a => t a -> a)
-> Foldable t
$cfold :: forall m. Monoid m => Component m -> m
fold :: forall m. Monoid m => Component m -> m
$cfoldMap :: forall m a. Monoid m => (a -> m) -> Component a -> m
foldMap :: forall m a. Monoid m => (a -> m) -> Component a -> m
$cfoldMap' :: forall m a. Monoid m => (a -> m) -> Component a -> m
foldMap' :: forall m a. Monoid m => (a -> m) -> Component a -> m
$cfoldr :: forall a b. (a -> b -> b) -> b -> Component a -> b
foldr :: forall a b. (a -> b -> b) -> b -> Component a -> b
$cfoldr' :: forall a b. (a -> b -> b) -> b -> Component a -> b
foldr' :: forall a b. (a -> b -> b) -> b -> Component a -> b
$cfoldl :: forall b a. (b -> a -> b) -> b -> Component a -> b
foldl :: forall b a. (b -> a -> b) -> b -> Component a -> b
$cfoldl' :: forall b a. (b -> a -> b) -> b -> Component a -> b
foldl' :: forall b a. (b -> a -> b) -> b -> Component a -> b
$cfoldr1 :: forall a. (a -> a -> a) -> Component a -> a
foldr1 :: forall a. (a -> a -> a) -> Component a -> a
$cfoldl1 :: forall a. (a -> a -> a) -> Component a -> a
foldl1 :: forall a. (a -> a -> a) -> Component a -> a
$ctoList :: forall a. Component a -> [a]
toList :: forall a. Component a -> [a]
$cnull :: forall a. Component a -> Bool
null :: forall a. Component a -> Bool
$clength :: forall a. Component a -> Int
length :: forall a. Component a -> Int
$celem :: forall a. Eq a => a -> Component a -> Bool
elem :: forall a. Eq a => a -> Component a -> Bool
$cmaximum :: forall a. Ord a => Component a -> a
maximum :: forall a. Ord a => Component a -> a
$cminimum :: forall a. Ord a => Component a -> a
minimum :: forall a. Ord a => Component a -> a
$csum :: forall a. Num a => Component a -> a
sum :: forall a. Num a => Component a -> a
$cproduct :: forall a. Num a => Component a -> a
product :: forall a. Num a => Component a -> a
Foldable, Functor Component
Foldable Component
(Functor Component, Foldable Component) =>
(forall (f :: * -> *) a b.
 Applicative f =>
 (a -> f b) -> Component a -> f (Component b))
-> (forall (f :: * -> *) a.
    Applicative f =>
    Component (f a) -> f (Component a))
-> (forall (m :: * -> *) a b.
    Monad m =>
    (a -> m b) -> Component a -> m (Component b))
-> (forall (m :: * -> *) a.
    Monad m =>
    Component (m a) -> m (Component a))
-> Traversable Component
forall (t :: * -> *).
(Functor t, Foldable t) =>
(forall (f :: * -> *) a b.
 Applicative f =>
 (a -> f b) -> t a -> f (t b))
-> (forall (f :: * -> *) a. Applicative f => t (f a) -> f (t a))
-> (forall (m :: * -> *) a b.
    Monad m =>
    (a -> m b) -> t a -> m (t b))
-> (forall (m :: * -> *) a. Monad m => t (m a) -> m (t a))
-> Traversable t
forall (m :: * -> *) a.
Monad m =>
Component (m a) -> m (Component a)
forall (f :: * -> *) a.
Applicative f =>
Component (f a) -> f (Component a)
forall (m :: * -> *) a b.
Monad m =>
(a -> m b) -> Component a -> m (Component b)
forall (f :: * -> *) a b.
Applicative f =>
(a -> f b) -> Component a -> f (Component b)
$ctraverse :: forall (f :: * -> *) a b.
Applicative f =>
(a -> f b) -> Component a -> f (Component b)
traverse :: forall (f :: * -> *) a b.
Applicative f =>
(a -> f b) -> Component a -> f (Component b)
$csequenceA :: forall (f :: * -> *) a.
Applicative f =>
Component (f a) -> f (Component a)
sequenceA :: forall (f :: * -> *) a.
Applicative f =>
Component (f a) -> f (Component a)
$cmapM :: forall (m :: * -> *) a b.
Monad m =>
(a -> m b) -> Component a -> m (Component b)
mapM :: forall (m :: * -> *) a b.
Monad m =>
(a -> m b) -> Component a -> m (Component b)
$csequence :: forall (m :: * -> *) a.
Monad m =>
Component (m a) -> m (Component a)
sequence :: forall (m :: * -> *) a.
Monad m =>
Component (m a) -> m (Component a)
Traversable, (forall x. Component a -> Rep (Component a) x)
-> (forall x. Rep (Component a) x -> Component a)
-> Generic (Component a)
forall x. Rep (Component a) x -> Component a
forall x. Component a -> Rep (Component a) x
forall a.
(forall x. a -> Rep a x) -> (forall x. Rep a x -> a) -> Generic a
forall a x. Rep (Component a) x -> Component a
forall a x. Component a -> Rep (Component a) x
$cfrom :: forall a x. Component a -> Rep (Component a) x
from :: forall x. Component a -> Rep (Component a) x
$cto :: forall a x. Rep (Component a) x -> Component a
to :: forall x. Rep (Component a) x -> Component a
Generic, Component a -> ()
(Component a -> ()) -> NFData (Component a)
forall a. NFData a => Component a -> ()
forall a. (a -> ()) -> NFData a
$crnf :: forall a. NFData a => Component a -> ()
rnf :: Component a -> ()
NFData)

-- | Given a tokenised input string, return only the v'Word's within
-- it.
getWords :: [Component a] -> [a]
getWords :: forall a. [Component a] -> [a]
getWords = (Component a -> Maybe a) -> [Component a] -> [a]
forall a b. (a -> Maybe b) -> [a] -> [b]
mapMaybe ((Component a -> Maybe a) -> [Component a] -> [a])
-> (Component a -> Maybe a) -> [Component a] -> [a]
forall a b. (a -> b) -> a -> b
$ \case
    Word a
a -> a -> Maybe a
forall a. a -> Maybe a
Just a
a
    Component a
_ -> Maybe a
forall a. Maybe a
Nothing

-- | Given a 'Component' containing multiple values in a v'Word',
-- split it apart into a list of 'Component's in which the given
-- 'String' is used as a 'Separator' between multiple results.
--
-- For instance:
--
-- >>> splitMultipleResults " " (Word ["abc", "def", "ghi"])
-- [Word "abc", Separator " ", Word "def", Separator " ", Word "ghi"]
--
-- >>> splitMultipleResults " " (Word ["abc"])
-- [Word "abc"]
splitMultipleResults :: String -> Component [a] -> [Component a]
splitMultipleResults :: forall a. String -> Component [a] -> [Component a]
splitMultipleResults String
wh (Word [a]
as) = Component a -> [Component a] -> [Component a]
forall a. a -> [a] -> [a]
intersperse (String -> Component a
forall a. String -> Component a
Separator String
wh) ([Component a] -> [Component a]) -> [Component a] -> [Component a]
forall a b. (a -> b) -> a -> b
$ a -> Component a
forall a. a -> Component a
Word (a -> Component a) -> [a] -> [Component a]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> [a]
as
splitMultipleResults String
_ (Separator String
w) = [String -> Component a
forall a. String -> Component a
Separator String
w]
splitMultipleResults String
_ (Gloss String
g) = [String -> Component a
forall a. String -> Component a
Gloss String
g]
    
-- | Megaparsec parser for 'PWord's — see 'tokeniseWord' documentation
-- for details on the parsing strategy and the meaning of the second
-- parameter. For most usecases 'tokeniseWord' should suffice;
-- 'wordParser' itself is only really useful in unusual situations
-- (e.g. as part of a larger parser). The first parameter gives a list
-- of characters (aside from whitespace) which should be excluded from
-- words, i.e. the parser will stop if any of them are found. The second
-- gives a list of multigraphs which might be expected.
--
-- Note: the second parameter __must__ be 'sortByDescendingLength'-ed;
-- otherwise multigraphs will not be parsed correctly.
wordParser :: [Char] -> [String] -> ParsecT Void String Identity PWord
wordParser :: String -> [String] -> ParsecT Void String Identity PWord
wordParser String
excludes [String]
gs = ParsecT Void String Identity Grapheme
-> ParsecT Void String Identity PWord
forall (m :: * -> *) a. MonadPlus m => m a -> m [a]
some (ParsecT Void String Identity Grapheme
 -> ParsecT Void String Identity PWord)
-> ParsecT Void String Identity Grapheme
-> ParsecT Void String Identity PWord
forall a b. (a -> b) -> a -> b
$
    (Grapheme
GBoundary Grapheme
-> ParsecT Void String Identity (Token String)
-> ParsecT Void String Identity Grapheme
forall a b.
a
-> ParsecT Void String Identity b -> ParsecT Void String Identity a
forall (f :: * -> *) a b. Functor f => a -> f b -> f a
<$ Token String -> ParsecT Void String Identity (Token String)
forall e s (m :: * -> *).
MonadParsec e s m =>
Token s -> m (Token s)
single Char
Token String
'#')
    ParsecT Void String Identity Grapheme
-> ParsecT Void String Identity Grapheme
-> ParsecT Void String Identity Grapheme
forall a.
ParsecT Void String Identity a
-> ParsecT Void String Identity a -> ParsecT Void String Identity a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> [ParsecT Void String Identity Grapheme]
-> ParsecT Void String Identity Grapheme
forall (f :: * -> *) (m :: * -> *) a.
(Foldable f, Alternative m) =>
f (m a) -> m a
choice ((String -> Grapheme)
-> ParsecT Void String Identity String
-> ParsecT Void String Identity Grapheme
forall a b.
(a -> b)
-> ParsecT Void String Identity a -> ParsecT Void String Identity b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap String -> Grapheme
GMulti (ParsecT Void String Identity String
 -> ParsecT Void String Identity Grapheme)
-> (String -> ParsecT Void String Identity String)
-> String
-> ParsecT Void String Identity Grapheme
forall b c a. (b -> c) -> (a -> b) -> a -> c
. String -> ParsecT Void String Identity String
Tokens String -> ParsecT Void String Identity (Tokens String)
forall e s (m :: * -> *).
MonadParsec e s m =>
Tokens s -> m (Tokens s)
chunk (String -> ParsecT Void String Identity Grapheme)
-> [String] -> [ParsecT Void String Identity Grapheme]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> [String]
gs)
    ParsecT Void String Identity Grapheme
-> ParsecT Void String Identity Grapheme
-> ParsecT Void String Identity Grapheme
forall a.
ParsecT Void String Identity a
-> ParsecT Void String Identity a -> ParsecT Void String Identity a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (String -> Grapheme
GMulti (String -> Grapheme) -> (Char -> String) -> Char -> Grapheme
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> String
forall a. a -> [a]
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Char -> Grapheme)
-> ParsecT Void String Identity Char
-> ParsecT Void String Identity Grapheme
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> (Token String -> Bool)
-> ParsecT Void String Identity (Token String)
forall e s (m :: * -> *).
MonadParsec e s m =>
(Token s -> Bool) -> m (Token s)
satisfy (Bool -> Bool
not (Bool -> Bool) -> (Char -> Bool) -> Char -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Bool
exclude))
  where
    exclude :: Char -> Bool
exclude = Bool -> Bool -> Bool
(||) (Bool -> Bool -> Bool) -> (Char -> Bool) -> Char -> Bool -> Bool
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Char -> Bool
isSpace (Char -> Bool -> Bool) -> (Char -> Bool) -> Char -> Bool
forall a b. (Char -> a -> b) -> (Char -> a) -> Char -> b
forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> (Char -> String -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` String
excludes)

-- | Megaparsec parser for 'Component's. Similarly to 'wordParser',
-- usually it’s easier to use 'tokeniseWords' instead.
componentsParser
    :: ParsecT Void String Identity a
    -> ParsecT Void String Identity [Component a]
componentsParser :: forall a.
ParsecT Void String Identity a
-> ParsecT Void String Identity [Component a]
componentsParser ParsecT Void String Identity a
p = ParsecT Void String Identity (Component a)
-> ParsecT Void String Identity [Component a]
forall (m :: * -> *) a. MonadPlus m => m a -> m [a]
many (ParsecT Void String Identity (Component a)
 -> ParsecT Void String Identity [Component a])
-> ParsecT Void String Identity (Component a)
-> ParsecT Void String Identity [Component a]
forall a b. (a -> b) -> a -> b
$
    (String -> Component a
forall a. String -> Component a
Separator (String -> Component a)
-> ParsecT Void String Identity String
-> ParsecT Void String Identity (Component a)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Maybe String
-> (Token String -> Bool)
-> ParsecT Void String Identity (Tokens String)
forall e s (m :: * -> *).
MonadParsec e s m =>
Maybe String -> (Token s -> Bool) -> m (Tokens s)
takeWhile1P Maybe String
forall a. Maybe a
Nothing Char -> Bool
Token String -> Bool
isSpace) ParsecT Void String Identity (Component a)
-> ParsecT Void String Identity (Component a)
-> ParsecT Void String Identity (Component a)
forall a.
ParsecT Void String Identity a
-> ParsecT Void String Identity a -> ParsecT Void String Identity a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|>
    (String -> Component a
forall a. String -> Component a
Gloss (String -> Component a)
-> ParsecT Void String Identity String
-> ParsecT Void String Identity (Component a)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Bool -> ParsecT Void String Identity String
forall {s} {m :: * -> *} {e}.
(Token s ~ Char, Tokens s ~ String, MonadParsec e s m) =>
Bool -> m String
gloss Bool
False) ParsecT Void String Identity (Component a)
-> ParsecT Void String Identity (Component a)
-> ParsecT Void String Identity (Component a)
forall a.
ParsecT Void String Identity a
-> ParsecT Void String Identity a -> ParsecT Void String Identity a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|>
    (a -> Component a
forall a. a -> Component a
Word (a -> Component a)
-> ParsecT Void String Identity a
-> ParsecT Void String Identity (Component a)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> ParsecT Void String Identity a
p)
  where
    gloss :: Bool -> m String
gloss Bool
returnBracketed = do
        Char
_ <- Token s -> m (Token s)
forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
Token s -> m (Token s)
char Char
Token s
'['
        [String]
contents <- m String -> m [String]
forall (m :: * -> *) a. MonadPlus m => m a -> m [a]
many (m String -> m [String]) -> m String -> m [String]
forall a b. (a -> b) -> a -> b
$ Bool -> m String
gloss Bool
True m String -> m String -> m String
forall a. m a -> m a -> m a
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> Maybe String -> (Token s -> Bool) -> m (Tokens s)
forall e s (m :: * -> *).
MonadParsec e s m =>
Maybe String -> (Token s -> Bool) -> m (Tokens s)
takeWhile1P Maybe String
forall a. Maybe a
Nothing (Bool -> Bool
not (Bool -> Bool) -> (Char -> Bool) -> Char -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> String -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` String
"[]"))
        Char
_ <- Token s -> m (Token s)
forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
Token s -> m (Token s)
char Char
Token s
']'
        String -> m String
forall a. a -> m a
forall (f :: * -> *) a. Applicative f => a -> f a
pure (String -> m String) -> String -> m String
forall a b. (a -> b) -> a -> b
$ if Bool
returnBracketed
           then Char
'[' Char -> ShowS
forall a. a -> [a] -> [a]
: [String] -> String
forall (t :: * -> *) a. Foldable t => t [a] -> [a]
concat [String]
contents String -> ShowS
forall a. [a] -> [a] -> [a]
++ String
"]"
           else [String] -> String
forall (t :: * -> *) a. Foldable t => t [a] -> [a]
concat [String]
contents

sortByDescendingLength :: [[a]] -> [[a]]
sortByDescendingLength :: forall a. [[a]] -> [[a]]
sortByDescendingLength = ([a] -> [a] -> Ordering) -> [[a]] -> [[a]]
forall a. (a -> a -> Ordering) -> [a] -> [a]
sortBy (Down Int -> Down Int -> Ordering
forall a. Ord a => a -> a -> Ordering
compare (Down Int -> Down Int -> Ordering)
-> ([a] -> Down Int) -> [a] -> [a] -> Ordering
forall b c a. (b -> b -> c) -> (a -> b) -> a -> a -> c
`on` Int -> Down Int
forall a. a -> Down a
Down (Int -> Down Int) -> ([a] -> Int) -> [a] -> Down Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [a] -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length)

-- | Tokenise a 'String' input word into a 'PWord' by splitting it up
-- into t'Grapheme's. A list of available multigraphs is supplied as
-- the first argument.
--
-- Note that this tokeniser is greedy: if one of the given
-- multigraphs is a prefix of another, the tokeniser will prefer the
-- longest if possible. If there are no matching multigraphs starting
-- at a particular character in the 'String', 'tokeniseWord' will
-- treat that character as its own t'Grapheme'. For instance:
--
-- >>> tokeniseWord [] "cherish"
-- Right [GMulti "c",GMulti "h",GMulti "e",GMulti "r",GMulti "i",GMulti "s",GMulti "h"]
-- 
-- >>> tokeniseWord ["e","h","i","r","s","sh"] "cherish"
-- Right [GMulti "c",GMulti "h",GMulti "e",GMulti "r",GMulti "i",GMulti "sh"]
-- 
-- >>> tokeniseWord ["c","ch","e","h","i","r","s","sh"] "cherish"
-- Right [GMulti "ch",GMulti "e",GMulti "r",GMulti "i",GMulti "sh"]
tokeniseWord :: [String] -> String -> Either (ParseErrorBundle String Void) PWord
tokeniseWord :: [String] -> String -> Either (ParseErrorBundle String Void) PWord
tokeniseWord ([String] -> [String]
forall a. [[a]] -> [[a]]
sortByDescendingLength -> [String]
gs) = ParsecT Void String Identity PWord
-> String -> String -> Either (ParseErrorBundle String Void) PWord
forall e s a.
Parsec e s a -> String -> s -> Either (ParseErrorBundle s e) a
parse (String -> [String] -> ParsecT Void String Identity PWord
wordParser String
"[" [String]
gs) String
""

-- | Given a list of available multigraphs, tokenise an input string
-- into a list of words and other 'Component's. This uses the same
-- tokenisation strategy as 'tokeniseWords', but also recognises
-- 'Gloss'es (in square brackets) and 'Separator's (in the form of
-- whitespace).
tokeniseWords :: [String] -> String -> Either (ParseErrorBundle String Void) [Component PWord]
tokeniseWords :: [String]
-> String
-> Either (ParseErrorBundle String Void) [Component PWord]
tokeniseWords ([String] -> [String]
forall a. [[a]] -> [[a]]
sortByDescendingLength -> [String]
gs) =
        Parsec Void String [Component PWord]
-> String
-> String
-> Either (ParseErrorBundle String Void) [Component PWord]
forall e s a.
Parsec e s a -> String -> s -> Either (ParseErrorBundle s e) a
parse (ParsecT Void String Identity PWord
-> Parsec Void String [Component PWord]
forall a.
ParsecT Void String Identity a
-> ParsecT Void String Identity [Component a]
componentsParser (ParsecT Void String Identity PWord
 -> Parsec Void String [Component PWord])
-> ParsecT Void String Identity PWord
-> Parsec Void String [Component PWord]
forall a b. (a -> b) -> a -> b
$ String -> [String] -> ParsecT Void String Identity PWord
wordParser String
"[" [String]
gs) String
""

-- | Given a function to convert 'Word's to strings, converts a list
-- of 'Component's to strings.
detokeniseWords' :: (a -> String) -> [Component a] -> String
detokeniseWords' :: forall a. (a -> String) -> [Component a] -> String
detokeniseWords' a -> String
f = (Component a -> String) -> [Component a] -> String
forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap ((Component a -> String) -> [Component a] -> String)
-> (Component a -> String) -> [Component a] -> String
forall a b. (a -> b) -> a -> b
$ \case
    Word a
gs -> a -> String
f a
gs
    Separator String
w -> String
w
    Gloss String
g -> Char
'['Char -> ShowS
forall a. a -> [a] -> [a]
:(String
g String -> ShowS
forall a. [a] -> [a] -> [a]
++ String
"]")

-- | Specialisation of 'detokeniseWords'' for 'PWord's, converting
-- words to strings using 'concatWithBoundary'.
detokeniseWords :: [Component PWord] -> String
detokeniseWords :: [Component PWord] -> String
detokeniseWords = (PWord -> String) -> [Component PWord] -> String
forall a. (a -> String) -> [Component a] -> String
detokeniseWords' PWord -> String
concatWithBoundary

-- | Given a list of sound changes, extract the list of multigraphs
-- defined in the first categories declaration of the 'SoundChange's.
findFirstCategoriesDecl :: SoundChanges c [Grapheme] -> [String]
findFirstCategoriesDecl :: forall (c :: LexemeType -> *). SoundChanges c PWord -> [String]
findFirstCategoriesDecl (DirectiveS PWord
gs:[Statement c PWord]
_) =
    (Grapheme -> Maybe String) -> PWord -> [String]
forall a b. (a -> Maybe b) -> [a] -> [b]
mapMaybe
        (\case Grapheme
GBoundary -> Maybe String
forall a. Maybe a
Nothing; GMulti String
m -> String -> Maybe String
forall a. a -> Maybe a
Just String
m)
        PWord
gs
findFirstCategoriesDecl (Statement c PWord
_:[Statement c PWord]
ss) = [Statement c PWord] -> [String]
forall (c :: LexemeType -> *). SoundChanges c PWord -> [String]
findFirstCategoriesDecl [Statement c PWord]
ss
findFirstCategoriesDecl [] = []

-- | CPS'd form of 'findFirstCategoriesDecl'. Nice for doing things
-- like @'withFirstCategoriesDecl' 'tokeniseWords' changes words@ (to
-- tokenise using the graphemes from the first categories declaration)
-- and so on.
withFirstCategoriesDecl :: ([String] -> t) -> SoundChanges c [Grapheme] -> t
withFirstCategoriesDecl :: forall t (c :: LexemeType -> *).
([String] -> t) -> SoundChanges c PWord -> t
withFirstCategoriesDecl [String] -> t
tok SoundChanges c PWord
ss = [String] -> t
tok (SoundChanges c PWord -> [String]
forall (c :: LexemeType -> *). SoundChanges c PWord -> [String]
findFirstCategoriesDecl SoundChanges c PWord
ss)