{-# LANGUAGE DeriveAnyClass    #-}
{-# LANGUAGE DeriveGeneric     #-}
{-# LANGUAGE DeriveTraversable #-}
{-# LANGUAGE LambdaCase        #-}
{-# LANGUAGE TypeFamilies      #-}
{-# LANGUAGE ViewPatterns      #-}

module Brassica.SoundChange.Tokenise
       ( 
       -- * Components
         Component(..)
       , getWords
       , splitMultipleResults
       -- * High-level interface
       , tokeniseWord
       , tokeniseWords
       , detokeniseWords'
       , detokeniseWords
       , findFirstCategoriesDecl
       , withFirstCategoriesDecl
       -- * Lower-level functions
       , wordParser
       , componentsParser
       , sortByDescendingLength
       ) where

import Data.Char (isSpace)
import Data.Function (on)
import Data.Functor.Identity
import Data.List (intersperse, sortBy)
import Data.Maybe (mapMaybe)
import Data.Ord (Down(..))
import Data.Void (Void)
import GHC.Generics (Generic)

import Control.DeepSeq (NFData)
import Text.Megaparsec hiding (State)
import Text.Megaparsec.Char

import Brassica.SoundChange.Types

-- | Represents a component of a tokenised input string. v'Word's in
-- the input are represented as the type parameter @a@ — which for
-- this reason will usually, though not always, be 'PWord'.
data Component a = Word a | Separator String | Gloss String
    deriving (Component a -> Component a -> Bool
forall a. Eq a => Component a -> Component a -> Bool
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: Component a -> Component a -> Bool
$c/= :: forall a. Eq a => Component a -> Component a -> Bool
== :: Component a -> Component a -> Bool
$c== :: forall a. Eq a => Component a -> Component a -> Bool
Eq, Int -> Component a -> ShowS
forall a. Show a => Int -> Component a -> ShowS
forall a. Show a => [Component a] -> ShowS
forall a. Show a => Component a -> String
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [Component a] -> ShowS
$cshowList :: forall a. Show a => [Component a] -> ShowS
show :: Component a -> String
$cshow :: forall a. Show a => Component a -> String
showsPrec :: Int -> Component a -> ShowS
$cshowsPrec :: forall a. Show a => Int -> Component a -> ShowS
Show, forall a b. a -> Component b -> Component a
forall a b. (a -> b) -> Component a -> Component b
forall (f :: * -> *).
(forall a b. (a -> b) -> f a -> f b)
-> (forall a b. a -> f b -> f a) -> Functor f
<$ :: forall a b. a -> Component b -> Component a
$c<$ :: forall a b. a -> Component b -> Component a
fmap :: forall a b. (a -> b) -> Component a -> Component b
$cfmap :: forall a b. (a -> b) -> Component a -> Component b
Functor, forall a. Eq a => a -> Component a -> Bool
forall a. Num a => Component a -> a
forall a. Ord a => Component a -> a
forall m. Monoid m => Component m -> m
forall a. Component a -> Bool
forall a. Component a -> Int
forall a. Component a -> [a]
forall a. (a -> a -> a) -> Component a -> a
forall m a. Monoid m => (a -> m) -> Component a -> m
forall b a. (b -> a -> b) -> b -> Component a -> b
forall a b. (a -> b -> b) -> b -> Component a -> b
forall (t :: * -> *).
(forall m. Monoid m => t m -> m)
-> (forall m a. Monoid m => (a -> m) -> t a -> m)
-> (forall m a. Monoid m => (a -> m) -> t a -> m)
-> (forall a b. (a -> b -> b) -> b -> t a -> b)
-> (forall a b. (a -> b -> b) -> b -> t a -> b)
-> (forall b a. (b -> a -> b) -> b -> t a -> b)
-> (forall b a. (b -> a -> b) -> b -> t a -> b)
-> (forall a. (a -> a -> a) -> t a -> a)
-> (forall a. (a -> a -> a) -> t a -> a)
-> (forall a. t a -> [a])
-> (forall a. t a -> Bool)
-> (forall a. t a -> Int)
-> (forall a. Eq a => a -> t a -> Bool)
-> (forall a. Ord a => t a -> a)
-> (forall a. Ord a => t a -> a)
-> (forall a. Num a => t a -> a)
-> (forall a. Num a => t a -> a)
-> Foldable t
product :: forall a. Num a => Component a -> a
$cproduct :: forall a. Num a => Component a -> a
sum :: forall a. Num a => Component a -> a
$csum :: forall a. Num a => Component a -> a
minimum :: forall a. Ord a => Component a -> a
$cminimum :: forall a. Ord a => Component a -> a
maximum :: forall a. Ord a => Component a -> a
$cmaximum :: forall a. Ord a => Component a -> a
elem :: forall a. Eq a => a -> Component a -> Bool
$celem :: forall a. Eq a => a -> Component a -> Bool
length :: forall a. Component a -> Int
$clength :: forall a. Component a -> Int
null :: forall a. Component a -> Bool
$cnull :: forall a. Component a -> Bool
toList :: forall a. Component a -> [a]
$ctoList :: forall a. Component a -> [a]
foldl1 :: forall a. (a -> a -> a) -> Component a -> a
$cfoldl1 :: forall a. (a -> a -> a) -> Component a -> a
foldr1 :: forall a. (a -> a -> a) -> Component a -> a
$cfoldr1 :: forall a. (a -> a -> a) -> Component a -> a
foldl' :: forall b a. (b -> a -> b) -> b -> Component a -> b
$cfoldl' :: forall b a. (b -> a -> b) -> b -> Component a -> b
foldl :: forall b a. (b -> a -> b) -> b -> Component a -> b
$cfoldl :: forall b a. (b -> a -> b) -> b -> Component a -> b
foldr' :: forall a b. (a -> b -> b) -> b -> Component a -> b
$cfoldr' :: forall a b. (a -> b -> b) -> b -> Component a -> b
foldr :: forall a b. (a -> b -> b) -> b -> Component a -> b
$cfoldr :: forall a b. (a -> b -> b) -> b -> Component a -> b
foldMap' :: forall m a. Monoid m => (a -> m) -> Component a -> m
$cfoldMap' :: forall m a. Monoid m => (a -> m) -> Component a -> m
foldMap :: forall m a. Monoid m => (a -> m) -> Component a -> m
$cfoldMap :: forall m a. Monoid m => (a -> m) -> Component a -> m
fold :: forall m. Monoid m => Component m -> m
$cfold :: forall m. Monoid m => Component m -> m
Foldable, Functor Component
Foldable Component
forall (t :: * -> *).
Functor t
-> Foldable t
-> (forall (f :: * -> *) a b.
    Applicative f =>
    (a -> f b) -> t a -> f (t b))
-> (forall (f :: * -> *) a. Applicative f => t (f a) -> f (t a))
-> (forall (m :: * -> *) a b.
    Monad m =>
    (a -> m b) -> t a -> m (t b))
-> (forall (m :: * -> *) a. Monad m => t (m a) -> m (t a))
-> Traversable t
forall (m :: * -> *) a.
Monad m =>
Component (m a) -> m (Component a)
forall (f :: * -> *) a.
Applicative f =>
Component (f a) -> f (Component a)
forall (m :: * -> *) a b.
Monad m =>
(a -> m b) -> Component a -> m (Component b)
forall (f :: * -> *) a b.
Applicative f =>
(a -> f b) -> Component a -> f (Component b)
sequence :: forall (m :: * -> *) a.
Monad m =>
Component (m a) -> m (Component a)
$csequence :: forall (m :: * -> *) a.
Monad m =>
Component (m a) -> m (Component a)
mapM :: forall (m :: * -> *) a b.
Monad m =>
(a -> m b) -> Component a -> m (Component b)
$cmapM :: forall (m :: * -> *) a b.
Monad m =>
(a -> m b) -> Component a -> m (Component b)
sequenceA :: forall (f :: * -> *) a.
Applicative f =>
Component (f a) -> f (Component a)
$csequenceA :: forall (f :: * -> *) a.
Applicative f =>
Component (f a) -> f (Component a)
traverse :: forall (f :: * -> *) a b.
Applicative f =>
(a -> f b) -> Component a -> f (Component b)
$ctraverse :: forall (f :: * -> *) a b.
Applicative f =>
(a -> f b) -> Component a -> f (Component b)
Traversable, forall a.
(forall x. a -> Rep a x) -> (forall x. Rep a x -> a) -> Generic a
forall a x. Rep (Component a) x -> Component a
forall a x. Component a -> Rep (Component a) x
$cto :: forall a x. Rep (Component a) x -> Component a
$cfrom :: forall a x. Component a -> Rep (Component a) x
Generic, forall a. NFData a => Component a -> ()
forall a. (a -> ()) -> NFData a
rnf :: Component a -> ()
$crnf :: forall a. NFData a => Component a -> ()
NFData)

-- | Given a tokenised input string, return only the v'Word's within
-- it.
getWords :: [Component a] -> [a]
getWords :: forall a. [Component a] -> [a]
getWords = forall a b. (a -> Maybe b) -> [a] -> [b]
mapMaybe forall a b. (a -> b) -> a -> b
$ \case
    Word a
a -> forall a. a -> Maybe a
Just a
a
    Component a
_ -> forall a. Maybe a
Nothing

-- | Given a 'Component' containing multiple values in a v'Word',
-- split it apart into a list of 'Component's in which the given
-- 'String' is used as a 'Separator' between multiple results.
--
-- For instance:
--
-- >>> splitMultipleResults " " (Word ["abc", "def", "ghi"])
-- [Word "abc", Separator " ", Word "def", Separator " ", Word "ghi"]
--
-- >>> splitMultipleResults " " (Word ["abc"])
-- [Word "abc"]
splitMultipleResults :: String -> Component [a] -> [Component a]
splitMultipleResults :: forall a. String -> Component [a] -> [Component a]
splitMultipleResults String
wh (Word [a]
as) = forall a. a -> [a] -> [a]
intersperse (forall a. String -> Component a
Separator String
wh) forall a b. (a -> b) -> a -> b
$ forall a. a -> Component a
Word forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> [a]
as
splitMultipleResults String
_ (Separator String
w) = [forall a. String -> Component a
Separator String
w]
splitMultipleResults String
_ (Gloss String
g) = [forall a. String -> Component a
Gloss String
g]
    
-- | Megaparsec parser for 'PWord's — see 'tokeniseWord' documentation
-- for details on the parsing strategy and the meaning of the second
-- parameter. For most usecases 'tokeniseWord' should suffice;
-- 'wordParser' itself is only really useful in unusual situations
-- (e.g. as part of a larger parser). The first parameter gives a list
-- of characters (aside from whitespace) which should be excluded from
-- words, i.e. the parser will stop if any of them are found.
--
-- Note: the second parameter __must__ be 'sortByDescendingLength'-ed;
-- otherwise digraphs will not be parsed correctly.
wordParser :: [Char] -> [Grapheme] -> ParsecT Void String Identity PWord
wordParser :: String -> [String] -> ParsecT Void String Identity [String]
wordParser String
excludes [String]
gs = forall (m :: * -> *) a. MonadPlus m => m a -> m [a]
some forall a b. (a -> b) -> a -> b
$ forall (f :: * -> *) (m :: * -> *) a.
(Foldable f, Alternative m) =>
f (m a) -> m a
choice (forall e s (m :: * -> *).
MonadParsec e s m =>
Tokens s -> m (Tokens s)
chunk forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> [String]
gs) forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (forall (f :: * -> *) a. Applicative f => a -> f a
pure forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> forall e s (m :: * -> *).
MonadParsec e s m =>
(Token s -> Bool) -> m (Token s)
satisfy (Bool -> Bool
not forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Bool
exclude))
  where
    exclude :: Char -> Bool
exclude = Bool -> Bool -> Bool
(||) forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Char -> Bool
isSpace forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> (forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` String
excludes)

-- | Megaparsec parser for 'Component's. Similarly to 'wordParser',
-- usually it’s easier to use 'tokeniseWords' instead.
componentsParser
    :: ParsecT Void String Identity a
    -> ParsecT Void String Identity [Component a]
componentsParser :: forall a.
ParsecT Void String Identity a
-> ParsecT Void String Identity [Component a]
componentsParser ParsecT Void String Identity a
p = forall (m :: * -> *) a. MonadPlus m => m a -> m [a]
many forall a b. (a -> b) -> a -> b
$
    (forall a. String -> Component a
Separator forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> forall e s (m :: * -> *).
MonadParsec e s m =>
Maybe String -> (Token s -> Bool) -> m (Tokens s)
takeWhile1P forall a. Maybe a
Nothing Char -> Bool
isSpace) forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|>
    (forall a. String -> Component a
Gloss forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> forall {s} {e} {m :: * -> *}.
(Tokens s ~ String, Token s ~ Char, MonadParsec e s m) =>
Bool -> m String
gloss Bool
False) forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|>
    (forall a. a -> Component a
Word forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> ParsecT Void String Identity a
p)
  where
    gloss :: Bool -> m String
gloss Bool
returnBracketed = do
        Token s
_ <- forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
Token s -> m (Token s)
char Char
'['
        [String]
contents <- forall (m :: * -> *) a. MonadPlus m => m a -> m [a]
many forall a b. (a -> b) -> a -> b
$ Bool -> m String
gloss Bool
True forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> forall e s (m :: * -> *).
MonadParsec e s m =>
Maybe String -> (Token s -> Bool) -> m (Tokens s)
takeWhile1P forall a. Maybe a
Nothing (Bool -> Bool
not forall b c a. (b -> c) -> (a -> b) -> a -> c
. (forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` String
"[]"))
        Token s
_ <- forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
Token s -> m (Token s)
char Char
']'
        forall (f :: * -> *) a. Applicative f => a -> f a
pure forall a b. (a -> b) -> a -> b
$ if Bool
returnBracketed
           then Char
'[' forall a. a -> [a] -> [a]
: forall (t :: * -> *) a. Foldable t => t [a] -> [a]
concat [String]
contents forall a. [a] -> [a] -> [a]
++ String
"]"
           else forall (t :: * -> *) a. Foldable t => t [a] -> [a]
concat [String]
contents

sortByDescendingLength :: [[a]] -> [[a]]
sortByDescendingLength :: forall a. [[a]] -> [[a]]
sortByDescendingLength = forall a. (a -> a -> Ordering) -> [a] -> [a]
sortBy (forall a. Ord a => a -> a -> Ordering
compare forall b c a. (b -> b -> c) -> (a -> b) -> a -> a -> c
`on` forall a. a -> Down a
Down forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall (t :: * -> *) a. Foldable t => t a -> Int
length)

-- | Tokenise a 'String' input word into a 'PWord' by splitting it up
-- into t'Grapheme's. A list of available t'Grapheme's is supplied as
-- an argument.
--
-- Note that this tokeniser is greedy: if one of the given
-- t'Grapheme's is a prefix of another, the tokeniser will prefer the
-- longest if possible. If there are no matching t'Grapheme's starting
-- at a particular character in the 'String', 'tokeniseWord' will
-- treat that character as its own t'Grapheme'. For instance:
--
-- >>> tokeniseWord [] "cherish"
-- Right ["c","h","e","r","i","s","h"]
-- 
-- >>> tokeniseWord ["e","h","i","r","s","sh"] "cherish"
-- Right ["c","h","e","r","i","sh"]
-- 
-- >>> tokeniseWord ["c","ch","e","h","i","r","s","sh"] "cherish"
-- Right ["ch","e","r","i","sh"]
tokeniseWord :: [Grapheme] -> String -> Either (ParseErrorBundle String Void) PWord
tokeniseWord :: [String]
-> String -> Either (ParseErrorBundle String Void) [String]
tokeniseWord (forall a. [[a]] -> [[a]]
sortByDescendingLength -> [String]
gs) = forall e s a.
Parsec e s a -> String -> s -> Either (ParseErrorBundle s e) a
parse (String -> [String] -> ParsecT Void String Identity [String]
wordParser String
"[" [String]
gs) String
""

-- | Given a list of available t'Grapheme's, tokenise an input string
-- into a list of words and other 'Component's. This uses the same
-- tokenisation strategy as 'tokeniseWords', but also recognises
-- 'Gloss'es (in square brackets) and 'Separator's (in the form of
-- whitespace).
tokeniseWords :: [Grapheme] -> String -> Either (ParseErrorBundle String Void) [Component PWord]
tokeniseWords :: [String]
-> String
-> Either (ParseErrorBundle String Void) [Component [String]]
tokeniseWords (forall a. [[a]] -> [[a]]
sortByDescendingLength -> [String]
gs) =
    forall e s a.
Parsec e s a -> String -> s -> Either (ParseErrorBundle s e) a
parse (forall a.
ParsecT Void String Identity a
-> ParsecT Void String Identity [Component a]
componentsParser forall a b. (a -> b) -> a -> b
$ String -> [String] -> ParsecT Void String Identity [String]
wordParser String
"[" [String]
gs) String
""

-- | Given a function to convert 'Word's to strings, converts a list
-- of 'Component's to strings.
detokeniseWords' :: (a -> String) -> [Component a] -> String
detokeniseWords' :: forall a. (a -> String) -> [Component a] -> String
detokeniseWords' a -> String
f = forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap forall a b. (a -> b) -> a -> b
$ \case
    Word a
gs -> a -> String
f a
gs
    Separator String
w -> String
w
    Gloss String
g -> Char
'['forall a. a -> [a] -> [a]
:(String
g forall a. [a] -> [a] -> [a]
++ String
"]")

-- | Specialisation of 'detokeniseWords'' for 'PWord's, converting
-- words to strings using 'concat'.
detokeniseWords :: [Component PWord] -> String
detokeniseWords :: [Component [String]] -> String
detokeniseWords = forall a. (a -> String) -> [Component a] -> String
detokeniseWords' forall (t :: * -> *) a. Foldable t => t [a] -> [a]
concat

-- | Given a list of sound changes, extract the list of graphemes
-- defined in the first categories declaration of the 'SoundChange's.
findFirstCategoriesDecl :: SoundChanges -> [Grapheme]
findFirstCategoriesDecl :: SoundChanges -> [String]
findFirstCategoriesDecl (CategoriesDeclS (CategoriesDecl [String]
gs):SoundChanges
_) = [String]
gs
findFirstCategoriesDecl (Statement
_:SoundChanges
ss) = SoundChanges -> [String]
findFirstCategoriesDecl SoundChanges
ss
findFirstCategoriesDecl [] = []

-- | CPS'd form of 'findFirstCategoriesDecl'. Nice for doing things
-- like @'withFirstCategoriesDecl' 'tokeniseWords' changes words@ (to
-- tokenise using the graphemes from the first categories declaration)
-- and so on.
withFirstCategoriesDecl :: ([Grapheme] -> t) -> SoundChanges -> t
withFirstCategoriesDecl :: forall t. ([String] -> t) -> SoundChanges -> t
withFirstCategoriesDecl [String] -> t
tok SoundChanges
ss = [String] -> t
tok (SoundChanges -> [String]
findFirstCategoriesDecl SoundChanges
ss)