{-# LANGUAGE DeriveAnyClass    #-}
{-# LANGUAGE DeriveGeneric     #-}
{-# LANGUAGE DeriveTraversable #-}
{-# LANGUAGE LambdaCase        #-}
{-# LANGUAGE TypeFamilies      #-}
{-# LANGUAGE ViewPatterns      #-}

module Brassica.SoundChange.Tokenise
       ( 
       -- * Components
         Component(..)
       , getWords
       , splitMultipleResults
       -- * High-level interface
       , tokeniseWord
       , tokeniseWords
       , detokeniseWords'
       , detokeniseWords
       , concatWithBoundary
       , findFirstCategoriesDecl
       , withFirstCategoriesDecl
       -- * Lower-level functions
       , wordParser
       , componentsParser
       , sortByDescendingLength
       ) where

import Data.Char (isSpace)
import Data.Function (on)
import Data.Functor.Identity
import Data.List (intersperse, sortBy)
import Data.Maybe (mapMaybe)
import Data.Ord (Down(..))
import Data.Void (Void)
import GHC.Generics (Generic)

import Control.DeepSeq (NFData)
import Text.Megaparsec hiding (State)
import Text.Megaparsec.Char

import Brassica.SoundChange.Types

-- | Represents a component of a tokenised input string. v'Word's in
-- the input are represented as the type parameter @a@ — which for
-- this reason will usually, though not always, be 'PWord'.
data Component a = Word a | Separator String | Gloss String
    deriving (Component a -> Component a -> Bool
forall a. Eq a => Component a -> Component a -> Bool
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: Component a -> Component a -> Bool
$c/= :: forall a. Eq a => Component a -> Component a -> Bool
== :: Component a -> Component a -> Bool
$c== :: forall a. Eq a => Component a -> Component a -> Bool
Eq, Int -> Component a -> ShowS
forall a. Show a => Int -> Component a -> ShowS
forall a. Show a => [Component a] -> ShowS
forall a. Show a => Component a -> String
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [Component a] -> ShowS
$cshowList :: forall a. Show a => [Component a] -> ShowS
show :: Component a -> String
$cshow :: forall a. Show a => Component a -> String
showsPrec :: Int -> Component a -> ShowS
$cshowsPrec :: forall a. Show a => Int -> Component a -> ShowS
Show, forall a b. a -> Component b -> Component a
forall a b. (a -> b) -> Component a -> Component b
forall (f :: * -> *).
(forall a b. (a -> b) -> f a -> f b)
-> (forall a b. a -> f b -> f a) -> Functor f
<$ :: forall a b. a -> Component b -> Component a
$c<$ :: forall a b. a -> Component b -> Component a
fmap :: forall a b. (a -> b) -> Component a -> Component b
$cfmap :: forall a b. (a -> b) -> Component a -> Component b
Functor, forall a. Eq a => a -> Component a -> Bool
forall a. Num a => Component a -> a
forall a. Ord a => Component a -> a
forall m. Monoid m => Component m -> m
forall a. Component a -> Bool
forall a. Component a -> Int
forall a. Component a -> [a]
forall a. (a -> a -> a) -> Component a -> a
forall m a. Monoid m => (a -> m) -> Component a -> m
forall b a. (b -> a -> b) -> b -> Component a -> b
forall a b. (a -> b -> b) -> b -> Component a -> b
forall (t :: * -> *).
(forall m. Monoid m => t m -> m)
-> (forall m a. Monoid m => (a -> m) -> t a -> m)
-> (forall m a. Monoid m => (a -> m) -> t a -> m)
-> (forall a b. (a -> b -> b) -> b -> t a -> b)
-> (forall a b. (a -> b -> b) -> b -> t a -> b)
-> (forall b a. (b -> a -> b) -> b -> t a -> b)
-> (forall b a. (b -> a -> b) -> b -> t a -> b)
-> (forall a. (a -> a -> a) -> t a -> a)
-> (forall a. (a -> a -> a) -> t a -> a)
-> (forall a. t a -> [a])
-> (forall a. t a -> Bool)
-> (forall a. t a -> Int)
-> (forall a. Eq a => a -> t a -> Bool)
-> (forall a. Ord a => t a -> a)
-> (forall a. Ord a => t a -> a)
-> (forall a. Num a => t a -> a)
-> (forall a. Num a => t a -> a)
-> Foldable t
product :: forall a. Num a => Component a -> a
$cproduct :: forall a. Num a => Component a -> a
sum :: forall a. Num a => Component a -> a
$csum :: forall a. Num a => Component a -> a
minimum :: forall a. Ord a => Component a -> a
$cminimum :: forall a. Ord a => Component a -> a
maximum :: forall a. Ord a => Component a -> a
$cmaximum :: forall a. Ord a => Component a -> a
elem :: forall a. Eq a => a -> Component a -> Bool
$celem :: forall a. Eq a => a -> Component a -> Bool
length :: forall a. Component a -> Int
$clength :: forall a. Component a -> Int
null :: forall a. Component a -> Bool
$cnull :: forall a. Component a -> Bool
toList :: forall a. Component a -> [a]
$ctoList :: forall a. Component a -> [a]
foldl1 :: forall a. (a -> a -> a) -> Component a -> a
$cfoldl1 :: forall a. (a -> a -> a) -> Component a -> a
foldr1 :: forall a. (a -> a -> a) -> Component a -> a
$cfoldr1 :: forall a. (a -> a -> a) -> Component a -> a
foldl' :: forall b a. (b -> a -> b) -> b -> Component a -> b
$cfoldl' :: forall b a. (b -> a -> b) -> b -> Component a -> b
foldl :: forall b a. (b -> a -> b) -> b -> Component a -> b
$cfoldl :: forall b a. (b -> a -> b) -> b -> Component a -> b
foldr' :: forall a b. (a -> b -> b) -> b -> Component a -> b
$cfoldr' :: forall a b. (a -> b -> b) -> b -> Component a -> b
foldr :: forall a b. (a -> b -> b) -> b -> Component a -> b
$cfoldr :: forall a b. (a -> b -> b) -> b -> Component a -> b
foldMap' :: forall m a. Monoid m => (a -> m) -> Component a -> m
$cfoldMap' :: forall m a. Monoid m => (a -> m) -> Component a -> m
foldMap :: forall m a. Monoid m => (a -> m) -> Component a -> m
$cfoldMap :: forall m a. Monoid m => (a -> m) -> Component a -> m
fold :: forall m. Monoid m => Component m -> m
$cfold :: forall m. Monoid m => Component m -> m
Foldable, Functor Component
Foldable Component
forall (t :: * -> *).
Functor t
-> Foldable t
-> (forall (f :: * -> *) a b.
    Applicative f =>
    (a -> f b) -> t a -> f (t b))
-> (forall (f :: * -> *) a. Applicative f => t (f a) -> f (t a))
-> (forall (m :: * -> *) a b.
    Monad m =>
    (a -> m b) -> t a -> m (t b))
-> (forall (m :: * -> *) a. Monad m => t (m a) -> m (t a))
-> Traversable t
forall (m :: * -> *) a.
Monad m =>
Component (m a) -> m (Component a)
forall (f :: * -> *) a.
Applicative f =>
Component (f a) -> f (Component a)
forall (m :: * -> *) a b.
Monad m =>
(a -> m b) -> Component a -> m (Component b)
forall (f :: * -> *) a b.
Applicative f =>
(a -> f b) -> Component a -> f (Component b)
sequence :: forall (m :: * -> *) a.
Monad m =>
Component (m a) -> m (Component a)
$csequence :: forall (m :: * -> *) a.
Monad m =>
Component (m a) -> m (Component a)
mapM :: forall (m :: * -> *) a b.
Monad m =>
(a -> m b) -> Component a -> m (Component b)
$cmapM :: forall (m :: * -> *) a b.
Monad m =>
(a -> m b) -> Component a -> m (Component b)
sequenceA :: forall (f :: * -> *) a.
Applicative f =>
Component (f a) -> f (Component a)
$csequenceA :: forall (f :: * -> *) a.
Applicative f =>
Component (f a) -> f (Component a)
traverse :: forall (f :: * -> *) a b.
Applicative f =>
(a -> f b) -> Component a -> f (Component b)
$ctraverse :: forall (f :: * -> *) a b.
Applicative f =>
(a -> f b) -> Component a -> f (Component b)
Traversable, forall a.
(forall x. a -> Rep a x) -> (forall x. Rep a x -> a) -> Generic a
forall a x. Rep (Component a) x -> Component a
forall a x. Component a -> Rep (Component a) x
$cto :: forall a x. Rep (Component a) x -> Component a
$cfrom :: forall a x. Component a -> Rep (Component a) x
Generic, forall a. NFData a => Component a -> ()
forall a. (a -> ()) -> NFData a
rnf :: Component a -> ()
$crnf :: forall a. NFData a => Component a -> ()
NFData)

-- | Given a tokenised input string, return only the v'Word's within
-- it.
getWords :: [Component a] -> [a]
getWords :: forall a. [Component a] -> [a]
getWords = forall a b. (a -> Maybe b) -> [a] -> [b]
mapMaybe forall a b. (a -> b) -> a -> b
$ \case
    Word a
a -> forall a. a -> Maybe a
Just a
a
    Component a
_ -> forall a. Maybe a
Nothing

-- | Given a 'Component' containing multiple values in a v'Word',
-- split it apart into a list of 'Component's in which the given
-- 'String' is used as a 'Separator' between multiple results.
--
-- For instance:
--
-- >>> splitMultipleResults " " (Word ["abc", "def", "ghi"])
-- [Word "abc", Separator " ", Word "def", Separator " ", Word "ghi"]
--
-- >>> splitMultipleResults " " (Word ["abc"])
-- [Word "abc"]
splitMultipleResults :: String -> Component [a] -> [Component a]
splitMultipleResults :: forall a. String -> Component [a] -> [Component a]
splitMultipleResults String
wh (Word [a]
as) = forall a. a -> [a] -> [a]
intersperse (forall a. String -> Component a
Separator String
wh) forall a b. (a -> b) -> a -> b
$ forall a. a -> Component a
Word forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> [a]
as
splitMultipleResults String
_ (Separator String
w) = [forall a. String -> Component a
Separator String
w]
splitMultipleResults String
_ (Gloss String
g) = [forall a. String -> Component a
Gloss String
g]
    
-- | Megaparsec parser for 'PWord's — see 'tokeniseWord' documentation
-- for details on the parsing strategy and the meaning of the second
-- parameter. For most usecases 'tokeniseWord' should suffice;
-- 'wordParser' itself is only really useful in unusual situations
-- (e.g. as part of a larger parser). The first parameter gives a list
-- of characters (aside from whitespace) which should be excluded from
-- words, i.e. the parser will stop if any of them are found. The second
-- gives a list of multigraphs which might be expected.
--
-- Note: the second parameter __must__ be 'sortByDescendingLength'-ed;
-- otherwise multigraphs will not be parsed correctly.
wordParser :: [Char] -> [String] -> ParsecT Void String Identity PWord
wordParser :: String -> [String] -> ParsecT Void String Identity PWord
wordParser String
excludes [String]
gs = forall (m :: * -> *) a. MonadPlus m => m a -> m [a]
some forall a b. (a -> b) -> a -> b
$
    (Grapheme
GBoundary forall (f :: * -> *) a b. Functor f => a -> f b -> f a
<$ forall e s (m :: * -> *).
MonadParsec e s m =>
Token s -> m (Token s)
single Char
'#')
    forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> forall (f :: * -> *) (m :: * -> *) a.
(Foldable f, Alternative m) =>
f (m a) -> m a
choice (forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap String -> Grapheme
GMulti forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall e s (m :: * -> *).
MonadParsec e s m =>
Tokens s -> m (Tokens s)
chunk forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> [String]
gs)
    forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> (String -> Grapheme
GMulti forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall (f :: * -> *) a. Applicative f => a -> f a
pure forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> forall e s (m :: * -> *).
MonadParsec e s m =>
(Token s -> Bool) -> m (Token s)
satisfy (Bool -> Bool
not forall b c a. (b -> c) -> (a -> b) -> a -> c
. Char -> Bool
exclude))
  where
    exclude :: Char -> Bool
exclude = Bool -> Bool -> Bool
(||) forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Char -> Bool
isSpace forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> (forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` String
excludes)

-- | Megaparsec parser for 'Component's. Similarly to 'wordParser',
-- usually it’s easier to use 'tokeniseWords' instead.
componentsParser
    :: ParsecT Void String Identity a
    -> ParsecT Void String Identity [Component a]
componentsParser :: forall a.
ParsecT Void String Identity a
-> ParsecT Void String Identity [Component a]
componentsParser ParsecT Void String Identity a
p = forall (m :: * -> *) a. MonadPlus m => m a -> m [a]
many forall a b. (a -> b) -> a -> b
$
    (forall a. String -> Component a
Separator forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> forall e s (m :: * -> *).
MonadParsec e s m =>
Maybe String -> (Token s -> Bool) -> m (Tokens s)
takeWhile1P forall a. Maybe a
Nothing Char -> Bool
isSpace) forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|>
    (forall a. String -> Component a
Gloss forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> forall {s} {e} {m :: * -> *}.
(Tokens s ~ String, Token s ~ Char, MonadParsec e s m) =>
Bool -> m String
gloss Bool
False) forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|>
    (forall a. a -> Component a
Word forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> ParsecT Void String Identity a
p)
  where
    gloss :: Bool -> m String
gloss Bool
returnBracketed = do
        Token s
_ <- forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
Token s -> m (Token s)
char Char
'['
        [String]
contents <- forall (m :: * -> *) a. MonadPlus m => m a -> m [a]
many forall a b. (a -> b) -> a -> b
$ Bool -> m String
gloss Bool
True forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> forall e s (m :: * -> *).
MonadParsec e s m =>
Maybe String -> (Token s -> Bool) -> m (Tokens s)
takeWhile1P forall a. Maybe a
Nothing (Bool -> Bool
not forall b c a. (b -> c) -> (a -> b) -> a -> c
. (forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem` String
"[]"))
        Token s
_ <- forall e s (m :: * -> *).
(MonadParsec e s m, Token s ~ Char) =>
Token s -> m (Token s)
char Char
']'
        forall (f :: * -> *) a. Applicative f => a -> f a
pure forall a b. (a -> b) -> a -> b
$ if Bool
returnBracketed
           then Char
'[' forall a. a -> [a] -> [a]
: forall (t :: * -> *) a. Foldable t => t [a] -> [a]
concat [String]
contents forall a. [a] -> [a] -> [a]
++ String
"]"
           else forall (t :: * -> *) a. Foldable t => t [a] -> [a]
concat [String]
contents

sortByDescendingLength :: [[a]] -> [[a]]
sortByDescendingLength :: forall a. [[a]] -> [[a]]
sortByDescendingLength = forall a. (a -> a -> Ordering) -> [a] -> [a]
sortBy (forall a. Ord a => a -> a -> Ordering
compare forall b c a. (b -> b -> c) -> (a -> b) -> a -> a -> c
`on` forall a. a -> Down a
Down forall b c a. (b -> c) -> (a -> b) -> a -> c
. forall (t :: * -> *) a. Foldable t => t a -> Int
length)

-- | Tokenise a 'String' input word into a 'PWord' by splitting it up
-- into t'Grapheme's. A list of available multigraphs is supplied as
-- the first argument.
--
-- Note that this tokeniser is greedy: if one of the given
-- multigraphs is a prefix of another, the tokeniser will prefer the
-- longest if possible. If there are no matching multigraphs starting
-- at a particular character in the 'String', 'tokeniseWord' will
-- treat that character as its own t'Grapheme'. For instance:
--
-- >>> tokeniseWord [] "cherish"
-- Right [GMulti "c",GMulti "h",GMulti "e",GMulti "r",GMulti "i",GMulti "s",GMulti "h"]
-- 
-- >>> tokeniseWord ["e","h","i","r","s","sh"] "cherish"
-- Right [GMulti "c",GMulti "h",GMulti "e",GMulti "r",GMulti "i",GMulti "sh"]
-- 
-- >>> tokeniseWord ["c","ch","e","h","i","r","s","sh"] "cherish"
-- Right [GMulti "ch",GMulti "e",GMulti "r",GMulti "i",GMulti "sh"]
tokeniseWord :: [String] -> String -> Either (ParseErrorBundle String Void) PWord
tokeniseWord :: [String] -> String -> Either (ParseErrorBundle String Void) PWord
tokeniseWord (forall a. [[a]] -> [[a]]
sortByDescendingLength -> [String]
gs) = forall e s a.
Parsec e s a -> String -> s -> Either (ParseErrorBundle s e) a
parse (String -> [String] -> ParsecT Void String Identity PWord
wordParser String
"[" [String]
gs) String
""

-- | Given a list of available multigraphs, tokenise an input string
-- into a list of words and other 'Component's. This uses the same
-- tokenisation strategy as 'tokeniseWords', but also recognises
-- 'Gloss'es (in square brackets) and 'Separator's (in the form of
-- whitespace).
tokeniseWords :: [String] -> String -> Either (ParseErrorBundle String Void) [Component PWord]
tokeniseWords :: [String]
-> String
-> Either (ParseErrorBundle String Void) [Component PWord]
tokeniseWords (forall a. [[a]] -> [[a]]
sortByDescendingLength -> [String]
gs) =
        forall e s a.
Parsec e s a -> String -> s -> Either (ParseErrorBundle s e) a
parse (forall a.
ParsecT Void String Identity a
-> ParsecT Void String Identity [Component a]
componentsParser forall a b. (a -> b) -> a -> b
$ String -> [String] -> ParsecT Void String Identity PWord
wordParser String
"[" [String]
gs) String
""

-- | Given a function to convert 'Word's to strings, converts a list
-- of 'Component's to strings.
detokeniseWords' :: (a -> String) -> [Component a] -> String
detokeniseWords' :: forall a. (a -> String) -> [Component a] -> String
detokeniseWords' a -> String
f = forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap forall a b. (a -> b) -> a -> b
$ \case
    Word a
gs -> a -> String
f a
gs
    Separator String
w -> String
w
    Gloss String
g -> Char
'['forall a. a -> [a] -> [a]
:(String
g forall a. [a] -> [a] -> [a]
++ String
"]")

-- | Specialisation of 'detokeniseWords'' for 'PWord's, converting
-- words to strings using 'concatWithBoundary'.
detokeniseWords :: [Component PWord] -> String
detokeniseWords :: [Component PWord] -> String
detokeniseWords = forall a. (a -> String) -> [Component a] -> String
detokeniseWords' PWord -> String
concatWithBoundary

-- | Given a list of sound changes, extract the list of multigraphs
-- defined in the first categories declaration of the 'SoundChange's.
findFirstCategoriesDecl :: SoundChanges -> [String]
findFirstCategoriesDecl :: SoundChanges -> [String]
findFirstCategoriesDecl (CategoriesDeclS (CategoriesDecl PWord
gs):SoundChanges
_) =
    forall a b. (a -> Maybe b) -> [a] -> [b]
mapMaybe
        (\case Grapheme
GBoundary -> forall a. Maybe a
Nothing; GMulti String
m -> forall a. a -> Maybe a
Just String
m)
        PWord
gs
findFirstCategoriesDecl (Statement
_:SoundChanges
ss) = SoundChanges -> [String]
findFirstCategoriesDecl SoundChanges
ss
findFirstCategoriesDecl [] = []

-- | CPS'd form of 'findFirstCategoriesDecl'. Nice for doing things
-- like @'withFirstCategoriesDecl' 'tokeniseWords' changes words@ (to
-- tokenise using the graphemes from the first categories declaration)
-- and so on.
withFirstCategoriesDecl :: ([String] -> t) -> SoundChanges -> t
withFirstCategoriesDecl :: forall t. ([String] -> t) -> SoundChanges -> t
withFirstCategoriesDecl [String] -> t
tok SoundChanges
ss = [String] -> t
tok (SoundChanges -> [String]
findFirstCategoriesDecl SoundChanges
ss)