{- |
  Module        : Text.Tokenizer.Split
  Copyright     : (c) Lev Dvorkin, 2022
  License       : MIT
  Maintainer    : lev_135@mail.ru
  Stability     : Experimental

  This provides simple tokenizing algorithm
-}
module Text.Tokenizer.Split (
    TokenizeMap (..), singleTokMap, insert, makeTokenizeMap,
    TokenizeError (..), tokenize
  ) where

import Data.Map (Map)

import qualified Data.Map as M
import Data.IntMap (IntMap)
import qualified Data.IntMap as IM
import qualified Data.Set as S
import Data.Bifunctor (Bifunctor(..))
import Control.Monad.Trans.State (State, evalState, gets)
import Data.Maybe (fromMaybe)
import Data.Foldable (foldrM)
import Control.Monad (guard)
import Data.Coerce (coerce)
import qualified Text.Tokenizer.BlackWhiteSet as BWS
import Control.Applicative (Alternative(..))

import Text.Tokenizer.Types
  (Token(..), Count(..), Repeatable(..), RToken(..), TokId, Alt(..), makeRToken)
import Text.Tokenizer.BlackWhiteSet (BlackWhiteSet(..))

-- | Simple lens for modifying 'tokId' field

modifyId :: (TokId -> TokId) -> RToken c -> RToken c
modifyId :: (TokId -> TokId) -> RToken c -> RToken c
modifyId TokId -> TokId
f tok :: RToken c
tok@RToken {TokId
$sel:tokId:RToken :: forall c. RToken c -> TokId
tokId :: TokId
tokId} = RToken c
tok {$sel:tokId:RToken :: TokId
tokId = TokId -> TokId
f TokId
tokId}

-- | Auxillary structure for tokenizing. Should be used as opaque type,

-- initializing by 'makeTokenizeMap' and concatenating by 'Semigroup' instance.

data TokenizeMap k c = TokenizeMap {
    TokenizeMap k c -> TokId
tokCount    :: Int,
    TokenizeMap k c -> Map c [RToken c]
charTokMap  :: Map c [RToken c],
    TokenizeMap k c -> [RToken c]
blackToks   :: [RToken c],
    TokenizeMap k c -> IntMap k
tokNames    :: IntMap k
  } deriving (TokId -> TokenizeMap k c -> ShowS
[TokenizeMap k c] -> ShowS
TokenizeMap k c -> String
(TokId -> TokenizeMap k c -> ShowS)
-> (TokenizeMap k c -> String)
-> ([TokenizeMap k c] -> ShowS)
-> Show (TokenizeMap k c)
forall a.
(TokId -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
forall k c. (Show c, Show k) => TokId -> TokenizeMap k c -> ShowS
forall k c. (Show c, Show k) => [TokenizeMap k c] -> ShowS
forall k c. (Show c, Show k) => TokenizeMap k c -> String
showList :: [TokenizeMap k c] -> ShowS
$cshowList :: forall k c. (Show c, Show k) => [TokenizeMap k c] -> ShowS
show :: TokenizeMap k c -> String
$cshow :: forall k c. (Show c, Show k) => TokenizeMap k c -> String
showsPrec :: TokId -> TokenizeMap k c -> ShowS
$cshowsPrec :: forall k c. (Show c, Show k) => TokId -> TokenizeMap k c -> ShowS
Show)

instance Ord c => Semigroup (TokenizeMap k c) where
  TokenizeMap TokId
tokCount' Map c [RToken c]
tokMap' [RToken c]
blackToks' IntMap k
tokNames'
    <> :: TokenizeMap k c -> TokenizeMap k c -> TokenizeMap k c
<> TokenizeMap TokId
tokCount'' Map c [RToken c]
tokMap'' [RToken c]
blackToks'' IntMap k
tokNames'' =
      TokenizeMap :: forall k c.
TokId
-> Map c [RToken c] -> [RToken c] -> IntMap k -> TokenizeMap k c
TokenizeMap
        { $sel:tokCount:TokenizeMap :: TokId
tokCount = TokId
tokCount' TokId -> TokId -> TokId
forall a. Num a => a -> a -> a
+ TokId
tokCount'',
          $sel:charTokMap:TokenizeMap :: Map c [RToken c]
charTokMap = ([RToken c] -> [RToken c] -> [RToken c])
-> Map c [RToken c] -> Map c [RToken c] -> Map c [RToken c]
forall k a. Ord k => (a -> a -> a) -> Map k a -> Map k a -> Map k a
M.unionWith [RToken c] -> [RToken c] -> [RToken c]
forall a. Semigroup a => a -> a -> a
(<>) Map c [RToken c]
tokMap' Map c [RToken c]
tokMap''',
          $sel:blackToks:TokenizeMap :: [RToken c]
blackToks = [RToken c]
blackToks' [RToken c] -> [RToken c] -> [RToken c]
forall a. Semigroup a => a -> a -> a
<> [RToken c]
blackToks''',
          $sel:tokNames:TokenizeMap :: IntMap k
tokNames = IntMap k
tokNames' IntMap k -> IntMap k -> IntMap k
forall a. Semigroup a => a -> a -> a
<> IntMap k
tokNames'''
        }
      where
        tokMap''' :: Map c [RToken c]
tokMap''' = (RToken c -> RToken c) -> [RToken c] -> [RToken c]
forall a b. (a -> b) -> [a] -> [b]
map ((TokId -> TokId) -> RToken c -> RToken c
forall c. (TokId -> TokId) -> RToken c -> RToken c
modifyId (TokId -> TokId -> TokId
forall a. Num a => a -> a -> a
+ TokId
tokCount')) ([RToken c] -> [RToken c]) -> Map c [RToken c] -> Map c [RToken c]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Map c [RToken c]
tokMap''
        blackToks''' :: [RToken c]
blackToks''' = (RToken c -> RToken c) -> [RToken c] -> [RToken c]
forall a b. (a -> b) -> [a] -> [b]
map ((TokId -> TokId) -> RToken c -> RToken c
forall c. (TokId -> TokId) -> RToken c -> RToken c
modifyId (TokId -> TokId -> TokId
forall a. Num a => a -> a -> a
+ TokId
tokCount')) [RToken c]
blackToks''
        tokNames''' :: IntMap k
tokNames''' = (TokId -> TokId) -> IntMap k -> IntMap k
forall a. (TokId -> TokId) -> IntMap a -> IntMap a
IM.mapKeysMonotonic (TokId -> TokId -> TokId
forall a. Num a => a -> a -> a
+ TokId
tokCount') IntMap k
tokNames''

instance Ord c => Monoid (TokenizeMap k c) where
  mempty :: TokenizeMap k c
mempty = TokId
-> Map c [RToken c] -> [RToken c] -> IntMap k -> TokenizeMap k c
forall k c.
TokId
-> Map c [RToken c] -> [RToken c] -> IntMap k -> TokenizeMap k c
TokenizeMap TokId
0 Map c [RToken c]
forall a. Monoid a => a
mempty [RToken c]
forall a. Monoid a => a
mempty IntMap k
forall a. Monoid a => a
mempty

-- | Make a 'TokenizeMap' with one element

singleTokMap :: Ord c => Token k c -> TokenizeMap k c
singleTokMap :: Token k c -> TokenizeMap k c
singleTokMap tok :: Token k c
tok@Token {k
$sel:name:Token :: forall k c. Token k c -> k
name :: k
name, [Repeatable c]
$sel:body:Token :: forall k c. Token k c -> [Repeatable c]
body :: [Repeatable c]
body} =
  TokenizeMap :: forall k c.
TokId
-> Map c [RToken c] -> [RToken c] -> IntMap k -> TokenizeMap k c
TokenizeMap
    { $sel:tokCount:TokenizeMap :: TokId
tokCount = TokId
1,
      $sel:charTokMap:TokenizeMap :: Map c [RToken c]
charTokMap = case BlackWhiteSet c
bws of
        BlackSet Set c
_ -> Map c [RToken c]
forall a. Monoid a => a
mempty
        WhiteSet Set c
s -> [(c, [RToken c])] -> Map c [RToken c]
forall k a. Eq k => [(k, a)] -> Map k a
M.fromAscList ([(c, [RToken c])] -> Map c [RToken c])
-> (Set c -> [(c, [RToken c])]) -> Set c -> Map c [RToken c]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (c -> (c, [RToken c])) -> [c] -> [(c, [RToken c])]
forall a b. (a -> b) -> [a] -> [b]
map (, [RToken c
rtok]) ([c] -> [(c, [RToken c])])
-> (Set c -> [c]) -> Set c -> [(c, [RToken c])]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Set c -> [c]
forall a. Set a -> [a]
S.toList (Set c -> Map c [RToken c]) -> Set c -> Map c [RToken c]
forall a b. (a -> b) -> a -> b
$ Set c
s,
      $sel:blackToks:TokenizeMap :: [RToken c]
blackToks = case BlackWhiteSet c
bws of
        BlackSet Set c
_ -> [RToken c
rtok]
        WhiteSet Set c
_ -> [RToken c]
forall a. Monoid a => a
mempty,
      -- tokMap = M.fromAscList $ map (,[rtok]) $ toList $ head body,

      $sel:tokNames:TokenizeMap :: IntMap k
tokNames = TokId -> k -> IntMap k
forall a. TokId -> a -> IntMap a
IM.singleton TokId
tokId k
name
    }
  where
    tokId :: TokId
tokId = TokId
0
    bws :: BlackWhiteSet c
bws = Repeatable c -> BlackWhiteSet c
forall c. Repeatable c -> BlackWhiteSet c
getBWS (Repeatable c -> BlackWhiteSet c)
-> Repeatable c -> BlackWhiteSet c
forall a b. (a -> b) -> a -> b
$ [Repeatable c] -> Repeatable c
forall a. [a] -> a
head [Repeatable c]
body
    rtok :: RToken c
rtok = TokId -> Token k c -> RToken c
forall k c. TokId -> Token k c -> RToken c
makeRToken TokId
tokId Token k c
tok

-- | Insert 'Token' into 'TokenizeMap'

insert :: Ord c => Token k c -> TokenizeMap k c -> TokenizeMap k c
insert :: Token k c -> TokenizeMap k c -> TokenizeMap k c
insert Token k c
tok = (TokenizeMap k c -> TokenizeMap k c -> TokenizeMap k c
forall a. Semigroup a => a -> a -> a
<> Token k c -> TokenizeMap k c
forall c k. Ord c => Token k c -> TokenizeMap k c
singleTokMap Token k c
tok)

-- | Create auxillary Map for tokenizing. Should be called once for initializing

makeTokenizeMap :: Ord c => [Token k c] -> TokenizeMap k c
makeTokenizeMap :: [Token k c] -> TokenizeMap k c
makeTokenizeMap = (Token k c -> TokenizeMap k c -> TokenizeMap k c)
-> TokenizeMap k c -> [Token k c] -> TokenizeMap k c
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr Token k c -> TokenizeMap k c -> TokenizeMap k c
forall c k.
Ord c =>
Token k c -> TokenizeMap k c -> TokenizeMap k c
insert TokenizeMap k c
forall a. Monoid a => a
mempty

-- | Error during tokenizing

--

-- Everywhere @[(k, [c])]@ type is used, the list of pairs with name of token

-- and part of string, matched by it is stored

data TokenizeError k c
  = NoWayTokenize
      Int
      -- ^ Position of the first character that can not be tokenized

      [(k, [c])]
      -- ^ Part of string successfully tokenized (the longest of all attempts)

  | TwoWaysTokenize
      Int
      -- ^ Length of uniquely tokenized prefix

      [(k, [c])]
      -- ^ First tokenize way

      [(k, [c])]
      -- ^ Second tokenize way

  deriving (TokId -> TokenizeError k c -> ShowS
[TokenizeError k c] -> ShowS
TokenizeError k c -> String
(TokId -> TokenizeError k c -> ShowS)
-> (TokenizeError k c -> String)
-> ([TokenizeError k c] -> ShowS)
-> Show (TokenizeError k c)
forall a.
(TokId -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
forall k c. (Show k, Show c) => TokId -> TokenizeError k c -> ShowS
forall k c. (Show k, Show c) => [TokenizeError k c] -> ShowS
forall k c. (Show k, Show c) => TokenizeError k c -> String
showList :: [TokenizeError k c] -> ShowS
$cshowList :: forall k c. (Show k, Show c) => [TokenizeError k c] -> ShowS
show :: TokenizeError k c -> String
$cshow :: forall k c. (Show k, Show c) => TokenizeError k c -> String
showsPrec :: TokId -> TokenizeError k c -> ShowS
$cshowsPrec :: forall k c. (Show k, Show c) => TokId -> TokenizeError k c -> ShowS
Show, TokenizeError k c -> TokenizeError k c -> Bool
(TokenizeError k c -> TokenizeError k c -> Bool)
-> (TokenizeError k c -> TokenizeError k c -> Bool)
-> Eq (TokenizeError k c)
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
forall k c.
(Eq k, Eq c) =>
TokenizeError k c -> TokenizeError k c -> Bool
/= :: TokenizeError k c -> TokenizeError k c -> Bool
$c/= :: forall k c.
(Eq k, Eq c) =>
TokenizeError k c -> TokenizeError k c -> Bool
== :: TokenizeError k c -> TokenizeError k c -> Bool
$c== :: forall k c.
(Eq k, Eq c) =>
TokenizeError k c -> TokenizeError k c -> Bool
Eq)

mapTokErrKey :: (k -> k') -> TokenizeError k c -> TokenizeError k' c
mapTokErrKey :: (k -> k') -> TokenizeError k c -> TokenizeError k' c
mapTokErrKey k -> k'
f (NoWayTokenize TokId
pos [(k, [c])]
toks) =
  TokId -> [(k', [c])] -> TokenizeError k' c
forall k c. TokId -> [(k, [c])] -> TokenizeError k c
NoWayTokenize TokId
pos (((k, [c]) -> (k', [c])) -> [(k, [c])] -> [(k', [c])]
forall a b. (a -> b) -> [a] -> [b]
map ((k -> k') -> (k, [c]) -> (k', [c])
forall (p :: * -> * -> *) a b c.
Bifunctor p =>
(a -> b) -> p a c -> p b c
first k -> k'
f) [(k, [c])]
toks)
mapTokErrKey k -> k'
f (TwoWaysTokenize TokId
pos [(k, [c])]
toks [(k, [c])]
toks') =
  TokId -> [(k', [c])] -> [(k', [c])] -> TokenizeError k' c
forall k c. TokId -> [(k, [c])] -> [(k, [c])] -> TokenizeError k c
TwoWaysTokenize TokId
pos (((k, [c]) -> (k', [c])) -> [(k, [c])] -> [(k', [c])]
forall a b. (a -> b) -> [a] -> [b]
map ((k -> k') -> (k, [c]) -> (k', [c])
forall (p :: * -> * -> *) a b c.
Bifunctor p =>
(a -> b) -> p a c -> p b c
first k -> k'
f) [(k, [c])]
toks) (((k, [c]) -> (k', [c])) -> [(k, [c])] -> [(k', [c])]
forall a b. (a -> b) -> [a] -> [b]
map ((k -> k') -> (k, [c]) -> (k', [c])
forall (p :: * -> * -> *) a b c.
Bifunctor p =>
(a -> b) -> p a c -> p b c
first k -> k'
f) [(k, [c])]
toks')

-- | Split list of symbols on tokens.

tokenize :: forall k c. Ord c => TokenizeMap k c -> [c] -> Either (TokenizeError k c) [(k, [c])]
tokenize :: TokenizeMap k c -> [c] -> Either (TokenizeError k c) [(k, [c])]
tokenize TokenizeMap {Map c [RToken c]
charTokMap :: Map c [RToken c]
$sel:charTokMap:TokenizeMap :: forall k c. TokenizeMap k c -> Map c [RToken c]
charTokMap, [RToken c]
blackToks :: [RToken c]
$sel:blackToks:TokenizeMap :: forall k c. TokenizeMap k c -> [RToken c]
blackToks, IntMap k
tokNames :: IntMap k
$sel:tokNames:TokenizeMap :: forall k c. TokenizeMap k c -> IntMap k
tokNames} [c]
cs =
    (TokenizeError TokId c -> TokenizeError k c)
-> ([(TokId, [c])] -> [(k, [c])])
-> Either (TokenizeError TokId c) [(TokId, [c])]
-> Either (TokenizeError k c) [(k, [c])]
forall (p :: * -> * -> *) a b c d.
Bifunctor p =>
(a -> b) -> (c -> d) -> p a c -> p b d
bimap TokenizeError TokId c -> TokenizeError k c
nameTokErr [(TokId, [c])] -> [(k, [c])]
nameTokRes (Either (TokenizeError TokId c) [(TokId, [c])]
 -> Either (TokenizeError k c) [(k, [c])])
-> Either (TokenizeError TokId c) [(TokId, [c])]
-> Either (TokenizeError k c) [(k, [c])]
forall a b. (a -> b) -> a -> b
$ (State
   (IntMap (Either (TokenizeError TokId c) [(TokId, [c])]))
   (Either (TokenizeError TokId c) [(TokId, [c])])
 -> IntMap (Either (TokenizeError TokId c) [(TokId, [c])])
 -> Either (TokenizeError TokId c) [(TokId, [c])])
-> IntMap (Either (TokenizeError TokId c) [(TokId, [c])])
-> State
     (IntMap (Either (TokenizeError TokId c) [(TokId, [c])]))
     (Either (TokenizeError TokId c) [(TokId, [c])])
-> Either (TokenizeError TokId c) [(TokId, [c])]
forall a b c. (a -> b -> c) -> b -> a -> c
flip State
  (IntMap (Either (TokenizeError TokId c) [(TokId, [c])]))
  (Either (TokenizeError TokId c) [(TokId, [c])])
-> IntMap (Either (TokenizeError TokId c) [(TokId, [c])])
-> Either (TokenizeError TokId c) [(TokId, [c])]
forall s a. State s a -> s -> a
evalState IntMap (Either (TokenizeError TokId c) [(TokId, [c])])
forall a. Monoid a => a
mempty (State
   (IntMap (Either (TokenizeError TokId c) [(TokId, [c])]))
   (Either (TokenizeError TokId c) [(TokId, [c])])
 -> Either (TokenizeError TokId c) [(TokId, [c])])
-> State
     (IntMap (Either (TokenizeError TokId c) [(TokId, [c])]))
     (Either (TokenizeError TokId c) [(TokId, [c])])
-> Either (TokenizeError TokId c) [(TokId, [c])]
forall a b. (a -> b) -> a -> b
$ TokId
-> [c]
-> [c]
-> State
     (IntMap (Either (TokenizeError TokId c) [(TokId, [c])]))
     (Either (TokenizeError TokId c) [(TokId, [c])])
h TokId
0 [] [c]
cs
  where
    nameTokErr :: TokenizeError TokId c -> TokenizeError k c
    nameTokErr :: TokenizeError TokId c -> TokenizeError k c
nameTokErr = (TokId -> k) -> TokenizeError TokId c -> TokenizeError k c
forall k k' c. (k -> k') -> TokenizeError k c -> TokenizeError k' c
mapTokErrKey (IntMap k
tokNames IntMap k -> TokId -> k
forall a. IntMap a -> TokId -> a
IM.!)
    nameTokRes :: [(TokId, [c])] -> [(k, [c])]
    nameTokRes :: [(TokId, [c])] -> [(k, [c])]
nameTokRes = ((TokId, [c]) -> (k, [c])) -> [(TokId, [c])] -> [(k, [c])]
forall a b. (a -> b) -> [a] -> [b]
map (((TokId, [c]) -> (k, [c])) -> [(TokId, [c])] -> [(k, [c])])
-> ((TokId, [c]) -> (k, [c])) -> [(TokId, [c])] -> [(k, [c])]
forall a b. (a -> b) -> a -> b
$ (TokId -> k) -> (TokId, [c]) -> (k, [c])
forall (p :: * -> * -> *) a b c.
Bifunctor p =>
(a -> b) -> p a c -> p b c
first (IntMap k
tokNames IntMap k -> TokId -> k
forall a. IntMap a -> TokId -> a
IM.!)
    -- input string is split in two parts: (reversed) @prevs@ and @nexts@

    -- @pos == length prevs@

    -- prevs are assumed to be already processed

    -- returns unique possible first token's result at the @pos@ position

    h :: Int -> [c] -> [c] -> State (IntMap (Res c)) (Res c)
    h :: TokId
-> [c]
-> [c]
-> State
     (IntMap (Either (TokenizeError TokId c) [(TokId, [c])]))
     (Either (TokenizeError TokId c) [(TokId, [c])])
h TokId
_ [c]
_ [] = Either (TokenizeError TokId c) [(TokId, [c])]
-> State
     (IntMap (Either (TokenizeError TokId c) [(TokId, [c])]))
     (Either (TokenizeError TokId c) [(TokId, [c])])
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Either (TokenizeError TokId c) [(TokId, [c])]
 -> State
      (IntMap (Either (TokenizeError TokId c) [(TokId, [c])]))
      (Either (TokenizeError TokId c) [(TokId, [c])]))
-> Either (TokenizeError TokId c) [(TokId, [c])]
-> State
     (IntMap (Either (TokenizeError TokId c) [(TokId, [c])]))
     (Either (TokenizeError TokId c) [(TokId, [c])])
forall a b. (a -> b) -> a -> b
$ [(TokId, [c])] -> Either (TokenizeError TokId c) [(TokId, [c])]
forall a b. b -> Either a b
Right []
    h TokId
pos [c]
prevs nexts :: [c]
nexts@(c
cur : [c]
_) = do
      -- get memorized result

      Maybe (Either (TokenizeError TokId c) [(TokId, [c])])
mres <- (IntMap (Either (TokenizeError TokId c) [(TokId, [c])])
 -> Maybe (Either (TokenizeError TokId c) [(TokId, [c])]))
-> StateT
     (IntMap (Either (TokenizeError TokId c) [(TokId, [c])]))
     Identity
     (Maybe (Either (TokenizeError TokId c) [(TokId, [c])]))
forall (m :: * -> *) s a. Monad m => (s -> a) -> StateT s m a
gets ((IntMap (Either (TokenizeError TokId c) [(TokId, [c])])
  -> Maybe (Either (TokenizeError TokId c) [(TokId, [c])]))
 -> StateT
      (IntMap (Either (TokenizeError TokId c) [(TokId, [c])]))
      Identity
      (Maybe (Either (TokenizeError TokId c) [(TokId, [c])])))
-> (IntMap (Either (TokenizeError TokId c) [(TokId, [c])])
    -> Maybe (Either (TokenizeError TokId c) [(TokId, [c])]))
-> StateT
     (IntMap (Either (TokenizeError TokId c) [(TokId, [c])]))
     Identity
     (Maybe (Either (TokenizeError TokId c) [(TokId, [c])]))
forall a b. (a -> b) -> a -> b
$ TokId
-> IntMap (Either (TokenizeError TokId c) [(TokId, [c])])
-> Maybe (Either (TokenizeError TokId c) [(TokId, [c])])
forall a. TokId -> IntMap a -> Maybe a
IM.lookup TokId
pos
      State
  (IntMap (Either (TokenizeError TokId c) [(TokId, [c])]))
  (Either (TokenizeError TokId c) [(TokId, [c])])
-> (Either (TokenizeError TokId c) [(TokId, [c])]
    -> State
         (IntMap (Either (TokenizeError TokId c) [(TokId, [c])]))
         (Either (TokenizeError TokId c) [(TokId, [c])]))
-> Maybe (Either (TokenizeError TokId c) [(TokId, [c])])
-> State
     (IntMap (Either (TokenizeError TokId c) [(TokId, [c])]))
     (Either (TokenizeError TokId c) [(TokId, [c])])
forall b a. b -> (a -> b) -> Maybe a -> b
maybe State
  (IntMap (Either (TokenizeError TokId c) [(TokId, [c])]))
  (Either (TokenizeError TokId c) [(TokId, [c])])
acceptedToks Either (TokenizeError TokId c) [(TokId, [c])]
-> State
     (IntMap (Either (TokenizeError TokId c) [(TokId, [c])]))
     (Either (TokenizeError TokId c) [(TokId, [c])])
forall (f :: * -> *) a. Applicative f => a -> f a
pure Maybe (Either (TokenizeError TokId c) [(TokId, [c])])
mres
      where
        -- List of all tokens to be considered at current position

        allToks :: [RToken c]
        allToks :: [RToken c]
allToks = [RToken c]
blackToks [RToken c] -> [RToken c] -> [RToken c]
forall a. Semigroup a => a -> a -> a
<> [RToken c] -> Maybe [RToken c] -> [RToken c]
forall a. a -> Maybe a -> a
fromMaybe [] (c -> Map c [RToken c] -> Maybe [RToken c]
forall k a. Ord k => k -> Map k a -> Maybe a
M.lookup c
cur Map c [RToken c]
charTokMap)

        acceptedToks :: State (IntMap (Res c)) (Res c)
        acceptedToks :: State
  (IntMap (Either (TokenizeError TokId c) [(TokId, [c])]))
  (Either (TokenizeError TokId c) [(TokId, [c])])
acceptedToks =
          ((TokId, [c], [c])
 -> Either (TokenizeError TokId c) [(TokId, [c])]
 -> State
      (IntMap (Either (TokenizeError TokId c) [(TokId, [c])]))
      (Either (TokenizeError TokId c) [(TokId, [c])]))
-> Either (TokenizeError TokId c) [(TokId, [c])]
-> [(TokId, [c], [c])]
-> State
     (IntMap (Either (TokenizeError TokId c) [(TokId, [c])]))
     (Either (TokenizeError TokId c) [(TokId, [c])])
forall (t :: * -> *) (m :: * -> *) a b.
(Foldable t, Monad m) =>
(a -> b -> m b) -> b -> t a -> m b
foldrM
            ( \(TokId
tokId, [c]
curs, [c]
nexts') Either (TokenizeError TokId c) [(TokId, [c])]
res'' -> do
                let curTok :: (TokId, [c])
curTok = (TokId
tokId, [c]
curs)
                Either (TokenizeError TokId c) [(TokId, [c])]
res' <- (TokId, [c])
-> Either (TokenizeError TokId c) [(TokId, [c])]
-> Either (TokenizeError TokId c) [(TokId, [c])]
forall c. (TokId, [c]) -> Res c -> Res c
addTok (TokId, [c])
curTok (Either (TokenizeError TokId c) [(TokId, [c])]
 -> Either (TokenizeError TokId c) [(TokId, [c])])
-> State
     (IntMap (Either (TokenizeError TokId c) [(TokId, [c])]))
     (Either (TokenizeError TokId c) [(TokId, [c])])
-> State
     (IntMap (Either (TokenizeError TokId c) [(TokId, [c])]))
     (Either (TokenizeError TokId c) [(TokId, [c])])
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$>
                  TokId
-> [c]
-> [c]
-> State
     (IntMap (Either (TokenizeError TokId c) [(TokId, [c])]))
     (Either (TokenizeError TokId c) [(TokId, [c])])
h (TokId
pos TokId -> TokId -> TokId
forall a. Num a => a -> a -> a
+ [c] -> TokId
forall (t :: * -> *) a. Foldable t => t a -> TokId
length [c]
curs) ([c] -> [c]
forall a. [a] -> [a]
reverse [c]
curs [c] -> [c] -> [c]
forall a. Semigroup a => a -> a -> a
<> [c]
prevs) [c]
nexts'
                Either (TokenizeError TokId c) [(TokId, [c])]
-> State
     (IntMap (Either (TokenizeError TokId c) [(TokId, [c])]))
     (Either (TokenizeError TokId c) [(TokId, [c])])
forall (f :: * -> *) a. Applicative f => a -> f a
pure (Either (TokenizeError TokId c) [(TokId, [c])]
 -> State
      (IntMap (Either (TokenizeError TokId c) [(TokId, [c])]))
      (Either (TokenizeError TokId c) [(TokId, [c])]))
-> Either (TokenizeError TokId c) [(TokId, [c])]
-> State
     (IntMap (Either (TokenizeError TokId c) [(TokId, [c])]))
     (Either (TokenizeError TokId c) [(TokId, [c])])
forall a b. (a -> b) -> a -> b
$ case (Either (TokenizeError TokId c) [(TokId, [c])]
res', Either (TokenizeError TokId c) [(TokId, [c])]
res'') of
                  (Left TwoWaysTokenize {}, Either (TokenizeError TokId c) [(TokId, [c])]
_) -> Either (TokenizeError TokId c) [(TokId, [c])]
res'
                  (Either (TokenizeError TokId c) [(TokId, [c])]
_, Left TwoWaysTokenize {}) -> Either (TokenizeError TokId c) [(TokId, [c])]
res''
                  (Left NoWayTokenize {}, Right [(TokId, [c])]
_) -> Either (TokenizeError TokId c) [(TokId, [c])]
res''
                  (Right [(TokId, [c])]
_, Left NoWayTokenize {}) -> Either (TokenizeError TokId c) [(TokId, [c])]
res'
                  (Left (NoWayTokenize TokId
l' [(TokId, [c])]
_), Left (NoWayTokenize TokId
l'' [(TokId, [c])]
_)) ->
                    if TokId
l' TokId -> TokId -> Bool
forall a. Ord a => a -> a -> Bool
> TokId
l'' then Either (TokenizeError TokId c) [(TokId, [c])]
res' else Either (TokenizeError TokId c) [(TokId, [c])]
res''
                  (Right [(TokId, [c])]
toks', Right [(TokId, [c])]
toks'') ->
                    TokenizeError TokId c
-> Either (TokenizeError TokId c) [(TokId, [c])]
forall a b. a -> Either a b
Left (TokenizeError TokId c
 -> Either (TokenizeError TokId c) [(TokId, [c])])
-> TokenizeError TokId c
-> Either (TokenizeError TokId c) [(TokId, [c])]
forall a b. (a -> b) -> a -> b
$ TokId -> [(TokId, [c])] -> [(TokId, [c])] -> TokenizeError TokId c
forall k c. TokId -> [(k, [c])] -> [(k, [c])] -> TokenizeError k c
TwoWaysTokenize TokId
pos [(TokId, [c])]
toks' [(TokId, [c])]
toks''
            )
            ((TokenizeError TokId c
-> Either (TokenizeError TokId c) [(TokId, [c])]
forall a b. a -> Either a b
Left (TokenizeError TokId c
 -> Either (TokenizeError TokId c) [(TokId, [c])])
-> TokenizeError TokId c
-> Either (TokenizeError TokId c) [(TokId, [c])]
forall a b. (a -> b) -> a -> b
$ TokId -> [(TokId, [c])] -> TokenizeError TokId c
forall k c. TokId -> [(k, [c])] -> TokenizeError k c
NoWayTokenize TokId
pos []) :: Res c)
            ([(TokId, [c], [c])]
 -> State
      (IntMap (Either (TokenizeError TokId c) [(TokId, [c])]))
      (Either (TokenizeError TokId c) [(TokId, [c])]))
-> [(TokId, [c], [c])]
-> State
     (IntMap (Either (TokenizeError TokId c) [(TokId, [c])]))
     (Either (TokenizeError TokId c) [(TokId, [c])])
forall a b. (a -> b) -> a -> b
$ (RToken c -> [(TokId, [c], [c])])
-> [RToken c] -> [(TokId, [c], [c])]
forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap ((RToken c -> Alt (TokId, [c], [c]))
-> RToken c -> [(TokId, [c], [c])]
coerce ((RToken c -> Alt (TokId, [c], [c]))
 -> RToken c -> [(TokId, [c], [c])])
-> (RToken c -> Alt (TokId, [c], [c]))
-> RToken c
-> [(TokId, [c], [c])]
forall a b. (a -> b) -> a -> b
$ [c] -> [c] -> RToken c -> Alt (TokId, [c], [c])
forall c. Ord c => [c] -> [c] -> RToken c -> Alt (TokId, [c], [c])
accepts [c]
prevs [c]
nexts) [RToken c]
allToks

addTok :: (TokId, [c]) -> Res c -> Res c
addTok :: (TokId, [c]) -> Res c -> Res c
addTok (TokId, [c])
tok = \case
  Left (NoWayTokenize TokId
pos [(TokId, [c])]
toks) ->
    TokenizeError TokId c -> Res c
forall a b. a -> Either a b
Left (TokenizeError TokId c -> Res c) -> TokenizeError TokId c -> Res c
forall a b. (a -> b) -> a -> b
$ TokId -> [(TokId, [c])] -> TokenizeError TokId c
forall k c. TokId -> [(k, [c])] -> TokenizeError k c
NoWayTokenize TokId
pos ((TokId, [c])
tok (TokId, [c]) -> [(TokId, [c])] -> [(TokId, [c])]
forall a. a -> [a] -> [a]
: [(TokId, [c])]
toks)
  Left (TwoWaysTokenize TokId
len [(TokId, [c])]
toks [(TokId, [c])]
toks') ->
    TokenizeError TokId c -> Res c
forall a b. a -> Either a b
Left (TokenizeError TokId c -> Res c) -> TokenizeError TokId c -> Res c
forall a b. (a -> b) -> a -> b
$ TokId -> [(TokId, [c])] -> [(TokId, [c])] -> TokenizeError TokId c
forall k c. TokId -> [(k, [c])] -> [(k, [c])] -> TokenizeError k c
TwoWaysTokenize TokId
len ((TokId, [c])
tok (TokId, [c]) -> [(TokId, [c])] -> [(TokId, [c])]
forall a. a -> [a] -> [a]
: [(TokId, [c])]
toks) ((TokId, [c])
tok (TokId, [c]) -> [(TokId, [c])] -> [(TokId, [c])]
forall a. a -> [a] -> [a]
: [(TokId, [c])]
toks')
  Right [(TokId, [c])]
rs -> [(TokId, [c])] -> Res c
forall a b. b -> Either a b
Right ([(TokId, [c])] -> Res c) -> [(TokId, [c])] -> Res c
forall a b. (a -> b) -> a -> b
$ (TokId, [c])
tok (TokId, [c]) -> [(TokId, [c])] -> [(TokId, [c])]
forall a. a -> [a] -> [a]
: [(TokId, [c])]
rs

accepts :: Ord c => [c] -> [c] -> RToken c -> Alt (TokId, [c], [c])
accepts :: [c] -> [c] -> RToken c -> Alt (TokId, [c], [c])
accepts [c]
rprevs [c]
nexts RToken {TokId
tokId :: TokId
$sel:tokId:RToken :: forall c. RToken c -> TokId
tokId, [Repeatable c]
$sel:rbehind:RToken :: forall c. RToken c -> [Repeatable c]
rbehind :: [Repeatable c]
rbehind, [Repeatable c]
$sel:body:RToken :: forall c. RToken c -> [Repeatable c]
body :: [Repeatable c]
body, [Repeatable c]
$sel:ahead:RToken :: forall c. RToken c -> [Repeatable c]
ahead :: [Repeatable c]
ahead} = do
  Bool -> Alt ()
forall (f :: * -> *). Alternative f => Bool -> f ()
guard (Bool -> Alt ()) -> Bool -> Alt ()
forall a b. (a -> b) -> a -> b
$ Bool -> Bool
not (Bool -> Bool)
-> (Alt ([c], Rem c) -> Bool) -> Alt ([c], Rem c) -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Alt ([c], Rem c) -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
null (Alt ([c], Rem c) -> Bool) -> Alt ([c], Rem c) -> Bool
forall a b. (a -> b) -> a -> b
$ [Repeatable c] -> [c] -> Alt ([c], Rem c)
forall c. Ord c => [Repeatable c] -> [c] -> Alt ([c], Rem c)
check [Repeatable c]
rbehind [c]
rprevs
  ([c]
curs, Rem c
rem) <- [Repeatable c] -> [c] -> Alt ([c], Rem c)
forall c. Ord c => [Repeatable c] -> [c] -> Alt ([c], Rem c)
check [Repeatable c]
body [c]
nexts
  case Rem c
rem of
    RemRepeatable [Repeatable c]
_ -> Alt (TokId, [c], [c])
forall (f :: * -> *) a. Alternative f => f a
empty -- not enough symbols for token's body

    RemString [c]
cs' -> do
      Bool -> Alt ()
forall (f :: * -> *). Alternative f => Bool -> f ()
guard (Bool -> Alt ()) -> Bool -> Alt ()
forall a b. (a -> b) -> a -> b
$ Bool -> Bool
not (Bool -> Bool)
-> (Alt ([c], Rem c) -> Bool) -> Alt ([c], Rem c) -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Alt ([c], Rem c) -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
null (Alt ([c], Rem c) -> Bool) -> Alt ([c], Rem c) -> Bool
forall a b. (a -> b) -> a -> b
$ [Repeatable c] -> [c] -> Alt ([c], Rem c)
forall c. Ord c => [Repeatable c] -> [c] -> Alt ([c], Rem c)
check [Repeatable c]
ahead [c]
cs'
      (TokId, [c], [c]) -> Alt (TokId, [c], [c])
forall (f :: * -> *) a. Applicative f => a -> f a
pure (TokId
tokId, [c]
curs, [c]
cs')

data Rem c = RemRepeatable [Repeatable c] | RemString [c]
  deriving (Rem c -> Rem c -> Bool
(Rem c -> Rem c -> Bool) -> (Rem c -> Rem c -> Bool) -> Eq (Rem c)
forall c. Eq c => Rem c -> Rem c -> Bool
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: Rem c -> Rem c -> Bool
$c/= :: forall c. Eq c => Rem c -> Rem c -> Bool
== :: Rem c -> Rem c -> Bool
$c== :: forall c. Eq c => Rem c -> Rem c -> Bool
Eq, Eq (Rem c)
Eq (Rem c)
-> (Rem c -> Rem c -> Ordering)
-> (Rem c -> Rem c -> Bool)
-> (Rem c -> Rem c -> Bool)
-> (Rem c -> Rem c -> Bool)
-> (Rem c -> Rem c -> Bool)
-> (Rem c -> Rem c -> Rem c)
-> (Rem c -> Rem c -> Rem c)
-> Ord (Rem c)
Rem c -> Rem c -> Bool
Rem c -> Rem c -> Ordering
Rem c -> Rem c -> Rem c
forall a.
Eq a
-> (a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
forall c. Ord c => Eq (Rem c)
forall c. Ord c => Rem c -> Rem c -> Bool
forall c. Ord c => Rem c -> Rem c -> Ordering
forall c. Ord c => Rem c -> Rem c -> Rem c
min :: Rem c -> Rem c -> Rem c
$cmin :: forall c. Ord c => Rem c -> Rem c -> Rem c
max :: Rem c -> Rem c -> Rem c
$cmax :: forall c. Ord c => Rem c -> Rem c -> Rem c
>= :: Rem c -> Rem c -> Bool
$c>= :: forall c. Ord c => Rem c -> Rem c -> Bool
> :: Rem c -> Rem c -> Bool
$c> :: forall c. Ord c => Rem c -> Rem c -> Bool
<= :: Rem c -> Rem c -> Bool
$c<= :: forall c. Ord c => Rem c -> Rem c -> Bool
< :: Rem c -> Rem c -> Bool
$c< :: forall c. Ord c => Rem c -> Rem c -> Bool
compare :: Rem c -> Rem c -> Ordering
$ccompare :: forall c. Ord c => Rem c -> Rem c -> Ordering
$cp1Ord :: forall c. Ord c => Eq (Rem c)
Ord, TokId -> Rem c -> ShowS
[Rem c] -> ShowS
Rem c -> String
(TokId -> Rem c -> ShowS)
-> (Rem c -> String) -> ([Rem c] -> ShowS) -> Show (Rem c)
forall c. Show c => TokId -> Rem c -> ShowS
forall c. Show c => [Rem c] -> ShowS
forall c. Show c => Rem c -> String
forall a.
(TokId -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [Rem c] -> ShowS
$cshowList :: forall c. Show c => [Rem c] -> ShowS
show :: Rem c -> String
$cshow :: forall c. Show c => Rem c -> String
showsPrec :: TokId -> Rem c -> ShowS
$cshowsPrec :: forall c. Show c => TokId -> Rem c -> ShowS
Show)

check :: Ord c => [Repeatable c] -> [c] -> Alt ([c], Rem c)
check :: [Repeatable c] -> [c] -> Alt ([c], Rem c)
check [] [c]
cs = ([c], Rem c) -> Alt ([c], Rem c)
forall (f :: * -> *) a. Applicative f => a -> f a
pure ([], [c] -> Rem c
forall c. [c] -> Rem c
RemString [c]
cs)
check [Repeatable c]
rs [] = ([c], Rem c) -> Alt ([c], Rem c)
forall (f :: * -> *) a. Applicative f => a -> f a
pure ([], [Repeatable c] -> Rem c
forall c. [Repeatable c] -> Rem c
RemRepeatable [Repeatable c]
rs)
check rs0 :: [Repeatable c]
rs0@(Repeatable Count
cnt BlackWhiteSet c
bws : [Repeatable c]
rs) (c
c : [c]
cs) = do
  Bool -> Alt ()
forall (f :: * -> *). Alternative f => Bool -> f ()
guard (Bool -> Alt ()) -> Bool -> Alt ()
forall a b. (a -> b) -> a -> b
$ c
c c -> BlackWhiteSet c -> Bool
forall c. Ord c => c -> BlackWhiteSet c -> Bool
`BWS.member` BlackWhiteSet c
bws
  ([c] -> [c]) -> ([c], Rem c) -> ([c], Rem c)
forall (p :: * -> * -> *) a b c.
Bifunctor p =>
(a -> b) -> p a c -> p b c
first (c
c c -> [c] -> [c]
forall a. a -> [a] -> [a]
:) (([c], Rem c) -> ([c], Rem c))
-> Alt ([c], Rem c) -> Alt ([c], Rem c)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> case Count
cnt of
    Count
One -> [Repeatable c] -> [c] -> Alt ([c], Rem c)
forall c. Ord c => [Repeatable c] -> [c] -> Alt ([c], Rem c)
check [Repeatable c]
rs [c]
cs
    Count
Some -> [Repeatable c] -> [c] -> Alt ([c], Rem c)
forall c. Ord c => [Repeatable c] -> [c] -> Alt ([c], Rem c)
check [Repeatable c]
rs [c]
cs Alt ([c], Rem c) -> Alt ([c], Rem c) -> Alt ([c], Rem c)
forall (f :: * -> *) a. Alternative f => f a -> f a -> f a
<|> [Repeatable c] -> [c] -> Alt ([c], Rem c)
forall c. Ord c => [Repeatable c] -> [c] -> Alt ([c], Rem c)
check [Repeatable c]
rs0 [c]
cs

type Res c = Either (TokenizeError TokId c) [(TokId, [c])]