-- Hoogle documentation, generated by Haddock
-- See Hoogle, http://www.haskell.org/hoogle/


-- | Check uniqueness and tokenize safely
--   
--   Provide fast enough uniqueness checking for set of tokens specified on
--   a subset of regular expression. See README for more info.
--   
--   <i>WARNING this package is not tested enough for the moment. Bugs are
--   very likely here.</i>
@package tokenizer
@version 0.1.0.0


-- | This module contains auxillary set structure to store effectively
--   small sets of symbols and complementary to them
module Text.Tokenizer.BlackWhiteSet

-- | Select some "white set" of available elements or "black set" of
--   forbidden ones
data BlackWhiteSet c
BlackSet :: Set c -> BlackWhiteSet c
WhiteSet :: Set c -> BlackWhiteSet c

-- | Make a <a>BlackWhiteSet</a> containing only one symbol
singleton :: c -> BlackWhiteSet c

-- | Intersect two <a>BlackWhiteSet</a>s.
intersection :: Ord c => BlackWhiteSet c -> BlackWhiteSet c -> BlackWhiteSet c

-- | Check if <a>BlackWhiteSet</a> is empty
--   
--   NB! number of all elements assumed to be too large, so <a>BlackSet</a>
--   is never supposed to be empty
isEmpty :: BlackWhiteSet c -> Bool

-- | Check if symbol is a member of a <a>BlackWhiteSet</a>
--   
--   <pre>
--   &gt;&gt;&gt; member 'a' (WhiteSet (S.fromList ['a', 'b']))
--   True
--   </pre>
--   
--   <pre>
--   &gt;&gt;&gt; member 'a' (BlackSet (S.fromList ['a', 'b']))
--   False
--   </pre>
member :: Ord c => c -> BlackWhiteSet c -> Bool
instance GHC.Show.Show c => GHC.Show.Show (Text.Tokenizer.BlackWhiteSet.BlackWhiteSet c)
instance GHC.Classes.Ord c => GHC.Classes.Ord (Text.Tokenizer.BlackWhiteSet.BlackWhiteSet c)
instance GHC.Classes.Eq c => GHC.Classes.Eq (Text.Tokenizer.BlackWhiteSet.BlackWhiteSet c)


-- | This module contains common types used by uniqueness checking and
--   tokenizing algorithms
module Text.Tokenizer.Types

-- | Type synonym for list monad used as a collection of alternatives
newtype Alt a
Alt :: [a] -> Alt a

-- | Number of symbols acceptable by <a>Repeatable</a>
data Count
One :: Count
Some :: Count

-- | <a>BlackWhiteSet</a> that can be repeated.
data Repeatable c
Repeatable :: Count -> BlackWhiteSet c -> Repeatable c
[$sel:getCnt:Repeatable] :: Repeatable c -> Count
[$sel:getBWS:Repeatable] :: Repeatable c -> BlackWhiteSet c

-- | Token id type synonym.
type TokId = Int

-- | Token with name of type <tt>k</tt> (used for uniqueness error messages
--   and tokenizing output) over char type <tt>c</tt>.
data Token k c
Token :: k -> [BlackWhiteSet c] -> [Repeatable c] -> Token k c

-- | the name of token
[$sel:name:Token] :: Token k c -> k

-- | restrictions on symbols before/after matchable part
--   
--   NB! they are assumed to be satisfied if there are no symbols
--   before/after matched part respectively

-- | restrictions on symbols before/after matchable part
--   
--   NB! they are assumed to be satisfied if there are no symbols
--   before/after matched part respectively
[$sel:behind:Token, $sel:ahead:Token] :: Token k c -> [BlackWhiteSet c]

-- | matchable sequences of char sets with possible repetitions
[$sel:body:Token] :: Token k c -> [Repeatable c]

-- | Type for internal needs. Contains autogenerated
--   <a>$sel:tokId:RToken</a> and restrictions behind token are reversed
data RToken c
RToken :: TokId -> [Repeatable c] -> [Repeatable c] -> RToken c

-- | unique token's id (generated automatically)
[$sel:tokId:RToken] :: RToken c -> TokId

-- | constraints on symbols behind/ahead of matchable part

-- | constraints on symbols behind/ahead of matchable part
[$sel:rbehind:RToken, $sel:ahead:RToken] :: RToken c -> [Repeatable c]

-- | matchable part of string
[$sel:body:RToken] :: RToken c -> [Repeatable c]

-- | Construct an <a>RToken</a> from <a>Token</a> and its id
makeRToken :: TokId -> Token k c -> RToken c
instance Data.Traversable.Traversable Text.Tokenizer.Types.Alt
instance Data.Foldable.Foldable Text.Tokenizer.Types.Alt
instance GHC.Base.Alternative Text.Tokenizer.Types.Alt
instance GHC.Base.Monad Text.Tokenizer.Types.Alt
instance GHC.Base.Applicative Text.Tokenizer.Types.Alt
instance GHC.Base.Functor Text.Tokenizer.Types.Alt
instance GHC.Show.Show a => GHC.Show.Show (Text.Tokenizer.Types.Alt a)
instance GHC.Classes.Ord a => GHC.Classes.Ord (Text.Tokenizer.Types.Alt a)
instance GHC.Classes.Eq a => GHC.Classes.Eq (Text.Tokenizer.Types.Alt a)
instance GHC.Show.Show Text.Tokenizer.Types.Count
instance GHC.Classes.Ord Text.Tokenizer.Types.Count
instance GHC.Classes.Eq Text.Tokenizer.Types.Count
instance GHC.Show.Show c => GHC.Show.Show (Text.Tokenizer.Types.Repeatable c)
instance GHC.Classes.Ord c => GHC.Classes.Ord (Text.Tokenizer.Types.Repeatable c)
instance GHC.Classes.Eq c => GHC.Classes.Eq (Text.Tokenizer.Types.Repeatable c)
instance (GHC.Show.Show k, GHC.Show.Show c) => GHC.Show.Show (Text.Tokenizer.Types.Token k c)
instance GHC.Show.Show c => GHC.Show.Show (Text.Tokenizer.Types.RToken c)
instance GHC.Classes.Eq (Text.Tokenizer.Types.RToken c)
instance GHC.Classes.Ord (Text.Tokenizer.Types.RToken c)


-- | This provides simple tokenizing algorithm
module Text.Tokenizer.Split

-- | Auxillary structure for tokenizing. Should be used as opaque type,
--   initializing by <a>makeTokenizeMap</a> and concatenating by
--   <a>Semigroup</a> instance.
data TokenizeMap k c
TokenizeMap :: Int -> Map c [RToken c] -> [RToken c] -> IntMap k -> TokenizeMap k c
[$sel:tokCount:TokenizeMap] :: TokenizeMap k c -> Int
[$sel:charTokMap:TokenizeMap] :: TokenizeMap k c -> Map c [RToken c]
[$sel:blackToks:TokenizeMap] :: TokenizeMap k c -> [RToken c]
[$sel:tokNames:TokenizeMap] :: TokenizeMap k c -> IntMap k

-- | Make a <a>TokenizeMap</a> with one element
singleTokMap :: Ord c => Token k c -> TokenizeMap k c

-- | Insert <a>Token</a> into <a>TokenizeMap</a>
insert :: Ord c => Token k c -> TokenizeMap k c -> TokenizeMap k c

-- | Create auxillary Map for tokenizing. Should be called once for
--   initializing
makeTokenizeMap :: Ord c => [Token k c] -> TokenizeMap k c

-- | Error during tokenizing
--   
--   Everywhere <tt>[(k, [c])]</tt> type is used, the list of pairs with
--   name of token and part of string, matched by it is stored
data TokenizeError k c
NoWayTokenize :: Int -> [(k, [c])] -> TokenizeError k c
TwoWaysTokenize :: Int -> [(k, [c])] -> [(k, [c])] -> TokenizeError k c

-- | Split list of symbols on tokens.
tokenize :: forall k c. Ord c => TokenizeMap k c -> [c] -> Either (TokenizeError k c) [(k, [c])]
instance (GHC.Show.Show c, GHC.Show.Show k) => GHC.Show.Show (Text.Tokenizer.Split.TokenizeMap k c)
instance (GHC.Classes.Eq k, GHC.Classes.Eq c) => GHC.Classes.Eq (Text.Tokenizer.Split.TokenizeError k c)
instance (GHC.Show.Show k, GHC.Show.Show c) => GHC.Show.Show (Text.Tokenizer.Split.TokenizeError k c)
instance GHC.Show.Show c => GHC.Show.Show (Text.Tokenizer.Split.Rem c)
instance GHC.Classes.Ord c => GHC.Classes.Ord (Text.Tokenizer.Split.Rem c)
instance GHC.Classes.Eq c => GHC.Classes.Eq (Text.Tokenizer.Split.Rem c)
instance GHC.Classes.Ord c => GHC.Base.Semigroup (Text.Tokenizer.Split.TokenizeMap k c)
instance GHC.Classes.Ord c => GHC.Base.Monoid (Text.Tokenizer.Split.TokenizeMap k c)


-- | This module contains implementation of uniqueness checking algorithm
--   based on Sardinas-Patterson's algorithm
module Text.Tokenizer.Uniqueness
data Rem c

-- | First list reminder. May be empty if there is no rem
Rem1 :: [Repeatable c] -> Rem c

-- | Second list reminder. Always is nonempty
Rem2 :: [Repeatable c] -> Rem c
data MergeRes c
MergeRes :: [Repeatable c] -> Rem c -> MergeRes c
[$sel:merged:MergeRes] :: MergeRes c -> [Repeatable c]
[$sel:mergeRem:MergeRes] :: MergeRes c -> Rem c
mergeReps :: Ord c => [Repeatable c] -> [Repeatable c] -> Alt (MergeRes c)
mergedList :: MergeRes c -> [Repeatable c]
remList :: MergeRes c -> [Repeatable c]
rem1 :: MergeRes c -> [Repeatable c]
rem2 :: MergeRes c -> [Repeatable c]

-- | Dangling suffix
data Suff c
Suff :: [Repeatable c] -> [Repeatable c] -> [Repeatable c] -> Suff c

-- | Symbols behind suffix. Note that only <tt>maxBehind</tt> symbols are
--   preserved
[$sel:srbeh:Suff] :: Suff c -> [Repeatable c]

-- | Symbols from suffix' body
[$sel:scur:Suff] :: Suff c -> [Repeatable c]

-- | Symbols ahead suffix
[$sel:sahead:Suff] :: Suff c -> [Repeatable c]

-- | Result of division.
--   
--   It looks like
--   
--   <pre>
--         rtoks       |       lastTok
--   --------|---------|-----------------------|~~~~~
--       rprefToks        |
--   -----|-----|---------|
--   suff (remained part):
--                 behind |     current        | ahead
--                 -------|====================|~~~~~
--   </pre>
data Div c
Div :: [(TokId, Int)] -> (TokId, Int) -> [(TokId, Int)] -> [Repeatable c] -> Suff c -> Div c

-- | Tokens in main sequence, except last one
[$sel:rtoks:Div] :: Div c -> [(TokId, Int)]

-- | Last token in main sequence
[$sel:lastTok:Div] :: Div c -> (TokId, Int)

-- | Tokens in alter sequence
[$sel:rprefToks:Div] :: Div c -> [(TokId, Int)]

-- | Processed symbols
[$sel:processed:Div] :: Div c -> [Repeatable c]

-- | Remained suffix
[$sel:suff:Div] :: Div c -> Suff c
initDiv :: RToken c -> Div c
stepDiv :: Ord c => Int -> Div c -> RToken c -> Alt (Div c)

-- | Two ways of tokenizing a string, demonstrating non-uniqueness
data ConflictTokens k c
ConflictTokens :: [(k, [BlackWhiteSet c])] -> ConflictTokens k c
[$sel:tokList1:ConflictTokens, $sel:tokList2:ConflictTokens] :: ConflictTokens k c -> [(k, [BlackWhiteSet c])]

-- | Check that there is no list of symbols, that can be decomposed to ways
--   on the tokens from given list
checkUniqueTokenizing :: forall k c. Ord c => [Token k c] -> Either (ConflictTokens k c) ()
instance GHC.Show.Show c => GHC.Show.Show (Text.Tokenizer.Uniqueness.Suff c)
instance GHC.Classes.Ord c => GHC.Classes.Ord (Text.Tokenizer.Uniqueness.Suff c)
instance GHC.Classes.Eq c => GHC.Classes.Eq (Text.Tokenizer.Uniqueness.Suff c)
instance GHC.Show.Show c => GHC.Show.Show (Text.Tokenizer.Uniqueness.Div c)
instance GHC.Classes.Ord c => GHC.Classes.Ord (Text.Tokenizer.Uniqueness.Div c)
instance GHC.Classes.Eq c => GHC.Classes.Eq (Text.Tokenizer.Uniqueness.Div c)
instance (GHC.Classes.Ord k, GHC.Classes.Ord c) => GHC.Classes.Ord (Text.Tokenizer.Uniqueness.ConflictTokens k c)
instance (GHC.Classes.Eq k, GHC.Classes.Eq c) => GHC.Classes.Eq (Text.Tokenizer.Uniqueness.ConflictTokens k c)
instance (GHC.Show.Show k, GHC.Show.Show c) => GHC.Show.Show (Text.Tokenizer.Uniqueness.ConflictTokens k c)


-- | This module reexports everything you need from the package
module Text.Tokenizer

-- | Select some "white set" of available elements or "black set" of
--   forbidden ones
data BlackWhiteSet c
BlackSet :: Set c -> BlackWhiteSet c
WhiteSet :: Set c -> BlackWhiteSet c

-- | Number of symbols acceptable by <a>Repeatable</a>
data Count
One :: Count
Some :: Count

-- | <a>BlackWhiteSet</a> that can be repeated.
data Repeatable c
Repeatable :: Count -> BlackWhiteSet c -> Repeatable c
[$sel:getCnt:Repeatable] :: Repeatable c -> Count
[$sel:getBWS:Repeatable] :: Repeatable c -> BlackWhiteSet c

-- | Token with name of type <tt>k</tt> (used for uniqueness error messages
--   and tokenizing output) over char type <tt>c</tt>.
data Token k c
Token :: k -> [BlackWhiteSet c] -> [Repeatable c] -> Token k c

-- | the name of token
[$sel:name:Token] :: Token k c -> k

-- | restrictions on symbols before/after matchable part
--   
--   NB! they are assumed to be satisfied if there are no symbols
--   before/after matched part respectively

-- | restrictions on symbols before/after matchable part
--   
--   NB! they are assumed to be satisfied if there are no symbols
--   before/after matched part respectively
[$sel:behind:Token, $sel:ahead:Token] :: Token k c -> [BlackWhiteSet c]

-- | matchable sequences of char sets with possible repetitions
[$sel:body:Token] :: Token k c -> [Repeatable c]

-- | Two ways of tokenizing a string, demonstrating non-uniqueness
data ConflictTokens k c
ConflictTokens :: [(k, [BlackWhiteSet c])] -> ConflictTokens k c
[$sel:tokList1:ConflictTokens, $sel:tokList2:ConflictTokens] :: ConflictTokens k c -> [(k, [BlackWhiteSet c])]

-- | Check that there is no list of symbols, that can be decomposed to ways
--   on the tokens from given list
checkUniqueTokenizing :: forall k c. Ord c => [Token k c] -> Either (ConflictTokens k c) ()

-- | Auxillary structure for tokenizing. Should be used as opaque type,
--   initializing by <a>makeTokenizeMap</a> and concatenating by
--   <a>Semigroup</a> instance.
data TokenizeMap k c

-- | Create auxillary Map for tokenizing. Should be called once for
--   initializing
makeTokenizeMap :: Ord c => [Token k c] -> TokenizeMap k c

-- | Error during tokenizing
--   
--   Everywhere <tt>[(k, [c])]</tt> type is used, the list of pairs with
--   name of token and part of string, matched by it is stored
data TokenizeError k c
NoWayTokenize :: Int -> [(k, [c])] -> TokenizeError k c
TwoWaysTokenize :: Int -> [(k, [c])] -> [(k, [c])] -> TokenizeError k c

-- | Split list of symbols on tokens.
tokenize :: forall k c. Ord c => TokenizeMap k c -> [c] -> Either (TokenizeError k c) [(k, [c])]