-- Hoogle documentation, generated by Haddock -- See Hoogle, http://www.haskell.org/hoogle/ -- | Check uniqueness and tokenize safely -- -- Provide fast enough uniqueness checking for set of tokens specified on -- a subset of regular expression. See README for more info. -- -- WARNING this package is not tested enough for the moment. Bugs are -- very likely here. @package tokenizer @version 0.1.0.0 -- | This module contains auxillary set structure to store effectively -- small sets of symbols and complementary to them module Text.Tokenizer.BlackWhiteSet -- | Select some "white set" of available elements or "black set" of -- forbidden ones data BlackWhiteSet c BlackSet :: Set c -> BlackWhiteSet c WhiteSet :: Set c -> BlackWhiteSet c -- | Make a BlackWhiteSet containing only one symbol singleton :: c -> BlackWhiteSet c -- | Intersect two BlackWhiteSets. intersection :: Ord c => BlackWhiteSet c -> BlackWhiteSet c -> BlackWhiteSet c -- | Check if BlackWhiteSet is empty -- -- NB! number of all elements assumed to be too large, so BlackSet -- is never supposed to be empty isEmpty :: BlackWhiteSet c -> Bool -- | Check if symbol is a member of a BlackWhiteSet -- --
--   >>> member 'a' (WhiteSet (S.fromList ['a', 'b']))
--   True
--   
-- --
--   >>> member 'a' (BlackSet (S.fromList ['a', 'b']))
--   False
--   
member :: Ord c => c -> BlackWhiteSet c -> Bool instance GHC.Show.Show c => GHC.Show.Show (Text.Tokenizer.BlackWhiteSet.BlackWhiteSet c) instance GHC.Classes.Ord c => GHC.Classes.Ord (Text.Tokenizer.BlackWhiteSet.BlackWhiteSet c) instance GHC.Classes.Eq c => GHC.Classes.Eq (Text.Tokenizer.BlackWhiteSet.BlackWhiteSet c) -- | This module contains common types used by uniqueness checking and -- tokenizing algorithms module Text.Tokenizer.Types -- | Type synonym for list monad used as a collection of alternatives newtype Alt a Alt :: [a] -> Alt a -- | Number of symbols acceptable by Repeatable data Count One :: Count Some :: Count -- | BlackWhiteSet that can be repeated. data Repeatable c Repeatable :: Count -> BlackWhiteSet c -> Repeatable c [$sel:getCnt:Repeatable] :: Repeatable c -> Count [$sel:getBWS:Repeatable] :: Repeatable c -> BlackWhiteSet c -- | Token id type synonym. type TokId = Int -- | Token with name of type k (used for uniqueness error messages -- and tokenizing output) over char type c. data Token k c Token :: k -> [BlackWhiteSet c] -> [Repeatable c] -> Token k c -- | the name of token [$sel:name:Token] :: Token k c -> k -- | restrictions on symbols before/after matchable part -- -- NB! they are assumed to be satisfied if there are no symbols -- before/after matched part respectively -- | restrictions on symbols before/after matchable part -- -- NB! they are assumed to be satisfied if there are no symbols -- before/after matched part respectively [$sel:behind:Token, $sel:ahead:Token] :: Token k c -> [BlackWhiteSet c] -- | matchable sequences of char sets with possible repetitions [$sel:body:Token] :: Token k c -> [Repeatable c] -- | Type for internal needs. Contains autogenerated -- $sel:tokId:RToken and restrictions behind token are reversed data RToken c RToken :: TokId -> [Repeatable c] -> [Repeatable c] -> RToken c -- | unique token's id (generated automatically) [$sel:tokId:RToken] :: RToken c -> TokId -- | constraints on symbols behind/ahead of matchable part -- | constraints on symbols behind/ahead of matchable part [$sel:rbehind:RToken, $sel:ahead:RToken] :: RToken c -> [Repeatable c] -- | matchable part of string [$sel:body:RToken] :: RToken c -> [Repeatable c] -- | Construct an RToken from Token and its id makeRToken :: TokId -> Token k c -> RToken c instance Data.Traversable.Traversable Text.Tokenizer.Types.Alt instance Data.Foldable.Foldable Text.Tokenizer.Types.Alt instance GHC.Base.Alternative Text.Tokenizer.Types.Alt instance GHC.Base.Monad Text.Tokenizer.Types.Alt instance GHC.Base.Applicative Text.Tokenizer.Types.Alt instance GHC.Base.Functor Text.Tokenizer.Types.Alt instance GHC.Show.Show a => GHC.Show.Show (Text.Tokenizer.Types.Alt a) instance GHC.Classes.Ord a => GHC.Classes.Ord (Text.Tokenizer.Types.Alt a) instance GHC.Classes.Eq a => GHC.Classes.Eq (Text.Tokenizer.Types.Alt a) instance GHC.Show.Show Text.Tokenizer.Types.Count instance GHC.Classes.Ord Text.Tokenizer.Types.Count instance GHC.Classes.Eq Text.Tokenizer.Types.Count instance GHC.Show.Show c => GHC.Show.Show (Text.Tokenizer.Types.Repeatable c) instance GHC.Classes.Ord c => GHC.Classes.Ord (Text.Tokenizer.Types.Repeatable c) instance GHC.Classes.Eq c => GHC.Classes.Eq (Text.Tokenizer.Types.Repeatable c) instance (GHC.Show.Show k, GHC.Show.Show c) => GHC.Show.Show (Text.Tokenizer.Types.Token k c) instance GHC.Show.Show c => GHC.Show.Show (Text.Tokenizer.Types.RToken c) instance GHC.Classes.Eq (Text.Tokenizer.Types.RToken c) instance GHC.Classes.Ord (Text.Tokenizer.Types.RToken c) -- | This provides simple tokenizing algorithm module Text.Tokenizer.Split -- | Auxillary structure for tokenizing. Should be used as opaque type, -- initializing by makeTokenizeMap and concatenating by -- Semigroup instance. data TokenizeMap k c TokenizeMap :: Int -> Map c [RToken c] -> [RToken c] -> IntMap k -> TokenizeMap k c [$sel:tokCount:TokenizeMap] :: TokenizeMap k c -> Int [$sel:charTokMap:TokenizeMap] :: TokenizeMap k c -> Map c [RToken c] [$sel:blackToks:TokenizeMap] :: TokenizeMap k c -> [RToken c] [$sel:tokNames:TokenizeMap] :: TokenizeMap k c -> IntMap k -- | Make a TokenizeMap with one element singleTokMap :: Ord c => Token k c -> TokenizeMap k c -- | Insert Token into TokenizeMap insert :: Ord c => Token k c -> TokenizeMap k c -> TokenizeMap k c -- | Create auxillary Map for tokenizing. Should be called once for -- initializing makeTokenizeMap :: Ord c => [Token k c] -> TokenizeMap k c -- | Error during tokenizing -- -- Everywhere [(k, [c])] type is used, the list of pairs with -- name of token and part of string, matched by it is stored data TokenizeError k c NoWayTokenize :: Int -> [(k, [c])] -> TokenizeError k c TwoWaysTokenize :: Int -> [(k, [c])] -> [(k, [c])] -> TokenizeError k c -- | Split list of symbols on tokens. tokenize :: forall k c. Ord c => TokenizeMap k c -> [c] -> Either (TokenizeError k c) [(k, [c])] instance (GHC.Show.Show c, GHC.Show.Show k) => GHC.Show.Show (Text.Tokenizer.Split.TokenizeMap k c) instance (GHC.Classes.Eq k, GHC.Classes.Eq c) => GHC.Classes.Eq (Text.Tokenizer.Split.TokenizeError k c) instance (GHC.Show.Show k, GHC.Show.Show c) => GHC.Show.Show (Text.Tokenizer.Split.TokenizeError k c) instance GHC.Show.Show c => GHC.Show.Show (Text.Tokenizer.Split.Rem c) instance GHC.Classes.Ord c => GHC.Classes.Ord (Text.Tokenizer.Split.Rem c) instance GHC.Classes.Eq c => GHC.Classes.Eq (Text.Tokenizer.Split.Rem c) instance GHC.Classes.Ord c => GHC.Base.Semigroup (Text.Tokenizer.Split.TokenizeMap k c) instance GHC.Classes.Ord c => GHC.Base.Monoid (Text.Tokenizer.Split.TokenizeMap k c) -- | This module contains implementation of uniqueness checking algorithm -- based on Sardinas-Patterson's algorithm module Text.Tokenizer.Uniqueness data Rem c -- | First list reminder. May be empty if there is no rem Rem1 :: [Repeatable c] -> Rem c -- | Second list reminder. Always is nonempty Rem2 :: [Repeatable c] -> Rem c data MergeRes c MergeRes :: [Repeatable c] -> Rem c -> MergeRes c [$sel:merged:MergeRes] :: MergeRes c -> [Repeatable c] [$sel:mergeRem:MergeRes] :: MergeRes c -> Rem c mergeReps :: Ord c => [Repeatable c] -> [Repeatable c] -> Alt (MergeRes c) mergedList :: MergeRes c -> [Repeatable c] remList :: MergeRes c -> [Repeatable c] rem1 :: MergeRes c -> [Repeatable c] rem2 :: MergeRes c -> [Repeatable c] -- | Dangling suffix data Suff c Suff :: [Repeatable c] -> [Repeatable c] -> [Repeatable c] -> Suff c -- | Symbols behind suffix. Note that only maxBehind symbols are -- preserved [$sel:srbeh:Suff] :: Suff c -> [Repeatable c] -- | Symbols from suffix' body [$sel:scur:Suff] :: Suff c -> [Repeatable c] -- | Symbols ahead suffix [$sel:sahead:Suff] :: Suff c -> [Repeatable c] -- | Result of division. -- -- It looks like -- --
--         rtoks       |       lastTok
--   --------|---------|-----------------------|~~~~~
--       rprefToks        |
--   -----|-----|---------|
--   suff (remained part):
--                 behind |     current        | ahead
--                 -------|====================|~~~~~
--   
data Div c Div :: [(TokId, Int)] -> (TokId, Int) -> [(TokId, Int)] -> [Repeatable c] -> Suff c -> Div c -- | Tokens in main sequence, except last one [$sel:rtoks:Div] :: Div c -> [(TokId, Int)] -- | Last token in main sequence [$sel:lastTok:Div] :: Div c -> (TokId, Int) -- | Tokens in alter sequence [$sel:rprefToks:Div] :: Div c -> [(TokId, Int)] -- | Processed symbols [$sel:processed:Div] :: Div c -> [Repeatable c] -- | Remained suffix [$sel:suff:Div] :: Div c -> Suff c initDiv :: RToken c -> Div c stepDiv :: Ord c => Int -> Div c -> RToken c -> Alt (Div c) -- | Two ways of tokenizing a string, demonstrating non-uniqueness data ConflictTokens k c ConflictTokens :: [(k, [BlackWhiteSet c])] -> ConflictTokens k c [$sel:tokList1:ConflictTokens, $sel:tokList2:ConflictTokens] :: ConflictTokens k c -> [(k, [BlackWhiteSet c])] -- | Check that there is no list of symbols, that can be decomposed to ways -- on the tokens from given list checkUniqueTokenizing :: forall k c. Ord c => [Token k c] -> Either (ConflictTokens k c) () instance GHC.Show.Show c => GHC.Show.Show (Text.Tokenizer.Uniqueness.Suff c) instance GHC.Classes.Ord c => GHC.Classes.Ord (Text.Tokenizer.Uniqueness.Suff c) instance GHC.Classes.Eq c => GHC.Classes.Eq (Text.Tokenizer.Uniqueness.Suff c) instance GHC.Show.Show c => GHC.Show.Show (Text.Tokenizer.Uniqueness.Div c) instance GHC.Classes.Ord c => GHC.Classes.Ord (Text.Tokenizer.Uniqueness.Div c) instance GHC.Classes.Eq c => GHC.Classes.Eq (Text.Tokenizer.Uniqueness.Div c) instance (GHC.Classes.Ord k, GHC.Classes.Ord c) => GHC.Classes.Ord (Text.Tokenizer.Uniqueness.ConflictTokens k c) instance (GHC.Classes.Eq k, GHC.Classes.Eq c) => GHC.Classes.Eq (Text.Tokenizer.Uniqueness.ConflictTokens k c) instance (GHC.Show.Show k, GHC.Show.Show c) => GHC.Show.Show (Text.Tokenizer.Uniqueness.ConflictTokens k c) -- | This module reexports everything you need from the package module Text.Tokenizer -- | Select some "white set" of available elements or "black set" of -- forbidden ones data BlackWhiteSet c BlackSet :: Set c -> BlackWhiteSet c WhiteSet :: Set c -> BlackWhiteSet c -- | Number of symbols acceptable by Repeatable data Count One :: Count Some :: Count -- | BlackWhiteSet that can be repeated. data Repeatable c Repeatable :: Count -> BlackWhiteSet c -> Repeatable c [$sel:getCnt:Repeatable] :: Repeatable c -> Count [$sel:getBWS:Repeatable] :: Repeatable c -> BlackWhiteSet c -- | Token with name of type k (used for uniqueness error messages -- and tokenizing output) over char type c. data Token k c Token :: k -> [BlackWhiteSet c] -> [Repeatable c] -> Token k c -- | the name of token [$sel:name:Token] :: Token k c -> k -- | restrictions on symbols before/after matchable part -- -- NB! they are assumed to be satisfied if there are no symbols -- before/after matched part respectively -- | restrictions on symbols before/after matchable part -- -- NB! they are assumed to be satisfied if there are no symbols -- before/after matched part respectively [$sel:behind:Token, $sel:ahead:Token] :: Token k c -> [BlackWhiteSet c] -- | matchable sequences of char sets with possible repetitions [$sel:body:Token] :: Token k c -> [Repeatable c] -- | Two ways of tokenizing a string, demonstrating non-uniqueness data ConflictTokens k c ConflictTokens :: [(k, [BlackWhiteSet c])] -> ConflictTokens k c [$sel:tokList1:ConflictTokens, $sel:tokList2:ConflictTokens] :: ConflictTokens k c -> [(k, [BlackWhiteSet c])] -- | Check that there is no list of symbols, that can be decomposed to ways -- on the tokens from given list checkUniqueTokenizing :: forall k c. Ord c => [Token k c] -> Either (ConflictTokens k c) () -- | Auxillary structure for tokenizing. Should be used as opaque type, -- initializing by makeTokenizeMap and concatenating by -- Semigroup instance. data TokenizeMap k c -- | Create auxillary Map for tokenizing. Should be called once for -- initializing makeTokenizeMap :: Ord c => [Token k c] -> TokenizeMap k c -- | Error during tokenizing -- -- Everywhere [(k, [c])] type is used, the list of pairs with -- name of token and part of string, matched by it is stored data TokenizeError k c NoWayTokenize :: Int -> [(k, [c])] -> TokenizeError k c TwoWaysTokenize :: Int -> [(k, [c])] -> [(k, [c])] -> TokenizeError k c -- | Split list of symbols on tokens. tokenize :: forall k c. Ord c => TokenizeMap k c -> [c] -> Either (TokenizeError k c) [(k, [c])]