Copyright	(c) Lev Dvorkin 2022
License	MIT
Maintainer	lev_135@mail.ru
Stability	Experimental
Safe Haskell	None
Language	Haskell2010

Text.Tokenizer

Contents

Structures for tokens representation
Uniqueness checking
Splitting string on tokens

Description

This module reexports everything you need from the package

Synopsis

data BlackWhiteSet c
- = BlackSet (Set c)
- | WhiteSet (Set c)
data Count
- = One
- | Some
data Repeatable c = Repeatable {
- getCnt :: Count
- getBWS :: BlackWhiteSet c
}
data Token k c = Token {
- name :: k
- behind, ahead :: [BlackWhiteSet c]
- body :: [Repeatable c]
}
data ConflictTokens k c = ConflictTokens {
- tokList1, tokList2 :: [(k, [BlackWhiteSet c])]
}
checkUniqueTokenizing :: forall k c. Ord c => [Token k c] -> Either (ConflictTokens k c) ()
data TokenizeMap k c
makeTokenizeMap :: Ord c => [Token k c] -> TokenizeMap k c
data TokenizeError k c
- = NoWayTokenize Int [(k, [c])]
- | TwoWaysTokenize Int [(k, [c])] [(k, [c])]
tokenize :: forall k c. Ord c => TokenizeMap k c -> [c] -> Either (TokenizeError k c) [(k, [c])]

Structures for tokens representation

data BlackWhiteSet c Source #

Select some "white set" of available elements or "black set" of forbidden ones

Constructors

BlackSet (Set c)
WhiteSet (Set c)

Instances

Instances details

Eq c => Eq (BlackWhiteSet c) Source #
Instance details Defined in Text.Tokenizer.BlackWhiteSet Methods (==) :: BlackWhiteSet c -> BlackWhiteSet c -> Bool # (/=) :: BlackWhiteSet c -> BlackWhiteSet c -> Bool #
Ord c => Ord (BlackWhiteSet c) Source #
Instance details Defined in Text.Tokenizer.BlackWhiteSet Methods compare :: BlackWhiteSet c -> BlackWhiteSet c -> Ordering # (<) :: BlackWhiteSet c -> BlackWhiteSet c -> Bool # (<=) :: BlackWhiteSet c -> BlackWhiteSet c -> Bool # (>) :: BlackWhiteSet c -> BlackWhiteSet c -> Bool # (>=) :: BlackWhiteSet c -> BlackWhiteSet c -> Bool # max :: BlackWhiteSet c -> BlackWhiteSet c -> BlackWhiteSet c # min :: BlackWhiteSet c -> BlackWhiteSet c -> BlackWhiteSet c #
Show c => Show (BlackWhiteSet c) Source #
Instance details Defined in Text.Tokenizer.BlackWhiteSet Methods showsPrec :: Int -> BlackWhiteSet c -> ShowS # show :: BlackWhiteSet c -> String # showList :: [BlackWhiteSet c] -> ShowS #

data Count Source #

Number of symbols acceptable by Repeatable

Constructors

One
Some

Instances

Instances details

Eq Count Source #
Instance details Defined in Text.Tokenizer.Types Methods (==) :: Count -> Count -> Bool # (/=) :: Count -> Count -> Bool #
Ord Count Source #
Instance details Defined in Text.Tokenizer.Types Methods compare :: Count -> Count -> Ordering # (<) :: Count -> Count -> Bool # (<=) :: Count -> Count -> Bool # (>) :: Count -> Count -> Bool # (>=) :: Count -> Count -> Bool # max :: Count -> Count -> Count # min :: Count -> Count -> Count #
Show Count Source #
Instance details Defined in Text.Tokenizer.Types Methods showsPrec :: Int -> Count -> ShowS # show :: Count -> String # showList :: [Count] -> ShowS #

data Repeatable c Source #

BlackWhiteSet that can be repeated.

Constructors

Repeatable
Fields getCnt :: Count getBWS :: BlackWhiteSet c

Instances

Instances details

Eq c => Eq (Repeatable c) Source #
Instance details Defined in Text.Tokenizer.Types Methods (==) :: Repeatable c -> Repeatable c -> Bool # (/=) :: Repeatable c -> Repeatable c -> Bool #
Ord c => Ord (Repeatable c) Source #
Instance details Defined in Text.Tokenizer.Types Methods compare :: Repeatable c -> Repeatable c -> Ordering # (<) :: Repeatable c -> Repeatable c -> Bool # (<=) :: Repeatable c -> Repeatable c -> Bool # (>) :: Repeatable c -> Repeatable c -> Bool # (>=) :: Repeatable c -> Repeatable c -> Bool # max :: Repeatable c -> Repeatable c -> Repeatable c # min :: Repeatable c -> Repeatable c -> Repeatable c #
Show c => Show (Repeatable c) Source #
Instance details Defined in Text.Tokenizer.Types Methods showsPrec :: Int -> Repeatable c -> ShowS # show :: Repeatable c -> String # showList :: [Repeatable c] -> ShowS #

data Token k c Source #

Token with name of type k (used for uniqueness error messages and tokenizing output) over char type c.

Constructors

Token
Fields name :: k the name of token behind, ahead :: [BlackWhiteSet c] restrictions on symbols before/after matchable part NB! they are assumed to be satisfied if there are no symbols before/after matched part respectively body :: [Repeatable c] matchable sequences of char sets with possible repetitions

Instances

Instances details

(Show k, Show c) => Show (Token k c) Source #
Instance details Defined in Text.Tokenizer.Types Methods showsPrec :: Int -> Token k c -> ShowS # show :: Token k c -> String # showList :: [Token k c] -> ShowS #

Uniqueness checking

data ConflictTokens k c Source #

Two ways of tokenizing a string, demonstrating non-uniqueness

Constructors

ConflictTokens
Fields tokList1, tokList2 :: [(k, [BlackWhiteSet c])]

Instances

Instances details

(Eq k, Eq c) => Eq (ConflictTokens k c) Source #
Instance details Defined in Text.Tokenizer.Uniqueness Methods (==) :: ConflictTokens k c -> ConflictTokens k c -> Bool # (/=) :: ConflictTokens k c -> ConflictTokens k c -> Bool #
(Ord k, Ord c) => Ord (ConflictTokens k c) Source #
Instance details Defined in Text.Tokenizer.Uniqueness Methods compare :: ConflictTokens k c -> ConflictTokens k c -> Ordering # (<) :: ConflictTokens k c -> ConflictTokens k c -> Bool # (<=) :: ConflictTokens k c -> ConflictTokens k c -> Bool # (>) :: ConflictTokens k c -> ConflictTokens k c -> Bool # (>=) :: ConflictTokens k c -> ConflictTokens k c -> Bool # max :: ConflictTokens k c -> ConflictTokens k c -> ConflictTokens k c # min :: ConflictTokens k c -> ConflictTokens k c -> ConflictTokens k c #
(Show k, Show c) => Show (ConflictTokens k c) Source #
Instance details Defined in Text.Tokenizer.Uniqueness Methods showsPrec :: Int -> ConflictTokens k c -> ShowS # show :: ConflictTokens k c -> String # showList :: [ConflictTokens k c] -> ShowS #

checkUniqueTokenizing :: forall k c. Ord c => [Token k c] -> Either (ConflictTokens k c) () Source #

Check that there is no list of symbols, that can be decomposed to ways on the tokens from given list

Splitting string on tokens

data TokenizeMap k c Source #

Auxillary structure for tokenizing. Should be used as opaque type, initializing by makeTokenizeMap and concatenating by Semigroup instance.

Instances

Instances details

(Show c, Show k) => Show (TokenizeMap k c) Source #
Instance details Defined in Text.Tokenizer.Split Methods showsPrec :: Int -> TokenizeMap k c -> ShowS # show :: TokenizeMap k c -> String # showList :: [TokenizeMap k c] -> ShowS #
Ord c => Semigroup (TokenizeMap k c) Source #
Instance details Defined in Text.Tokenizer.Split Methods (<>) :: TokenizeMap k c -> TokenizeMap k c -> TokenizeMap k c # sconcat :: NonEmpty (TokenizeMap k c) -> TokenizeMap k c # stimes :: Integral b => b -> TokenizeMap k c -> TokenizeMap k c #
Ord c => Monoid (TokenizeMap k c) Source #
Instance details Defined in Text.Tokenizer.Split Methods mempty :: TokenizeMap k c # mappend :: TokenizeMap k c -> TokenizeMap k c -> TokenizeMap k c # mconcat :: [TokenizeMap k c] -> TokenizeMap k c #

makeTokenizeMap :: Ord c => [Token k c] -> TokenizeMap k c Source #

Create auxillary Map for tokenizing. Should be called once for initializing

data TokenizeError k c Source #

Error during tokenizing

Everywhere [(k, [c])] type is used, the list of pairs with name of token and part of string, matched by it is stored

Constructors

NoWayTokenize
Fields Int Position of the first character that can not be tokenized [(k, [c])] Part of string successfully tokenized (the longest of all attempts)
TwoWaysTokenize
Fields Int Length of uniquely tokenized prefix [(k, [c])] First tokenize way [(k, [c])] Second tokenize way

Instances

Instances details

(Eq k, Eq c) => Eq (TokenizeError k c) Source #
Instance details Defined in Text.Tokenizer.Split Methods (==) :: TokenizeError k c -> TokenizeError k c -> Bool # (/=) :: TokenizeError k c -> TokenizeError k c -> Bool #
(Show k, Show c) => Show (TokenizeError k c) Source #
Instance details Defined in Text.Tokenizer.Split Methods showsPrec :: Int -> TokenizeError k c -> ShowS # show :: TokenizeError k c -> String # showList :: [TokenizeError k c] -> ShowS #

tokenize :: forall k c. Ord c => TokenizeMap k c -> [c] -> Either (TokenizeError k c) [(k, [c])] Source #

Split list of symbols on tokens.