tokenizer-0.1.0.0: Check uniqueness and tokenize safely
Copyright(c) Lev Dvorkin 2022
LicenseMIT
Maintainerlev_135@mail.ru
StabilityExperimental
Safe HaskellNone
LanguageHaskell2010

Text.Tokenizer

Description

This module reexports everything you need from the package

Synopsis

Structures for tokens representation

data BlackWhiteSet c Source #

Select some "white set" of available elements or "black set" of forbidden ones

Constructors

BlackSet (Set c) 
WhiteSet (Set c) 

data Count Source #

Number of symbols acceptable by Repeatable

Constructors

One 
Some 

Instances

Instances details
Eq Count Source # 
Instance details

Defined in Text.Tokenizer.Types

Methods

(==) :: Count -> Count -> Bool #

(/=) :: Count -> Count -> Bool #

Ord Count Source # 
Instance details

Defined in Text.Tokenizer.Types

Methods

compare :: Count -> Count -> Ordering #

(<) :: Count -> Count -> Bool #

(<=) :: Count -> Count -> Bool #

(>) :: Count -> Count -> Bool #

(>=) :: Count -> Count -> Bool #

max :: Count -> Count -> Count #

min :: Count -> Count -> Count #

Show Count Source # 
Instance details

Defined in Text.Tokenizer.Types

Methods

showsPrec :: Int -> Count -> ShowS #

show :: Count -> String #

showList :: [Count] -> ShowS #

data Repeatable c Source #

BlackWhiteSet that can be repeated.

Constructors

Repeatable 

Fields

Instances

Instances details
Eq c => Eq (Repeatable c) Source # 
Instance details

Defined in Text.Tokenizer.Types

Methods

(==) :: Repeatable c -> Repeatable c -> Bool #

(/=) :: Repeatable c -> Repeatable c -> Bool #

Ord c => Ord (Repeatable c) Source # 
Instance details

Defined in Text.Tokenizer.Types

Show c => Show (Repeatable c) Source # 
Instance details

Defined in Text.Tokenizer.Types

data Token k c Source #

Token with name of type k (used for uniqueness error messages and tokenizing output) over char type c.

Constructors

Token 

Fields

  • name :: k

    the name of token

  • behind, ahead :: [BlackWhiteSet c]

    restrictions on symbols before/after matchable part

    NB! they are assumed to be satisfied if there are no symbols before/after matched part respectively

  • body :: [Repeatable c]

    matchable sequences of char sets with possible repetitions

Instances

Instances details
(Show k, Show c) => Show (Token k c) Source # 
Instance details

Defined in Text.Tokenizer.Types

Methods

showsPrec :: Int -> Token k c -> ShowS #

show :: Token k c -> String #

showList :: [Token k c] -> ShowS #

Uniqueness checking

data ConflictTokens k c Source #

Two ways of tokenizing a string, demonstrating non-uniqueness

Constructors

ConflictTokens 

Fields

Instances

Instances details
(Eq k, Eq c) => Eq (ConflictTokens k c) Source # 
Instance details

Defined in Text.Tokenizer.Uniqueness

(Ord k, Ord c) => Ord (ConflictTokens k c) Source # 
Instance details

Defined in Text.Tokenizer.Uniqueness

(Show k, Show c) => Show (ConflictTokens k c) Source # 
Instance details

Defined in Text.Tokenizer.Uniqueness

checkUniqueTokenizing :: forall k c. Ord c => [Token k c] -> Either (ConflictTokens k c) () Source #

Check that there is no list of symbols, that can be decomposed to ways on the tokens from given list

Splitting string on tokens

data TokenizeMap k c Source #

Auxillary structure for tokenizing. Should be used as opaque type, initializing by makeTokenizeMap and concatenating by Semigroup instance.

Instances

Instances details
(Show c, Show k) => Show (TokenizeMap k c) Source # 
Instance details

Defined in Text.Tokenizer.Split

Methods

showsPrec :: Int -> TokenizeMap k c -> ShowS #

show :: TokenizeMap k c -> String #

showList :: [TokenizeMap k c] -> ShowS #

Ord c => Semigroup (TokenizeMap k c) Source # 
Instance details

Defined in Text.Tokenizer.Split

Methods

(<>) :: TokenizeMap k c -> TokenizeMap k c -> TokenizeMap k c #

sconcat :: NonEmpty (TokenizeMap k c) -> TokenizeMap k c #

stimes :: Integral b => b -> TokenizeMap k c -> TokenizeMap k c #

Ord c => Monoid (TokenizeMap k c) Source # 
Instance details

Defined in Text.Tokenizer.Split

Methods

mempty :: TokenizeMap k c #

mappend :: TokenizeMap k c -> TokenizeMap k c -> TokenizeMap k c #

mconcat :: [TokenizeMap k c] -> TokenizeMap k c #

makeTokenizeMap :: Ord c => [Token k c] -> TokenizeMap k c Source #

Create auxillary Map for tokenizing. Should be called once for initializing

data TokenizeError k c Source #

Error during tokenizing

Everywhere [(k, [c])] type is used, the list of pairs with name of token and part of string, matched by it is stored

Constructors

NoWayTokenize 

Fields

  • Int

    Position of the first character that can not be tokenized

  • [(k, [c])]

    Part of string successfully tokenized (the longest of all attempts)

TwoWaysTokenize 

Fields

  • Int

    Length of uniquely tokenized prefix

  • [(k, [c])]

    First tokenize way

  • [(k, [c])]

    Second tokenize way

Instances

Instances details
(Eq k, Eq c) => Eq (TokenizeError k c) Source # 
Instance details

Defined in Text.Tokenizer.Split

Methods

(==) :: TokenizeError k c -> TokenizeError k c -> Bool #

(/=) :: TokenizeError k c -> TokenizeError k c -> Bool #

(Show k, Show c) => Show (TokenizeError k c) Source # 
Instance details

Defined in Text.Tokenizer.Split

tokenize :: forall k c. Ord c => TokenizeMap k c -> [c] -> Either (TokenizeError k c) [(k, [c])] Source #

Split list of symbols on tokens.