-- Hoogle documentation, generated by Haddock -- See Hoogle, http://www.haskell.org/hoogle/ -- | Haskell implementation of tiktoken -- -- This packages only implements tokenization. In other words, given an -- existing encoding (cl100k_base) you can tokenize an input. @package tiktoken @version 1.0.2 -- | You can use this module to convert back and forth between a -- ByteString and its corresponding tokens using an existing -- encoding like cl100k_base or o200k_base -- -- Example usage: -- --
-- {-# LANGUAGE OverloadedStrings #-}
--
-- import Tiktoken (o200k_base, toTokens, toRanks)
--
-- main :: IO ()
-- main = do
-- -- Just ["El"," perro"," come"," las"," man","z","anas"]
-- print (toTokens o200k_base "El perro come las manzanas")
--
-- -- Just [4422,96439,3063,1996,873,89,14457]
-- print (toRanks o200k_base "El perro come las manzanas")
--
module Tiktoken
-- | This is an efficient internal representation of an encoding like
-- cl100k_base, p50k_edit, or o200k_base
data Encoding
-- | Parse an encoding from the .tiktoken file format
tiktokenToEncoding :: ByteString -> Text -> Either (ParseErrorBundle Text Void) Encoding
-- | Add special tokens to a base Encoding
addSpecialTokens :: Map ByteString Int -> Encoding -> Encoding
-- | r50k_base Encoding
r50k_base :: Encoding
-- | p50k_base Encoding
p50k_base :: Encoding
-- | p50k_edit Encoding
p50k_edit :: Encoding
-- | cl100k_base Encoding
cl100k_base :: Encoding
-- | o200k_base Encoding
o200k_base :: Encoding
-- | Use an Encoding to tokenize a ByteString into smaller
-- ByteStrings
--
-- This only fails if you provide an Encoding that cannot rank all
-- possible 1-byte sequences
toTokens :: Encoding -> ByteString -> Maybe [ByteString]
-- | Use an Encoding to tokenize a ByteString into ranks
--
-- This only fails if you provide an Encoding that cannot rank all
-- possible 1-byte sequences
toRanks :: Encoding -> ByteString -> Maybe [Int]
-- | Use an Encoding to tokenize a ByteString into smaller
-- ByteStrings and their associated ranks
--
-- This only fails if you provide an Encoding that cannot rank all
-- possible 1-byte sequences
toTokensAndRanks :: Encoding -> ByteString -> Maybe [(Int, ByteString)]
-- | Combine a sequence of ByteString tokens back into a
-- ByteString
--
-- This is just a synonym for
-- Data.ByteString.concat (no Encoding
-- necessary), provided solely for consistency/convenience.
fromTokens :: [ByteString] -> ByteString
-- | Convert a sequence of ranks back into a ByteString
--
-- This will fail if you supply any ranks which are not recognized by the
-- Encoding.
fromRanks :: Encoding -> [Int] -> Maybe ByteString
instance Control.DeepSeq.NFData Tiktoken.Encoding
instance GHC.Generics.Generic Tiktoken.Encoding
instance GHC.Classes.Ord Tiktoken.Ranked
instance GHC.Classes.Eq Tiktoken.Ranked