Safe Haskell	Safe-Inferred
Language	Haskell2010

Data.Text.Utf8

Contents

Decoding
Indexing
Slicing Functions
Functions on Arrays
General Functions

Description

This module provides functions that allow treating Text values as series of UTF-8 code units instead of characters. Any calls to Text in alfred-margaret go through this module. Therefore we re-export some Text functions, e.g. concat.

Synopsis

type CodePoint = Char
type CodeUnit = Word8
newtype CodeUnitIndex = CodeUnitIndex {
- codeUnitIndex :: Int
}
data Text = Text !Array !Int !Int
fromByteList :: [Word8] -> Text
isCaseInvariant :: Text -> Bool
lengthUtf8 :: Text -> CodeUnitIndex
lowerCodePoint :: Char -> Char
unlowerCodePoint :: Char -> [Char]
lowerUtf8 :: Text -> Text
toLowerAscii :: Char -> Char
unicode2utf8 :: (Ord a, Num a, Bits a) => a -> [a]
unpackUtf8 :: Text -> [CodeUnit]
decode2 :: CodeUnit -> CodeUnit -> CodePoint
decode3 :: CodeUnit -> CodeUnit -> CodeUnit -> CodePoint
decode4 :: CodeUnit -> CodeUnit -> CodeUnit -> CodeUnit -> CodePoint
decodeUtf8 :: [CodeUnit] -> [CodePoint]
indexCodeUnit :: Text -> CodeUnitIndex -> CodeUnit
unsafeIndexCodePoint :: Text -> CodeUnitIndex -> (CodeUnitIndex, CodePoint)
unsafeIndexCodeUnit :: Text -> CodeUnitIndex -> CodeUnit
skipCodePointsBackwards :: Text -> CodeUnitIndex -> Int -> CodeUnitIndex
unsafeCutUtf8 :: CodeUnitIndex -> CodeUnitIndex -> Text -> (Text, Text)
unsafeSliceUtf8 :: CodeUnitIndex -> CodeUnitIndex -> Text -> Text
arrayContents :: Array -> Ptr Word8
isArrayPinned :: Array -> Bool
unsafeIndexCodePoint' :: Array -> CodeUnitIndex -> (CodeUnitIndex, CodePoint)
unsafeIndexCodeUnit' :: Array -> CodeUnitIndex -> CodeUnit
data BackwardsIter = BackwardsIter {
- backwardsIterNext :: !CodeUnitIndex
- backwardsIterChar :: !CodePoint
- backwardsIterEndOfChar :: !CodeUnitIndex
}
unsafeIndexEndOfCodePoint' :: Array -> CodeUnitIndex -> BackwardsIter
unsafeIndexAnywhereInCodePoint' :: Array -> CodeUnitIndex -> BackwardsIter
concat :: [Text] -> Text
dropWhile :: (Char -> Bool) -> Text -> Text
isInfixOf :: Text -> Text -> Bool
null :: Text -> Bool
pack :: String -> Text
replicate :: Int -> Text -> Text
unpack :: Text -> String
indices :: Text -> Text -> [Int]

Documentation

type CodePoint = Char Source #

A Unicode code point.

type CodeUnit = Word8 Source #

A UTF-8 code unit is a byte. A Unicode code point can be encoded as up to four code units.

newtype CodeUnitIndex Source #

An index into the raw UTF-8 data of a Text. This is not the code point index as conventionally accepted by Text, so we wrap it to avoid confusing the two. Incorrect index manipulation can lead to surrogate pairs being sliced, so manipulate indices with care. This type is also used for lengths.

Constructors

CodeUnitIndex
Fields codeUnitIndex :: Int

Instances

Instances details

FromJSON CodeUnitIndex Source #
Instance details Defined in Data.Text.Utf8 Methods parseJSON :: Value -> Parser CodeUnitIndex # parseJSONList :: Value -> Parser [CodeUnitIndex] # omittedField :: Maybe CodeUnitIndex #
ToJSON CodeUnitIndex Source #
Instance details Defined in Data.Text.Utf8 Methods toJSON :: CodeUnitIndex -> Value # toEncoding :: CodeUnitIndex -> Encoding # toJSONList :: [CodeUnitIndex] -> Value # toEncodingList :: [CodeUnitIndex] -> Encoding # omitField :: CodeUnitIndex -> Bool #
Bounded CodeUnitIndex Source #
Instance details Defined in Data.Text.Utf8 Methods minBound :: CodeUnitIndex # maxBound :: CodeUnitIndex #
Generic CodeUnitIndex Source #
Instance details Defined in Data.Text.Utf8 Associated Types type Rep CodeUnitIndex :: Type -> Type # Methods from :: CodeUnitIndex -> Rep CodeUnitIndex x # to :: Rep CodeUnitIndex x -> CodeUnitIndex #
Num CodeUnitIndex Source #
Instance details Defined in Data.Text.Utf8 Methods (+) :: CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex # (-) :: CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex # (*) :: CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex # negate :: CodeUnitIndex -> CodeUnitIndex # abs :: CodeUnitIndex -> CodeUnitIndex # signum :: CodeUnitIndex -> CodeUnitIndex # fromInteger :: Integer -> CodeUnitIndex #
Show CodeUnitIndex Source #
Instance details Defined in Data.Text.Utf8 Methods showsPrec :: Int -> CodeUnitIndex -> ShowS # show :: CodeUnitIndex -> String # showList :: [CodeUnitIndex] -> ShowS #
NFData CodeUnitIndex Source #
Instance details Defined in Data.Text.Utf8 Methods rnf :: CodeUnitIndex -> () #
Eq CodeUnitIndex Source #
Instance details Defined in Data.Text.Utf8 Methods (==) :: CodeUnitIndex -> CodeUnitIndex -> Bool # (/=) :: CodeUnitIndex -> CodeUnitIndex -> Bool #
Ord CodeUnitIndex Source #
Instance details Defined in Data.Text.Utf8 Methods compare :: CodeUnitIndex -> CodeUnitIndex -> Ordering # (<) :: CodeUnitIndex -> CodeUnitIndex -> Bool # (<=) :: CodeUnitIndex -> CodeUnitIndex -> Bool # (>) :: CodeUnitIndex -> CodeUnitIndex -> Bool # (>=) :: CodeUnitIndex -> CodeUnitIndex -> Bool # max :: CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex # min :: CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex #
Hashable CodeUnitIndex Source #
Instance details Defined in Data.Text.Utf8 Methods hashWithSalt :: Int -> CodeUnitIndex -> Int # hash :: CodeUnitIndex -> Int #
Prim CodeUnitIndex Source #
Instance details Defined in Data.Text.Utf8 Methods sizeOfType# :: Proxy CodeUnitIndex -> Int# # sizeOf# :: CodeUnitIndex -> Int# # alignmentOfType# :: Proxy CodeUnitIndex -> Int# # alignment# :: CodeUnitIndex -> Int# # indexByteArray# :: ByteArray# -> Int# -> CodeUnitIndex # readByteArray# :: MutableByteArray# s -> Int# -> State# s -> (# State# s, CodeUnitIndex #) # writeByteArray# :: MutableByteArray# s -> Int# -> CodeUnitIndex -> State# s -> State# s # setByteArray# :: MutableByteArray# s -> Int# -> Int# -> CodeUnitIndex -> State# s -> State# s # indexOffAddr# :: Addr# -> Int# -> CodeUnitIndex # readOffAddr# :: Addr# -> Int# -> State# s -> (# State# s, CodeUnitIndex #) # writeOffAddr# :: Addr# -> Int# -> CodeUnitIndex -> State# s -> State# s # setOffAddr# :: Addr# -> Int# -> Int# -> CodeUnitIndex -> State# s -> State# s #
type Rep CodeUnitIndex Source #
Instance details Defined in Data.Text.Utf8 type Rep CodeUnitIndex = D1 ('MetaData "CodeUnitIndex" "Data.Text.Utf8" "alfred-margaret-2.1.0.2-6sxlpnB4iUU9qpUGhQU9LV" 'True) (C1 ('MetaCons "CodeUnitIndex" 'PrefixI 'True) (S1 ('MetaSel ('Just "codeUnitIndex") 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) (Rec0 Int)))

data Text #

A space efficient, packed, unboxed Unicode text type.

Constructors

Text
Fields !Array bytearray encoded as UTF-8 !Int offset in bytes (not in Char!), pointing to a start of UTF-8 sequence !Int length in bytes (not in Char!), pointing to an end of UTF-8 sequence

Instances

Instances details

FromJSON Text
Instance details Defined in Data.Aeson.Types.FromJSON Methods parseJSON :: Value -> Parser Text # parseJSONList :: Value -> Parser [Text] # omittedField :: Maybe Text #
FromJSONKey Text
Instance details Defined in Data.Aeson.Types.FromJSON Methods fromJSONKey :: FromJSONKeyFunction Text # fromJSONKeyList :: FromJSONKeyFunction [Text] #
ToJSON Text
Instance details Defined in Data.Aeson.Types.ToJSON Methods toJSON :: Text -> Value # toEncoding :: Text -> Encoding # toJSONList :: [Text] -> Value # toEncodingList :: [Text] -> Encoding # omitField :: Text -> Bool #
ToJSONKey Text
Instance details Defined in Data.Aeson.Types.ToJSON Methods toJSONKey :: ToJSONKeyFunction Text # toJSONKeyList :: ToJSONKeyFunction [Text] #
Hashable Text
Instance details Defined in Data.Hashable.Class Methods hashWithSalt :: Int -> Text -> Int # hash :: Text -> Int #
type Item Text
Instance details Defined in Data.Text type Item Text = Char

fromByteList :: [Word8] -> Text Source #

isCaseInvariant :: Text -> Bool Source #

Return whether text has exactly one case variation, such that this function will not return true when Aho–Corasick would differentiate when doing case-insensitive matching.

lengthUtf8 :: Text -> CodeUnitIndex Source #

The return value of this function is not really an index. However the signature is supposed to make it clear that the length is returned in terms of code units, not code points.

lowerCodePoint :: Char -> Char Source #

Lower-Case a UTF-8 codepoint. Uses toLowerAscii for ASCII and toLower otherwise.

unlowerCodePoint :: Char -> [Char] Source #

Inverse of Char.toLower/Utf8.lowerCodePoint

Returns all the characters that have the given character as their lower case, for example:

unlowerCodePoint a == "aA" unlowerCodePoint A == "" unlowerCodePoint '1' == "1" unlowerCodePoint i == İiI unlowerCodePoint ß == ẞß

lowerUtf8 :: Text -> Text Source #

Lowercase a Text by applying lowerCodePoint to each Char.

toLowerAscii :: Char -> Char Source #

Lower-case the ASCII code points A-Z and leave the rest of ASCII intact.

unicode2utf8 :: (Ord a, Num a, Bits a) => a -> [a] Source #

Convert a Unicode Code Point c into a list of UTF-8 code units (bytes).

unpackUtf8 :: Text -> [CodeUnit] Source #

Decoding

Functions that turns code unit sequences into code point sequences.

decode2 :: CodeUnit -> CodeUnit -> CodePoint Source #

Decode 2 UTF-8 code units into their code point. The given code units should have the following format:

┌───────────────┬───────────────┐
│1 1 0 x x x x x│1 0 x x x x x x│
└───────────────┴───────────────┘

decode3 :: CodeUnit -> CodeUnit -> CodeUnit -> CodePoint Source #

Decode 3 UTF-8 code units into their code point. The given code units should have the following format:

┌───────────────┬───────────────┬───────────────┐
│1 1 1 0 x x x x│1 0 x x x x x x│1 0 x x x x x x│
└───────────────┴───────────────┴───────────────┘

decode4 :: CodeUnit -> CodeUnit -> CodeUnit -> CodeUnit -> CodePoint Source #

Decode 4 UTF-8 code units into their code point. The given code units should have the following format:

┌───────────────┬───────────────┬───────────────┬───────────────┐
│1 1 1 1 0 x x x│1 0 x x x x x x│1 0 x x x x x x│1 0 x x x x x x│
└───────────────┴───────────────┴───────────────┴───────────────┘

decodeUtf8 :: [CodeUnit] -> [CodePoint] Source #

Decode a list of UTF-8 code units into a list of code points.

Indexing

Text can be indexed by code units or code points. A CodePoint is a 21-bit Unicode code point and can consist of up to four code units. A CodeUnit is a single byte.

indexCodeUnit :: Text -> CodeUnitIndex -> CodeUnit Source #

Get the code unit at the given CodeUnitIndex. Performs bounds checking.

unsafeIndexCodePoint :: Text -> CodeUnitIndex -> (CodeUnitIndex, CodePoint) Source #

Does exactly the same thing as unsafeIndexCodePoint', but on Text values.

unsafeIndexCodeUnit :: Text -> CodeUnitIndex -> CodeUnit Source #

skipCodePointsBackwards :: Text -> CodeUnitIndex -> Int -> CodeUnitIndex Source #

Scan backwards through the text until we've seen the specified number of codepoints. Assumes that the initial CodeUnitIndex is within a codepoint.

Slicing Functions

unsafeCutUtf8 and unsafeSliceUtf8 are used to retrieve slices of Text values. unsafeSliceUtf8 begin length returns a substring of length length starting at begin. unsafeSliceUtf8 begin length returns a tuple of the "surrounding" substrings.

They satisfy the following property:

let (prefix, suffix) = unsafeCutUtf8 begin length t
in concat [prefix, unsafeSliceUtf8 begin length t, suffix] == t

The following diagram visualizes the relevant offsets for begin = CodeUnitIndex 2, length = CodeUnitIndex 6 and t = "BCDEFGHIJKL".

 off                 off+len
  │                     │
  ▼                     ▼
──┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬──
 A│B│C│D│E│F│G│H│I│J│K│L│M│N
──┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴──
      ▲           ▲
      │           │
 off+begin   off+begin+length

unsafeSliceUtf8 begin length t == "DEFGHI"
unsafeCutUtf8 begin length t == ("BC", "JKL")

The shown array is open at each end because in general, t may be a slice as well.

WARNING: As their name implies, these functions are not (necessarily) bounds-checked. Use at your own risk.

unsafeCutUtf8 Source #

Arguments

:: CodeUnitIndex	Starting position of substring.
-> CodeUnitIndex	Length of substring.
-> Text	Initial string.
-> (Text, Text)

unsafeSliceUtf8 :: CodeUnitIndex -> CodeUnitIndex -> Text -> Text Source #

Functions on Arrays

Functions for working with Array values.

arrayContents :: Array -> Ptr Word8 Source #

See byteArrayContents.

isArrayPinned :: Array -> Bool Source #

See isByteArrayPinned.

unsafeIndexCodePoint' :: Array -> CodeUnitIndex -> (CodeUnitIndex, CodePoint) Source #

Decode a code point at the given CodeUnitIndex. Returns garbage if there is no valid code point at that position. Does not perform bounds checking. See decode2, decode3 and decode4 for the expected format of multi-byte code points.

unsafeIndexCodeUnit' :: Array -> CodeUnitIndex -> CodeUnit Source #

data BackwardsIter Source #

Intermediate state when you're iterating backwards through a Utf8 text.

Constructors

BackwardsIter
Fields backwardsIterNext :: !CodeUnitIndex First byte to the left of the codepoint that we're focused on. This can be used with `unsafeIndexEndOfCodePoint'` to find the next codepoint. backwardsIterChar :: !CodePoint The codepoint that we're focused on backwardsIterEndOfChar :: !CodeUnitIndex Points to the last byte of the codepoint that we're focused on

unsafeIndexEndOfCodePoint' :: Array -> CodeUnitIndex -> BackwardsIter Source #

Similar to unsafeIndexCodePoint', but assumes that the given index is the end of a utf8 codepoint. It returns the decoded code point and the index _before_ the code point. The resulting index could be passed directly to unsafeIndexEndOfCodePoint' again to decode the _previous_ code point.

unsafeIndexAnywhereInCodePoint' :: Array -> CodeUnitIndex -> BackwardsIter Source #

General Functions

Re-exported from Text.

concat :: [Text] -> Text #

O(n) Concatenate a list of Texts.

dropWhile :: (Char -> Bool) -> Text -> Text #

O(n) dropWhile p t returns the suffix remaining after takeWhile p t.

isInfixOf :: Text -> Text -> Bool #

O(n+m) The isInfixOf function takes two Texts and returns True if and only if the first is contained, wholly and intact, anywhere within the second.

In (unlikely) bad cases, this function's time complexity degrades towards O(n*m).

null :: Text -> Bool #

O(1) Tests whether a Text is empty or not.

pack :: String -> Text #

O(n) Convert a String into a Text. Performs replacement on invalid scalar values, so unpack . pack is not id:

>>> Data.Text.unpack (pack "\55555")
"\65533"

replicate :: Int -> Text -> Text #

O(n*m) replicate n t is a Text consisting of the input t repeated n times.

unpack :: Text -> String #

O(n) Convert a Text into a String.

indices #

Arguments

:: Text	Substring to search for (`needle`)
-> Text	Text to search in (`haystack`)
-> [Int]

O(n+m) Find the offsets of all non-overlapping indices of needle within haystack.

In (unlikely) bad cases, this algorithm's complexity degrades towards O(n*m).