Safe Haskell	None
Language	Haskell2010

Data.Text.Utf8

Contents

Decoding
Indexing
Slicing Functions
Functions on Arrays
General Functions

Description

This module provides functions that allow treating Text values as series of UTF-8 code units instead of characters. Any calls to Text in alfred-margaret go through this module. Therefore we re-export some Text functions, e.g. concat.

Synopsis

type CodePoint = Char
type CodeUnit = Word8
newtype CodeUnitIndex = CodeUnitIndex {
- codeUnitIndex :: Int
}
data Text = Text !Array !Int !Int
fromByteList :: [Word8] -> Text
isCaseInvariant :: Text -> Bool
lengthUtf8 :: Text -> CodeUnitIndex
lowerCodePoint :: Char -> Char
lowerUtf8 :: Text -> Text
toLowerAscii :: Char -> Char
unicode2utf8 :: (Ord a, Num a, Bits a) => a -> [a]
unpackUtf8 :: Text -> [CodeUnit]
decode2 :: CodeUnit -> CodeUnit -> CodePoint
decode3 :: CodeUnit -> CodeUnit -> CodeUnit -> CodePoint
decode4 :: CodeUnit -> CodeUnit -> CodeUnit -> CodeUnit -> CodePoint
decodeUtf8 :: [CodeUnit] -> [CodePoint]
indexCodeUnit :: Text -> CodeUnitIndex -> CodeUnit
unsafeIndexCodePoint :: Text -> CodeUnitIndex -> (CodeUnitIndex, CodePoint)
unsafeIndexCodeUnit :: Text -> CodeUnitIndex -> CodeUnit
unsafeCutUtf8 :: CodeUnitIndex -> CodeUnitIndex -> Text -> (Text, Text)
unsafeSliceUtf8 :: CodeUnitIndex -> CodeUnitIndex -> Text -> Text
arrayContents :: Array -> Ptr Word8
isArrayPinned :: Array -> Bool
unsafeIndexCodePoint' :: Array -> CodeUnitIndex -> (CodeUnitIndex, CodePoint)
unsafeIndexCodeUnit' :: Array -> CodeUnitIndex -> CodeUnit
concat :: [Text] -> Text
dropWhile :: (Char -> Bool) -> Text -> Text
isInfixOf :: Text -> Text -> Bool
null :: Text -> Bool
pack :: String -> Text
replicate :: Int -> Text -> Text
unpack :: Text -> String
indices :: Text -> Text -> [Int]

Documentation

type CodePoint = Char Source #

A Unicode code point.

type CodeUnit = Word8 Source #

A UTF-8 code unit is a byte. A Unicode code point can be encoded as up to four code units.

newtype CodeUnitIndex Source #

An index into the raw UTF-8 data of a Text. This is not the code point index as conventionally accepted by Text, so we wrap it to avoid confusing the two. Incorrect index manipulation can lead to surrogate pairs being sliced, so manipulate indices with care. This type is also used for lengths.

Constructors

CodeUnitIndex
Fields codeUnitIndex :: Int

Instances

Instances details

Bounded CodeUnitIndex Source #
Instance details Defined in Data.Text.Utf8 Methods minBound :: CodeUnitIndex # maxBound :: CodeUnitIndex #
Eq CodeUnitIndex Source #
Instance details Defined in Data.Text.Utf8 Methods (==) :: CodeUnitIndex -> CodeUnitIndex -> Bool # (/=) :: CodeUnitIndex -> CodeUnitIndex -> Bool #
Num CodeUnitIndex Source #
Instance details Defined in Data.Text.Utf8 Methods (+) :: CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex # (-) :: CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex # (*) :: CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex # negate :: CodeUnitIndex -> CodeUnitIndex # abs :: CodeUnitIndex -> CodeUnitIndex # signum :: CodeUnitIndex -> CodeUnitIndex # fromInteger :: Integer -> CodeUnitIndex #
Ord CodeUnitIndex Source #
Instance details Defined in Data.Text.Utf8 Methods compare :: CodeUnitIndex -> CodeUnitIndex -> Ordering # (<) :: CodeUnitIndex -> CodeUnitIndex -> Bool # (<=) :: CodeUnitIndex -> CodeUnitIndex -> Bool # (>) :: CodeUnitIndex -> CodeUnitIndex -> Bool # (>=) :: CodeUnitIndex -> CodeUnitIndex -> Bool # max :: CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex # min :: CodeUnitIndex -> CodeUnitIndex -> CodeUnitIndex #
Show CodeUnitIndex Source #
Instance details Defined in Data.Text.Utf8 Methods showsPrec :: Int -> CodeUnitIndex -> ShowS # show :: CodeUnitIndex -> String # showList :: [CodeUnitIndex] -> ShowS #
Generic CodeUnitIndex Source #
Instance details Defined in Data.Text.Utf8 Associated Types type Rep CodeUnitIndex :: Type -> Type # Methods from :: CodeUnitIndex -> Rep CodeUnitIndex x # to :: Rep CodeUnitIndex x -> CodeUnitIndex #
Hashable CodeUnitIndex Source #
Instance details Defined in Data.Text.Utf8 Methods hashWithSalt :: Int -> CodeUnitIndex -> Int # hash :: CodeUnitIndex -> Int #
ToJSON CodeUnitIndex Source #
Instance details Defined in Data.Text.Utf8 Methods toJSON :: CodeUnitIndex -> Value # toEncoding :: CodeUnitIndex -> Encoding # toJSONList :: [CodeUnitIndex] -> Value # toEncodingList :: [CodeUnitIndex] -> Encoding #
FromJSON CodeUnitIndex Source #
Instance details Defined in Data.Text.Utf8 Methods parseJSON :: Value -> Parser CodeUnitIndex # parseJSONList :: Value -> Parser [CodeUnitIndex] #
NFData CodeUnitIndex Source #
Instance details Defined in Data.Text.Utf8 Methods rnf :: CodeUnitIndex -> () #
type Rep CodeUnitIndex Source #
Instance details Defined in Data.Text.Utf8 type Rep CodeUnitIndex = D1 ('MetaData "CodeUnitIndex" "Data.Text.Utf8" "alfred-margaret-2.0.0.0-AHkhEWrSlE7G50v2ifxOcH" 'True) (C1 ('MetaCons "CodeUnitIndex" 'PrefixI 'True) (S1 ('MetaSel ('Just "codeUnitIndex") 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) (Rec0 Int)))

data Text #

A space efficient, packed, unboxed Unicode text type.

Constructors

Text !Array !Int !Int

Instances

Instances details

Hashable Text
Instance details Defined in Data.Hashable.Class Methods hashWithSalt :: Int -> Text -> Int # hash :: Text -> Int #
ToJSON Text
Instance details Defined in Data.Aeson.Types.ToJSON Methods toJSON :: Text -> Value # toEncoding :: Text -> Encoding # toJSONList :: [Text] -> Value # toEncodingList :: [Text] -> Encoding #
ToJSONKey Text
Instance details Defined in Data.Aeson.Types.ToJSON Methods toJSONKey :: ToJSONKeyFunction Text # toJSONKeyList :: ToJSONKeyFunction [Text] #
FromJSON Text
Instance details Defined in Data.Aeson.Types.FromJSON Methods parseJSON :: Value -> Parser Text # parseJSONList :: Value -> Parser [Text] #
FromJSONKey Text
Instance details Defined in Data.Aeson.Types.FromJSON Methods fromJSONKey :: FromJSONKeyFunction Text # fromJSONKeyList :: FromJSONKeyFunction [Text] #
Chunk Text
Instance details Defined in Data.Attoparsec.Internal.Types Associated Types type ChunkElem Text # Methods nullChunk :: Text -> Bool # pappendChunk :: State Text -> Text -> State Text # atBufferEnd :: Text -> State Text -> Pos # bufferElemAt :: Text -> Pos -> State Text -> Maybe (ChunkElem Text, Int) # chunkElemToChar :: Text -> ChunkElem Text -> Char #
type State Text
Instance details Defined in Data.Attoparsec.Internal.Types type State Text = Buffer
type ChunkElem Text
Instance details Defined in Data.Attoparsec.Internal.Types type ChunkElem Text = Char
type Item Text
Instance details Defined in Data.Text type Item Text = Char

fromByteList :: [Word8] -> Text Source #

isCaseInvariant :: Text -> Bool Source #

Return whether text is the same lowercase as uppercase, such that this function will not return true when Aho–Corasick would differentiate when doing case-insensitive matching.

lengthUtf8 :: Text -> CodeUnitIndex Source #

The return value of this function is not really an index. However the signature is supposed to make it clear that the length is returned in terms of code units, not code points.

lowerCodePoint :: Char -> Char Source #

Lower-Case a UTF-8 codepoint. Uses toLowerAscii for ASCII and toLower otherwise.

lowerUtf8 :: Text -> Text Source #

Lowercase a Text by applying lowerCodePoint to each Char.

toLowerAscii :: Char -> Char Source #

Lower-case the ASCII code points A-Z and leave the rest of ASCII intact.

unicode2utf8 :: (Ord a, Num a, Bits a) => a -> [a] Source #

Convert a Unicode Code Point c into a list of UTF-8 code units (bytes).

unpackUtf8 :: Text -> [CodeUnit] Source #

Decoding

Functions that turns code unit sequences into code point sequences.

decode2 :: CodeUnit -> CodeUnit -> CodePoint Source #

Decode 2 UTF-8 code units into their code point. The given code units should have the following format:

┌───────────────┬───────────────┐
│1 1 0 x x x x x│1 0 x x x x x x│
└───────────────┴───────────────┘

decode3 :: CodeUnit -> CodeUnit -> CodeUnit -> CodePoint Source #

Decode 3 UTF-8 code units into their code point. The given code units should have the following format:

┌───────────────┬───────────────┬───────────────┐
│1 1 1 0 x x x x│1 0 x x x x x x│1 0 x x x x x x│
└───────────────┴───────────────┴───────────────┘

decode4 :: CodeUnit -> CodeUnit -> CodeUnit -> CodeUnit -> CodePoint Source #

Decode 4 UTF-8 code units into their code point. The given code units should have the following format:

┌───────────────┬───────────────┬───────────────┬───────────────┐
│1 1 1 1 0 x x x│1 0 x x x x x x│1 0 x x x x x x│1 0 x x x x x x│
└───────────────┴───────────────┴───────────────┴───────────────┘

decodeUtf8 :: [CodeUnit] -> [CodePoint] Source #

Decode a list of UTF-8 code units into a list of code points.

Indexing

Text can be indexed by code units or code points. A CodePoint is a 21-bit Unicode code point and can consist of up to four code units. A CodeUnit is a single byte.

indexCodeUnit :: Text -> CodeUnitIndex -> CodeUnit Source #

Get the code unit at the given CodeUnitIndex. Performs bounds checking.

unsafeIndexCodePoint :: Text -> CodeUnitIndex -> (CodeUnitIndex, CodePoint) Source #

Does exactly the same thing as unsafeIndexCodePoint', but on Text values.

unsafeIndexCodeUnit :: Text -> CodeUnitIndex -> CodeUnit Source #

Slicing Functions

unsafeCutUtf8 and unsafeSliceUtf8 are used to retrieve slices of Text values. unsafeSliceUtf8 begin length returns a substring of length length starting at begin. unsafeSliceUtf8 begin length returns a tuple of the "surrounding" substrings.

They satisfy the following property:

let (prefix, suffix) = unsafeCutUtf8 begin length t
in concat [prefix, unsafeSliceUtf8 begin length t, suffix] == t

The following diagram visualizes the relevant offsets for begin = CodeUnitIndex 2, length = CodeUnitIndex 6 and t = "BCDEFGHIJKL".

 off                 off+len
  │                     │
  ▼                     ▼
──┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬──
 A│B│C│D│E│F│G│H│I│J│K│L│M│N
──┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴──
      ▲           ▲
      │           │
 off+begin   off+begin+length

unsafeSliceUtf8 begin length t == "DEFGHI"
unsafeCutUtf8 begin length t == ("BC", "JKL")

The shown array is open at each end because in general, t may be a slice as well.

WARNING: As their name implies, these functions are not (necessarily) bounds-checked. Use at your own risk.

unsafeCutUtf8 Source #

Arguments

:: CodeUnitIndex	Starting position of substring.
-> CodeUnitIndex	Length of substring.
-> Text	Initial string.
-> (Text, Text)

unsafeSliceUtf8 :: CodeUnitIndex -> CodeUnitIndex -> Text -> Text Source #

Functions on Arrays

Functions for working with Array values.

arrayContents :: Array -> Ptr Word8 Source #

See byteArrayContents.

isArrayPinned :: Array -> Bool Source #

See isByteArrayPinned.

unsafeIndexCodePoint' :: Array -> CodeUnitIndex -> (CodeUnitIndex, CodePoint) Source #

Decode a code point at the given CodeUnitIndex. Returns garbage if there is no valid code point at that position. Does not perform bounds checking. See decode2, decode3 and decode4 for the expected format of multi-byte code points.

unsafeIndexCodeUnit' :: Array -> CodeUnitIndex -> CodeUnit Source #

General Functions

Re-exported from Text.

concat :: [Text] -> Text #

O(n) Concatenate a list of Texts.

dropWhile :: (Char -> Bool) -> Text -> Text #

O(n) dropWhile p t returns the suffix remaining after takeWhile p t.

isInfixOf :: Text -> Text -> Bool #

O(n+m) The isInfixOf function takes two Texts and returns True iff the first is contained, wholly and intact, anywhere within the second.

In (unlikely) bad cases, this function's time complexity degrades towards O(n*m).

null :: Text -> Bool #

O(1) Tests whether a Text is empty or not.

pack :: String -> Text #

O(n) Convert a String into a Text. Performs replacement on invalid scalar values.

replicate :: Int -> Text -> Text #

O(n*m) replicate n t is a Text consisting of the input t repeated n times.

unpack :: Text -> String #

O(n) Convert a Text into a String.

indices #

Arguments

:: Text	Substring to search for (`needle`)
-> Text	Text to search in (`haystack`)
-> [Int]

O(n+m) Find the offsets of all non-overlapping indices of needle within haystack.

In (unlikely) bad cases, this algorithm's complexity degrades towards O(n*m).