-- | Basic types and operators on Text objects. module Data.Text.Base export { -- Runtime operators. makeTextLit; sizeOfTextLit; indexTextLit; -- Construction. textOfChar; textOfWord8; textOfVector; -- Conversion. vectorOfText; -- Projections. sizeOfText; indexText; -- Operators. eqText; } import Data.Text.Char import Data.Numeric.Nat import Data.Numeric.Word import Data.Maybe ------------------------------------------------------------------------------- -- Foreign imports. -- | A TextLit is a boxed object that contains a pointer to literal -- UTF-8 data in static memory. import foreign boxed type TextLit : Data -- | Runtime operators for working with unboxed text literals. import foreign c value -- | Make a text literal. -- The TextLit# type is a pointer to literal UTF-8 data in -- static memory, which we pack into a boxed object. makeTextLit : TextLit# -> TextLit -- | Get the size of a boxed text literal. sizeOfTextLit : TextLit -> Nat# -- | Get a single byte from a boxed text literal. -- This is a byte rather than a character, -- as UTF-8 encoded characters can consist of multiple bytes. indexTextLit : TextLit -> Nat# -> Word8# -- | Top level region containing text vectors. -- All our dynamic character data is in this region. import foreign abstract type RegionText : Region -- | Capabilities to allocate and read top-level text vectors. import foreign abstract capability capTopTextAlloc : Alloc RegionText capTopTextRead : Read RegionText where -- Text ----------------------------------------------------------------------- data Text where -- | Wrap a text literal into a text object. -- The characterdata is stored in static memory. TextLit : TextLit -> Text -- | Wrap some character data into a text object. -- The character data is stored on the heap. TextVec : Vector# RegionText Word8# -> Text -- | Append two text objects. TextApp : Text -> Text -> Text -- Construction --------------------------------------------------------------- -- | O(1). Wrap a single character into a text object. -- We're only taking the lowest 8 bit bytes, -- rather than doing proper UTF-8 encoding. textOfChar (c: Word32): Text = textOfWord8 (truncate# c) -- | O(1). Wrap a single byte into a text object. textOfWord8 (w8: Word8): Text = TextVec $ extend RegionText using r1 with { Alloc r1; Write r1 } in do -- Allocate the vector to hold the data, -- including an extra null terminator byte. vec = vectorAlloc# [r1] 2 -- Write the character. vectorWrite# vec 0 w8 -- Write the null terminator. vectorWrite# vec 1 0w8 vec -- | O(1). Wrap a vector of UTF-8 data into a text object. textOfVector (vec: Vector# RegionText Word8): Text = TextVec vec -- Conversion ----------------------------------------------------------------- -- | Copy a Text object into a flat vector of UTF-8 data. vectorOfText [r1: Region] (tt: Text) : S (Alloc r1) (Vector# r1 Word8) = extend r1 using r2 with { Alloc r2; Write r2 } in do -- Allocate a vector to hold all the data, -- including an extra null terminator byte. vec = vectorAlloc# [r2] (add (sizeOfText tt) 1) -- Copy the text data into the vector. iEnd = copyTextToVector tt vec 0 -- Write the null terminator. vectorWrite# vec iEnd 0w8 vec -- | Copy a text object to a mutable vector of UTF-8 data. copyTextToVector [r: Region] (tt: Text) (vec: Vector# r Word8) (i0: Nat) : S (Write r) Nat = case tt of TextLit lit -> copyTextLitToVector lit vec i0 0 (sizeOfTextLit lit) TextVec vec2 -> copyTextVecToVector vec2 vec i0 0 (vectorLength# vec2 - 1) TextApp t1 t2 -> do i1 = copyTextToVector t1 vec i0 i2 = copyTextToVector t2 vec i1 i2 -- | Copy a text literal to a mutable vector of UTF-8 data. copyTextLitToVector [r: Region] (tt: TextLit) (vec: Vector# r Word8) (iDst iSrc nSrc: Nat) : S (Write r) Nat | iSrc >= nSrc = iDst | otherwise = do vectorWrite# vec iDst (indexTextLit tt iSrc) copyTextLitToVector tt vec (iDst + 1) (iSrc + 1) nSrc -- | Copy a text source vector to a mutable vector of UTF-8 data. copyTextVecToVector [r1 r2: Region] (vecSrc: Vector# r1 Word8) (vecDst: Vector# r2 Word8) (iDst iSrc nSrc: Nat) : S (Read r1 + Write r2) Nat | iSrc >= nSrc = iDst | otherwise = do vectorWrite# vecDst iDst (vectorRead# vecSrc iSrc) copyTextVecToVector vecSrc vecDst (iDst + 1) (iSrc + 1) nSrc -- Projections ---------------------------------------------------------------- -- | Get the size of the utf8 data in a Text object, in bytes. -- -- * This is NOT the same as the length of the text string in characters, -- as single characters can be encoded using multiple bytes. -- sizeOfText (tt: Text): Nat = case tt of TextLit lit -> sizeOfTextLit lit -- The size of a text vector is the vector size minus -- the null terminator byte. TextVec vec -> vectorLength# vec - 1 TextApp t1 t2 -> sizeOfText t1 + sizeOfText t2 -- | Get a single word8 character from a Text object. indexText (tt: Text) (ix: Nat): Maybe Word8 = case tt of TextLit lit | ix >= sizeOfTextLit lit -> Nothing | otherwise -> Just (indexTextLit lit ix) TextVec vec | ix >= vectorLength# vec - 1 -> Nothing | otherwise -> Just (vectorRead# vec ix) TextApp t1 t2 -> case indexText t1 ix of Just x -> Just x Nothing -> indexText t2 (ix - sizeOfText t1) -- Comparison ----------------------------------------------------------------- -- | O(max len1 len2). -- Check if two text objects represent the same characters. eqText (tx1 tx2: Text): Bool = private r with {Read r; Alloc r} in do -- We copy both strings into new vectors before doing the comparison. -- It would be better to do it in-place. vec1 = vectorOfText [r] tx1 len1 = vectorLength# vec1 vec2 = vectorOfText [r] tx2 len2 = vectorLength# vec2 match | len1 /= len2 = False | len1 == 0 = True | otherwise = go vec1 vec2 len1 0 where go (vec1 vec2: Vector# r Word8) (len: Nat) (ix: Nat): S (Read r) Bool | ix >= (len - 1) = True | not (eq# (vectorRead# vec1 ix) (vectorRead# vec2 ix)) = False | otherwise = go vec1 vec2 len (ix + 1)