{-# LANGUAGE GeneralizedNewtypeDeriving, BangPatterns #-} -- | A module for efficiently constructing a 'Builder'. This module offers more -- functions than the standard ones, optimized for HTML generation. -- -- -- SM: General remark: Try to split it into Utf8 specific parts and a -- HtmlBuilder using it. Essentially, a UTF-8 builder is a Text builder that -- uses UTF-8 for its internal representation. The Text builder from Tom -- Harper would then be called Utf16Builder. They should offer exactly the -- same interface (except perhaps for the extraction functions.) -- module Text.Blaze.Internal.Utf8Builder ( -- * The Utf8Builder type. Utf8Builder -- * Creating Builders from various text representations. , fromChar , fromText , fromString -- * Creating Builders from ByteStrings. , unsafeFromByteString -- * Transformations on the builder. , optimizePiece -- * Extracting the value from the builder. , toLazyByteString , toText -- * Internal functions to extend the builder. -- ** The write type. , Write , fromUnsafeWrite , optimizeWriteBuilder , writeList -- ** Functions to create a write. , writeChar , writeByteString ) where import Foreign import Data.Char (ord) import Data.Monoid (Monoid (..)) import Prelude hiding (quot) import Debug.Trace (trace) import Data.Binary.Builder (Builder) import qualified Data.Binary.Builder as B import qualified Data.ByteString as S import qualified Data.ByteString.Internal as S import qualified Data.ByteString.Lazy as L import Data.Text (Text) import qualified Data.Text as T import qualified Data.Text.Encoding as T -- | A newtype definition for the UTF-8 builder monoid. newtype Utf8Builder = Utf8Builder Builder deriving (Monoid) -- | /O(1)./ Convert a Haskell character to a 'Utf8Builder', without doing any -- escaping. -- fromChar :: Char -> Utf8Builder fromChar = fromUnsafeWrite . writeChar -- | /O(n)./ Convert a 'Text' value to a 'Utf8Builder'. This function will not -- do any HTML escaping. -- fromText :: Text -> Utf8Builder fromText text = fromUnsafeWrite $ T.foldl (\w c -> w `mappend` writeChar c) mempty text -- | /O(n)./ Convert a Haskell 'String' to a builder. Unlike 'fromHtmlString', -- this function will not do any escaping. -- fromString :: String -> Utf8Builder fromString = writeList writeChar -- fromUnsafeWrite $ -- foldl (\w c -> w `mappend` writeChar c) mempty string -- | /O(n)./ A Builder taking a 'S.ByteString`, copying it. This function is -- considered unsafe, as a `S.ByteString` can contain invalid UTF-8 bytes, so -- you chould use it with caution. This function should perform better when -- dealing with small strings than the fromByteString function from Builder. -- unsafeFromByteString :: S.ByteString -> Utf8Builder unsafeFromByteString = fromUnsafeWrite . writeByteString -- | /O(n)./ Optimize a small builder. This function has an initial speed -- penalty, but will speed up later calls of the optimized builder piece. This -- speedup will only work well for small builders (less than 1k characters). -- optimizePiece :: Utf8Builder -> Utf8Builder optimizePiece = fromUnsafeWrite . optimizeWriteBuilder {-# INLINE optimizePiece #-} -- | /O(n)./ Convert the builder to a 'L.ByteString'. -- toLazyByteString :: Utf8Builder -> L.ByteString toLazyByteString (Utf8Builder builder) = B.toLazyByteString builder -- | /O(n)./ Convert the builder to a 'Text' value. Please note that this -- function is a lot slower than the 'toLazyByteString' function. -- toText :: Utf8Builder -> Text toText = T.concat . map T.decodeUtf8 . L.toChunks . toLazyByteString -- | Abstract representation of a write action to the internal buffer. -- data Write = Write {-# UNPACK #-} !Int (Ptr Word8 -> IO ()) -- Create a monoid interface for the write actions. instance Monoid Write where mempty = Write 0 (const $ return ()) {-# INLINE mempty #-} mappend (Write l1 f1) (Write l2 f2) = Write (l1 + l2) (\ptr -> f1 ptr >> f2 (ptr `plusPtr` l1)) {-# INLINE mappend #-} -- INV: The writes must be smaller than the default buffer size. -- -- SM: Note that moving the control flow away from the Builder will give us the -- next level of speed. This way we have simple tail-recursive functions -- consuming data and filling the buffer. writeList :: (a -> Write) -> [a] -> Utf8Builder writeList f xs0 = Utf8Builder $ B.fillBuffer (go xs0 0) where go [] !w !l !p = return (w, Nothing) -- here should come the call to the next filler. go xs@(x:xs') !w !l !p = case f x of Write n g | n <= l -> do g p go xs' (w+n) (l-n) (p `plusPtr` n) | otherwise -> return (w, Just (B.forceNewBuffer `mappend` B.fillBuffer (go xs 0))) {-# INLINE go #-} {-# INLINE writeList #-} -- | Create a builder from a write. -- fromUnsafeWrite :: Write -- ^ Write to execute. -> Utf8Builder -- ^ Resulting builder. fromUnsafeWrite (Write l f) = Utf8Builder $ B.fromUnsafeWrite l f {-# INLINE fromUnsafeWrite #-} -- | Optimize a small builder to a write operation. -- optimizeWriteBuilder :: Utf8Builder -- ^ Small builder to optimize. -> Write -- ^ Resulting write. optimizeWriteBuilder = writeByteString . mconcat . L.toChunks . toLazyByteString {-# INLINE optimizeWriteBuilder #-} -- | Write a 'S.ByteString' to the builder. -- writeByteString :: S.ByteString -- ^ ByteString to write. -> Write -- ^ Resulting write. writeByteString byteString = Write l f where (fptr, o, l) = S.toForeignPtr byteString f dst = do copyBytes dst (unsafeForeignPtrToPtr fptr `plusPtr` o) l touchForeignPtr fptr {-# INLINE f #-} {-# INLINE writeByteString #-} -- | Write a Unicode character, encoding it as UTF-8. -- writeChar :: Char -- ^ Character to write. -> Write -- ^ Resulting write. writeChar = encodeCharUtf8 f1 f2 f3 f4 where f1 x = Write 1 $ \ptr -> poke ptr x f2 x1 x2 = Write 2 $ \ptr -> do poke ptr x1 poke (ptr `plusPtr` 1) x2 f3 x1 x2 x3 = Write 3 $ \ptr -> do poke ptr x1 poke (ptr `plusPtr` 1) x2 poke (ptr `plusPtr` 2) x3 f4 x1 x2 x3 x4 = Write 4 $ \ptr -> do poke ptr x1 poke (ptr `plusPtr` 1) x2 poke (ptr `plusPtr` 2) x3 poke (ptr `plusPtr` 3) x4 {-# INLINE writeChar #-} -- | Encode a Unicode character to another datatype, using UTF-8. This function -- acts as an abstract way of encoding characters, as it is unaware of what -- needs to happen with the resulting bytes: you have to specify functions to -- deal with those. -- encodeCharUtf8 :: (Word8 -> a) -- ^ 1-byte UTF-8. -> (Word8 -> Word8 -> a) -- ^ 2-byte UTF-8. -> (Word8 -> Word8 -> Word8 -> a) -- ^ 3-byte UTF-8. -> (Word8 -> Word8 -> Word8 -> Word8 -> a) -- ^ 4-byte UTF-8. -> Char -- ^ Input 'Char'. -> a -- ^ Result. encodeCharUtf8 f1 f2 f3 f4 c = case ord c of x | x <= 0x7F -> f1 $ fromIntegral x | x <= 0x07FF -> let x1 = fromIntegral $ (x `shiftR` 6) + 0xC0 x2 = fromIntegral $ (x .&. 0x3F) + 0x80 in f2 x1 x2 | x <= 0xFFFF -> let x1 = fromIntegral $ (x `shiftR` 12) + 0xE0 x2 = fromIntegral $ ((x `shiftR` 6) .&. 0x3F) + 0x80 x3 = fromIntegral $ (x .&. 0x3F) + 0x80 in f3 x1 x2 x3 | otherwise -> let x1 = fromIntegral $ (x `shiftR` 18) + 0xF0 x2 = fromIntegral $ ((x `shiftR` 12) .&. 0x3F) + 0x80 x3 = fromIntegral $ ((x `shiftR` 6) .&. 0x3F) + 0x80 x4 = fromIntegral $ (x .&. 0x3F) + 0x80 in f4 x1 x2 x3 x4 {-# INLINE encodeCharUtf8 #-}