-- | -- Module : Foundation.Primitive.UTF8.Helper -- License : BSD-style -- Maintainer : Foundation -- -- Some low level helpers to use UTF8 -- -- Most helpers are lowlevel and unsafe, don't use -- directly. {-# LANGUAGE BangPatterns #-} {-# LANGUAGE GeneralizedNewtypeDeriving #-} {-# LANGUAGE MagicHash #-} {-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE TypeFamilies #-} {-# LANGUAGE UnboxedTuples #-} {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE CPP #-} module Foundation.Primitive.UTF8.Helper where import Foundation.Internal.Base import Foundation.Internal.Primitive import Foundation.Bits import Foundation.Primitive.Types.OffsetSize import Foundation.Numerical import Foundation.Primitive.Types import GHC.Prim import GHC.Types import GHC.Word -- same as nextAscii but with a ByteArray# nextAsciiBA :: ByteArray# -> Offset8 -> (# Word8, Bool #) nextAsciiBA ba n = (# w, not (testBit w 7) #) where !w = primBaIndex ba n {-# INLINE nextAsciiBA #-} -- same as nextAscii but with a ByteArray# nextAsciiPtr :: Ptr Word8 -> Offset8 -> (# Word8, Bool #) nextAsciiPtr (Ptr addr) n = (# w, not (testBit w 7) #) where !w = primAddrIndex addr n {-# INLINE nextAsciiPtr #-} -- | nextAsciiBa specialized to get a digit between 0 and 9 (included) nextAsciiDigitBA :: ByteArray# -> Offset8 -> (# Word8, Bool #) nextAsciiDigitBA ba n = (# d, d < 0xa #) where !d = primBaIndex ba n - 0x30 {-# INLINE nextAsciiDigitBA #-} nextAsciiDigitPtr :: Ptr Word8 -> Offset8 -> (# Word8, Bool #) nextAsciiDigitPtr (Ptr addr) n = (# d, d < 0xa #) where !d = primAddrIndex addr n - 0x30 {-# INLINE nextAsciiDigitPtr #-} expectAsciiBA :: ByteArray# -> Offset8 -> Word8 -> Bool expectAsciiBA ba n v = primBaIndex ba n == v {-# INLINE expectAsciiBA #-} expectAsciiPtr :: Ptr Word8 -> Offset8 -> Word8 -> Bool expectAsciiPtr (Ptr ptr) n v = primAddrIndex ptr n == v {-# INLINE expectAsciiPtr #-} -- | Different way to encode a Character in UTF8 represented as an ADT data UTF8Char = UTF8_1 {-# UNPACK #-} !Word8 | UTF8_2 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 | UTF8_3 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 | UTF8_4 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 {-# UNPACK #-} !Word8 -- | Transform a Unicode code point 'Char' into -- -- note that we expect here a valid unicode code point in the *allowed* range. -- bits will be lost if going above 0x10ffff asUTF8Char :: Char -> UTF8Char asUTF8Char !c | bool# (ltWord# x 0x80## ) = encode1 | bool# (ltWord# x 0x800## ) = encode2 | bool# (ltWord# x 0x10000##) = encode3 | otherwise = encode4 where !(I# xi) = fromEnum c !x = int2Word# xi encode1 = UTF8_1 (W8# x) encode2 = let !x1 = W8# (or# (uncheckedShiftRL# x 6#) 0xc0##) !x2 = toContinuation x in UTF8_2 x1 x2 encode3 = let !x1 = W8# (or# (uncheckedShiftRL# x 12#) 0xe0##) !x2 = toContinuation (uncheckedShiftRL# x 6#) !x3 = toContinuation x in UTF8_3 x1 x2 x3 encode4 = let !x1 = W8# (or# (uncheckedShiftRL# x 18#) 0xf0##) !x2 = toContinuation (uncheckedShiftRL# x 12#) !x3 = toContinuation (uncheckedShiftRL# x 6#) !x4 = toContinuation x in UTF8_4 x1 x2 x3 x4 toContinuation :: Word# -> Word8 toContinuation w = W8# (or# (and# w 0x3f##) 0x80##) {-# INLINE toContinuation #-} -- given the encoding of UTF8 Char, get the number of bytes of this sequence numBytes :: UTF8Char -> Size8 numBytes UTF8_1{} = Size 1 numBytes UTF8_2{} = Size 2 numBytes UTF8_3{} = Size 3 numBytes UTF8_4{} = Size 4 -- given the leading byte of a utf8 sequence, get the number of bytes of this sequence skipNextHeaderValue :: Word8 -> Size Word8 skipNextHeaderValue !x | x < 0xC0 = Size 1 -- 0b11000000 | x < 0xE0 = Size 2 -- 0b11100000 | x < 0xF0 = Size 3 -- 0b11110000 | otherwise = Size 4 {-# INLINE skipNextHeaderValue #-} charToBytes :: Int -> Size8 charToBytes c | c < 0x80 = Size 1 | c < 0x800 = Size 2 | c < 0x10000 = Size 3 | c < 0x110000 = Size 4 | otherwise = error ("invalid code point: " `mappend` show c)