{-# LANGUAGE MagicHash #-} -- | -- Module : Data.Text.Encoding.Utf16 -- Copyright : (c) Tom Harper 2008-2009, -- (c) Bryan O'Sullivan 2009, -- (c) Duncan Coutts 2009 -- -- License : BSD-style -- Maintainer : rtharper@aftereternity.co.uk, bos@serpentine.com, -- duncan@haskell.org -- Stability : experimental -- Portability : GHC -- -- Basic UTF-8 validation and character manipulation. module Data.Text.Encoding.Utf8 ( -- Decomposition ord2 , ord3 , ord4 -- Construction , chr2 , chr3 , chr4 -- * Validation , validate1 , validate2 , validate3 , validate4 ) where import Control.Exception (assert) import Data.Char (ord) import Data.Bits ((.&.)) import Data.Text.UnsafeShift (shiftR) import GHC.Exts import GHC.Word (Word8(..)) default(Int) between :: Word8 -- ^ byte to check -> Word8 -- ^ lower bound -> Word8 -- ^ upper bound -> Bool between x y z = x >= y && x <= z {-# INLINE between #-} ord2 :: Char -> (Word8,Word8) ord2 c = assert (n >= 0x80 && n <= 0x07ff) (x1,x2) where n = ord c x1 = fromIntegral $ (n `shiftR` 6) + 0xC0 x2 = fromIntegral $ (n .&. 0x3F) + 0x80 ord3 :: Char -> (Word8,Word8,Word8) ord3 c = assert (n >= 0x0800 && n <= 0xffff) (x1,x2,x3) where n = ord c x1 = fromIntegral $ (n `shiftR` 12) + 0xE0 x2 = fromIntegral $ ((n `shiftR` 6) .&. 0x3F) + 0x80 x3 = fromIntegral $ (n .&. 0x3F) + 0x80 ord4 :: Char -> (Word8,Word8,Word8,Word8) ord4 c = assert (n >= 0x10000) (x1,x2,x3,x4) where n = ord c x1 = fromIntegral $ (n `shiftR` 18) + 0xF0 x2 = fromIntegral $ ((n `shiftR` 12) .&. 0x3F) + 0x80 x3 = fromIntegral $ ((n `shiftR` 6) .&. 0x3F) + 0x80 x4 = fromIntegral $ (n .&. 0x3F) + 0x80 chr2 :: Word8 -> Word8 -> Char chr2 (W8# x1#) (W8# x2#) = C# (chr# (z1# +# z2#)) where y1# = word2Int# x1# y2# = word2Int# x2# z1# = uncheckedIShiftL# (y1# -# 0xC0#) 6# z2# = y2# -# 0x80# {-# INLINE chr2 #-} chr3 :: Word8 -> Word8 -> Word8 -> Char chr3 (W8# x1#) (W8# x2#) (W8# x3#) = C# (chr# (z1# +# z2# +# z3#)) where y1# = word2Int# x1# y2# = word2Int# x2# y3# = word2Int# x3# z1# = uncheckedIShiftL# (y1# -# 0xE0#) 12# z2# = uncheckedIShiftL# (y2# -# 0x80#) 6# z3# = y3# -# 0x80# {-# INLINE chr3 #-} chr4 :: Word8 -> Word8 -> Word8 -> Word8 -> Char chr4 (W8# x1#) (W8# x2#) (W8# x3#) (W8# x4#) = C# (chr# (z1# +# z2# +# z3# +# z4#)) where y1# = word2Int# x1# y2# = word2Int# x2# y3# = word2Int# x3# y4# = word2Int# x4# z1# = uncheckedIShiftL# (y1# -# 0xF0#) 18# z2# = uncheckedIShiftL# (y2# -# 0x80#) 12# z3# = uncheckedIShiftL# (y3# -# 0x80#) 6# z4# = y4# -# 0x80# {-# INLINE chr4 #-} validate1 :: Word8 -> Bool validate1 x1 = between x1 0x00 0x7F {-# INLINE validate1 #-} validate2 :: Word8 -> Word8 -> Bool validate2 x1 x2 = between x1 0xC2 0xDF && between x2 0x80 0xBF {-# INLINE validate2 #-} validate3 :: Word8 -> Word8 -> Word8 -> Bool {-# INLINE validate3 #-} validate3 x1 x2 x3 = validate3_1 || validate3_2 || validate3_3 || validate3_4 where validate3_1 = (x1 == 0xE0) && between x2 0xA0 0xBF && between x3 0x80 0xBF validate3_2 = between x1 0xE1 0xEC && between x2 0x80 0xBF && between x3 0x80 0xBF validate3_3 = x1 == 0xED && between x2 0x80 0x9F && between x3 0x80 0xBF validate3_4 = between x1 0xEE 0xEF && between x2 0x80 0xBF && between x3 0x80 0xBF validate4 :: Word8 -> Word8 -> Word8 -> Word8 -> Bool {-# INLINE validate4 #-} validate4 x1 x2 x3 x4 = validate4_1 || validate4_2 || validate4_3 where validate4_1 = x1 == 0xF0 && between x2 0x90 0xBF && between x3 0x80 0xBF && between x4 0x80 0xBF validate4_2 = between x1 0xF1 0xF3 && between x2 0x80 0xBF && between x3 0x80 0xBF && between x4 0x80 0xBF validate4_3 = x1 == 0xF4 && between x2 0x80 0x8F && between x3 0x80 0xBF && between x4 0x80 0xBF