-- | -- Module : Foundation.String.Encoding.UTF16 -- License : BSD-style -- Maintainer : Foundation -- Stability : experimental -- Portability : portable -- {-# LANGUAGE MagicHash #-} module Foundation.String.Encoding.UTF16 ( UTF16(..) , UTF16_Invalid(..) ) where import Foundation.Internal.Base import Foundation.Primitive.Types.OffsetSize import Foundation.Primitive.Monad import GHC.Prim import GHC.Word import GHC.Types import Foundation.Numerical import Data.Bits import qualified Prelude import Foundation.Array.Unboxed import Foundation.Array.Unboxed.Mutable (MUArray) import Foundation.Boot.Builder import Foundation.String.Encoding.Encoding data UTF16_Invalid = InvalidContinuation | InvalidUnicode Char deriving (Show, Eq, Typeable) instance Exception UTF16_Invalid data UTF16 = UTF16 instance Encoding UTF16 where type Unit UTF16 = Word16 type Error UTF16 = UTF16_Invalid encodingNext _ = next encodingWrite _ = write -- -- U+0000 to U+D7FF and U+E000 to U+FFFF : 1 bytes -- U+10000 to U+10FFFF : -- * 0x010000 is subtracted from the code point, leaving a 20-bit number in the range 0..0x0FFFFF. -- * The top ten bits (a number in the range 0..0x03FF) are added to 0xD800 to give the first 16-bit code unit -- or high surrogate, which will be in the range 0xD800..0xDBFF. -- * The low ten bits (also in the range 0..0x03FF) are added to 0xDC00 to give the second 16-bit code unit -- or low surrogate, which will be in the range 0xDC00..0xDFFF. next :: (Offset Word16 -> Word16) -> Offset Word16 -> Either UTF16_Invalid (Char, Offset Word16) next getter off | h < 0xd800 = Right (toChar hh, off + Offset 1) | h >= 0xe000 = Right (toChar hh, off + Offset 1) | otherwise = nextContinuation where h :: Word16 !h@(W16# hh) = getter off toChar :: Word# -> Char toChar w = C# (chr# (word2Int# w)) to32 :: Word16 -> Word32 to32 (W16# w) = W32# w nextContinuation | cont >= 0xdc00 && cont < 0xe00 = let !(W32# w) = ((to32 h .&. 0x3ff) `shiftL` 10) .|. (to32 cont .&. 0x3ff) in Right (toChar w, off + Offset 2) | otherwise = Left InvalidContinuation where cont :: Word16 !cont = getter $ off + Offset 1 write :: (PrimMonad st, Monad st) => Char -> Builder (UArray Word16) (MUArray Word16) Word16 st () write c | c < toEnum 0xd800 = builderAppend $ w16 c | c > toEnum 0x10000 = let (w1, w2) = wHigh c in builderAppend w1 >> builderAppend w2 | c > toEnum 0x10ffff = throw $ InvalidUnicode c | c >= toEnum 0xe000 = builderAppend $ w16 c | otherwise = throw $ InvalidUnicode c where w16 :: Char -> Word16 w16 (C# ch) = W16# (int2Word# (ord# ch)) to16 :: Word32 -> Word16 to16 = Prelude.fromIntegral wHigh :: Char -> (Word16, Word16) wHigh (C# ch) = let v = W32# (minusWord# (int2Word# (ord# ch)) 0x10000##) in (0xdc00 .|. to16 (v `shiftR` 10), 0xd800 .|. to16 (v .&. 0x3ff))