-- | -- Module : Foundation.String.Encoding.UTF16 -- License : BSD-style -- Maintainer : Foundation -- Stability : experimental -- Portability : portable -- {-# LANGUAGE MagicHash #-} module Foundation.String.Encoding.UTF16 ( UTF16(..) , UTF16_Invalid(..) ) where import Foundation.Internal.Base import Foundation.Internal.Types import Foundation.Primitive.Monad import GHC.Prim import GHC.Word import GHC.Types import Foundation.Numerical import Data.Bits import qualified Prelude import Foundation.Array.Unboxed import Foundation.Collection.Buildable import Foundation.String.Encoding.Encoding data UTF16_Invalid = InvalidContinuation | InvalidUnicode Char deriving (Show, Eq, Typeable) instance Exception UTF16_Invalid data UTF16 = UTF16 instance Encoding UTF16 where type Unit UTF16 = Word16 type Error UTF16 = UTF16_Invalid encodingNext _ = next encodingWrite _ = write -- -- U+0000 to U+D7FF and U+E000 to U+FFFF : 1 bytes -- U+10000 to U+10FFFF : -- * 0x010000 is subtracted from the code point, leaving a 20-bit number in the range 0..0x0FFFFF. -- * The top ten bits (a number in the range 0..0x03FF) are added to 0xD800 to give the first 16-bit code unit -- or high surrogate, which will be in the range 0xD800..0xDBFF. -- * The low ten bits (also in the range 0..0x03FF) are added to 0xDC00 to give the second 16-bit code unit -- or low surrogate, which will be in the range 0xDC00..0xDFFF. next :: (Offset Word16 -> Word16) -> Offset Word16 -> Either UTF16_Invalid (Char, Offset Word16) next getter off | h < 0xd800 = Right (toChar hh, off + Offset 1) | h >= 0xe000 = Right (toChar hh, off + Offset 1) | otherwise = nextContinuation where h :: Word16 !h@(W16# hh) = getter off toChar :: Word# -> Char toChar w = C# (chr# (word2Int# w)) to32 :: Word16 -> Word32 to32 (W16# w) = W32# w nextContinuation | cont >= 0xdc00 && cont < 0xe00 = let !(W32# w) = ((to32 h .&. 0x3ff) `shiftL` 10) .|. (to32 cont .&. 0x3ff) in Right (toChar w, off + Offset 2) | otherwise = Left InvalidContinuation where cont :: Word16 !cont = getter $ off + Offset 1 write :: (PrimMonad st, Monad st) => Char -> Builder (UArray Word16) st () write c | c < toEnum 0xd800 = append $ w16 c | c > toEnum 0x10000 = let (w1, w2) = wHigh c in append w1 >> append w2 | c > toEnum 0x10ffff = throw $ InvalidUnicode c | c >= toEnum 0xe000 = append $ w16 c | otherwise = throw $ InvalidUnicode c where w16 :: Char -> Word16 w16 (C# ch) = W16# (int2Word# (ord# ch)) to16 :: Word32 -> Word16 to16 = Prelude.fromIntegral wHigh :: Char -> (Word16, Word16) wHigh (C# ch) = let v = W32# (minusWord# (int2Word# (ord# ch)) 0x10000##) in (0xdc00 .|. to16 (v `shiftR` 10), 0xd800 .|. to16 (v .&. 0x3ff))