module Codec.Phaser.UTF16 (
utf16_char,
utf16_word16_stream,
utf16_stream_useBOM,
utf16_stream_le,
utf16_stream_be,
utf16_stream_unknown,
utf16_encode_stream_be_nobom,
utf16_encode_stream_le_nobom,
utf16_encode_stream_be,
utf16_encode_stream_le,
utf16_encode_char
) where
import Data.Bits
import Data.Word
import Control.Applicative
import Control.Monad
import Codec.Phaser.Core
import Codec.Phaser.Common
unit_be :: Phase p Word8 o Word16
unit_be = (\a b -> shiftL a 8 .|. b) <$> byte <*> byte where
byte = fromIntegral <$> get
unit_le :: Phase p Word8 o Word16
unit_le = (\a b -> a .|. shiftL b 8) <$> byte <*> byte where
byte = fromIntegral <$> get
encode_unit_be :: Word16 -> Phase p i Word8 ()
encode_unit_be c = do
yield $ fromIntegral $ shiftR c 8
yield $ fromIntegral $ c .&. 0xFF
encode_unit_le :: Word16 -> Phase p i Word8 ()
encode_unit_le c = do
yield $ fromIntegral $ c .&. 0xFF
yield $ fromIntegral $ shiftR c 8
useBOM_unit :: Monoid p => Phase p Word8 o1 (Phase p Word8 o2 Word16)
useBOM_unit = "UTF-16: No byte order mark" <?> (go unit_be <|> go unit_le) where
go u = do
bom <- fitYield u
if bom == 0xFEFF
then return $ fitYield u
else empty
utf16_char :: Monoid p => Phase p Word16 o Char
utf16_char = do
hs <- fromIntegral <$> get :: Phase p Word16 o Int
case () of
_ | hs <= 0xD7FF || hs >= 0xE000 -> return $ toEnum hs
| hs >= 0xD800 && hs <= 0xDBFF -> do
ls <- fromIntegral <$> get :: Phase p Word16 o Int
"UTF-16: Invalid low surrogate" <?> guard (ls >= 0xDC00 && ls <= 0xDFFF)
return $ toEnum $ 0x010000 + 0x0400 * (hs - 0xD800) + (ls - 0xDC00)
| otherwise -> fail "UTF-16: Invalid high surrogate"
utf16_encode_char :: Char -> Phase p i Word16 ()
utf16_encode_char c = let
cc = fromEnum c
in case () of
_ | cc <= 0xD7FF || (cc >= 0xE000 && cc <= 0xFFFF) ->
yield $ fromIntegral cc
| cc >= 0x10000 && cc <= 0x10FFFF -> do
let s = cc .&. complement 0x010000
yield $ fromIntegral $ shiftR s 10 .|. 0xD800
yield $ fromIntegral $ (s .&. 0x03FF) .|. 0xDC00
| otherwise -> fail "Character not representable in UTF-16"
mkStream :: Monoid p => Phase p i a a -> Phase p i a ()
mkStream u = go where
go = ((u >>= yield) >> go) <|> return ()
mkEncodeStream :: Monoid p => (c -> Phase p c o a) -> Phase p c o ()
mkEncodeStream f = go where
go = (get >>= f >> go) <|> return ()
utf16_word16_stream :: Monoid p => Phase p Word16 Char ()
utf16_word16_stream = mkStream utf16_char
utf16_stream_useBOM :: Monoid p => Phase p Word8 Char ()
utf16_stream_useBOM = do
unit <- useBOM_unit :: Monoid p =>
Phase p Word8 Char (Phase p Word8 Word16 Word16)
toPhase $ mkStream unit >># utf16_word16_stream
utf16_stream_le :: Monoid p => Automaton p Word8 Char ()
utf16_stream_le = mkStream unit_le >># utf16_word16_stream
utf16_stream_be :: Monoid p => Automaton p Word8 Char ()
utf16_stream_be = mkStream unit_be >># utf16_word16_stream
utf16_stream_unknown :: Monoid p => Phase p Word8 Char ()
utf16_stream_unknown = flip (<|>) (return ()) $ do
unit <- return unit_le <|> return unit_be
h <- fitYield unit
case h of
0xFEFF -> toPhase $ mkStream (fitYield unit) >># utf16_word16_stream
0xFFFE -> fail "Reversed byte order mark"
_ -> toPhase $ mkStream (fitYield unit) >># (put1 h >> utf16_word16_stream)
utf16_encode_stream_word16 :: Monoid p => Phase p Char Word16 ()
utf16_encode_stream_word16 = mkEncodeStream utf16_encode_char
utf16_encode_stream_be_nobom :: Monoid p => Automaton p Char Word8 ()
utf16_encode_stream_be_nobom =
utf16_encode_stream_word16 >># mkEncodeStream encode_unit_be
utf16_encode_stream_le_nobom :: Monoid p => Automaton p Char Word8 ()
utf16_encode_stream_le_nobom =
utf16_encode_stream_word16 >># mkEncodeStream encode_unit_le
utf16_encode_stream_be :: Monoid p => Automaton p Char Word8 ()
utf16_encode_stream_be =
(yield 0xFEFF >> utf16_encode_stream_word16) >># mkEncodeStream encode_unit_be
utf16_encode_stream_le :: Monoid p => Automaton p Char Word8 ()
utf16_encode_stream_le =
(yield 0xFEFF >> utf16_encode_stream_word16) >># mkEncodeStream encode_unit_le