Codec/Binary/UTF8/String.hs

--
-- |
-- Module      :  Codec.Binary.UTF8.String
-- Copyright   :  (c) Eric Mertens 2007
-- License     :  BSD3-style (see LICENSE)
-- 
-- Maintainer:    emertens@galois.com
-- Stability   :  experimental
-- Portability :  portable
--
-- Support for encoding UTF8 Strings to and from @[Word8]@
--

module Codec.Binary.UTF8.String (
      encode
    , decode
    , encodeString
    , decodeString
    
    , isUTF8Encoded
    , utf8Encode
  ) where

import Data.Word        (Word8,Word32)
import Data.Bits        ((.|.),(.&.),shiftL,shiftR)
import Data.Char        (chr,ord)

default(Int)

-- | Encode a string using 'encode' and store the result in a 'String'.
encodeString :: String -> String
encodeString xs = map (toEnum . fromEnum) (encode xs)

-- | Decode a string using 'decode' using a 'String' as input.
-- | This is not safe but it is necessary if UTF-8 encoded text
-- | has been loaded into a 'String' prior to being decoded.
decodeString :: String -> String
decodeString xs = decode (map (toEnum . fromEnum) xs)

replacement_character :: Char
replacement_character = '\xfffd'

-- | Encode a Haskell String to a list of Word8 values, in UTF8 format.
encode :: String -> [Word8]
encode = concatMap (map fromIntegral . go . ord)
 where
  go oc
   | oc <= 0x7f       = [oc]

   | oc <= 0x7ff      = [ 0xc0 + (oc `shiftR` 6)
                        , 0x80 + oc .&. 0x3f
                        ]

   | oc <= 0xffff     = [ 0xe0 + (oc `shiftR` 12)
                        , 0x80 + ((oc `shiftR` 6) .&. 0x3f)
                        , 0x80 + oc .&. 0x3f
                        ]
   | otherwise        = [ 0xf0 + (oc `shiftR` 18)
                        , 0x80 + ((oc `shiftR` 12) .&. 0x3f)
                        , 0x80 + ((oc `shiftR` 6) .&. 0x3f)
                        , 0x80 + oc .&. 0x3f
                        ]

--
-- | Decode a UTF8 string packed into a list of Word8 values, directly to String
--
decode :: [Word8] -> String
decode [    ] = ""
decode (c:cs)
  | c < 0x80  = chr (fromEnum c) : decode cs
  | c < 0xc0  = replacement_character : decode cs
  | c < 0xe0  = multi1
  | c < 0xf0  = multi_byte 2 0xf  0x800
  | c < 0xf8  = multi_byte 3 0x7  0x10000
  | c < 0xfc  = multi_byte 4 0x3  0x200000
  | c < 0xfe  = multi_byte 5 0x1  0x4000000
  | otherwise = replacement_character : decode cs
  where
    multi1 = case cs of
      c1 : ds | c1 .&. 0xc0 == 0x80 ->
        let d = ((fromEnum c .&. 0x1f) `shiftL` 6) .|.  fromEnum (c1 .&. 0x3f)
        in if d >= 0x000080 then toEnum d : decode ds
                            else replacement_character : decode ds
      _ -> replacement_character : decode cs

    multi_byte :: Int -> Word8 -> Int -> [Char]
    multi_byte i mask overlong = aux i cs (fromEnum (c .&. mask))
      where
        aux 0 rs acc
          | overlong <= acc && acc <= 0x10ffff &&
            (acc < 0xd800 || 0xdfff < acc)     &&
            (acc < 0xfffe || 0xffff < acc)      = chr acc : decode rs
          | otherwise = replacement_character : decode rs

        aux n (r:rs) acc
          | r .&. 0xc0 == 0x80 = aux (n-1) rs
                               $ shiftL acc 6 .|. fromEnum (r .&. 0x3f)

        aux _ rs     _ = replacement_character : decode rs


-- | @utf8Encode str@ is a convenience function; checks to see if
-- @str@ isn't UTF-8 encoded before doing so. Sometimes useful, but
-- you are better off keeping track of the encoding so as to avoid
-- the cost of checking.
utf8Encode :: String -> String
utf8Encode str
 | isUTF8Encoded str = str
 | otherwise         = encodeString str


-- | @isUTF8Encoded str@ tries to recognize input string as being in UTF-8 form.
isUTF8Encoded :: String -> Bool
isUTF8Encoded [] = True
isUTF8Encoded (x:xs) = 
  case ox of
    _ | ox < 0x80  -> isUTF8Encoded xs
      | ox > 0xff  -> False
      | ox < 0xc0  -> False
      | ox < 0xe0  -> check1
      | ox < 0xf0  -> check_byte 2 0xf 0
      | ox < 0xf8  -> check_byte 3 0x7  0x10000
      | ox < 0xfc  -> check_byte 4 0x3  0x200000
      | ox < 0xfe  -> check_byte 5 0x1  0x4000000
      | otherwise  -> False
 where
   ox = toW32 x
   
   toW32 :: Char -> Word32
   toW32 ch = fromIntegral (fromEnum ch)

   check1 = 
    case xs of
     [] -> False
     c1 : ds 
      | oc .&. 0xc0 /= 0x80 || d < 0x000080 -> False
      | otherwise -> isUTF8Encoded ds
      where
       oc = toW32 c1
       d = ((ox .&. 0x1f) `shiftL` 6) .|.  (oc .&. 0x3f)

   check_byte :: Int -> Word32 -> Word32 -> Bool
   check_byte i mask overlong = aux i xs (ox .&. mask)
      where
        aux 0 rs acc
         | overlong <= acc && 
	   acc <= 0x10ffff &&
           (acc < 0xd800 || 0xdfff < acc) &&
           (acc < 0xfffe || 0xffff < acc) = isUTF8Encoded rs
         | otherwise = False

        aux n (r:rs) acc
         | toW32 r .&. 0xc0 == 0x80 = 
	    aux (n-1) rs  (acc `shiftL` 6 .|. (toW32 r .&. 0x3f))

        aux _ _  _ = False