-- -------------------------------------------------------------------------- -- $Revision: 130 $ $Date: 2006-11-09 16:22:20 +0100 (Thu, 09 Nov 2006) $ -- -------------------------------------------------------------------------- -- | -- -- Module : Encode.Unicode.UTF8 -- Copyright : Otakar Smrz 2005-2006 -- License : GPL -- -- Maintainer : otakar.smrz mff.cuni.cz -- Stability : provisional -- Portability : portable -- -- Modified version of John Meacham's module Encode.Unicode.UTF8 ( -- * Types UTF8 (..) ) where import Encode import Data.Bits import Version version = revised "$Revision: 130 $" data UTF8 = UTF8 | UTF deriving (Enum, Show) instance Encoding UTF8 where encode _ = map toEnum . integerUTF . map fromEnum decode _ = map toEnum . integerUCS . map fromEnum -- http://repetae.net/john/repos/jhc/UTF8.hs -- rewritten by Otakar Smrz -- -- toUTF :: String -> [Word8] -- toUTF = map fromIntegral . integerUTF . map fromEnum -- -- fromUTF :: [Word8] -> String -- fromUTF = map toEnum . integerUCS . map fromIntegral integerUTF :: [Int] -> [Int] integerUTF [] = [] integerUTF (x:xs) | x <= 0x007F = x : integerUTF xs | x <= 0x07FF = (0xC0 .|. ((x `shift` (-6)) .&. 0x1F)) : (0x80 .|. (x .&. 0x3F)) : integerUTF xs | otherwise = (0xE0 .|. ((x `shift` (-12)) .&. 0x0F)) : (0x80 .|. ((x `shift` (-6)) .&. 0x3F)) : (0x80 .|. (x .&. 0x3F)) : integerUTF xs integerUCS :: [Int] -> [Int] integerUCS [] = [] integerUCS (x:xs) | x <= 0x7F = x : integerUCS xs | x <= 0xBF = error ("integerUCS: illegal character byte " ++ show x) | x <= 0xDF = doubleByte x xs | x <= 0xEF = tripleByte x xs | otherwise = error ("integerUCS: illegal character byte " ++ show x) doubleByte x1 (x2:xs) = (((x1 .&. 0x1F) `shift` 6) .|. (x2 .&. 0x3F)) : integerUCS xs doubleByte x _ = error ("integerUCS: illegal 2-byte sequence " ++ show x) tripleByte x1 (x2:x3:xs) = (((x1 .&. 0x0F) `shift` 12) .|. ((x2 .&. 0x3F) `shift` 6) .|. (x3 .&. 0x3F)) : integerUCS xs tripleByte x _ = error ("integerUCS: illegal 3-byte sequence " ++ show x)