-- -------------------------------------------------------------------------- -- $Revision: 549 $ $Date: 2008-05-06 12:32:16 +0200 (Tue, 06 May 2008) $ -- -------------------------------------------------------------------------- -- | -- -- Module : Encode.Unicode.UTF8 -- Copyright : Otakar Smrz 2005-2008 -- License : GPL -- -- Maintainer : otakar.smrz mff.cuni.cz -- Stability : provisional -- Portability : portable -- -- Modified version of John Meacham's module Encode.Unicode.UTF8 ( -- * Types UTF8 (..) ) where import Encode import Data.Bits import Version version = revised "$Revision: 549 $" data UTF8 = UTF8 | UTF deriving (Enum, Show) instance Encoding UTF8 where encode _ = map (toEnum . fromIntegral) . codespaceUTF . map (fromIntegral . fromEnum) decode _ = map (toEnum . fromIntegral) . codespaceUCS . map (fromIntegral . fromEnum) -- http://repetae.net/john/repos/jhc/UTF8.hs -- rewritten by Otakar Smrz -- -- toUTF :: String -> [Word8] -- toUTF = map fromIntegral . codespaceUTF . map fromEnum -- -- fromUTF :: [Word8] -> String -- fromUTF = map toEnum . codespaceUCS . map fromIntegral codespaceUTF :: [CSpace] -> [CSpace] codespaceUTF [] = [] codespaceUTF (x:xs) | x <= 0x007F = x : codespaceUTF xs | x <= 0x07FF = (0xC0 .|. ((x `shift` (-6)) .&. 0x1F)) : (0x80 .|. (x .&. 0x3F)) : codespaceUTF xs | otherwise = (0xE0 .|. ((x `shift` (-12)) .&. 0x0F)) : (0x80 .|. ((x `shift` (-6)) .&. 0x3F)) : (0x80 .|. (x .&. 0x3F)) : codespaceUTF xs codespaceUCS :: [CSpace] -> [CSpace] codespaceUCS [] = [] codespaceUCS (x:xs) | x <= 0x7F = x : codespaceUCS xs | x <= 0xBF = error ("codespaceUCS: illegal character byte " ++ show x) | x <= 0xDF = doubleByte x xs | x <= 0xEF = tripleByte x xs | otherwise = error ("codespaceUCS: illegal character byte " ++ show x) doubleByte x1 (x2:xs) = (((x1 .&. 0x1F) `shift` 6) .|. (x2 .&. 0x3F)) : codespaceUCS xs doubleByte x _ = error ("codespaceUCS: illegal 2-byte sequence " ++ show x) tripleByte x1 (x2:x3:xs) = (((x1 .&. 0x0F) `shift` 12) .|. ((x2 .&. 0x3F) `shift` 6) .|. (x3 .&. 0x3F)) : codespaceUCS xs tripleByte x _ = error ("codespaceUCS: illegal 3-byte sequence " ++ show x)