{-# OPTIONS -fglasgow-exts #-} module Data.Encoding.Base (Encoding(..) ,EncodeState(..) ,encodeMultibyte ,encodeMultibyteLazy ,decodeMultibyte ,decodeMultibyteLazy ,encodeSinglebyte ,encodeSinglebyteLazy ,decodeSinglebyte ,EncodingException(..) ,DecodingException(..) ,decodingArray ,encodingMap) where import Data.Array(array) import Data.ByteString (ByteString,unfoldrN,unfoldr,length,index,unpack) import qualified Data.ByteString.Lazy as LBS import Data.Encoding.Helper.Template #if __GLASGOW_HASKELL__>=608 import Data.ByteString.Unsafe(unsafeIndex) #else import Data.ByteString.Base(unsafeIndex) #endif import Data.Map (Map,fromList,lookup) import Data.Char(chr) import Data.Maybe(mapMaybe) import Data.Typeable import Data.Word import Prelude hiding (lookup,length) import qualified Prelude import Control.Exception import Data.Dynamic(toDyn) import Language.Haskell.TH {- | Represents an encoding, supporting various methods of de- and encoding. Minimal complete definition: encode, decode -} class Encoding enc where -- | Encode a 'String' into a strict 'ByteString'. Throws the -- 'HasNoRepresentation'-Exception if it encounters an unrepresentable -- character. encode :: enc -> String -> ByteString -- | Encode a 'String' into a lazy 'Data.ByteString.Lazy.ByteString'. encodeLazy :: enc -> String -> LBS.ByteString encodeLazy e str = LBS.fromChunks [encode e str] -- | Whether or not the given 'Char' is representable in this encoding. Default: 'True'. encodable :: enc -> Char -> Bool encodable _ _ = True -- | Decode a strict 'ByteString' into a 'String'. If the string is not -- decodable, a 'DecodingException' is thrown. decode :: enc -> ByteString -> String decodeLazy :: enc -> LBS.ByteString -> String decodeLazy e str = concatMap (decode e) (LBS.toChunks str) -- | Whether or no a given 'ByteString' is decodable. Default: 'True'. decodable :: enc -> ByteString -> Bool decodable _ _ = True encodeMultibyte :: (Char -> (Word8,EncodeState)) -> String -> ByteString encodeMultibyte f str = unfoldr (\st -> case st of (Done,[]) -> Nothing (Done,x:xs) -> let (w,st) = f x in Just (w,(st,xs)) (Put1 w1,xs) -> Just (w1,(Done,xs)) (Put2 w1 w2,xs) -> Just (w1,(Put1 w2,xs)) (Put3 w1 w2 w3,xs) -> Just (w1,(Put2 w2 w3,xs))) (Done,str) encodeMultibyteLazy :: (Char -> (Word8,EncodeState)) -> String -> LBS.ByteString encodeMultibyteLazy f str = LBS.unfoldr (\ ~(st,rest) -> case st of Done -> case rest of [] -> Nothing x:xs -> let ~(w,st) = f x in Just (w,(st,xs)) Put1 w1 -> Just (w1,(Done,rest)) Put2 w1 w2 -> Just (w1,(Put1 w2,rest)) Put3 w1 w2 w3 -> Just (w1,(Put2 w2 w3,rest))) (Done,str) decodeMultibyte :: ([Word8] -> (Char,[Word8])) -> ByteString -> String decodeMultibyte f str = decode (unpack str) where decode lst = let (c,nlst) = f lst in if null lst then [] else c:(decode nlst) decodeMultibyteLazy :: ([Word8] -> (Char,[Word8])) -> LBS.ByteString -> String decodeMultibyteLazy f str = decode (LBS.unpack str) where decode lst = let (c,nlst) = f lst in if null lst then [] else c:(decode nlst) encodeSinglebyte :: (Char -> Word8) -> String -> ByteString encodeSinglebyte f str = fst $ unfoldrN (Prelude.length str) (\st -> case st of [] -> Nothing (x:xs) -> Just (f x,xs)) str encodeSinglebyteLazy :: (Char -> Word8) -> String -> LBS.ByteString encodeSinglebyteLazy f str = LBS.unfoldr (\st -> case st of [] -> Nothing (x:xs) -> Just (f x,xs)) str decodeSinglebyte :: (Word8 -> Char) -> ByteString -> String decodeSinglebyte f str = map f (unpack str) data EncodeState = Done | Put1 !Word8 | Put2 !Word8 !Word8 | Put3 !Word8 !Word8 !Word8 deriving Show -- | This exception type is thrown whenever something went wrong during the -- encoding-process. data EncodingException = HasNoRepresentation Char -- ^ Thrown if a specific character -- is not representable in an encoding. deriving (Show,Typeable) -- | This exception type is thrown whenever something went wrong during the -- decoding-process. data DecodingException = IllegalCharacter Word8 -- ^ The sequence contained an illegal -- byte that couldn't be decoded. | UnexpectedEnd -- ^ more bytes were needed to allow a -- successfull decoding. | OutOfRange -- ^ the decoded value was out of the unicode range deriving (Show,Typeable) decodingArray :: FilePath -> Q Exp decodingArray file = do trans <- runIO (readTranslation file) createCharArray trans 0 255 encodingMap :: FilePath -> Q Exp #ifndef __HADDOCK__ encodingMap file = do trans <- runIO (readTranslation file) return $ AppE (VarE 'fromList) (ListE [ TupE [LitE $ CharL to,LitE $ IntegerL from] | (from,to) <- trans]) #endif readTranslation :: FilePath -> IO [(Integer,Char)] readTranslation file = do cont <- readFile file return $ mapMaybe (\ln -> case ln of [] -> Nothing ('#':xs) -> Nothing _ -> case words ln of (src:"#UNDEFINED":_) -> Just (read src,'\xFFFD') -- XXX: Find a better way to handle this (src:trg:_) -> Just (read src,chr $ read trg) _ -> Nothing ) (lines cont)