{-# OPTIONS -fglasgow-exts #-} module Data.Encoding.Base (Encoding(..) ,EncodeState(..) ,encodeMultibyte ,encodeMultibyteLazy ,decodeMultibyte ,decodeMultibyteLazy ,encodeSinglebyte ,encodeSinglebyteLazy ,decodeSinglebyte ,EncodingException(..) ,DecodingException(..) ,decodingArray ,encodingMap) where import Data.Array(array) import Data.ByteString (ByteString,unfoldrN,unfoldr,length,index,unpack) import qualified Data.ByteString.Lazy as LBS import Data.ByteString.Base(unsafeIndex) import Data.Map (Map,fromList,lookup) import Data.Char(chr) import Data.Maybe(mapMaybe) import Data.Typeable import Data.Word import Prelude hiding (lookup,length) import qualified Prelude import Control.Exception import Data.Dynamic(toDyn) import Language.Haskell.TH {- | Represents an encoding, supporting various methods of de- and encoding. Minimal complete definition: encode, decode -} class Encoding enc where -- | Encode a 'String' into a strict 'ByteString'. Throws the -- 'HasNoRepresentation'-Exception if it encounters an unrepresentable -- character. encode :: enc -> String -> ByteString -- | Encode a 'String' into a lazy 'Data.ByteString.Lazy.ByteString'. encodeLazy :: enc -> String -> LBS.ByteString encodeLazy e str = LBS.fromChunks [encode e str] -- | Whether or not the given 'Char' is representable in this encoding. Default: 'True'. encodable :: enc -> Char -> Bool encodable _ _ = True -- | Decode a strict 'ByteString' into a 'String'. If the string is not -- decodable, a 'DecodingException' is thrown. decode :: enc -> ByteString -> String decodeLazy :: enc -> LBS.ByteString -> String decodeLazy e str = concatMap (decode e) (LBS.toChunks str) -- | Whether or no a given 'ByteString' is decodable. Default: 'True'. decodable :: enc -> ByteString -> Bool decodable _ _ = True encodeMultibyte :: (Char -> (Word8,EncodeState)) -> String -> ByteString encodeMultibyte f str = unfoldr (\st -> case st of (Done,[]) -> Nothing (Done,x:xs) -> let (w,st) = f x in Just (w,(st,xs)) (Put1 w1,xs) -> Just (w1,(Done,xs)) (Put2 w1 w2,xs) -> Just (w1,(Put1 w2,xs)) (Put3 w1 w2 w3,xs) -> Just (w1,(Put2 w2 w3,xs))) (Done,str) encodeMultibyteLazy :: (Char -> (Word8,EncodeState)) -> String -> LBS.ByteString encodeMultibyteLazy f str = LBS.unfoldr (\ ~(st,rest) -> case st of Done -> case rest of [] -> Nothing x:xs -> let ~(w,st) = f x in Just (w,(st,xs)) Put1 w1 -> Just (w1,(Done,rest)) Put2 w1 w2 -> Just (w1,(Put1 w2,rest)) Put3 w1 w2 w3 -> Just (w1,(Put2 w2 w3,rest))) (Done,str) decodeMultibyte :: ([Word8] -> (Char,[Word8])) -> ByteString -> String decodeMultibyte f str = decode (unpack str) where decode lst = let (c,nlst) = f lst in if null lst then [] else c:(decode nlst) decodeMultibyteLazy :: ([Word8] -> (Char,[Word8])) -> LBS.ByteString -> String decodeMultibyteLazy f str = decode (LBS.unpack str) where decode lst = let (c,nlst) = f lst in if null lst then [] else c:(decode nlst) encodeSinglebyte :: (Char -> Word8) -> String -> ByteString encodeSinglebyte f str = fst $ unfoldrN (Prelude.length str) (\st -> case st of [] -> Nothing (x:xs) -> Just (f x,xs)) str encodeSinglebyteLazy :: (Char -> Word8) -> String -> LBS.ByteString encodeSinglebyteLazy f str = LBS.unfoldr (\st -> case st of [] -> Nothing (x:xs) -> Just (f x,xs)) str decodeSinglebyte :: (Word8 -> Char) -> ByteString -> String decodeSinglebyte f str = map f (unpack str) data EncodeState = Done | Put1 !Word8 | Put2 !Word8 !Word8 | Put3 !Word8 !Word8 !Word8 -- | This exception type is thrown whenever something went wrong during the -- encoding-process. data EncodingException = HasNoRepresentation Char -- ^ Thrown if a specific character -- is not representable in an encoding. deriving (Show,Typeable) -- | This exception type is thrown whenever something went wrong during the -- decoding-process. data DecodingException = IllegalCharacter Word8 -- ^ The sequence contained an illegal -- byte that couldn't be decoded. | UnexpectedEnd -- ^ more bytes were needed to allow a -- successfull decoding. deriving (Show,Typeable) decodingArray :: FilePath -> Q Exp -- Haddock hates template haskell... #ifndef __HADDOCK__ decodingArray file = do trans <- runIO (readTranslation file) return $ AppE (AppE (VarE 'array) (TupE [LitE $ IntegerL 0,LitE $ IntegerL 255])) (ListE [ TupE [LitE $ IntegerL from,LitE $ CharL to] | (from,to) <- trans ]) #endif encodingMap :: FilePath -> Q Exp #ifndef __HADDOCK__ encodingMap file = do trans <- runIO (readTranslation file) return $ AppE (VarE 'fromList) (ListE [ TupE [LitE $ CharL to,LitE $ IntegerL from] | (from,to) <- trans]) #endif readTranslation :: FilePath -> IO [(Integer,Char)] readTranslation file = do cont <- readFile file return $ mapMaybe (\ln -> case ln of [] -> Nothing ('#':xs) -> Nothing _ -> case words ln of (src:"#UNDEFINED":_) -> Just (read src,'\xFFFD') -- XXX: Find a better way to handle this (src:trg:_) -> Just (read src,chr $ read trg) _ -> Nothing ) (lines cont)