{-# LANGUAGE ForeignFunctionInterface #-} -- | A `Conduit` based on the iconv library for -- converting a `Data.ByteString` from one character set encoding to another module Data.Conduit.IConv ( CharacterEncoding , convert ) where import Data.Conduit import qualified Data.ByteString as B import qualified Data.ByteString.Unsafe as BU import Control.Applicative ((<$>)) import Foreign hiding (unsafePerformIO) import Foreign.C import System.IO.Unsafe (unsafePerformIO) import Control.Exception (mask_) -- | Type synonym for character encoding names -- -- See `convert` for details. type CharacterEncoding = String -- | Convert text from one named character encoding to another. -- -- Encoding names can be e.g. @\"UTF-8\"@ or @\"ISO-8559-1\"@. Appending -- @\"\/\/IGNORE\"@ to the output encoding will cause encoding errors to be -- ignored and the characters to be dropped, @\"\/\/IGNORE,TRANSLIT\"@ will -- cause them to be replaced by a replacement character. -- -- Without this encoding errors will cause an exception. -- -- On errors this will call `fail` convert :: Monad m => CharacterEncoding -- ^ Name of input character encoding -> CharacterEncoding -- ^ Name of output character encoding -> Conduit B.ByteString m B.ByteString convert inputEncoding outputEncoding = run initialConvert where initialConvert = iconvConvert inputEncoding outputEncoding run f = do maybeInput <- await case maybeInput of Nothing -> return () Just input -> do let res = f input case res of ConvertSuccess c f' -> yield c >> run f' ConvertUnsupportedConversionError -> fail "Unsupported conversion" ConvertUnexpectedOpenError s -> fail ("Unexpected open error: " ++ s) ConvertInvalidInputError -> fail "Invalid input" ConvertUnexpectedConversionError s -> fail ("Unexpected conversion error: " ++ s) -- Stream based API around iconv() data ConvertResult = ConvertSuccess B.ByteString (B.ByteString -> ConvertResult) | ConvertUnsupportedConversionError | ConvertUnexpectedOpenError String | ConvertInvalidInputError | ConvertUnexpectedConversionError String iconvConvert :: CharacterEncoding -> CharacterEncoding -> B.ByteString -> ConvertResult iconvConvert inputEncoding outputEncoding input = let eCtx = iconvOpen inputEncoding outputEncoding in case eCtx of Left UnsupportedConversion -> ConvertUnsupportedConversionError Left (UnexpectedOpenError (Errno errno)) -> ConvertUnexpectedOpenError (show errno) Right ctx -> convertInput ctx B.empty input where -- Converts newInput with the given context -- -- If converted is not empty, this will be prepended to -- the converted output. This will happen if we're called -- after the remainder of a previous conversion was consumed. -- converted is the result of the remainder conversion. -- -- If this conversion results in a remainder we return any -- results we got so far but will use convertInputWithRemaining -- with the remainder for the next call. convertInput ctx converted newInput | B.null newInput = ConvertSuccess converted (convertInput ctx B.empty) | otherwise = let res = iconv ctx newInput converted in case res of Converted c r -> ConvertSuccess c (convertInputWithRemaining ctx r) InvalidInput c _ -> if B.null c then ConvertInvalidInputError else -- Converted a bit but detected invalid input afterwards. -- Let's return what was converted so far and fail with -- the next call ConvertSuccess c (const ConvertInvalidInputError) UnexpectedError (Errno errno) -> ConvertUnexpectedConversionError (show errno) -- Convert any remainder from a previous conversion. We do -- this by first appending the first 32 bytes of the newInput -- to the remainder. With 32 bytes additionally we should be -- able to convert the remainder from any possible charset -- encoding that has 32 *bytes* per character at maximum. -- 4 would've been enough too probably. -- -- If the conversion was successful, we take the remaining -- newInput and the converted remainder, and hand it over -- to the previous conversion function. Which then will -- convert the remaining newInput and prepend the converted -- remainder to it. convertInputWithRemaining ctx remaining newInput | B.null remaining = convertInput ctx B.empty newInput | B.null newInput = ConvertSuccess B.empty (convertInputWithRemaining ctx remaining) | otherwise = let tmpInput = remaining `B.append` B.take 32 newInput res = iconv ctx tmpInput B.empty in case res of Converted c r -> if processed < B.length remaining then -- Didn't convert the complete remainder. Let's try again next -- time with the additional newInput ConvertSuccess B.empty (convertInputWithRemaining ctx (remaining `B.append` newInput)) else -- Converted the complete remainder. Now let's try to -- convert the remaining new input convertInput ctx c (B.drop consumedInput newInput) where processed = B.length tmpInput - B.length r consumedInput = processed - B.length remaining InvalidInput c _ -> if B.null c then ConvertInvalidInputError else -- Converted a bit but detected invalid input afterwards. -- Let's return what was converted so far and fail with -- the next call ConvertSuccess c (const ConvertInvalidInputError) UnexpectedError (Errno errno) -> ConvertUnexpectedConversionError (show errno) -- Thin wrapper around iconv_open() data IConvT = IConvT (ForeignPtr IConvT) data IConvOpenError = UnsupportedConversion | UnexpectedOpenError Errno iconvOpen :: CharacterEncoding -> CharacterEncoding -> Either IConvOpenError IConvT iconvOpen inputEncoding outputEncoding = unsafePerformIO $ mask_ $ do -- mask async exceptions, we might otherwise leak ptr <- withCString inputEncoding $ \inputEncodingPtr -> withCString outputEncoding $ \outputEncodingPtr -> c_iconv_open outputEncodingPtr inputEncodingPtr case ptrToIntPtr ptr of (-1) -> do errno <- getErrno return $ Left $ if errno == eINVAL then UnsupportedConversion else UnexpectedOpenError errno _ -> do fPtr <- newForeignPtr c_iconv_close ptr return $ Right (IConvT fPtr) -- Thin wrapper around iconv() data IConvResult = Converted B.ByteString B.ByteString | InvalidInput B.ByteString B.ByteString | UnexpectedError Errno iconv :: IConvT -> B.ByteString -> B.ByteString -> IConvResult iconv (IConvT fPtr) input outputPrefix = unsafePerformIO $ mask_ $ do -- mask async exceptions, we might otherwise leak let outputPrefixLen = B.length outputPrefix inputLen = B.length input -- We allocate enough memory for the worst case: 1 -- byte per character encodings to 4 bytes per -- character encodings (e.g. ASCII to UTF32). -- Additionally 16 bytes of padding just in case and -- the length of the converted prefix we have to -- prepend to the result -- -- We overallocate here but will resize later. The -- alternative would be to produce potentially many -- smaller chunks of output. outputLen = inputLen * 4 + 16 + outputPrefixLen convertLen = outputLen - outputPrefixLen -- We allocate via malloc(), which does not give us memory inside -- the GC heap. But this allows us to realloc() the memory later -- to the actual size. outputPtr <- mallocBytes outputLen BU.unsafeUseAsCString outputPrefix $ \outputPrefixPtr -> copyBytes outputPtr outputPrefixPtr outputPrefixLen -- Newly converted output should start at this pointer let outputConvPtr = plusPtr outputPtr outputPrefixLen -- Do the actual conversion and calculate how many bytes we -- read and wrote to update our state (res, readCount, writeCount) <- withForeignPtr fPtr $ \ptr -> BU.unsafeUseAsCString input $ \inputPtr -> with inputPtr $ \inputPtrPtr -> with (fromIntegral inputLen) $ \inputLenPtr -> with outputConvPtr $ \outputConvPtrPtr -> with (fromIntegral convertLen) $ \convertLenPtr -> do res <- c_iconv ptr inputPtrPtr inputLenPtr outputConvPtrPtr convertLenPtr -- The length pointers will be updated by iconv to the still -- remaining length after conversion. That means we can calculate -- the read/written number of bytes with: old - new readCount <- (`subtract` inputLen) . fromIntegral <$> peek inputLenPtr writeCount <- (`subtract` convertLen) . fromIntegral <$> peek convertLenPtr return (res, readCount, writeCount) let remaining = B.drop readCount input convertedLen = outputPrefixLen + writeCount -- Reallocate the memory to a memory area of the actual size. This -- potentially allows the OS to reuse any memory we allocated too much and -- in general does not cause the memory to be copied. -- -- This will return NULL if the output size is 0, but the resulting -- ByteString will then be the empty ByteString as expected. -- -- Shadowing outputPtr to prevent accidential usage of old outputPtr outputPtr <- reallocBytes outputPtr convertedLen output <- BU.unsafePackMallocCStringLen (outputPtr, convertedLen) if res /= (-1) then return $ Converted output remaining else do errno <- getErrno case () of -- The output buffer was too small! This should not happen -- because we overallocate for any possible output charset -- encoding _ | errno == e2BIG -> return $ UnexpectedError errno -- Incomplete byte sequence was detected, which we might be -- able to convert with further input later | errno == eINVAL -> return $ Converted output remaining -- Invalid byte sequence was detected. We might've converted -- something already but from here on we can't do anything | errno == eILSEQ -> return $ InvalidInput output remaining | otherwise -> return $ UnexpectedError errno -- Taken from Codec.Text.IConv foreign import ccall unsafe "hsiconv.h hs_wrap_iconv_open" c_iconv_open :: CString -- to code -> CString -- from code -> IO (Ptr IConvT) foreign import ccall unsafe "hsiconv.h hs_wrap_iconv" c_iconv :: Ptr IConvT -> Ptr (Ptr CChar) -- in buf -> Ptr CSize -- in buf bytes left -> Ptr (Ptr CChar) -- out buf -> Ptr CSize -- out buf bytes left -> IO CSize foreign import ccall unsafe "hsiconv.h &hs_wrap_iconv_close" c_iconv_close :: FinalizerPtr IConvT