{-# LANGUAGE ForeignFunctionInterface, CPP #-}
-- | Detect the likely character encoding for a stream of bytes using Mozilla's Universal Character Set Detector.
module Codec.Text.Detect (detectEncodingName, detectEncoding) where

import Control.Exception

import qualified Data.ByteString.Internal as SI
import qualified Data.ByteString.Lazy as L
import Data.Traversable (traverse)
import Control.Applicative

import Foreign.C.Types
import Foreign.C.String
import Foreign.Ptr
import Foreign.ForeignPtr

import System.IO
import System.IO.Unsafe


#if !MIN_VERSION_base(4,3,0)
mask :: ((IO a -> IO a) -> IO b) -> IO b
mask io = blocked >>= \b -> if b then io id else block $ io unblock
#endif


-- typedef void* csd_t;
type Csd_t = Ptr ()

-- csd_t csd_open(void);
foreign import ccall unsafe "csd_open" c_csd_open :: IO Csd_t

-- int csd_consider(csd_t csd, const char *data, int length);
foreign import ccall unsafe "csd_consider" c_csd_consider :: Csd_t -> CString -> CInt -> IO CInt

-- const char *csd_close(csd_t csd);
foreign import ccall unsafe "csd_close" c_csd_close :: Csd_t -> IO CString


-- | Detect the likely encoding used by a 'L.ByteString'. At the time of writing, the encoding
-- returned will be drawn from this list:
--
-- > Big5
-- > EUC-JP
-- > EUC-KR
-- > gb18030
-- > HZ-GB-2312
-- > IBM855
-- > IBM866
-- > ISO-2022-CN
-- > ISO-2022-JP
-- > ISO-2022-KR
-- > ISO-8859-2
-- > ISO-8859-5
-- > ISO-8859-7
-- > ISO-8859-8
-- > KOI8-R
-- > Shift_JIS
-- > TIS-620
-- > UTF-8
-- > UTF-16BE
-- > UTF-16LE
-- > UTF-32BE
-- > UTF-32LE
-- > windows-1250
-- > windows-1251
-- > windows-1252
-- > windows-1253
-- > windows-1255
-- > x-euc-tw
-- > X-ISO-10646-UCS-4-2143
-- > X-ISO-10646-UCS-4-3412
-- > x-mac-cyrillic
{-# NOINLINE detectEncodingName #-}
detectEncodingName :: L.ByteString -> Maybe String
detectEncodingName :: ByteString -> Maybe String
detectEncodingName ByteString
b = IO (Maybe String) -> Maybe String
forall a. IO a -> a
unsafePerformIO (IO (Maybe String) -> Maybe String)
-> IO (Maybe String) -> Maybe String
forall a b. (a -> b) -> a -> b
$ do
    ((forall a. IO a -> IO a) -> IO (Maybe String))
-> IO (Maybe String)
forall b. ((forall a. IO a -> IO a) -> IO b) -> IO b
mask (((forall a. IO a -> IO a) -> IO (Maybe String))
 -> IO (Maybe String))
-> ((forall a. IO a -> IO a) -> IO (Maybe String))
-> IO (Maybe String)
forall a b. (a -> b) -> a -> b
$ \forall a. IO a -> IO a
restore -> do
        Csd_t
csd <- IO Csd_t
c_csd_open
        IO () -> IO ()
forall a. IO a -> IO a
restore ((\ByteString -> IO () -> IO ()
f -> (ByteString -> IO () -> IO ()) -> IO () -> [ByteString] -> IO ()
forall a b. (a -> b -> b) -> b -> [a] -> b
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr ByteString -> IO () -> IO ()
f (() -> IO ()
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return ()) (ByteString -> [ByteString]
L.toChunks ByteString
b)) ((ByteString -> IO () -> IO ()) -> IO ())
-> (ByteString -> IO () -> IO ()) -> IO ()
forall a b. (a -> b) -> a -> b
$ \ByteString
chunk IO ()
feed_more -> do
            let (ForeignPtr Word8
fptr, Int
ptr_offset, Int
chunk_length) = ByteString -> (ForeignPtr Word8, Int, Int)
SI.toForeignPtr ByteString
chunk
            CInt
res <- ForeignPtr Word8 -> (Ptr Word8 -> IO CInt) -> IO CInt
forall a b. ForeignPtr a -> (Ptr a -> IO b) -> IO b
withForeignPtr ForeignPtr Word8
fptr ((Ptr Word8 -> IO CInt) -> IO CInt)
-> (Ptr Word8 -> IO CInt) -> IO CInt
forall a b. (a -> b) -> a -> b
$ \Ptr Word8
ptr -> Csd_t -> CString -> CInt -> IO CInt
c_csd_consider Csd_t
csd (Ptr Word8
ptr Ptr Word8 -> Int -> CString
forall a b. Ptr a -> Int -> Ptr b
`plusPtr` Int
ptr_offset) (Int -> CInt
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
chunk_length)
            case CInt
res CInt -> CInt -> Ordering
forall a. Ord a => a -> a -> Ordering
`compare` CInt
0 of
              Ordering
LT -> () -> IO ()
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return () -- Some sort of error: could report it?
              Ordering
EQ -> IO ()
feed_more -- Feed more data to come to a conclusion
              Ordering
GT -> () -> IO ()
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return () -- We have enough data!
          ) IO () -> IO CString -> IO ()
forall a b. IO a -> IO b -> IO a
`onException` Csd_t -> IO CString
c_csd_close Csd_t
csd
        CString
c_encoding_ptr <- Csd_t -> IO CString
c_csd_close Csd_t
csd
        if CString
c_encoding_ptr CString -> CString -> Bool
forall a. Eq a => a -> a -> Bool
== CString
forall a. Ptr a
nullPtr
         then Maybe String -> IO (Maybe String)
forall a. a -> IO a
forall (m :: * -> *) a. Monad m => a -> m a
return Maybe String
forall a. Maybe a
Nothing
         else String -> Maybe String
forall a. a -> Maybe a
Just (String -> Maybe String)
-> (String -> String) -> String -> Maybe String
forall b c a. (b -> c) -> (a -> b) -> a -> c
. String -> String
normalise (String -> Maybe String) -> IO String -> IO (Maybe String)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> CString -> IO String
peekCString CString
c_encoding_ptr
  where
    normalise :: String -> String
normalise String
"GB18030" = String
"gb18030"
    normalise String
x         = String
x

-- | Detect the encoding for a 'L.ByteString' and attempt to create a 'TextEncoding' suitable for decoding it.
detectEncoding :: L.ByteString -> IO (Maybe TextEncoding)
detectEncoding :: ByteString -> IO (Maybe TextEncoding)
detectEncoding = (String -> IO TextEncoding)
-> Maybe String -> IO (Maybe TextEncoding)
forall (t :: * -> *) (f :: * -> *) a b.
(Traversable t, Applicative f) =>
(a -> f b) -> t a -> f (t b)
forall (f :: * -> *) a b.
Applicative f =>
(a -> f b) -> Maybe a -> f (Maybe b)
traverse String -> IO TextEncoding
mkTextEncoding (Maybe String -> IO (Maybe TextEncoding))
-> (ByteString -> Maybe String)
-> ByteString
-> IO (Maybe TextEncoding)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> Maybe String
detectEncodingName