{-# LINE 1 "Data/Text/ICU/CharsetDetection.hsc" #-}
{-# LANGUAGE ForeignFunctionInterface #-}
-- |
-- Module      : Data.Text.ICU.CharsetDetection
-- Copyright   : (c) 2017 Zac Slade
--
-- License     : BSD-style
-- Maintainer  : bos@serpentine.com
-- Stability   : experimental
-- Portability : GHC
--
-- Access to the Unicode Character Set Detection facilities, implemented in
-- the International Components for Unicode (ICU) libraries.
--
-- For more information see the \"Character Set Detection\" chapter
-- in the ICU User Guide
-- <http://userguide.icu-project.org/conversion/detection>.
module Data.Text.ICU.CharsetDetection
    (
      setText
    , detect
    , mkCharsetDetector
    , withCharsetDetector
    , wrapUCharsetMatch
    , CharsetMatch
    , CharsetDetector
    , getConfidence
    , getName
    , getLanguage
    ) where

import Foreign.Ptr (Ptr)
import Foreign.C.String (CString)
import Foreign.C.Types (CChar)
import qualified Data.ByteString as BS
import Data.ByteString (ByteString)
import qualified Data.Text.Encoding as TE
import Data.Text (Text)

import Data.Text.ICU.Error.Internal (UErrorCode, handleError)
import Data.Text.ICU.CharsetDetection.Internal (UCharsetMatch, UCharsetDetector,
                                                CharsetDetector, CharsetMatch,
                                                mkCharsetDetector,
                                                withCharsetDetector,
                                                withCharsetMatch,
                                                wrapUCharsetMatch)



-- | From the ICU C API documentation:
-- "Character set detection is at best an imprecise operation. The
-- detection process will attempt to identify the charset that best matches
-- the characteristics of the byte data, but the process is partly statistical
-- in nature, and the results can not be guaranteed to always be correct.
--
-- For best accuracy in charset detection, the input data should be primarily
-- in a single language, and a minimum of a few hundred bytes worth of plain
-- text in the language are needed. The detection process will attempt to
-- ignore html or xml style markup that could otherwise obscure the content."

-- | Use the first 512 bytes, if available, as the text in the
-- 'CharsetDetector' object. This function is low-level and used by the more
-- high-level 'detect' function.
setText :: ByteString -> CharsetDetector -> IO ()
setText :: ByteString -> CharsetDetector -> IO ()
setText ByteString
bs CharsetDetector
ucsd = CharsetDetector -> (Ptr UCharsetDetector -> IO ()) -> IO ()
forall a. CharsetDetector -> (Ptr UCharsetDetector -> IO a) -> IO a
withCharsetDetector CharsetDetector
ucsd Ptr UCharsetDetector -> IO ()
go
  where
    go :: Ptr UCharsetDetector -> IO ()
go Ptr UCharsetDetector
u = if ByteString -> Int
BS.length ByteString
bs Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
512
              then ByteString -> (CStringLen -> IO ()) -> IO ()
forall a. ByteString -> (CStringLen -> IO a) -> IO a
BS.useAsCStringLen ByteString
bs (\(Ptr CChar
text,Int
size) -> (Ptr UErrorCode -> IO ()) -> IO ()
forall a. (Ptr UErrorCode -> IO a) -> IO a
handleError ((Ptr UErrorCode -> IO ()) -> IO ())
-> (Ptr UErrorCode -> IO ()) -> IO ()
forall a b. (a -> b) -> a -> b
$ Ptr UCharsetDetector -> Ptr CChar -> Int -> Ptr UErrorCode -> IO ()
ucsdet_setText Ptr UCharsetDetector
u Ptr CChar
text Int
size)
              else ByteString -> (CStringLen -> IO ()) -> IO ()
forall a. ByteString -> (CStringLen -> IO a) -> IO a
BS.useAsCStringLen (Int -> ByteString -> ByteString
BS.take Int
512 ByteString
bs) (\(Ptr CChar
text,Int
size) -> (Ptr UErrorCode -> IO ()) -> IO ()
forall a. (Ptr UErrorCode -> IO a) -> IO a
handleError ((Ptr UErrorCode -> IO ()) -> IO ())
-> (Ptr UErrorCode -> IO ()) -> IO ()
forall a b. (a -> b) -> a -> b
$ Ptr UCharsetDetector -> Ptr CChar -> Int -> Ptr UErrorCode -> IO ()
ucsdet_setText Ptr UCharsetDetector
u Ptr CChar
text Int
size)

-- | Attempt to perform a detection without an input filter. The best match
-- will be returned.
detect :: ByteString -> IO CharsetMatch
detect :: ByteString -> IO CharsetMatch
detect ByteString
bs = do
    CharsetDetector
ucsd <- IO CharsetDetector
mkCharsetDetector
    ByteString -> CharsetDetector -> IO ()
setText ByteString
bs CharsetDetector
ucsd
    CharsetDetector -> IO (Ptr UCharsetMatch) -> IO CharsetMatch
wrapUCharsetMatch CharsetDetector
ucsd (IO (Ptr UCharsetMatch) -> IO CharsetMatch)
-> IO (Ptr UCharsetMatch) -> IO CharsetMatch
forall a b. (a -> b) -> a -> b
$ CharsetDetector
-> (Ptr UCharsetDetector -> IO (Ptr UCharsetMatch))
-> IO (Ptr UCharsetMatch)
forall a. CharsetDetector -> (Ptr UCharsetDetector -> IO a) -> IO a
withCharsetDetector CharsetDetector
ucsd ((Ptr UErrorCode -> IO (Ptr UCharsetMatch))
-> IO (Ptr UCharsetMatch)
forall a. (Ptr UErrorCode -> IO a) -> IO a
handleError ((Ptr UErrorCode -> IO (Ptr UCharsetMatch))
 -> IO (Ptr UCharsetMatch))
-> (Ptr UCharsetDetector
    -> Ptr UErrorCode -> IO (Ptr UCharsetMatch))
-> Ptr UCharsetDetector
-> IO (Ptr UCharsetMatch)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Ptr UCharsetDetector -> Ptr UErrorCode -> IO (Ptr UCharsetMatch)
ucsdet_detect)

-- | See the confidence score from 0-100 of the 'CharsetMatch' object.
getConfidence :: CharsetMatch -> IO Int
getConfidence :: CharsetMatch -> IO Int
getConfidence CharsetMatch
ucm = CharsetMatch -> (Ptr UCharsetMatch -> IO Int) -> IO Int
forall a. CharsetMatch -> (Ptr UCharsetMatch -> IO a) -> IO a
withCharsetMatch CharsetMatch
ucm ((Ptr UCharsetMatch -> IO Int) -> IO Int)
-> (Ptr UCharsetMatch -> IO Int) -> IO Int
forall a b. (a -> b) -> a -> b
$ (Ptr UErrorCode -> IO Int) -> IO Int
forall a. (Ptr UErrorCode -> IO a) -> IO a
handleError ((Ptr UErrorCode -> IO Int) -> IO Int)
-> (Ptr UCharsetMatch -> Ptr UErrorCode -> IO Int)
-> Ptr UCharsetMatch
-> IO Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Ptr UCharsetMatch -> Ptr UErrorCode -> IO Int
ucsdet_getConfidence

-- | Extract the character set encoding name from the 'CharsetMatch'
-- object.
getName :: CharsetMatch -> IO Text
getName :: CharsetMatch -> IO Text
getName CharsetMatch
ucsm = do
    ByteString
bs <- CharsetMatch
-> (Ptr UCharsetMatch -> IO (Ptr CChar)) -> IO (Ptr CChar)
forall a. CharsetMatch -> (Ptr UCharsetMatch -> IO a) -> IO a
withCharsetMatch CharsetMatch
ucsm ((Ptr UErrorCode -> IO (Ptr CChar)) -> IO (Ptr CChar)
forall a. (Ptr UErrorCode -> IO a) -> IO a
handleError ((Ptr UErrorCode -> IO (Ptr CChar)) -> IO (Ptr CChar))
-> (Ptr UCharsetMatch -> Ptr UErrorCode -> IO (Ptr CChar))
-> Ptr UCharsetMatch
-> IO (Ptr CChar)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Ptr UCharsetMatch -> Ptr UErrorCode -> IO (Ptr CChar)
ucsdet_getName) IO (Ptr CChar) -> (Ptr CChar -> IO ByteString) -> IO ByteString
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= Ptr CChar -> IO ByteString
BS.packCString
    Text -> IO Text
forall (m :: * -> *) a. Monad m => a -> m a
return (Text -> IO Text) -> Text -> IO Text
forall a b. (a -> b) -> a -> b
$ ByteString -> Text
TE.decodeUtf8 ByteString
bs

-- | Extracts the three letter ISO code for the language encoded in the
-- 'CharsetMatch'.
getLanguage :: CharsetMatch -> IO Text
getLanguage :: CharsetMatch -> IO Text
getLanguage CharsetMatch
ucsm = do
    ByteString
bs <- CharsetMatch
-> (Ptr UCharsetMatch -> IO (Ptr CChar)) -> IO (Ptr CChar)
forall a. CharsetMatch -> (Ptr UCharsetMatch -> IO a) -> IO a
withCharsetMatch CharsetMatch
ucsm ((Ptr UErrorCode -> IO (Ptr CChar)) -> IO (Ptr CChar)
forall a. (Ptr UErrorCode -> IO a) -> IO a
handleError ((Ptr UErrorCode -> IO (Ptr CChar)) -> IO (Ptr CChar))
-> (Ptr UCharsetMatch -> Ptr UErrorCode -> IO (Ptr CChar))
-> Ptr UCharsetMatch
-> IO (Ptr CChar)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Ptr UCharsetMatch -> Ptr UErrorCode -> IO (Ptr CChar)
ucsdet_getLanguage) IO (Ptr CChar) -> (Ptr CChar -> IO ByteString) -> IO ByteString
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= Ptr CChar -> IO ByteString
BS.packCString
    Text -> IO Text
forall (m :: * -> *) a. Monad m => a -> m a
return (Text -> IO Text) -> Text -> IO Text
forall a b. (a -> b) -> a -> b
$ ByteString -> Text
TE.decodeUtf8 ByteString
bs

foreign import ccall unsafe "hs_text_icu.h __hs_ucsdet_setText" ucsdet_setText
    :: Ptr UCharsetDetector -> Ptr CChar -> Int -> Ptr UErrorCode -> IO ()

foreign import ccall unsafe "hs_text_icu.h __hs_ucsdet_detect" ucsdet_detect
    :: Ptr UCharsetDetector -> Ptr UErrorCode -> IO (Ptr UCharsetMatch)

foreign import ccall unsafe "hs_text_icu.h __hs_ucsdet_getConfidence" ucsdet_getConfidence
    :: Ptr UCharsetMatch -> Ptr UErrorCode -> IO Int

foreign import ccall unsafe "hs_text_icu.h __hs_ucsdet_getName" ucsdet_getName
    :: Ptr UCharsetMatch -> Ptr UErrorCode -> IO CString

foreign import ccall unsafe "hs_text_icu.h __hs_ucsdet_getLanguage" ucsdet_getLanguage
    :: Ptr UCharsetMatch -> Ptr UErrorCode -> IO CString