Haskell Code by HsColour

{-# LANGUAGE CPP, ForeignFunctionInterface #-}
{-| 
 This module contains functions to do Unicode normalization of CompactStrings.
-}
module Text.Unicode.Normalization(NormalizationMode(..)
                                 ,normalizationToCInt -- voor QuickCheck
                                 ,NormalizationOption(..)
                                 ,normalize
                                 ,NormalizationCheckResult(..)
                                 ,quickCheck
                                 ,isNormalized
                                 ,concatenate
                                 ,ComparisonOption(..)
                                 ,compare
                                 ) where

import Prelude hiding (compare)

-- for the foreign hackery
import System.IO.Unsafe(unsafePerformIO)
import Foreign.Ptr
import Foreign.C.Types
import Data.Int

-- Strings in UTF-16
import Data.CompactString(CompactString)
import Data.CompactString.Encodings(UTF16Native)
import qualified Data.CompactString as CS
import qualified Data.CompactString.Unsafe as CS(unsafeFromByteString)
#if __GLASGOW_HASKELL__ >= 608
import Data.ByteString.Internal(createAndTrim)
#else
import Data.ByteString.Base(createAndTrim)
#endif

import Text.Unicode.Base

-- debugging
import Debug.Trace

{-|
 A data type for representing an ICU Normalization type. You use this to specify
 how you'd like ICU to normalize your string.
-}
data NormalizationMode = NFD | NFKD | NFC | NFKC | FCD deriving (Eq, Show)

{-|
 Options to pass to normalize.

 There is only one option ATM.
-}
data NormalizationOption = Unicode3_2 -- ^ Normalize according to Unicode 3.2 
                           deriving (Eq, Show)

instance BitPackable NormalizationOption where
    intValue Unicode3_2 = 0x20

{-|
 Internal function to convert a NormalizationMode to its C enum value
-}
normalizationToCInt :: NormalizationMode -> CInt
normalizationToCInt NFD = 2
normalizationToCInt NFKD = 3
normalizationToCInt NFC = 4
normalizationToCInt NFKC = 5
normalizationToCInt FCD = 6

foreign import ccall "unicode/unorm.h unorm_normalize_3_8" raw_normalize :: Ptr UChar -> Int32 -> CInt -> Int32 -> Ptr UChar -> Int32 -> Ptr UErrorCode -> IO Int32

{-|
 Normalizes the given string, according to the given normalization type and options.

 This function is a higher-level wrapper around raw_normalize.

 Move this to something like Data.CompactString.Normalization, eventually.

 Generalize out the UErrorCode handling.
-}
normalize :: CompactString UTF16Native -> NormalizationMode -> [NormalizationOption] -> CompactString UTF16Native
-- bouw weer CompactString van bytestring waaruit buf en bufSize voortgekomen zijn -- 
normalize s n opts = CS.unsafeFromByteString $ unsafePerformIO $ createAndTrim maxBufSizeWord8s $ \tptr ->
                         withCompactString s $ \sptr slen ->
                             withPackedOptions opts $ \rawOptions ->
                                 let buf = castPtr tptr :: Ptr UChar
                                     rawNormalization = normalizationToCInt n
                                     rawBufSize = fromIntegral maxBufSizeUChars
                                 in do resultLength <- handleError $ \errptr ->
                                                           raw_normalize sptr slen rawNormalization rawOptions buf rawBufSize errptr
                                       -- resultLength is in UChars,return value in Word8s.
                                       -- So multiply by two.
                                       return (fromIntegral resultLength * 2) 
    where -- normalization to NFD never takes more than three times the space of
          -- the original string, according to Unicode
          maxBufSizeWord8s = 3 * 2 * CS.length s
          maxBufSizeUChars = 3 * CS.length s

{-|
 A type for the result of a quick normalization check.
-}
data NormalizationCheckResult = Normalized | NotNormalized | MaybeNormalized deriving (Eq, Show)

{-|
 Translates a normalization check result from C land to a NormalizationCheckResult.
-}
int32ToNormalizationCheckResult :: Int32 -> NormalizationCheckResult
int32ToNormalizationCheckResult 0 = NotNormalized
int32ToNormalizationCheckResult 1 = Normalized
int32ToNormalizationCheckResult 2 = MaybeNormalized

foreign import ccall "unicode/unorm.h unorm_quickCheckWithOptions_3_8" raw_quickCheck :: Ptr UChar -> Int32 -> CInt -> Int32 -> Ptr UErrorCode -> IO Int32

{-| 
  Attempts to check quickly whether a string is already normalized according to a certain 
  normalization mode.

  When you get MaybeNormalized as a result, you should normalize the 
  string and compare it to the original to know if it is normalized. You can
  make ICU do that by calling isNormalized.
-}
quickCheck :: CompactString UTF16Native -> NormalizationMode -> [NormalizationOption] -> NormalizationCheckResult
quickCheck s nm opts = int32ToNormalizationCheckResult $ unsafePerformIO $ withCompactString s $ \buf len ->
                           withPackedOptions opts $ \rawOpts ->
                           handleError $ \errptr -> raw_quickCheck buf len rawNormalizationMode rawOpts errptr
  where rawNormalizationMode = normalizationToCInt nm

foreign import ccall "unicode/unorm.h unorm_isNormalizedWithOptions_3_8" raw_isNormalized :: Ptr UChar -> Int32 -> CInt -> Int32 -> Ptr UErrorCode -> IO UBool

{-| 
  Tells of a string whether it is already normalized according to a certain 
  mode and options
-}
isNormalized :: CompactString UTF16Native -> NormalizationMode -> [NormalizationOption] -> Bool
isNormalized s nm opts = uBoolToBool $ unsafePerformIO $ withCompactString s $ \buf len ->
                             withPackedOptions opts $ \rawOpts -> 
                                 handleError $ \errptr -> raw_isNormalized buf len rawNormalizationMode rawOpts errptr
    where rawNormalizationMode = normalizationToCInt nm

foreign import ccall "unicode/unorm.h unorm_concatenate_3_8" raw_concatenate :: Ptr UChar -> Int32 -> Ptr UChar -> Int32 -> Ptr UChar -> Int32 -> CInt -> Int32 -> Ptr UErrorCode -> IO Int32

{-|
 Concatenates two normalized strings, such that the result is also normalized.

 More formally:
   Given that string1 is normalized according to mode and options, and string2
   is normalized according to mode and options, the result of concatenate
   string1 string2 mode options will be a concatenation of string1 and string2
   and be normalized according to mode and options.
-}
concatenate :: CompactString UTF16Native
            -> CompactString UTF16Native
            -> NormalizationMode
            -> [NormalizationOption]
            -> CompactString UTF16Native
concatenate s1 s2 m opts = CS.unsafeFromByteString $ unsafePerformIO $ createAndTrim maxBufSizeWord8s $ \targetPtrWord8 -> 
                               withCompactString s1 $ \buf1 len1 ->
                                   withCompactString s2 $ \buf2 len2 ->
                                       withPackedOptions opts $ \rawOpts ->
                                           handleError $ \errptr ->
                                               let targetPtr = castPtr targetPtrWord8 :: Ptr UChar
                                               in do bufUsed <- raw_concatenate buf1 len1 
                                                                                buf2 len2 
                                                                                targetPtr maxBufSizeUChars 
                                                                                rawMode rawOpts errptr
                                                     -- again, we get an Int32
                                                     -- from C but we must
                                                     -- supply an Int to
                                                     -- createAndTrim
                                                     return (fromIntegral bufUsed * 2)
  where rawMode = normalizationToCInt m
        -- the buffer size estimate is overly big, but I can't come up with
        -- anything less ATM
        maxBufSizeWord8s :: Int
        maxBufSizeWord8s = fromIntegral (2 * maxBufSizeUChars)
        maxBufSizeUChars :: Int32
        maxBufSizeUChars = fromIntegral (3 * (CS.length s1 + CS.length s2))

{-|
 A data type to encode options to the compare function.
-}
data ComparisonOption = InputIsFCD -- ^ Assume that both strings are FCD normalized
                      | IgnoreCase -- ^ Do case-insensitive comparison
                      | CompareCodePointOrder -- ^ Compare by code point order (default is code unit order)
                      deriving (Eq, Show)
instance BitPackable ComparisonOption where
     intValue InputIsFCD = 0x20000
     intValue IgnoreCase = 0x10000
     intValue CompareCodePointOrder = 0x8000

foreign import ccall "unicode/unorm.h unorm_compare_3_8" raw_compare :: Ptr UChar -> Int32 -> Ptr UChar -> Int32 -> Int32 -> Ptr UErrorCode -> IO Int32 

{-|
 Compares two Unicode strings for canonical equivalence.

 Two Unicode strings are canonically equivalent when their NFD and NFC 
 normalizations are equal.
-}
compare :: CompactString UTF16Native -> CompactString UTF16Native -> [ComparisonOption] -> Ordering
compare s1 s2 opts = cOrderingToOrdering $ unsafePerformIO $ withCompactString s1 $ \buf1 len1 ->
                         withCompactString s2 $ \buf2 len2 ->
                             withPackedOptions opts $ \rawOpts ->
                                 handleError $ \errptr ->
                                     raw_compare buf1 len1 buf2 len2 rawOpts errptr