{-# LANGUAGE CPP, ForeignFunctionInterface #-} {-| This module contains functions to do Unicode normalization of CompactStrings. -} module Text.Unicode.Normalization(NormalizationMode(..) ,normalizationToCInt -- voor QuickCheck ,NormalizationOption(..) ,normalize ,NormalizationCheckResult(..) ,quickCheck ,isNormalized ,concatenate ,ComparisonOption(..) ,compare ) where import Prelude hiding (compare) -- for the foreign hackery import System.IO.Unsafe(unsafePerformIO) import Foreign.Ptr import Foreign.C.Types import Data.Int -- Strings in UTF-16 import Data.CompactString(CompactString) import Data.CompactString.Encodings(UTF16Native) import qualified Data.CompactString as CS import qualified Data.CompactString.Unsafe as CS(unsafeFromByteString) #if __GLASGOW_HASKELL__ >= 608 import Data.ByteString.Internal(createAndTrim) #else import Data.ByteString.Base(createAndTrim) #endif import Text.Unicode.Base -- debugging import Debug.Trace {-| A data type for representing an ICU Normalization type. You use this to specify how you'd like ICU to normalize your string. -} data NormalizationMode = NFD | NFKD | NFC | NFKC | FCD deriving (Eq, Show) {-| Options to pass to normalize. There is only one option ATM. -} data NormalizationOption = Unicode3_2 -- ^ Normalize according to Unicode 3.2 deriving (Eq, Show) instance BitPackable NormalizationOption where intValue Unicode3_2 = 0x20 {-| Internal function to convert a NormalizationMode to its C enum value -} normalizationToCInt :: NormalizationMode -> CInt normalizationToCInt NFD = 2 normalizationToCInt NFKD = 3 normalizationToCInt NFC = 4 normalizationToCInt NFKC = 5 normalizationToCInt FCD = 6 foreign import ccall "unicode/unorm.h unorm_normalize_3_8" raw_normalize :: Ptr UChar -> Int32 -> CInt -> Int32 -> Ptr UChar -> Int32 -> Ptr UErrorCode -> IO Int32 {-| Normalizes the given string, according to the given normalization type and options. This function is a higher-level wrapper around raw_normalize. Move this to something like Data.CompactString.Normalization, eventually. Generalize out the UErrorCode handling. -} normalize :: CompactString UTF16Native -> NormalizationMode -> [NormalizationOption] -> CompactString UTF16Native -- bouw weer CompactString van bytestring waaruit buf en bufSize voortgekomen zijn -- normalize s n opts = CS.unsafeFromByteString $ unsafePerformIO $ createAndTrim maxBufSizeWord8s $ \tptr -> withCompactString s $ \sptr slen -> withPackedOptions opts $ \rawOptions -> let buf = castPtr tptr :: Ptr UChar rawNormalization = normalizationToCInt n rawBufSize = fromIntegral maxBufSizeUChars in do resultLength <- handleError $ \errptr -> raw_normalize sptr slen rawNormalization rawOptions buf rawBufSize errptr -- resultLength is in UChars,return value in Word8s. -- So multiply by two. return (fromIntegral resultLength * 2) where -- normalization to NFD never takes more than three times the space of -- the original string, according to Unicode maxBufSizeWord8s = 3 * 2 * CS.length s maxBufSizeUChars = 3 * CS.length s {-| A type for the result of a quick normalization check. -} data NormalizationCheckResult = Normalized | NotNormalized | MaybeNormalized deriving (Eq, Show) {-| Translates a normalization check result from C land to a NormalizationCheckResult. -} int32ToNormalizationCheckResult :: Int32 -> NormalizationCheckResult int32ToNormalizationCheckResult 0 = NotNormalized int32ToNormalizationCheckResult 1 = Normalized int32ToNormalizationCheckResult 2 = MaybeNormalized foreign import ccall "unicode/unorm.h unorm_quickCheckWithOptions_3_8" raw_quickCheck :: Ptr UChar -> Int32 -> CInt -> Int32 -> Ptr UErrorCode -> IO Int32 {-| Attempts to check quickly whether a string is already normalized according to a certain normalization mode. When you get MaybeNormalized as a result, you should normalize the string and compare it to the original to know if it is normalized. You can make ICU do that by calling isNormalized. -} quickCheck :: CompactString UTF16Native -> NormalizationMode -> [NormalizationOption] -> NormalizationCheckResult quickCheck s nm opts = int32ToNormalizationCheckResult $ unsafePerformIO $ withCompactString s $ \buf len -> withPackedOptions opts $ \rawOpts -> handleError $ \errptr -> raw_quickCheck buf len rawNormalizationMode rawOpts errptr where rawNormalizationMode = normalizationToCInt nm foreign import ccall "unicode/unorm.h unorm_isNormalizedWithOptions_3_8" raw_isNormalized :: Ptr UChar -> Int32 -> CInt -> Int32 -> Ptr UErrorCode -> IO UBool {-| Tells of a string whether it is already normalized according to a certain mode and options -} isNormalized :: CompactString UTF16Native -> NormalizationMode -> [NormalizationOption] -> Bool isNormalized s nm opts = uBoolToBool $ unsafePerformIO $ withCompactString s $ \buf len -> withPackedOptions opts $ \rawOpts -> handleError $ \errptr -> raw_isNormalized buf len rawNormalizationMode rawOpts errptr where rawNormalizationMode = normalizationToCInt nm foreign import ccall "unicode/unorm.h unorm_concatenate_3_8" raw_concatenate :: Ptr UChar -> Int32 -> Ptr UChar -> Int32 -> Ptr UChar -> Int32 -> CInt -> Int32 -> Ptr UErrorCode -> IO Int32 {-| Concatenates two normalized strings, such that the result is also normalized. More formally: Given that string1 is normalized according to mode and options, and string2 is normalized according to mode and options, the result of concatenate string1 string2 mode options will be a concatenation of string1 and string2 and be normalized according to mode and options. -} concatenate :: CompactString UTF16Native -> CompactString UTF16Native -> NormalizationMode -> [NormalizationOption] -> CompactString UTF16Native concatenate s1 s2 m opts = CS.unsafeFromByteString $ unsafePerformIO $ createAndTrim maxBufSizeWord8s $ \targetPtrWord8 -> withCompactString s1 $ \buf1 len1 -> withCompactString s2 $ \buf2 len2 -> withPackedOptions opts $ \rawOpts -> handleError $ \errptr -> let targetPtr = castPtr targetPtrWord8 :: Ptr UChar in do bufUsed <- raw_concatenate buf1 len1 buf2 len2 targetPtr maxBufSizeUChars rawMode rawOpts errptr -- again, we get an Int32 -- from C but we must -- supply an Int to -- createAndTrim return (fromIntegral bufUsed * 2) where rawMode = normalizationToCInt m -- the buffer size estimate is overly big, but I can't come up with -- anything less ATM maxBufSizeWord8s :: Int maxBufSizeWord8s = fromIntegral (2 * maxBufSizeUChars) maxBufSizeUChars :: Int32 maxBufSizeUChars = fromIntegral (3 * (CS.length s1 + CS.length s2)) {-| A data type to encode options to the compare function. -} data ComparisonOption = InputIsFCD -- ^ Assume that both strings are FCD normalized | IgnoreCase -- ^ Do case-insensitive comparison | CompareCodePointOrder -- ^ Compare by code point order (default is code unit order) deriving (Eq, Show) instance BitPackable ComparisonOption where intValue InputIsFCD = 0x20000 intValue IgnoreCase = 0x10000 intValue CompareCodePointOrder = 0x8000 foreign import ccall "unicode/unorm.h unorm_compare_3_8" raw_compare :: Ptr UChar -> Int32 -> Ptr UChar -> Int32 -> Int32 -> Ptr UErrorCode -> IO Int32 {-| Compares two Unicode strings for canonical equivalence. Two Unicode strings are canonically equivalent when their NFD and NFC normalizations are equal. -} compare :: CompactString UTF16Native -> CompactString UTF16Native -> [ComparisonOption] -> Ordering compare s1 s2 opts = cOrderingToOrdering $ unsafePerformIO $ withCompactString s1 $ \buf1 len1 -> withCompactString s2 $ \buf2 len2 -> withPackedOptions opts $ \rawOpts -> handleError $ \errptr -> raw_compare buf1 len1 buf2 len2 rawOpts errptr