-------------------------------------------------------------------------------- {-# LANGUAGE BangPatterns #-} {-# LANGUAGE MagicHash #-} {-# LANGUAGE NoImplicitPrelude #-} {-# OPTIONS_GHC -O2 #-} -------------------------------------------------------------------------------- module Data.ByteString.Internal.Utf8 ( isUtf8 , isUtf8' ) where -------------------------------------------------------------------------------- import Data.Bits ((.&.)) import Data.ByteString.Internal (ByteString(..), accursedUnutterablePerformIO) import Data.ByteString.Internal.Ascii (isAscii) import Data.Word (Word8) import GHC.Base import GHC.Ptr import Foreign.ForeignPtr (withForeignPtr) import Foreign.Storable (peek) -------------------------------------------------------------------------------- data Utf8 = U8 | U16 | U24 | U32 | UNot -- deriving Eq which :: Word8 -> Utf8 {-# inline which #-} which !c | isUtf8b8 c = U8 | isUtf8b16 c = U16 | isUtf8b24 c = U24 | isUtf8b32 c = U32 | otherwise = UNot -------------------------------------------------------------------------------- isUtf8Ptr :: Ptr Word8 -> Ptr Word8 -> IO Bool isUtf8Ptr !p !q | p == q = pure True | otherwise = do c <- peek p case which c of U8 -> isUtf8Ptr (p `plusPtr` 1) q U16 -> if q `minusPtr` p >= 2 then do d <- peek (p `plusPtr` 1) if isUtf8OtherBytes d then isUtf8Ptr (p `plusPtr` 2) q else pure False else pure False U24 -> if q `minusPtr` p >= 3 then do d <- peek (p `plusPtr` 1); e <- peek (p `plusPtr` 2) if (isUtf8OtherBytes d && isUtf8OtherBytes e) then isUtf8Ptr (p `plusPtr` 3) q else pure False else pure False U32 -> if q `minusPtr` p >= 4 then do d <- peek (p `plusPtr` 1); e <- peek (p `plusPtr` 2); f <- peek (p `plusPtr` 3); if (isUtf8OtherBytes d && isUtf8OtherBytes e && isUtf8OtherBytes f) then isUtf8Ptr (p `plusPtr` 4) q else pure False else pure False UNot -> pure False -------------------------------------------------------------------------------- -- Hex: 0x80 -- Binary: 10000000 -- Decimal: 128 isUtf8b8 :: Word8 -> Bool {-# inline isUtf8b8 #-} isUtf8b8 !w = w .&. 0x80 == 0 -- Hex: 0xE0 -- Binary: 11100000 -- Decimal: 224 isUtf8b16 :: Word8 -> Bool {-# inline isUtf8b16 #-} isUtf8b16 !w = w .&. 0xE0 == 0xC0 -- Hex: 0xF0 -- Binary: 11110000 -- Decimal: 240 isUtf8b24 :: Word8 -> Bool {-# inline isUtf8b24 #-} isUtf8b24 !w = w .&. 0xF0 == 0xE0 -- Hex: 0xF8 -- Binary: 11111000 -- Decimal: 248 isUtf8b32 :: Word8 -> Bool {-# inline isUtf8b32 #-} isUtf8b32 !w = w .&. 0xF8 == 0xF0 -- Hex: 0xC0 -- Binary: 11000000 -- Decimal: 192 isUtf8OtherBytes :: Word8 -> Bool {-# inline isUtf8OtherBytes #-} isUtf8OtherBytes !w = w .&. 0xC0 == 0x80 -------------------------------------------------------------------------------- -- | 'isUtf8' firsts calls the very fast 'isAscii' to see if -- the data is ASCII (and thus UTF-8). Use this if you know most of your -- data is ASCII-encoded. -- If you know that most of your data is probably not UTF8-encoded, it is probably -- best to use 'isUtf8'' to avoid this check. isUtf8 :: ByteString -> Bool isUtf8 (PS _ _ 0) = True isUtf8 b = if isAscii b then True else isUtf8' b {-# inline isUtf8 #-} -- | 'isUtf8'' does not call 'isAscii'. Use this if -- you know most of your data is not ASCII-encoded. -- If you know that most of your data is probably ASCII-encoded, it is -- probably best to use 'isUtf8'. isUtf8' :: ByteString -> Bool isUtf8' (PS _ _ 0) = True isUtf8' (PS fp (I# o#) (I# l#)) = accursedUnutterablePerformIO $ withForeignPtr fp $ \(Ptr addr) -> do let start, end :: Ptr Word8 start = Ptr (plusAddr# addr o#) end = Ptr (plusAddr# addr (o# +# l#)) isUtf8Ptr start end {-# inline isUtf8' #-} --------------------------------------------------------------------------------