{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE MultiWayIf #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE ScopedTypeVariables #-}

-- | Tools to check and prepare data to be parsed as valid unicode.
--
-- The following is an example of converting dubious data to a text.
--
-- > textDecode :: ByteString -> Text
-- > textDecode b =
-- >   case bomStrip b of
-- >     (Nothing, s)           -> T.decodeUtf8 $ unicodeCleanUTF8 s -- Assume UTF8
-- >     (Just BOM_UTF8, s)     -> T.decodeUtf8 $ unicodeCleanUTF8 s
-- >     (Just BOM_UTF16_BE, s) -> T.decodeUtf16BE s
-- >     (Just BOM_UTF16_LE, s) -> T.decodeUtf16LE s
-- >     (Just BOM_UTF32_BE, s) -> T.decodeUtf32BE s
-- >     (Just BOM_UTF32_LE, s) -> T.decodeUtf32LE s
module Zenacy.Unicode
  ( BOM(..)
  , bomStrings
  , bomStrip
  , unicodeCleanUTF8
  ) where

import Foreign
  ( castPtr
  , withForeignPtr
  )
import Control.Monad.ST
  ( ST
  , runST
  )
import Data.STRef
  ( STRef
  , newSTRef
  , readSTRef
  , writeSTRef
  )
import Data.ByteString
  ( ByteString
  )
import qualified Data.ByteString as S
  ( index
  , length
  , null
  , packCStringLen
  , pack
  , stripPrefix
  )
import Data.Vector.Storable.Mutable
  ( MVector(..)
  )
import qualified Data.Vector.Storable.Mutable as U
  ( new
  , length
  , write
  , grow
  , unsafeToForeignPtr0
  )
import Data.Word
  ( Word8
  )
import System.IO.Unsafe
  ( unsafePerformIO
  )

-- | Defines the unicode byte order mark.
data BOM
  = BOM_UTF8
  | BOM_UTF16_BE
  | BOM_UTF16_LE
  | BOM_UTF32_BE
  | BOM_UTF32_LE
    deriving (BOM -> BOM -> Bool
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: BOM -> BOM -> Bool
$c/= :: BOM -> BOM -> Bool
== :: BOM -> BOM -> Bool
$c== :: BOM -> BOM -> Bool
Eq, Eq BOM
BOM -> BOM -> Bool
BOM -> BOM -> Ordering
BOM -> BOM -> BOM
forall a.
Eq a
-> (a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
min :: BOM -> BOM -> BOM
$cmin :: BOM -> BOM -> BOM
max :: BOM -> BOM -> BOM
$cmax :: BOM -> BOM -> BOM
>= :: BOM -> BOM -> Bool
$c>= :: BOM -> BOM -> Bool
> :: BOM -> BOM -> Bool
$c> :: BOM -> BOM -> Bool
<= :: BOM -> BOM -> Bool
$c<= :: BOM -> BOM -> Bool
< :: BOM -> BOM -> Bool
$c< :: BOM -> BOM -> Bool
compare :: BOM -> BOM -> Ordering
$ccompare :: BOM -> BOM -> Ordering
Ord, Int -> BOM -> ShowS
[BOM] -> ShowS
BOM -> String
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [BOM] -> ShowS
$cshowList :: [BOM] -> ShowS
show :: BOM -> String
$cshow :: BOM -> String
showsPrec :: Int -> BOM -> ShowS
$cshowsPrec :: Int -> BOM -> ShowS
Show)

-- | Defines the byte order mark signatures.
bomStrings :: [(BOM,ByteString)]
bomStrings :: [(BOM, ByteString)]
bomStrings =
  [ ( BOM
BOM_UTF8,     [Word8] -> ByteString
S.pack [ Word8
0xEF, Word8
0xBB, Word8
0xBF ] )
  , ( BOM
BOM_UTF32_BE, [Word8] -> ByteString
S.pack [ Word8
0x00, Word8
0x00, Word8
0xFE, Word8
0xFF ] )
  , ( BOM
BOM_UTF32_LE, [Word8] -> ByteString
S.pack [ Word8
0xFF, Word8
0xFE, Word8
0x00, Word8
0x00 ] )
  -- The 16 bit codes need to be checked after the 32 bit codes,
  -- because the prefixes are similar.
  , ( BOM
BOM_UTF16_BE, [Word8] -> ByteString
S.pack [ Word8
0xFE, Word8
0xFF ] )
  , ( BOM
BOM_UTF16_LE, [Word8] -> ByteString
S.pack [ Word8
0xFF, Word8
0xFE ] )
  ]

-- | Remove the BOM from the start of a string.
bomStrip :: ByteString -> (Maybe BOM, ByteString)
bomStrip :: ByteString -> (Maybe BOM, ByteString)
bomStrip ByteString
x =
  forall {a}. [(a, ByteString)] -> (Maybe a, ByteString)
go [(BOM, ByteString)]
bomStrings
  where
    go :: [(a, ByteString)] -> (Maybe a, ByteString)
go [] =
      (forall a. Maybe a
Nothing, ByteString
x)
    go ((a
b,ByteString
s):[(a, ByteString)]
bs) =
      case ByteString -> ByteString -> Maybe ByteString
S.stripPrefix ByteString
s ByteString
x of
        Just ByteString
x' -> (forall a. a -> Maybe a
Just a
b, ByteString
x')
        Maybe ByteString
Nothing -> [(a, ByteString)] -> (Maybe a, ByteString)
go [(a, ByteString)]
bs

-- | Removes bad characters and nulls from a UTF8 byte string.
unicodeCleanUTF8 :: ByteString -> ByteString
unicodeCleanUTF8 :: ByteString -> ByteString
unicodeCleanUTF8 ByteString
x =
  forall a. (forall s. ST s a) -> a
runST forall a b. (a -> b) -> a -> b
$ do
    MVector s Word8
v <- forall (m :: * -> *) a.
(PrimMonad m, Storable a) =>
Int -> m (MVector (PrimState m) a)
U.new Int
100
    forall {s}. Int -> Int -> MVector s Word8 -> ST s ByteString
go Int
0 Int
0 MVector s Word8
v
  where
    go :: Int -> Int -> MVector s Word8 -> ST s ByteString
go Int
i Int
j MVector s Word8
u
      | Int
i forall a. Eq a => a -> a -> Bool
== ByteString -> Int
S.length ByteString
x = do
          forall s. MVector s Word8 -> Int -> ST s ByteString
dataString MVector s Word8
u Int
j
      | Bool
otherwise = do
          MVector s Word8
v <- if Int
j forall a. Num a => a -> a -> a
+ Int
3 forall a. Ord a => a -> a -> Bool
< forall a s. Storable a => MVector s a -> Int
U.length MVector s Word8
u
                  then forall (f :: * -> *) a. Applicative f => a -> f a
pure MVector s Word8
u
                  else forall (m :: * -> *) a.
(PrimMonad m, Storable a) =>
MVector (PrimState m) a -> Int -> m (MVector (PrimState m) a)
U.grow MVector s Word8
u forall a b. (a -> b) -> a -> b
$ forall a s. Storable a => MVector s a -> Int
U.length MVector s Word8
u

          let c0 :: Word8
c0 = HasCallStack => ByteString -> Int -> Word8
S.index ByteString
x (Int
i forall a. Num a => a -> a -> a
+ Int
0)
              c1 :: Word8
c1 = HasCallStack => ByteString -> Int -> Word8
S.index ByteString
x (Int
i forall a. Num a => a -> a -> a
+ Int
1)
              c2 :: Word8
c2 = HasCallStack => ByteString -> Int -> Word8
S.index ByteString
x (Int
i forall a. Num a => a -> a -> a
+ Int
2)
              c3 :: Word8
c3 = HasCallStack => ByteString -> Int -> Word8
S.index ByteString
x (Int
i forall a. Num a => a -> a -> a
+ Int
3)

          if | (Word8
c0 forall a. Ord a => a -> a -> Bool
>= Word8
0x01 Bool -> Bool -> Bool
&& Word8
c0 forall a. Ord a => a -> a -> Bool
<= Word8
0x7F) -> do
                 forall (m :: * -> *) a.
(PrimMonad m, Storable a) =>
MVector (PrimState m) a -> Int -> a -> m ()
U.write MVector s Word8
v (Int
j forall a. Num a => a -> a -> a
+ Int
0) Word8
c0
                 Int -> Int -> MVector s Word8 -> ST s ByteString
go (Int
i forall a. Num a => a -> a -> a
+ Int
1) (Int
j forall a. Num a => a -> a -> a
+ Int
1) MVector s Word8
v

             | (Word8
c0 forall a. Ord a => a -> a -> Bool
>= Word8
0xC0 Bool -> Bool -> Bool
&& Word8
c0 forall a. Ord a => a -> a -> Bool
<= Word8
0xDF) -> do
                 if | Int
i forall a. Num a => a -> a -> a
+ Int
1 forall a. Ord a => a -> a -> Bool
< ByteString -> Int
S.length ByteString
x Bool -> Bool -> Bool
&&
                      (Word8
c1 forall a. Ord a => a -> a -> Bool
>= Word8
0x80 Bool -> Bool -> Bool
&& Word8
c1 forall a. Ord a => a -> a -> Bool
<= Word8
0xBF) -> do
                        forall (m :: * -> *) a.
(PrimMonad m, Storable a) =>
MVector (PrimState m) a -> Int -> a -> m ()
U.write MVector s Word8
v (Int
j forall a. Num a => a -> a -> a
+ Int
0) Word8
c0
                        forall (m :: * -> *) a.
(PrimMonad m, Storable a) =>
MVector (PrimState m) a -> Int -> a -> m ()
U.write MVector s Word8
v (Int
j forall a. Num a => a -> a -> a
+ Int
1) Word8
c1
                        Int -> Int -> MVector s Word8 -> ST s ByteString
go (Int
i forall a. Num a => a -> a -> a
+ Int
2) (Int
j forall a. Num a => a -> a -> a
+ Int
2) MVector s Word8
v
                    | Bool
otherwise -> do
                        Int -> Int -> MVector s Word8 -> ST s ByteString
rep (Int
i forall a. Num a => a -> a -> a
+ Int
1) Int
j MVector s Word8
v

             | (Word8
c0 forall a. Ord a => a -> a -> Bool
>= Word8
0xE0 Bool -> Bool -> Bool
&& Word8
c0 forall a. Ord a => a -> a -> Bool
<= Word8
0xEF) -> do
                 if | Int
i forall a. Num a => a -> a -> a
+ Int
2 forall a. Ord a => a -> a -> Bool
< ByteString -> Int
S.length ByteString
x Bool -> Bool -> Bool
&&
                      (Word8
c1 forall a. Ord a => a -> a -> Bool
>= Word8
0x80 Bool -> Bool -> Bool
&& Word8
c1 forall a. Ord a => a -> a -> Bool
<= Word8
0xBF) Bool -> Bool -> Bool
&&
                      (Word8
c2 forall a. Ord a => a -> a -> Bool
>= Word8
0x80 Bool -> Bool -> Bool
&& Word8
c2 forall a. Ord a => a -> a -> Bool
<= Word8
0xBF) -> do
                        forall (m :: * -> *) a.
(PrimMonad m, Storable a) =>
MVector (PrimState m) a -> Int -> a -> m ()
U.write MVector s Word8
v (Int
j forall a. Num a => a -> a -> a
+ Int
0) Word8
c0
                        forall (m :: * -> *) a.
(PrimMonad m, Storable a) =>
MVector (PrimState m) a -> Int -> a -> m ()
U.write MVector s Word8
v (Int
j forall a. Num a => a -> a -> a
+ Int
1) Word8
c1
                        forall (m :: * -> *) a.
(PrimMonad m, Storable a) =>
MVector (PrimState m) a -> Int -> a -> m ()
U.write MVector s Word8
v (Int
j forall a. Num a => a -> a -> a
+ Int
2) Word8
c2
                        Int -> Int -> MVector s Word8 -> ST s ByteString
go (Int
i forall a. Num a => a -> a -> a
+ Int
3) (Int
j forall a. Num a => a -> a -> a
+ Int
3) MVector s Word8
v
                    | (Word8
c1 forall a. Ord a => a -> a -> Bool
>= Word8
0x80 Bool -> Bool -> Bool
&& Word8
c1 forall a. Ord a => a -> a -> Bool
<= Word8
0xBF) -> do
                        Int -> Int -> MVector s Word8 -> ST s ByteString
rep (Int
i forall a. Num a => a -> a -> a
+ Int
2) Int
j MVector s Word8
v
                    | Bool
otherwise ->
                        Int -> Int -> MVector s Word8 -> ST s ByteString
rep (Int
i forall a. Num a => a -> a -> a
+ Int
1) Int
j MVector s Word8
v

             | (Word8
c0 forall a. Ord a => a -> a -> Bool
>= Word8
0xF0 Bool -> Bool -> Bool
&& Word8
c0 forall a. Ord a => a -> a -> Bool
<= Word8
0xF7) -> do
                 if | Int
i forall a. Num a => a -> a -> a
+ Int
3 forall a. Ord a => a -> a -> Bool
< ByteString -> Int
S.length ByteString
x Bool -> Bool -> Bool
&&
                      (Word8
c1 forall a. Ord a => a -> a -> Bool
>= Word8
0x80 Bool -> Bool -> Bool
&& Word8
c1 forall a. Ord a => a -> a -> Bool
<= Word8
0xBF) Bool -> Bool -> Bool
&&
                      (Word8
c2 forall a. Ord a => a -> a -> Bool
>= Word8
0x80 Bool -> Bool -> Bool
&& Word8
c2 forall a. Ord a => a -> a -> Bool
<= Word8
0xBF) Bool -> Bool -> Bool
&&
                      (Word8
c3 forall a. Ord a => a -> a -> Bool
>= Word8
0x80 Bool -> Bool -> Bool
&& Word8
c3 forall a. Ord a => a -> a -> Bool
<= Word8
0xBF) -> do
                        forall (m :: * -> *) a.
(PrimMonad m, Storable a) =>
MVector (PrimState m) a -> Int -> a -> m ()
U.write MVector s Word8
v (Int
j forall a. Num a => a -> a -> a
+ Int
0) Word8
c0
                        forall (m :: * -> *) a.
(PrimMonad m, Storable a) =>
MVector (PrimState m) a -> Int -> a -> m ()
U.write MVector s Word8
v (Int
j forall a. Num a => a -> a -> a
+ Int
1) Word8
c1
                        forall (m :: * -> *) a.
(PrimMonad m, Storable a) =>
MVector (PrimState m) a -> Int -> a -> m ()
U.write MVector s Word8
v (Int
j forall a. Num a => a -> a -> a
+ Int
2) Word8
c2
                        forall (m :: * -> *) a.
(PrimMonad m, Storable a) =>
MVector (PrimState m) a -> Int -> a -> m ()
U.write MVector s Word8
v (Int
j forall a. Num a => a -> a -> a
+ Int
3) Word8
c3
                        Int -> Int -> MVector s Word8 -> ST s ByteString
go (Int
i forall a. Num a => a -> a -> a
+ Int
4) (Int
j forall a. Num a => a -> a -> a
+ Int
4) MVector s Word8
v
                    | (Word8
c1 forall a. Ord a => a -> a -> Bool
>= Word8
0x80 Bool -> Bool -> Bool
&& Word8
c1 forall a. Ord a => a -> a -> Bool
<= Word8
0xBF) Bool -> Bool -> Bool
&&
                      (Word8
c2 forall a. Ord a => a -> a -> Bool
>= Word8
0x80 Bool -> Bool -> Bool
&& Word8
c2 forall a. Ord a => a -> a -> Bool
<= Word8
0xBF) -> do
                        Int -> Int -> MVector s Word8 -> ST s ByteString
rep (Int
i forall a. Num a => a -> a -> a
+ Int
3) Int
j MVector s Word8
v
                    | (Word8
c1 forall a. Ord a => a -> a -> Bool
>= Word8
0x80 Bool -> Bool -> Bool
&& Word8
c1 forall a. Ord a => a -> a -> Bool
<= Word8
0xBF) -> do
                        Int -> Int -> MVector s Word8 -> ST s ByteString
rep (Int
i forall a. Num a => a -> a -> a
+ Int
2) Int
j MVector s Word8
v
                    | Bool
otherwise ->
                        Int -> Int -> MVector s Word8 -> ST s ByteString
rep (Int
i forall a. Num a => a -> a -> a
+ Int
1) Int
j MVector s Word8
v

             | Bool
otherwise -> do
                 Int -> Int -> MVector s Word8 -> ST s ByteString
rep (Int
i forall a. Num a => a -> a -> a
+ Int
1) Int
j MVector s Word8
v

    rep :: Int -> Int -> MVector s Word8 -> ST s ByteString
rep Int
i Int
j MVector s Word8
v = do
      forall (m :: * -> *) a.
(PrimMonad m, Storable a) =>
MVector (PrimState m) a -> Int -> a -> m ()
U.write MVector s Word8
v (Int
j forall a. Num a => a -> a -> a
+ Int
0) Word8
0xEF
      forall (m :: * -> *) a.
(PrimMonad m, Storable a) =>
MVector (PrimState m) a -> Int -> a -> m ()
U.write MVector s Word8
v (Int
j forall a. Num a => a -> a -> a
+ Int
1) Word8
0xBF
      forall (m :: * -> *) a.
(PrimMonad m, Storable a) =>
MVector (PrimState m) a -> Int -> a -> m ()
U.write MVector s Word8
v (Int
j forall a. Num a => a -> a -> a
+ Int
2) Word8
0xBD
      Int -> Int -> MVector s Word8 -> ST s ByteString
go Int
i (Int
j forall a. Num a => a -> a -> a
+ Int
3) MVector s Word8
v

-- | Converts a storable vector to a byte string.
dataString :: MVector s Word8 -> Int -> ST s ByteString
dataString :: forall s. MVector s Word8 -> Int -> ST s ByteString
dataString MVector s Word8
v Int
n =
  forall (f :: * -> *) a. Applicative f => a -> f a
pure forall a b. (a -> b) -> a -> b
$ forall a. IO a -> a
unsafePerformIO forall a b. (a -> b) -> a -> b
$ do
    let (ForeignPtr Word8
f, Int
_) = forall s a. MVector s a -> (ForeignPtr a, Int)
U.unsafeToForeignPtr0 MVector s Word8
v
    forall a b. ForeignPtr a -> (Ptr a -> IO b) -> IO b
withForeignPtr ForeignPtr Word8
f forall a b. (a -> b) -> a -> b
$ \Ptr Word8
p ->
      CStringLen -> IO ByteString
S.packCStringLen (forall a b. Ptr a -> Ptr b
castPtr Ptr Word8
p, Int
n)