{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE MultiWayIf #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE ScopedTypeVariables #-}
module Zenacy.Unicode
( BOM(..)
, bomStrings
, bomStrip
, unicodeCleanUTF8
) where
import Foreign
( castPtr
, withForeignPtr
)
import Control.Monad.ST
( ST
, runST
)
import Data.STRef
( STRef
, newSTRef
, readSTRef
, writeSTRef
)
import Data.ByteString
( ByteString
)
import qualified Data.ByteString as S
( index
, length
, null
, packCStringLen
, pack
, stripPrefix
)
import Data.Vector.Storable.Mutable
( MVector(..)
)
import qualified Data.Vector.Storable.Mutable as U
( new
, length
, write
, grow
, unsafeToForeignPtr0
)
import Data.Word
( Word8
)
import System.IO.Unsafe
( unsafePerformIO
)
data BOM
= BOM_UTF8
| BOM_UTF16_BE
| BOM_UTF16_LE
| BOM_UTF32_BE
| BOM_UTF32_LE
deriving (BOM -> BOM -> Bool
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: BOM -> BOM -> Bool
$c/= :: BOM -> BOM -> Bool
== :: BOM -> BOM -> Bool
$c== :: BOM -> BOM -> Bool
Eq, Eq BOM
BOM -> BOM -> Bool
BOM -> BOM -> Ordering
BOM -> BOM -> BOM
forall a.
Eq a
-> (a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
min :: BOM -> BOM -> BOM
$cmin :: BOM -> BOM -> BOM
max :: BOM -> BOM -> BOM
$cmax :: BOM -> BOM -> BOM
>= :: BOM -> BOM -> Bool
$c>= :: BOM -> BOM -> Bool
> :: BOM -> BOM -> Bool
$c> :: BOM -> BOM -> Bool
<= :: BOM -> BOM -> Bool
$c<= :: BOM -> BOM -> Bool
< :: BOM -> BOM -> Bool
$c< :: BOM -> BOM -> Bool
compare :: BOM -> BOM -> Ordering
$ccompare :: BOM -> BOM -> Ordering
Ord, Int -> BOM -> ShowS
[BOM] -> ShowS
BOM -> String
forall a.
(Int -> a -> ShowS) -> (a -> String) -> ([a] -> ShowS) -> Show a
showList :: [BOM] -> ShowS
$cshowList :: [BOM] -> ShowS
show :: BOM -> String
$cshow :: BOM -> String
showsPrec :: Int -> BOM -> ShowS
$cshowsPrec :: Int -> BOM -> ShowS
Show)
bomStrings :: [(BOM,ByteString)]
bomStrings :: [(BOM, ByteString)]
bomStrings =
[ ( BOM
BOM_UTF8, [Word8] -> ByteString
S.pack [ Word8
0xEF, Word8
0xBB, Word8
0xBF ] )
, ( BOM
BOM_UTF32_BE, [Word8] -> ByteString
S.pack [ Word8
0x00, Word8
0x00, Word8
0xFE, Word8
0xFF ] )
, ( BOM
BOM_UTF32_LE, [Word8] -> ByteString
S.pack [ Word8
0xFF, Word8
0xFE, Word8
0x00, Word8
0x00 ] )
, ( BOM
BOM_UTF16_BE, [Word8] -> ByteString
S.pack [ Word8
0xFE, Word8
0xFF ] )
, ( BOM
BOM_UTF16_LE, [Word8] -> ByteString
S.pack [ Word8
0xFF, Word8
0xFE ] )
]
bomStrip :: ByteString -> (Maybe BOM, ByteString)
bomStrip :: ByteString -> (Maybe BOM, ByteString)
bomStrip ByteString
x =
forall {a}. [(a, ByteString)] -> (Maybe a, ByteString)
go [(BOM, ByteString)]
bomStrings
where
go :: [(a, ByteString)] -> (Maybe a, ByteString)
go [] =
(forall a. Maybe a
Nothing, ByteString
x)
go ((a
b,ByteString
s):[(a, ByteString)]
bs) =
case ByteString -> ByteString -> Maybe ByteString
S.stripPrefix ByteString
s ByteString
x of
Just ByteString
x' -> (forall a. a -> Maybe a
Just a
b, ByteString
x')
Maybe ByteString
Nothing -> [(a, ByteString)] -> (Maybe a, ByteString)
go [(a, ByteString)]
bs
unicodeCleanUTF8 :: ByteString -> ByteString
unicodeCleanUTF8 :: ByteString -> ByteString
unicodeCleanUTF8 ByteString
x =
forall a. (forall s. ST s a) -> a
runST forall a b. (a -> b) -> a -> b
$ do
MVector s Word8
v <- forall (m :: * -> *) a.
(PrimMonad m, Storable a) =>
Int -> m (MVector (PrimState m) a)
U.new Int
100
forall {s}. Int -> Int -> MVector s Word8 -> ST s ByteString
go Int
0 Int
0 MVector s Word8
v
where
go :: Int -> Int -> MVector s Word8 -> ST s ByteString
go Int
i Int
j MVector s Word8
u
| Int
i forall a. Eq a => a -> a -> Bool
== ByteString -> Int
S.length ByteString
x = do
forall s. MVector s Word8 -> Int -> ST s ByteString
dataString MVector s Word8
u Int
j
| Bool
otherwise = do
MVector s Word8
v <- if Int
j forall a. Num a => a -> a -> a
+ Int
3 forall a. Ord a => a -> a -> Bool
< forall a s. Storable a => MVector s a -> Int
U.length MVector s Word8
u
then forall (f :: * -> *) a. Applicative f => a -> f a
pure MVector s Word8
u
else forall (m :: * -> *) a.
(PrimMonad m, Storable a) =>
MVector (PrimState m) a -> Int -> m (MVector (PrimState m) a)
U.grow MVector s Word8
u forall a b. (a -> b) -> a -> b
$ forall a s. Storable a => MVector s a -> Int
U.length MVector s Word8
u
let c0 :: Word8
c0 = HasCallStack => ByteString -> Int -> Word8
S.index ByteString
x (Int
i forall a. Num a => a -> a -> a
+ Int
0)
c1 :: Word8
c1 = HasCallStack => ByteString -> Int -> Word8
S.index ByteString
x (Int
i forall a. Num a => a -> a -> a
+ Int
1)
c2 :: Word8
c2 = HasCallStack => ByteString -> Int -> Word8
S.index ByteString
x (Int
i forall a. Num a => a -> a -> a
+ Int
2)
c3 :: Word8
c3 = HasCallStack => ByteString -> Int -> Word8
S.index ByteString
x (Int
i forall a. Num a => a -> a -> a
+ Int
3)
if | (Word8
c0 forall a. Ord a => a -> a -> Bool
>= Word8
0x01 Bool -> Bool -> Bool
&& Word8
c0 forall a. Ord a => a -> a -> Bool
<= Word8
0x7F) -> do
forall (m :: * -> *) a.
(PrimMonad m, Storable a) =>
MVector (PrimState m) a -> Int -> a -> m ()
U.write MVector s Word8
v (Int
j forall a. Num a => a -> a -> a
+ Int
0) Word8
c0
Int -> Int -> MVector s Word8 -> ST s ByteString
go (Int
i forall a. Num a => a -> a -> a
+ Int
1) (Int
j forall a. Num a => a -> a -> a
+ Int
1) MVector s Word8
v
| (Word8
c0 forall a. Ord a => a -> a -> Bool
>= Word8
0xC0 Bool -> Bool -> Bool
&& Word8
c0 forall a. Ord a => a -> a -> Bool
<= Word8
0xDF) -> do
if | Int
i forall a. Num a => a -> a -> a
+ Int
1 forall a. Ord a => a -> a -> Bool
< ByteString -> Int
S.length ByteString
x Bool -> Bool -> Bool
&&
(Word8
c1 forall a. Ord a => a -> a -> Bool
>= Word8
0x80 Bool -> Bool -> Bool
&& Word8
c1 forall a. Ord a => a -> a -> Bool
<= Word8
0xBF) -> do
forall (m :: * -> *) a.
(PrimMonad m, Storable a) =>
MVector (PrimState m) a -> Int -> a -> m ()
U.write MVector s Word8
v (Int
j forall a. Num a => a -> a -> a
+ Int
0) Word8
c0
forall (m :: * -> *) a.
(PrimMonad m, Storable a) =>
MVector (PrimState m) a -> Int -> a -> m ()
U.write MVector s Word8
v (Int
j forall a. Num a => a -> a -> a
+ Int
1) Word8
c1
Int -> Int -> MVector s Word8 -> ST s ByteString
go (Int
i forall a. Num a => a -> a -> a
+ Int
2) (Int
j forall a. Num a => a -> a -> a
+ Int
2) MVector s Word8
v
| Bool
otherwise -> do
Int -> Int -> MVector s Word8 -> ST s ByteString
rep (Int
i forall a. Num a => a -> a -> a
+ Int
1) Int
j MVector s Word8
v
| (Word8
c0 forall a. Ord a => a -> a -> Bool
>= Word8
0xE0 Bool -> Bool -> Bool
&& Word8
c0 forall a. Ord a => a -> a -> Bool
<= Word8
0xEF) -> do
if | Int
i forall a. Num a => a -> a -> a
+ Int
2 forall a. Ord a => a -> a -> Bool
< ByteString -> Int
S.length ByteString
x Bool -> Bool -> Bool
&&
(Word8
c1 forall a. Ord a => a -> a -> Bool
>= Word8
0x80 Bool -> Bool -> Bool
&& Word8
c1 forall a. Ord a => a -> a -> Bool
<= Word8
0xBF) Bool -> Bool -> Bool
&&
(Word8
c2 forall a. Ord a => a -> a -> Bool
>= Word8
0x80 Bool -> Bool -> Bool
&& Word8
c2 forall a. Ord a => a -> a -> Bool
<= Word8
0xBF) -> do
forall (m :: * -> *) a.
(PrimMonad m, Storable a) =>
MVector (PrimState m) a -> Int -> a -> m ()
U.write MVector s Word8
v (Int
j forall a. Num a => a -> a -> a
+ Int
0) Word8
c0
forall (m :: * -> *) a.
(PrimMonad m, Storable a) =>
MVector (PrimState m) a -> Int -> a -> m ()
U.write MVector s Word8
v (Int
j forall a. Num a => a -> a -> a
+ Int
1) Word8
c1
forall (m :: * -> *) a.
(PrimMonad m, Storable a) =>
MVector (PrimState m) a -> Int -> a -> m ()
U.write MVector s Word8
v (Int
j forall a. Num a => a -> a -> a
+ Int
2) Word8
c2
Int -> Int -> MVector s Word8 -> ST s ByteString
go (Int
i forall a. Num a => a -> a -> a
+ Int
3) (Int
j forall a. Num a => a -> a -> a
+ Int
3) MVector s Word8
v
| (Word8
c1 forall a. Ord a => a -> a -> Bool
>= Word8
0x80 Bool -> Bool -> Bool
&& Word8
c1 forall a. Ord a => a -> a -> Bool
<= Word8
0xBF) -> do
Int -> Int -> MVector s Word8 -> ST s ByteString
rep (Int
i forall a. Num a => a -> a -> a
+ Int
2) Int
j MVector s Word8
v
| Bool
otherwise ->
Int -> Int -> MVector s Word8 -> ST s ByteString
rep (Int
i forall a. Num a => a -> a -> a
+ Int
1) Int
j MVector s Word8
v
| (Word8
c0 forall a. Ord a => a -> a -> Bool
>= Word8
0xF0 Bool -> Bool -> Bool
&& Word8
c0 forall a. Ord a => a -> a -> Bool
<= Word8
0xF7) -> do
if | Int
i forall a. Num a => a -> a -> a
+ Int
3 forall a. Ord a => a -> a -> Bool
< ByteString -> Int
S.length ByteString
x Bool -> Bool -> Bool
&&
(Word8
c1 forall a. Ord a => a -> a -> Bool
>= Word8
0x80 Bool -> Bool -> Bool
&& Word8
c1 forall a. Ord a => a -> a -> Bool
<= Word8
0xBF) Bool -> Bool -> Bool
&&
(Word8
c2 forall a. Ord a => a -> a -> Bool
>= Word8
0x80 Bool -> Bool -> Bool
&& Word8
c2 forall a. Ord a => a -> a -> Bool
<= Word8
0xBF) Bool -> Bool -> Bool
&&
(Word8
c3 forall a. Ord a => a -> a -> Bool
>= Word8
0x80 Bool -> Bool -> Bool
&& Word8
c3 forall a. Ord a => a -> a -> Bool
<= Word8
0xBF) -> do
forall (m :: * -> *) a.
(PrimMonad m, Storable a) =>
MVector (PrimState m) a -> Int -> a -> m ()
U.write MVector s Word8
v (Int
j forall a. Num a => a -> a -> a
+ Int
0) Word8
c0
forall (m :: * -> *) a.
(PrimMonad m, Storable a) =>
MVector (PrimState m) a -> Int -> a -> m ()
U.write MVector s Word8
v (Int
j forall a. Num a => a -> a -> a
+ Int
1) Word8
c1
forall (m :: * -> *) a.
(PrimMonad m, Storable a) =>
MVector (PrimState m) a -> Int -> a -> m ()
U.write MVector s Word8
v (Int
j forall a. Num a => a -> a -> a
+ Int
2) Word8
c2
forall (m :: * -> *) a.
(PrimMonad m, Storable a) =>
MVector (PrimState m) a -> Int -> a -> m ()
U.write MVector s Word8
v (Int
j forall a. Num a => a -> a -> a
+ Int
3) Word8
c3
Int -> Int -> MVector s Word8 -> ST s ByteString
go (Int
i forall a. Num a => a -> a -> a
+ Int
4) (Int
j forall a. Num a => a -> a -> a
+ Int
4) MVector s Word8
v
| (Word8
c1 forall a. Ord a => a -> a -> Bool
>= Word8
0x80 Bool -> Bool -> Bool
&& Word8
c1 forall a. Ord a => a -> a -> Bool
<= Word8
0xBF) Bool -> Bool -> Bool
&&
(Word8
c2 forall a. Ord a => a -> a -> Bool
>= Word8
0x80 Bool -> Bool -> Bool
&& Word8
c2 forall a. Ord a => a -> a -> Bool
<= Word8
0xBF) -> do
Int -> Int -> MVector s Word8 -> ST s ByteString
rep (Int
i forall a. Num a => a -> a -> a
+ Int
3) Int
j MVector s Word8
v
| (Word8
c1 forall a. Ord a => a -> a -> Bool
>= Word8
0x80 Bool -> Bool -> Bool
&& Word8
c1 forall a. Ord a => a -> a -> Bool
<= Word8
0xBF) -> do
Int -> Int -> MVector s Word8 -> ST s ByteString
rep (Int
i forall a. Num a => a -> a -> a
+ Int
2) Int
j MVector s Word8
v
| Bool
otherwise ->
Int -> Int -> MVector s Word8 -> ST s ByteString
rep (Int
i forall a. Num a => a -> a -> a
+ Int
1) Int
j MVector s Word8
v
| Bool
otherwise -> do
Int -> Int -> MVector s Word8 -> ST s ByteString
rep (Int
i forall a. Num a => a -> a -> a
+ Int
1) Int
j MVector s Word8
v
rep :: Int -> Int -> MVector s Word8 -> ST s ByteString
rep Int
i Int
j MVector s Word8
v = do
forall (m :: * -> *) a.
(PrimMonad m, Storable a) =>
MVector (PrimState m) a -> Int -> a -> m ()
U.write MVector s Word8
v (Int
j forall a. Num a => a -> a -> a
+ Int
0) Word8
0xEF
forall (m :: * -> *) a.
(PrimMonad m, Storable a) =>
MVector (PrimState m) a -> Int -> a -> m ()
U.write MVector s Word8
v (Int
j forall a. Num a => a -> a -> a
+ Int
1) Word8
0xBF
forall (m :: * -> *) a.
(PrimMonad m, Storable a) =>
MVector (PrimState m) a -> Int -> a -> m ()
U.write MVector s Word8
v (Int
j forall a. Num a => a -> a -> a
+ Int
2) Word8
0xBD
Int -> Int -> MVector s Word8 -> ST s ByteString
go Int
i (Int
j forall a. Num a => a -> a -> a
+ Int
3) MVector s Word8
v
dataString :: MVector s Word8 -> Int -> ST s ByteString
dataString :: forall s. MVector s Word8 -> Int -> ST s ByteString
dataString MVector s Word8
v Int
n =
forall (f :: * -> *) a. Applicative f => a -> f a
pure forall a b. (a -> b) -> a -> b
$ forall a. IO a -> a
unsafePerformIO forall a b. (a -> b) -> a -> b
$ do
let (ForeignPtr Word8
f, Int
_) = forall s a. MVector s a -> (ForeignPtr a, Int)
U.unsafeToForeignPtr0 MVector s Word8
v
forall a b. ForeignPtr a -> (Ptr a -> IO b) -> IO b
withForeignPtr ForeignPtr Word8
f forall a b. (a -> b) -> a -> b
$ \Ptr Word8
p ->
CStringLen -> IO ByteString
S.packCStringLen (forall a b. Ptr a -> Ptr b
castPtr Ptr Word8
p, Int
n)