module Encoding.UTF8.Light (
UTF8(..)
, lenUTF8
, lenUTF16
, countUTF8
, decodeUTF8
, encodeUTF8
, encodeUTF8'
, withUTF8
, putUTF8
, putUTF8Ln
, hPutUTF8
, hPutUTF8Ln
, readUTF8File
, writeUTF8File
, appendUTF8File
, hGetUTF8Line
, hGetUTF8Contents
, hGetUTF8
, hGetUTF8NonBlocking
, flipUTF8
, unflipUTF8
, flipTab
, unflipTab
, showHex
, toBits
, fromBits
) where
import Data.Bits
import Data.Word(Word8,Word16,Word32)
import Data.List(foldl')
import Data.Monoid(Monoid(..))
import Data.ByteString(ByteString)
import qualified Data.ByteString as B
import Data.ByteString.Internal(c2w,w2c)
import Data.ByteString.Unsafe
import Data.Char(chr,ord)
import System.IO(Handle)
fi :: (Num b, Integral a) => a -> b
fi = fromIntegral
class UTF8 a where
encode :: a -> ByteString
decode :: ByteString -> a
instance UTF8 ByteString where
encode = id
decode = id
instance UTF8 [Int] where
encode = encodeUTF8
decode = decodeUTF8
instance UTF8 String where
encode = encode . fmap ord
decode = fmap chr . decode
withUTF8 :: (UTF8 a) => a -> (ByteString -> b) -> b
withUTF8 a k = k (encode a)
putUTF8 :: (UTF8 a) => a -> IO ()
putUTF8 = flip withUTF8 B.putStr
putUTF8Ln :: (UTF8 a) => a -> IO ()
putUTF8Ln = flip withUTF8 B.putStrLn
hPutUTF8 :: (UTF8 a) => Handle -> a -> IO ()
hPutUTF8 h = flip withUTF8 (B.hPut h)
hPutUTF8Ln :: (UTF8 a) => Handle -> a -> IO ()
hPutUTF8Ln h = flip withUTF8 (B.hPutStrLn h)
readUTF8File :: (UTF8 a) => FilePath -> IO a
readUTF8File = (return . decode =<<) . B.readFile
writeUTF8File :: (UTF8 a) => FilePath -> a -> IO ()
writeUTF8File p = B.writeFile p . encode
appendUTF8File :: (UTF8 a) => FilePath -> a -> IO ()
appendUTF8File p = B.appendFile p . encode
hGetUTF8Line :: (UTF8 a) => Handle -> IO a
hGetUTF8Line = (return . decode =<<) . B.hGetLine
hGetUTF8Contents :: (UTF8 a) => Handle -> IO a
hGetUTF8Contents = (return . decode =<<) . B.hGetContents
hGetUTF8 :: (UTF8 a) => Handle -> Int -> IO a
hGetUTF8 h = (return . decode =<<) . B.hGet h
hGetUTF8NonBlocking :: (UTF8 a) => Handle -> Int -> IO a
hGetUTF8NonBlocking h = (return . decode =<<) . B.hGetNonBlocking h
lenUTF8 :: Word8 -> Int
lenUTF8 w8
| w8`shiftR`7==0x00 = 1
| w8`shiftR`5==0x06 = 2
| w8`shiftR`4==0x0e = 3
| w8`shiftR`3==0x1e = 4
| otherwise = 0
lenUTF16 :: Word16 -> Int
lenUTF16 w16
| w16`shiftR`10==0x36 = 2
| w16`shiftR`10==0x37 = 0
| otherwise = 1
countUTF8 :: ByteString -> [Int]
countUTF8 s = go 0 (B.length s) s
where go :: Int -> Int -> ByteString -> [Int]
go i len s
| len <= i = []
| otherwise = let n = lenUTF8 (unsafeIndex s i)
i' = i+n
in if n==0
then []
else i' `seq` n : go i' len s
encodeUTF8 :: [Int] -> ByteString
encodeUTF8 = B.pack . concat . encodeUTF8'
encodeUTF8' :: [Int] -> [[Word8]]
encodeUTF8' [] = []
encodeUTF8' (x:xs)
| x.&.0xffffff80==0 =
[fi x] : encodeUTF8' xs
| x.&.0xfffff800==0 =
[ fi(x`shiftR`6.|.0xc0)
, fi(x.&.0x3f.|.0x80)
] : encodeUTF8' xs
| x.&.0xffff0000==0 =
[ fi(x`shiftR`12.|.0xe0)
, fi(x`shiftR`6.&.0x3f.|.0x80)
, fi(x.&.0x3f.|.0x80)
] : encodeUTF8' xs
| x.&.0xffe00000==0 =
[ fi(x`shiftR`18.|.0xf0)
, fi(x`shiftR`12.&.0x3f.|.0x80)
, fi(x`shiftR`6.&.0x3f.|.0x80)
, fi(x.&.0x3f.|.0x80)
] : encodeUTF8' xs
| otherwise = [] : encodeUTF8' xs
decodeUTF8 :: ByteString -> [Int]
decodeUTF8 s = go 0 (B.length s) s
where go :: Int -> Int -> ByteString -> [Int]
go i len s
| len <= i = []
| otherwise =
let c1 = unsafeIndex s i
in case lenUTF8 c1 of
0 -> []
1 -> let i' = i+1
in i'`seq`
fi c1 : go i' len s
2 -> if len <= i+1
then []
else
let c2 = unsafeIndex s (i+1)
i' = i+2
in i'`seq`
fi(c1.&.0x1f)`shiftL`6
`xor`fi(c2.&.0x3f)
: go i' len s
3 -> if len <= i+2
then []
else
let c2 = unsafeIndex s (i+1)
c3 = unsafeIndex s (i+2)
i' = i+3
in i'`seq`
fi(c1.&.0x1f)`shiftL`12
`xor`fi(c2.&.0x3f)`shiftL`6
`xor`fi(c3.&.0x3f)
: go i' len s
4 -> if len <= i+3
then []
else
let c2 = unsafeIndex s (i+1)
c3 = unsafeIndex s (i+2)
c4 = unsafeIndex s (i+3)
i' = i+4
in i'`seq`
fi(c1.&.0x1f)`shiftL`18
`xor`fi(c2.&.0x3f)`shiftL`12
`xor`fi(c3.&.0x3f)`shiftL`6
`xor`fi(c4.&.0x3f)
: go i' len s
toBits :: Word8 -> [Word8]
toBits w8 = fmap ((.&.0x01) . (w8`shiftR`)) [7,6,5,4,3,2,1,0]
fromBits :: [Word8] -> Word8
fromBits = foldl' (\a (n,b) -> a.|.b`shiftL`n) 0
. reverse . zip [0..7] . reverse
hexTab :: ByteString
hexTab = B.pack . fmap c2w $
"0123456789abcdef"
showHex :: Int -> String
showHex i = ("0x"++)
. flip fmap [28,24,20,16,12,8,4,0] $ \n ->
w2c (unsafeIndex hexTab (i`shiftR`n.&.0xf))
flipUTF8 :: (UTF8 a) => a -> a
flipUTF8 = decode . flipString flipTab . encode
unflipUTF8 :: (UTF8 a) => a -> a
unflipUTF8 = decode . flipString unflipTab . encode
flipString :: [(Int,Int)] -> ByteString -> ByteString
flipString tab = encode
. reverse
. fmap (maybe ' ' chr
. flip lookup tab)
. decode
unflipTab :: [(Int,Int)]
unflipTab = fmap (uncurry(flip(,))) flipTab
flipTab :: [(Int,Int)]
flipTab = fmap (\(a,b)->(ord a,b))
[('a', 0x250)
,('b', ord 'q')
,('c', 0x254)
,('d', ord 'p')
,('e', 0x1dd)
,('f', 0x25f)
,('g', 0x183)
,('h', 0x265)
,('i', 0x131)
,('j', 0x27e)
,('k', 0x29e)
,('l', ord 'l')
,('m', 0x26f)
,('n', ord 'u')
,('o', ord 'o')
,('p', ord 'b')
,('q', ord 'd')
,('r', 0x279)
,('s', ord 's')
,('t', 0x287)
,('u', ord 'n')
,('v', 0x28c)
,('w', 0x28d)
,('x', ord 'x')
,('y', 0x28e)
,('z', ord 'z')
,('.', 0x2d9)
,('[', ord ']')
,(']', ord '[')
,('{', ord '}')
,('}', ord '{')
,('<', ord '>')
,('>', ord '<')
,('?', 0xbf)
,('!', 0xa1)
,('\'', ord ',')
,('_', 0x203e)
,(';', 0x061b)
]