module Data.Encoding.GB18030
(GB18030(..))
where
import Control.Exception
import Data.Char (chr,ord)
import Data.Word
import Data.Bits
import Data.Encoding.Base
import Data.ByteString (ByteString)
import qualified Data.ByteString as BS
import Data.Typeable
#if __GLASGOW_HASKELL__>=608
import Data.ByteString.Unsafe (unsafeIndex)
#else
import Data.ByteString.Base (unsafeIndex)
#endif
import Data.Encoding.GB18030Data
data GB18030 = GB18030 deriving (Eq,Show,Typeable)
instance Encoding GB18030 where
encode _ = encodeMultibyte encodeGB
encodeLazy _ = encodeMultibyteLazy encodeGB
decode _ = decodeMultibyte decodeGB
decodeLazy _ = decodeMultibyteLazy decodeGB
encodable _ ch = ch <= '\x10FFFF'
decodable _ = checkValidity
data DecodingState
= Valid
| Invalid
| Second
| Third
| Fourth
deriving Eq
checkValidity :: ByteString -> Bool
checkValidity bs = BS.foldl' (\st w -> case st of
Invalid -> Invalid
Valid | w<=0x80 -> Valid
| w<=0xFE -> Second
| otherwise -> Invalid
Second | w< 0x30 -> Invalid
| w<=0x39 -> Third
| w<=0x7E -> Valid
| w==0x7F -> Invalid
| w<=0xFE -> Valid
| otherwise -> Invalid
Third | w< 0x81 -> Invalid
| w<=0xFE -> Fourth
| otherwise -> Invalid
Fourth | w< 0x30 -> Invalid
| w<=0x39 -> Valid
| otherwise -> Invalid
) Valid bs == Valid
encodeGB :: Char -> (Word8,EncodeState)
encodeGB ch = if ch<='\x4946'
then (if ch<='\x4055'
then (if ch<='\x2E80'
then (if ch<='\x200F'
then (if ch<'\x0452'
then arr 0x0000 arr1
else range range1)
else (if ch<'\x2643'
then arr 0x2010 arr2
else range range2))
else (if ch<='\x3917'
then (if ch<'\x361B'
then arr 0x2E81 arr3
else range range3)
else (if ch<'\x3CE1'
then arr 0x3918 arr4
else range range4)))
else (if ch<='\x464B'
then (if ch<='\x4336'
then (if ch<'\x4160'
then arr 0x4056 arr5
else range range5)
else (if ch<'\x44D7'
then arr 0x4337 arr6
else range range6))
else (if ch<'\x478E'
then arr 0x464C arr7
else range range7)))
else (if ch<='\xF92B'
then (if ch<='\xD7FF'
then (if ch<='\x4C76'
then (if ch<'\x49B8'
then arr 0x4947 arr8
else range range8)
else (if ch<'\x9FA6'
then arr 0x4C77 arr9
else range range9))
else (if ch<'\xE865'
then arr 0xD800 arr10
else range range10))
else (if ch<='\xFFFF'
then (if ch<='\xFE2F'
then (if ch<'\xFA2A'
then arr 0xF92C arr11
else range range11)
else (if ch<'\xFFE6'
then arr 0xFE30 arr12
else range range12))
else (if ch<='\x10FFFF'
then range range13
else throwDyn (HasNoRepresentation ch))))
where
range r = let
(w1,w2,w3,w4) = delinear (ord ch + r)
in (w1,Put3 w2 w3 w4)
arr off a = let
ind = (ord ch off)*5
w1 = unsafeIndex a (ind+1)
w2 = unsafeIndex a (ind+2)
w3 = unsafeIndex a (ind+3)
w4 = unsafeIndex a (ind+4)
in (w1,case unsafeIndex a ind of
1 -> Done
2 -> Put1 w2
3 -> Put2 w2 w3
4 -> Put3 w2 w3 w4)
linear :: Word8 -> Word8 -> Word8 -> Word8 -> Int
linear w1 w2 w3 w4
= (fromIntegral (w40x30))
+ (fromIntegral (w30x81))*10
+ (fromIntegral (w20x30))*1260
+ (fromIntegral (w10x81))*12600
linear2 :: Word8 -> Word8 -> Int
linear2 w1 w2 = (fromIntegral (w2 (if w2<=0x7E
then 0x40
else 0x41)))
+ (fromIntegral (w10x81))*190
delinear :: Int -> (Word8,Word8,Word8,Word8)
delinear n = let
(w1,n1) = n `divMod` 12600
(w2,n2) = n1 `divMod` 1260
(w3,n3) = n2 `divMod` 10
w4 = n3
in (fromIntegral w1+0x81
,fromIntegral w2+0x30
,fromIntegral w3+0x81
,fromIntegral w4+0x30)
decodeGB :: [Word8] -> (Char,[Word8])
decodeGB (w1:rst)
| w1 <=0x80 = (chr $ fromIntegral w1,rst)
| w1 <=0xFE = case rst of
w2:rst2
| w2 < 0x30 -> throwDyn (IllegalCharacter w2)
| w2 <=0x39 -> case rst2 of
w3:rst3
| w3 < 0x81 -> throwDyn (IllegalCharacter w3)
| w3 <=0xFE -> case rst3 of
w4:rst4
| w4 < 0x30 -> throwDyn (IllegalCharacter w4)
| w4 <=0x39 -> let
v = linear w1 w2 w3 w4
in (decodeGBFour v,rst4)
| otherwise -> throwDyn (IllegalCharacter w4)
[] -> throwDyn UnexpectedEnd
| otherwise -> throwDyn (IllegalCharacter w3)
[] -> throwDyn UnexpectedEnd
| w2 <=0x7E -> (decodeGBTwo (linear2 w1 w2),rst2)
| w2 ==0x7F -> throwDyn (IllegalCharacter w2)
| w2 <=0xFE -> (decodeGBTwo (linear2 w1 w2),rst2)
| otherwise -> throwDyn (IllegalCharacter w2)
[] -> throwDyn UnexpectedEnd
| otherwise = throwDyn (IllegalCharacter w1)
decodeGBTwo :: Int -> Char
decodeGBTwo n = let
rn = n*2
w1 = unsafeIndex rrarr rn
w2 = unsafeIndex rrarr (rn+1)
in chr $ ((fromIntegral w1) `shiftL` 8) .|. (fromIntegral w2)
decodeGBFour :: Int -> Char
decodeGBFour v = if v<=17858
then (if v<=15582
then (if v<=11328
then (if v<=7921
then (if v<820
then arr 0 rarr1
else range range1)
else (if v<9219
then arr 7922 rarr2
else range range2))
else (if v<=13737
then (if v<12973
then arr 11329 rarr3
else range range3)
else (if v<14698
then arr 13738 rarr4
else range range4)))
else (if v<=17101
then (if v<=16317
then (if v<15847
then arr 15583 rarr5
else range range5)
else (if v<16729
then arr 16318 rarr6
else range range6))
else (if v<17418
then arr 17102 rarr7
else range range7)))
else (if v<=37844
then (if v<=33468
then (if v<=18663
then (if v<17961
then arr 17859 rarr8
else range range8)
else (if v<19043
then arr 18664 rarr9
else range range9))
else (if v<33550
then arr 33469 rarr10
else range range10))
else (if v<=39419
then (if v<=39107
then (if v<38078
then arr 37845 rarr11
else range range11)
else (if v<39394
then arr 39108 rarr12
else range range12))
else (if v<=1237575 && v>=189000
then range range13
else throwDyn OutOfRange)))
where
arr off a = let
v' = (voff)*2
w1 = unsafeIndex a v'
w2 = unsafeIndex a (v'+1)
in chr $ ((fromIntegral w1) `shiftL` 8)
.|. (fromIntegral w2)
range r = chr (vr)
range1,range2,range3,range4,range5,range6,range7,range8,range9,range10,range11,range12,range13 :: Int
range1 = 286
range2 = 576
range3 = 878
range4 = 887
range5 = 889
range6 = 894
range7 = 900
range8 = 911
range9 = 21827
range10 = 25943
range11 = 25964
range12 = 26116
range13 = 123464