{-# LANGUAGE CPP,DeriveDataTypeable #-}
{- | GB18030 is a chinese character encoding that is mandatory in china (if you
 -   don\'t implement it, you\'re not allowed to sell your software there).
 -}

module Data.Encoding.GB18030
	(GB18030(..))
	where

import Control.Exception
import Data.Char (chr,ord)
import Data.Word
import Data.Bits
import Data.Encoding.Base
import Data.ByteString (ByteString)
import qualified Data.ByteString as BS
import Data.Typeable

#if __GLASGOW_HASKELL__>=608
import Data.ByteString.Unsafe (unsafeIndex)
#else
import Data.ByteString.Base (unsafeIndex)
#endif

import Data.Encoding.GB18030Data

data GB18030 = GB18030 deriving (Eq,Show,Typeable)

instance Encoding GB18030 where
	encode _ = encodeMultibyte encodeGB
	encodeLazy _ = encodeMultibyteLazy encodeGB
	decode _ = decodeMultibyte decodeGB
	decodeLazy _ = decodeMultibyteLazy decodeGB
	encodable _ ch = ch <= '\x10FFFF'
	decodable _ = checkValidity

data DecodingState
	= Valid
	| Invalid
	| Second
	| Third
	| Fourth
	deriving Eq

checkValidity :: ByteString -> Bool
checkValidity bs = BS.foldl' (\st w -> case st of
	Invalid			-> Invalid
	Valid	| w<=0x80	-> Valid
		| w<=0xFE	-> Second
		| otherwise	-> Invalid
	Second	| w< 0x30	-> Invalid
		| w<=0x39	-> Third
		| w<=0x7E	-> Valid
		| w==0x7F	-> Invalid
		| w<=0xFE	-> Valid
		| otherwise	-> Invalid
	Third	| w< 0x81	-> Invalid
		| w<=0xFE	-> Fourth
		| otherwise	-> Invalid
	Fourth	| w< 0x30	-> Invalid
		| w<=0x39	-> Valid
		| otherwise	-> Invalid
	) Valid bs == Valid

{- How this works: The nested if-structures form an binary tree over the
 - encoding range.
 -}
encodeGB :: Char -> (Word8,EncodeState)
encodeGB ch = if ch<='\x4946'					-- 1
	then (if ch<='\x4055'					-- 2
		then (if ch<='\x2E80'				-- 3
			then (if ch<='\x200F'			-- 4
				then (if ch<'\x0452'
					then arr 0x0000 arr1
					else range range1)
				else (if ch<'\x2643'
					then arr 0x2010 arr2
					else range range2))
			else (if ch<='\x3917'			-- 4
				then (if ch<'\x361B'
					then arr 0x2E81 arr3
					else range range3)
				else (if ch<'\x3CE1'
					then arr 0x3918 arr4
					else range range4)))
		else (if ch<='\x464B'				-- 3
			then (if ch<='\x4336'			-- 4
				then (if ch<'\x4160'
					then arr 0x4056 arr5
					else range range5)
				else (if ch<'\x44D7'
					then arr 0x4337 arr6
					else range range6))
			else (if ch<'\x478E'
				then arr 0x464C arr7
				else range range7)))
	else (if ch<='\xF92B'					-- 2
		then (if ch<='\xD7FF'				-- 3
			then (if ch<='\x4C76'			-- 4
				then (if ch<'\x49B8'
					then arr 0x4947 arr8
					else range range8)
				else (if ch<'\x9FA6'
					then arr 0x4C77 arr9
					else range range9))
			else (if ch<'\xE865'
				then arr 0xD800 arr10
				else range range10))
		else (if ch<='\xFFFF'				-- 3
			then (if ch<='\xFE2F'			-- 4
				then (if ch<'\xFA2A'
					then arr 0xF92C arr11
					else range range11)
				else (if ch<'\xFFE6'
					then arr 0xFE30 arr12
					else range range12))
			else (if ch<='\x10FFFF'			-- 4
				then range range13
				else throwDyn (HasNoRepresentation ch))))
	where
	range r = let
		(w1,w2,w3,w4) = delinear (ord ch + r)
		in (w1,Put3 w2 w3 w4)
	arr off a = let
		ind = (ord ch - off)*5
		w1 = unsafeIndex a (ind+1)
		w2 = unsafeIndex a (ind+2)
		w3 = unsafeIndex a (ind+3)
		w4 = unsafeIndex a (ind+4)
		in (w1,case unsafeIndex a ind of
			1 -> Done
			2 -> Put1 w2
			3 -> Put2 w2 w3
			4 -> Put3 w2 w3 w4)

linear :: Word8 -> Word8 -> Word8 -> Word8 -> Int
linear w1 w2 w3 w4
	= (fromIntegral (w4-0x30))
	+ (fromIntegral (w3-0x81))*10
	+ (fromIntegral (w2-0x30))*1260
	+ (fromIntegral (w1-0x81))*12600

linear2 :: Word8 -> Word8 -> Int
linear2 w1 w2 = (fromIntegral (w2 - (if w2<=0x7E
	then 0x40
	else 0x41)))
	+ (fromIntegral (w1-0x81))*190

delinear :: Int -> (Word8,Word8,Word8,Word8)
delinear n = let
	(w1,n1) = n  `divMod` 12600
	(w2,n2) = n1 `divMod`  1260
	(w3,n3) = n2 `divMod`    10
	w4      = n3
	in (fromIntegral w1+0x81
	   ,fromIntegral w2+0x30
	   ,fromIntegral w3+0x81
	   ,fromIntegral w4+0x30)

decodeGB :: [Word8] -> (Char,[Word8])
decodeGB (w1:rst)
	| w1 <=0x80	= (chr $ fromIntegral w1,rst)	-- it's ascii
	| w1 <=0xFE	= case rst of
		w2:rst2
			| w2 < 0x30	-> throwDyn (IllegalCharacter w2)
			| w2 <=0x39	-> case rst2 of
				w3:rst3
					| w3 < 0x81	-> throwDyn (IllegalCharacter w3)
					| w3 <=0xFE	-> case rst3 of
						w4:rst4
							| w4 < 0x30	-> throwDyn (IllegalCharacter w4)
							| w4 <=0x39	-> let
								v = linear w1 w2 w3 w4
								in (decodeGBFour v,rst4)
							| otherwise	-> throwDyn (IllegalCharacter w4)
						[] -> throwDyn UnexpectedEnd
					| otherwise	-> throwDyn (IllegalCharacter w3)
				[] -> throwDyn UnexpectedEnd
			| w2 <=0x7E	-> (decodeGBTwo (linear2 w1 w2),rst2)
			| w2 ==0x7F	-> throwDyn (IllegalCharacter w2)
			| w2 <=0xFE	-> (decodeGBTwo (linear2 w1 w2),rst2)
			| otherwise	-> throwDyn (IllegalCharacter w2)
		[] -> throwDyn UnexpectedEnd
	| otherwise	= throwDyn (IllegalCharacter w1)

decodeGBTwo :: Int -> Char
decodeGBTwo n = let
	rn = n*2
	w1 = unsafeIndex rrarr rn
	w2 = unsafeIndex rrarr (rn+1)
	in chr $ ((fromIntegral w1) `shiftL` 8) .|. (fromIntegral w2)

decodeGBFour :: Int -> Char
decodeGBFour v = if v<=17858				-- 1
	then (if v<=15582					-- 2
		then (if v<=11328				-- 3
			then (if v<=7921			-- 4
				then (if v<820
					then arr 0 rarr1
					else range range1)
				else (if v<9219
					then arr 7922 rarr2
					else range range2))
			else (if v<=13737			-- 4
				then (if v<12973
					then arr 11329 rarr3
					else range range3)
				else (if v<14698
					then arr 13738 rarr4
					else range range4)))
		else (if v<=17101				-- 3
			then (if v<=16317			-- 4
				then (if v<15847
					then arr 15583 rarr5
					else range range5)
				else (if v<16729
					then arr 16318 rarr6
					else range range6))
			else (if v<17418
				then arr 17102 rarr7
				else range range7)))
	else (if v<=37844					-- 2
		then (if v<=33468				-- 3
			then (if v<=18663			-- 4
				then (if v<17961
					then arr 17859 rarr8
					else range range8)
				else (if v<19043
					then arr 18664 rarr9
					else range range9))
			else (if v<33550
				then arr 33469 rarr10
				else range range10))
		else (if v<=39419				-- 3
			then (if v<=39107			-- 4
				then (if v<38078
					then arr 37845 rarr11
					else range range11)
				else (if v<39394
					then arr 39108 rarr12
					else range range12))
			else (if v<=1237575 && v>=189000
				then range range13
				else throwDyn OutOfRange)))
	where
	arr off a = let
		v' = (v-off)*2
		w1 = unsafeIndex a v'
		w2 = unsafeIndex a (v'+1)
		in chr $ ((fromIntegral w1) `shiftL` 8)
		      .|. (fromIntegral w2)
	range r = chr (v-r)

range1,range2,range3,range4,range5,range6,range7,range8,range9,range10,range11,range12,range13 :: Int
range1 = -286
range2 = -576
range3 = -878
range4 = -887
range5 = -889
range6 = -894
range7 = -900
range8 = -911
range9 = -21827
range10 = -25943
range11 = -25964
range12 = -26116
range13 = 123464