{-# LANGUAGE CPP,DeriveDataTypeable #-}
{- | GB18030 is a chinese character encoding that is mandatory in china (if you
 -   don\'t implement it, you\'re not allowed to sell your software there).
 -}

module Data.Encoding.GB18030
	(GB18030(..))
	where

import Control.Throws
import Data.Char (chr,ord)
import Data.Word
import Data.Bits
import Data.Encoding.Base
import Data.Encoding.ByteSource
import Data.Encoding.ByteSink
import Data.Encoding.Exception
import Data.ByteString (ByteString)
import qualified Data.ByteString as BS
import Data.Typeable

#if __GLASGOW_HASKELL__>=608
import Data.ByteString.Unsafe (unsafeIndex)
#else
import Data.ByteString.Base (unsafeIndex)
#endif

import Data.Encoding.GB18030Data

data GB18030 = GB18030 deriving (Eq,Show,Typeable)

instance Encoding GB18030 where
    decodeChar _ = do
      w1 <- fetchWord8
      case () of
        _
          | w1 <= 0x80 -> return (chr $ fromIntegral w1) -- it's ascii
          | w1 <= 0xFE -> do
                     w2 <- fetchWord8
                     case () of
                       _
                         | w2 < 0x30 -> throwException (IllegalCharacter w2)
                         | w2 <= 0x39 -> do
                               w3 <- fetchWord8
                               case () of
                                 _
                                   | w3 < 0x81 -> throwException (IllegalCharacter w3)
                                   | w3 <= 0xFE -> do
                                          w4 <- fetchWord8
                                          case () of
                                            _
                                              | w4 < 0x30 -> throwException (IllegalCharacter w4)
                                              | w4 <= 0x39 -> decodeGBFour $ linear w1 w2 w3 w4
                                              | otherwise -> throwException (IllegalCharacter w4)
                                   | otherwise -> throwException (IllegalCharacter w3)
                         | w2 <= 0x7E -> return $ decodeGBTwo $ linear2 w1 w2
                         | w2 == 0x7F -> throwException (IllegalCharacter w2)
                         | w2 <= 0xFE -> return $ decodeGBTwo $ linear2 w1 w2
                         | otherwise -> throwException (IllegalCharacter w2)
          | otherwise -> throwException (IllegalCharacter w1)
    {- How this works: The nested if-structures form an binary tree over the
     - encoding range.
     -}
    encodeChar _ ch = if ch<='\x4946'					-- 1
	              then (if ch<='\x4055'					-- 2
		            then (if ch<='\x2E80'				-- 3
			          then (if ch<='\x200F'			-- 4
				        then (if ch<'\x0452'
					      then arr 0x0000 arr1
					      else range range1)
				        else (if ch<'\x2643'
					      then arr 0x2010 arr2
					else range range2))
			else (if ch<='\x3917'			-- 4
			      then (if ch<'\x361B'
				    then arr 0x2E81 arr3
				    else range range3)
			      else (if ch<'\x3CE1'
				    then arr 0x3918 arr4
				    else range range4)))
		else (if ch<='\x464B'				-- 3
		      then (if ch<='\x4336'			-- 4
			    then (if ch<'\x4160'
				  then arr 0x4056 arr5
				  else range range5)
			    else (if ch<'\x44D7'
				  then arr 0x4337 arr6
				  else range range6))
			else (if ch<'\x478E'
			      then arr 0x464C arr7
			      else range range7)))
	else (if ch<='\xF92B'					-- 2
	      then (if ch<='\xD7FF'				-- 3
		    then (if ch<='\x4C76'			-- 4
			  then (if ch<'\x49B8'
				then arr 0x4947 arr8
				else range range8)
			  else (if ch<'\x9FA6'
				then arr 0x4C77 arr9
				else range range9))
			else (if ch<'\xE865'
			      then arr 0xD800 arr10
			      else range range10))
		else (if ch<='\xFFFF'				-- 3
		      then (if ch<='\xFE2F'			-- 4
			    then (if ch<'\xFA2A'
				  then arr 0xF92C arr11
				  else range range11)
			    else (if ch<'\xFFE6'
				  then arr 0xFE30 arr12
				  else range range12))
			else (if ch<='\x10FFFF'			-- 4
			      then range range13
			      else throwException (HasNoRepresentation ch))))
        where
	  range r = let (w1,w2,w3,w4) = delinear (ord ch + r)
	            in pushWord8 w1 >> pushWord8 w2 >> pushWord8 w3 >> pushWord8 w4
	  arr off a = let ind = (ord ch - off)*5
	                  w1 = unsafeIndex a (ind+1)
	                  w2 = unsafeIndex a (ind+2)
	                  w3 = unsafeIndex a (ind+3)
	                  w4 = unsafeIndex a (ind+4)
		      in do
                        pushWord8 w1
                        case unsafeIndex a ind of
		          1 -> return ()
	                  2 -> pushWord8 w2
	                  3 -> pushWord8 w2 >> pushWord8 w3
	                  4 -> pushWord8 w2 >> pushWord8 w3 >> pushWord8 w4
    encodeable _ c = c <= '\x10FFFF'

linear :: Word8 -> Word8 -> Word8 -> Word8 -> Int
linear w1 w2 w3 w4
	= (fromIntegral (w4-0x30))
	+ (fromIntegral (w3-0x81))*10
	+ (fromIntegral (w2-0x30))*1260
	+ (fromIntegral (w1-0x81))*12600

linear2 :: Word8 -> Word8 -> Int
linear2 w1 w2 = (fromIntegral (w2 - (if w2<=0x7E
	then 0x40
	else 0x41)))
	+ (fromIntegral (w1-0x81))*190

delinear :: Int -> (Word8,Word8,Word8,Word8)
delinear n = let
	(w1,n1) = n  `divMod` 12600
	(w2,n2) = n1 `divMod`  1260
	(w3,n3) = n2 `divMod`    10
	w4      = n3
	in (fromIntegral w1+0x81
	   ,fromIntegral w2+0x30
	   ,fromIntegral w3+0x81
	   ,fromIntegral w4+0x30)

decodeGBTwo :: Int -> Char
decodeGBTwo n = let
	rn = n*2
	w1 = unsafeIndex rrarr rn
	w2 = unsafeIndex rrarr (rn+1)
	in chr $ ((fromIntegral w1) `shiftL` 8) .|. (fromIntegral w2)

decodeGBFour :: ByteSource m => Int -> m Char
decodeGBFour v = if v<=17858				-- 1
	then (if v<=15582					-- 2
		then (if v<=11328				-- 3
			then (if v<=7921			-- 4
				then (if v<820
					then arr 0 rarr1
					else range range1)
				else (if v<9219
					then arr 7922 rarr2
					else range range2))
			else (if v<=13737			-- 4
				then (if v<12973
					then arr 11329 rarr3
					else range range3)
				else (if v<14698
					then arr 13738 rarr4
					else range range4)))
		else (if v<=17101				-- 3
			then (if v<=16317			-- 4
				then (if v<15847
					then arr 15583 rarr5
					else range range5)
				else (if v<16729
					then arr 16318 rarr6
					else range range6))
			else (if v<17418
				then arr 17102 rarr7
				else range range7)))
	else (if v<=37844					-- 2
		then (if v<=33468				-- 3
			then (if v<=18663			-- 4
				then (if v<17961
					then arr 17859 rarr8
					else range range8)
				else (if v<19043
					then arr 18664 rarr9
					else range range9))
			else (if v<33550
				then arr 33469 rarr10
				else range range10))
		else (if v<=39419				-- 3
			then (if v<=39107			-- 4
				then (if v<38078
					then arr 37845 rarr11
					else range range11)
				else (if v<39394
					then arr 39108 rarr12
					else range range12))
			else (if v<=1237575 && v>=189000
				then range range13
				else throwException OutOfRange)))
	where
	arr off a = let
		v' = (v-off)*2
		w1 = unsafeIndex a v'
		w2 = unsafeIndex a (v'+1)
		in return $ chr $ ((fromIntegral w1) `shiftL` 8)
		      .|. (fromIntegral w2)
	range r = return $ chr (v-r)

range1,range2,range3,range4,range5,range6,range7,range8,range9,range10,range11,range12,range13 :: Int
range1 = -286
range2 = -576
range3 = -878
range4 = -887
range5 = -889
range6 = -894
range7 = -900
range8 = -911
range9 = -21827
range10 = -25943
range11 = -25964
range12 = -26116
range13 = 123464