-- © 2003 Peter Thiemann
{-|
  Implements UTF-8 encoding

  UCS-4 range (hex.)           UTF-8 octet sequence (binary)
  0000 0000-0000 007F   0xxxxxxx
  0000 0080-0000 07FF   110xxxxx 10xxxxxx
  0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
  0001 0000-001F FFFF   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  0020 0000-03FF FFFF   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  0400 0000-7FFF FFFF   1111110x 10xxxxxx ... 10xxxxxx
-}
module WASH.Utility.RFC2279 (encode, decode) where

import Char


-- |maps Unicode string to list of octets
encode :: String -> String

-- |maps list of octets in UTF-8 encoding to Unicode string
decode :: String -> String

factors = iterate (* 0x40) 1
f1 = factors !! 1
f2 = factors !! 2
f3 = factors !! 3
f4 = factors !! 4
f5 = factors !! 5

encode [] = []
encode (x:xs) = 
  let r0 = ord x in
  if r0 < 0x80 then
    x : encode xs
  else if r0 < 0x800 then
    let c1 = 0xC0 + r0 `div` f1
	c2 = 0x80 + r0 `mod` f1
    in  chr c1 : chr c2 : encode xs
  else if r0 < 0x10000 then
    let c1 = 0xE0 + r0 `div` f2
	r1 = r0 `mod` f2
	c2 = 0x80 + r1 `div` f1
	c3 = 0x80 + r1 `mod` f1
    in  chr c1 : chr c2 : chr c3 : encode xs
  else if r0 < 0x200000 then
    let c1 = 0xF0 + r0 `div` f3
	r1 = r0 `mod` f3
	c2 = 0x80 + r1 `div` f2
	r2 = r1 `mod` f2
	c3 = 0x80 + r2 `div` f1
	c4 = 0x80 + r2 `mod` f1
    in  chr c1 : chr c2 : chr c3 : chr c4 : encode xs
  else if r0 < 0x4000000 then
    let c1 = 0xF8 + r0 `div` f4
	r1 = r0 `mod` f4
	c2 = 0x80 + r1 `div` f3
	r2 = r1 `mod` f3
	c3 = 0x80 + r2 `div` f2
	r3 = r2 `mod` f2
	c4 = 0x80 + r3 `div` f1
	c5 = 0x80 + r3 `mod` f1
    in  chr c1 : chr c2 : chr c3 : chr c4 : chr c5 : encode xs
  else 
    let c1 = 0xFC + r0 `div` f5
	r1 = r0 `mod` f5
	c2 = 0x80 + r1 `div` f4
	r2 = r1 `mod` f4
	c3 = 0x80 + r2 `div` f3
	r3 = r2 `mod` f3
	c4 = 0x80 + r3 `div` f2
	r4 = r3 `mod` f2
	c5 = 0x80 + r4 `div` f1
	c6 = 0x80 + r4 `mod` f1
    in  chr c1 : chr c2 : chr c3 : chr c4 : chr c5 : chr c6 : encode xs


decode [] = []
decode (x : xs) =
  let ordx = ord x in 
  if ordx < 0x80 then
    x : decode xs
  else if ordx < 0xC0 then
    error "UTF-8 decoding out of sync"
  else if ordx < 0xE0 then
    decoden 1 (ordx - 0xC0) xs
  else if ordx < 0xF0 then
    decoden 2 (ordx - 0xE0) xs
  else if ordx < 0xF8 then
    decoden 3 (ordx - 0xF0) xs
  else if ordx < 0xFC then
    decoden 4 (ordx - 0xF8) xs
  else if ordx < 0xFE then
    decoden 5 (ordx - 0xFC) xs
  else
    error "UTF-8 decoding found illegal start octet"
      
decoden :: Int -> Int -> String -> String
decoden 0 v xs =
  chr v : decode xs
decoden n v (x : xs) =
  let ordx = ord x
      v' = f1 * v + ordx - 0x80
  in 
  if ordx >= 0x80 && ordx < 0xC0 then
    decoden (n-1) v' xs
  else 
    error "UTF-8 decoding found illegal continuation octet"