-----------------------------------------------------------------------------
-- Copyright 2014, Open Universiteit Nederland. This file is distributed
-- under the terms of the GNU General Public License. For more information,
-- see the file "LICENSE.txt", which is included in the distribution.
-----------------------------------------------------------------------------
-- |
-- Maintainer  :  bastiaan.heeren@ou.nl
-- Stability   :  provisional
-- Portability :  portable (depends on ghc)
--
-- Support for Unicode
--
-----------------------------------------------------------------------------
--  $Id: Unicode.hs 6535 2014-05-14 11:05:06Z bastiaan $

module Ideas.Text.XML.Unicode
   ( isExtender, isLetter, isDigit, isCombiningChar
   , decoding
   ) where

import Control.Arrow
import Data.Char (chr, ord)
import Data.List
import qualified Ideas.Text.UTF8 as UTF8

-- optimize for ascii characters
isLetter, isExtender, isDigit, isCombiningChar :: Char -> Bool
isLetter        c = inMap (ord c) letterMap
isExtender      c = inMap (ord c) extenderMap
isDigit         c = inMap (ord c) digitMap
isCombiningChar c = inMap (ord c) combiningCharMap

inMap :: Int -> [(Int, Int)] -> Bool
inMap n = rec
 where
   rec [] = False
   rec ((x, y):zs)
      | n < x     = False
      | n <= y    = True
      | otherwise = rec zs

f :: Char -> (Char, Char)
f c = (c, c)

intpairs :: [(Char, Char)] -> [(Int, Int)]
intpairs = map (ord *** ord)

letterMap :: [(Int, Int)]
letterMap = baseCharMap `merge` ideographicMap -- `merge` controlMap `merge` extraMap

merge :: [(Int, Int)] -> [(Int, Int)] -> [(Int, Int)]
merge (x:xs) (y:ys)
   | fst x <= fst y = x:merge xs (y:ys)
   | otherwise      = y:merge (x:xs) ys
merge xs ys = xs++ys

{-
extraMap :: [(Char, Char)]
extraMap = map f "\161\170\184\185"

controlMap :: [(Char, Char)]
controlMap = [ ('\x7F', '\x84'), ('\x86', '\x9F'), ('\xFDD0', '\xFDDF'),
   ('\x1FFFE', '\x1FFFF'), ('\x2FFFE', '\x2FFFF'), ('\x3FFFE', '\x3FFFF'),
   ('\x4FFFE', '\x4FFFF'), ('\x5FFFE', '\x5FFFF'), ('\x6FFFE', '\x6FFFF'),
   ('\x7FFFE', '\x7FFFF'), ('\x8FFFE', '\x8FFFF'), ('\x9FFFE', '\x9FFFF'),
   ('\xAFFFE', '\xAFFFF'), ('\xBFFFE', '\xBFFFF'), ('\xCFFFE', '\xCFFFF'),
   ('\xDFFFE', '\xDFFFF'), ('\xEFFFE', '\xEFFFF'), ('\xFFFFE', '\xFFFFF'),
   ('\x10FFFE', '\x10FFFF')] -}

baseCharMap :: [(Int, Int)]
baseCharMap = intpairs
   [ ('\x0041','\x005A'), ('\x0061','\x007A'), ('\x00C0','\x00D6'),
   ('\x00D8','\x00F6'), ('\x00F8','\x00FF'), ('\x0100','\x0131'),
   ('\x0134','\x013E'), ('\x0141','\x0148'), ('\x014A','\x017E'),
   ('\x0180','\x01C3'), ('\x01CD','\x01F0'), ('\x01F4','\x01F5'),
   ('\x01FA','\x0217'), ('\x0250','\x02A8'), ('\x02BB','\x02C1'), f '\x0386' ,
   ('\x0388','\x038A'), f '\x038C' , ('\x038E','\x03A1'), ('\x03A3','\x03CE'),
   ('\x03D0','\x03D6'), f '\x03DA' , f '\x03DC' , f '\x03DE' , f '\x03E0' ,
   ('\x03E2','\x03F3'), ('\x0401','\x040C'), ('\x040E','\x044F'),
   ('\x0451','\x045C'), ('\x045E','\x0481'), ('\x0490','\x04C4'),
   ('\x04C7','\x04C8'), ('\x04CB','\x04CC'), ('\x04D0','\x04EB'),
   ('\x04EE','\x04F5'), ('\x04F8','\x04F9'), ('\x0531','\x0556'), f '\x0559' ,
   ('\x0561','\x0586'), ('\x05D0','\x05EA'), ('\x05F0','\x05F2'),
   ('\x0621','\x063A'), ('\x0641','\x064A'), ('\x0671','\x06B7'),
   ('\x06BA','\x06BE'), ('\x06C0','\x06CE'), ('\x06D0','\x06D3'), f '\x06D5' ,
   ('\x06E5','\x06E6'), ('\x0905','\x0939'), f '\x093D' , ('\x0958','\x0961'),
   ('\x0985','\x098C'), ('\x098F','\x0990'), ('\x0993','\x09A8'),
   ('\x09AA','\x09B0'), f '\x09B2' , ('\x09B6','\x09B9'), ('\x09DC','\x09DD'),
   ('\x09DF','\x09E1'), ('\x09F0','\x09F1'), ('\x0A05','\x0A0A'),
   ('\x0A0F','\x0A10'), ('\x0A13','\x0A28'), ('\x0A2A','\x0A30'),
   ('\x0A32','\x0A33'), ('\x0A35','\x0A36'), ('\x0A38','\x0A39'),
   ('\x0A59','\x0A5C'), f '\x0A5E' , ('\x0A72','\x0A74'), ('\x0A85','\x0A8B'),
   f '\x0A8D' , ('\x0A8F','\x0A91'), ('\x0A93','\x0AA8'), ('\x0AAA','\x0AB0'),
   ('\x0AB2','\x0AB3'), ('\x0AB5','\x0AB9'), f '\x0ABD' , f '\x0AE0' ,
   ('\x0B05','\x0B0C'), ('\x0B0F','\x0B10'), ('\x0B13','\x0B28'),
   ('\x0B2A','\x0B30'), ('\x0B32','\x0B33'), ('\x0B36','\x0B39'), f '\x0B3D' ,
   ('\x0B5C','\x0B5D'), ('\x0B5F','\x0B61'), ('\x0B85','\x0B8A'),
   ('\x0B8E','\x0B90'), ('\x0B92','\x0B95'), ('\x0B99','\x0B9A'), f '\x0B9C' ,
   ('\x0B9E','\x0B9F'), ('\x0BA3','\x0BA4'), ('\x0BA8','\x0BAA'),
   ('\x0BAE','\x0BB5'), ('\x0BB7','\x0BB9'), ('\x0C05','\x0C0C'),
   ('\x0C0E','\x0C10'), ('\x0C12','\x0C28'), ('\x0C2A','\x0C33'),
   ('\x0C35','\x0C39'), ('\x0C60','\x0C61'), ('\x0C85','\x0C8C'),
   ('\x0C8E','\x0C90'), ('\x0C92','\x0CA8'), ('\x0CAA','\x0CB3'),
   ('\x0CB5','\x0CB9'), f '\x0CDE' , ('\x0CE0','\x0CE1'), ('\x0D05','\x0D0C'),
   ('\x0D0E','\x0D10'), ('\x0D12','\x0D28'), ('\x0D2A','\x0D39'),
   ('\x0D60','\x0D61'), ('\x0E01','\x0E2E'), f '\x0E30' , ('\x0E32','\x0E33'),
   ('\x0E40','\x0E45'), ('\x0E81','\x0E82'), f '\x0E84' , ('\x0E87','\x0E88'),
   f '\x0E8A' , f '\x0E8D' , ('\x0E94','\x0E97'), ('\x0E99','\x0E9F'),
   ('\x0EA1','\x0EA3'), f '\x0EA5' , f '\x0EA7' , ('\x0EAA','\x0EAB'),
   ('\x0EAD','\x0EAE'), f '\x0EB0' , ('\x0EB2','\x0EB3'), f '\x0EBD' ,
   ('\x0EC0','\x0EC4'), ('\x0F40','\x0F47'), ('\x0F49','\x0F69'),
   ('\x10A0','\x10C5'), ('\x10D0','\x10F6'), f '\x1100' , ('\x1102','\x1103'),
   ('\x1105','\x1107'), f '\x1109' , ('\x110B','\x110C'), ('\x110E','\x1112'),
   f '\x113C' , f '\x113E' , f '\x1140' , f '\x114C' , f '\x114E' , f '\x1150' , ('\x1154','\x1155') ,
   f '\x1159' , ('\x115F','\x1161'), f '\x1163' , f '\x1165' , f '\x1167' , f '\x1169' ,
   ('\x116D','\x116E'), ('\x1172','\x1173'), f '\x1175' , f '\x119E' , f '\x11A8' ,
   f '\x11AB' , ('\x11AE','\x11AF'), ('\x11B7','\x11B8'), f '\x11BA' ,
   ('\x11BC','\x11C2'), f '\x11EB' , f '\x11F0' , f '\x11F9' , ('\x1E00','\x1E9B'),
   ('\x1EA0','\x1EF9'), ('\x1F00','\x1F15'), ('\x1F18','\x1F1D'),
   ('\x1F20','\x1F45'), ('\x1F48','\x1F4D'), ('\x1F50','\x1F57'), f '\x1F59' ,
   f '\x1F5B' , f '\x1F5D' , ('\x1F5F','\x1F7D'), ('\x1F80','\x1FB4'),
   ('\x1FB6','\x1FBC'), f '\x1FBE' , ('\x1FC2','\x1FC4'), ('\x1FC6','\x1FCC'),
   ('\x1FD0','\x1FD3'), ('\x1FD6','\x1FDB'), ('\x1FE0','\x1FEC'),
   ('\x1FF2','\x1FF4'), ('\x1FF6','\x1FFC'), f '\x2126' , ('\x212A','\x212B'),
   f '\x212E' , ('\x2180','\x2182'), ('\x3041','\x3094'), ('\x30A1','\x30FA'),
   ('\x3105','\x312C'), ('\xAC00','\xD7A3') ]

ideographicMap :: [(Int, Int)]
ideographicMap = intpairs
   [ ('\x4E00','\x9FA5'),
   f '\x3007' , ('\x3021','\x3029') ]

combiningCharMap :: [(Int, Int)]
combiningCharMap = intpairs
   [('\x0300','\x0345'), ('\x0360','\x0361'), ('\x0483','\x0486'), ('\x0591','\x05A1'),
   ('\x05A3','\x05B9'), ('\x05BB','\x05BD'),  f '\x05BF' , ('\x05C1','\x05C2'),
   f '\x05C4' , ('\x064B','\x0652'), f '\x0670' , ('\x06D6','\x06DC'),
   ('\x06DD','\x06DF'), ('\x06E0','\x06E4'), ('\x06E7','\x06E8'),
   ('\x06EA','\x06ED'), ('\x0901','\x0903'), f '\x093C' , ('\x093E','\x094C'),
   f '\x094D' , ('\x0951','\x0954'), ('\x0962','\x0963'), ('\x0981','\x0983'),
   f '\x09BC' , f '\x09BE' , f '\x09BF' , ('\x09C0','\x09C4'), ('\x09C7','\x09C8'),
   ('\x09CB','\x09CD'), f '\x09D7' , ('\x09E2','\x09E3'), f '\x0A02' , f '\x0A3C' ,
   f '\x0A3E' , f '\x0A3F' , ('\x0A40','\x0A42'), ('\x0A47','\x0A48'),
   ('\x0A4B','\x0A4D'), ('\x0A70','\x0A71'), ('\x0A81','\x0A83'), f '\x0ABC' ,
   ('\x0ABE','\x0AC5'), ('\x0AC7','\x0AC9'), ('\x0ACB','\x0ACD'),
   ('\x0B01','\x0B03'), f '\x0B3C' , ('\x0B3E','\x0B43'), ('\x0B47','\x0B48'),
   ('\x0B4B','\x0B4D'), ('\x0B56','\x0B57'), ('\x0B82','\x0B83'),
   ('\x0BBE','\x0BC2'), ('\x0BC6','\x0BC8'), ('\x0BCA','\x0BCD'), f '\x0BD7' ,
   ('\x0C01','\x0C03'), ('\x0C3E','\x0C44'), ('\x0C46','\x0C48'),
   ('\x0C4A','\x0C4D'), ('\x0C55','\x0C56'), ('\x0C82','\x0C83'),
   ('\x0CBE','\x0CC4'), ('\x0CC6','\x0CC8'), ('\x0CCA','\x0CCD'),
   ('\x0CD5','\x0CD6'), ('\x0D02','\x0D03'), ('\x0D3E','\x0D43'),
   ('\x0D46','\x0D48'), ('\x0D4A','\x0D4D'), f '\x0D57' , f '\x0E31' ,
   ('\x0E34','\x0E3A'), ('\x0E47','\x0E4E'), f '\x0EB1' , ('\x0EB4','\x0EB9'),
   ('\x0EBB','\x0EBC'), ('\x0EC8','\x0ECD'), ('\x0F18','\x0F19'), f '\x0F35' ,
   f '\x0F37' , f '\x0F39' , f '\x0F3E' , f '\x0F3F' , ('\x0F71','\x0F84'),
   ('\x0F86','\x0F8B'), ('\x0F90','\x0F95'), f '\x0F97' , ('\x0F99','\x0FAD'),
   ('\x0FB1','\x0FB7'), f '\x0FB9' , ('\x20D0','\x20DC'), f '\x20E1' ,
   ('\x302A','\x302F'), f '\x3099' , f '\x309A' ]

digitMap :: [(Int, Int)]
digitMap = intpairs [ ('\x0030','\x0039'),
   ('\x0660','\x0669'), ('\x06F0','\x06F9'), ('\x0966','\x096F'),
   ('\x09E6','\x09EF'), ('\x0A66','\x0A6F'), ('\x0AE6','\x0AEF'),
   ('\x0B66','\x0B6F'), ('\x0BE7','\x0BEF'), ('\x0C66','\x0C6F'),
   ('\x0CE6','\x0CEF'), ('\x0D66','\x0D6F'), ('\x0E50','\x0E59'),
   ('\x0ED0','\x0ED9'), ('\x0F20','\x0F29')]

extenderMap ::  [(Int, Int)]
extenderMap = intpairs [f '\x00B7' , f '\x02D0' ,
   f '\x02D1' , f '\x0387' , f '\x0640' , f '\x0E46' , f '\x0EC6' , f '\x3005' , ('\x3031','\x3035')
   , ('\x309D','\x309E'), ('\x30FC','\x30FE') ]

decoding :: Monad m => String -> m String
decoding xs
   | "\255\254" `isPrefixOf` xs =
        return (decode16 $ drop 2 xs)
   | "\254\255" `isPrefixOf` xs =
        return (decode16X $ drop 2 xs)
   | "\239\187\191" `isPrefixOf` xs =
        UTF8.decodeM (drop 3 xs)
   | otherwise =
        UTF8.decodeM xs

decode16 :: String -> String
decode16 []  = []
decode16 [x] = [x]
decode16 (a:b:rest) = chr (ord b * 256 + ord a) : decode16 rest

decode16X :: String -> String
decode16X []  = []
decode16X [x] = [x]
decode16X (a:b:rest) = chr (ord b + ord a * 256) : decode16X rest