-- ------------------------------------------------------------

{- |
   Module     : Data.String.Unicode
   Copyright  : Copyright (C) 2010- Uwe Schmidt
   License    : MIT

   Maintainer : Uwe Schmidt (uwe@fh-wedel.de)
   Stability  : stable
   Portability: portable

   Unicode and UTF-8 Conversion Functions

-}

-- ------------------------------------------------------------

module Data.String.Unicode
    (
     -- * Unicode Type declarations
     Unicode,
     UString,
     UTF8Char,
     UTF8String,
     UStringWithErrors,
     DecodingFct,
     DecodingFctEmbedErrors,

      utf8ToUnicode
    , utf8ToUnicodeEmbedErrors
    , latin1ToUnicode
    , ucs2ToUnicode
    , ucs2BigEndianToUnicode
    , ucs2LittleEndianToUnicode
    , utf16beToUnicode
    , utf16leToUnicode

    , unicodeCharToUtf8

    , unicodeToUtf8
    , unicodeToXmlEntity
    , unicodeToLatin1
    , unicodeRemoveNoneAscii
    , unicodeRemoveNoneLatin1

    , intToCharRef
    , intToCharRefHex
    , intToHexString

    , getDecodingFct
    , getDecodingFctEmbedErrors
    , getOutputEncodingFct

    , normalizeNL
    , guessEncoding

    , getOutputEncodingFct'
    , unicodeCharToUtf8'
    , unicodeCharToXmlEntity'
    , unicodeCharToLatin1'
    )
where

import Data.Char                        ( toUpper )

import Data.Char.Properties.XMLCharProps( isXml1ByteChar
                                        , isXmlLatin1Char
                                        )
import Data.Char.IsoLatinTables

import Data.String.UTF8Decoding        ( decodeUtf8, decodeUtf8EmbedErrors )
import Data.String.EncodingNames

-- ------------------------------------------------------------

-- | Unicode is represented as the Char type
--   Precondition for this is the support of Unicode character range
--   in the compiler (e.g. ghc but not hugs)

type Unicode    = Char

-- | the type for Unicode strings

type UString    = [Unicode]

-- | UTF-8 charachters are represented by the Char type

type UTF8Char   = Char

-- | UTF-8 strings are implemented as Haskell strings

type UTF8String = String

-- | Decoding function with a pair containing the result string and a list of decoding errors as result

type DecodingFct = String -> (UString, [String])

type UStringWithErrors = [Either String Char]

-- | Decoding function where decoding errors are interleaved with decoded characters

type DecodingFctEmbedErrors = String -> UStringWithErrors

-- ------------------------------------------------------------

-- |
-- conversion from Unicode strings (UString) to UTF8 encoded strings.

unicodeToUtf8           :: UString -> UTF8String
unicodeToUtf8           = concatMap unicodeCharToUtf8

-- |
-- conversion from Unicode (Char) to a UTF8 encoded string.

unicodeCharToUtf8       :: Unicode -> UTF8String
unicodeCharToUtf8 c
    | i >= 0          && i <= 0x0000007F        -- 1 byte UTF8 (7 bits)
        = [ toEnum i ]
    | i >= 0x00000080 && i <= 0x000007FF        -- 2 byte UTF8 (5 + 6 bits)
        = [ toEnum (0xC0 + i `div` 0x40)
          , toEnum (0x80 + i                  `mod` 0x40)
          ]
    | i >= 0x00000800 && i <= 0x0000FFFF        -- 3 byte UTF8 (4 + 6 + 6 bits)
        = [ toEnum (0xE0 +  i `div`   0x1000)
          , toEnum (0x80 + (i `div`     0x40) `mod` 0x40)
          , toEnum (0x80 +  i                 `mod` 0x40)
          ]
    | i >= 0x00010000 && i <= 0x001FFFFF        -- 4 byte UTF8 (3 + 6 + 6 + 6 bits)
        = [ toEnum (0xF0 +  i `div`    0x40000)
          , toEnum (0x80 + (i `div`     0x1000) `mod` 0x40)
          , toEnum (0x80 + (i `div`       0x40) `mod` 0x40)
          , toEnum (0x80 +  i                   `mod` 0x40)
          ]
    | i >= 0x00200000 && i <= 0x03FFFFFF        -- 5 byte UTF8 (2 + 6 + 6 + 6 + 6 bits)
        = [ toEnum (0xF8 +  i `div`  0x1000000)
          , toEnum (0x80 + (i `div`    0x40000) `mod` 0x40)
          , toEnum (0x80 + (i `div`     0x1000) `mod` 0x40)
          , toEnum (0x80 + (i `div`       0x40) `mod` 0x40)
          , toEnum (0x80 +  i                   `mod` 0x40)
          ]
    | i >= 0x04000000 && i <= 0x7FFFFFFF        -- 6 byte UTF8 (1 + 6 + 6 + 6 + 6 + 6 bits)
        = [ toEnum (0xFC +  i `div` 0x40000000)
          , toEnum (0x80 + (i `div`  0x1000000) `mod` 0x40)
          , toEnum (0x80 + (i `div`    0x40000) `mod` 0x40)
          , toEnum (0x80 + (i `div`     0x1000) `mod` 0x40)
          , toEnum (0x80 + (i `div`       0x40) `mod` 0x40)
          , toEnum (0x80 +  i                   `mod` 0x40)
          ]
    | otherwise                                 -- other values not supported
        = error ("unicodeCharToUtf8: illegal integer argument " ++ show i)
    where
    i = fromEnum c

-- ------------------------------------------------------------

-- |
-- code conversion from latin1 to Unicode

latin1ToUnicode :: String -> UString
latin1ToUnicode = id

latinToUnicode  :: [(Char, Char)] -> String -> UString
latinToUnicode tt
    = map charToUni
    where
    charToUni c =
       foldr (\(src,dst) r ->
          case compare c src of
             EQ -> dst
             LT -> c {- not found in table -}
             GT -> r) c tt

-- | conversion from ASCII to unicode with check for legal ASCII char set
--
-- Structure of decoding function copied from 'Data.Char.UTF8.decode'.

decodeAscii     :: DecodingFct
decodeAscii
    = swap . partitionEither . decodeAsciiEmbedErrors

decodeAsciiEmbedErrors  :: String -> UStringWithErrors
decodeAsciiEmbedErrors str
    = map (\(c,pos) -> if isValid c
                         then Right c
                         else Left (toErrStr c pos)) posStr
    where
    posStr = zip str [(0::Int)..]
    toErrStr errChr pos
        = " at input position " ++ show pos ++ ": none ASCII char " ++ show errChr
    isValid x = x < '\x80'

-- |
-- UCS-2 big endian to Unicode conversion

ucs2BigEndianToUnicode  :: String -> UString

ucs2BigEndianToUnicode (b : l : r)
    = toEnum (fromEnum b * 256 + fromEnum l) : ucs2BigEndianToUnicode r

ucs2BigEndianToUnicode []
    = []

ucs2BigEndianToUnicode _
    = []                                -- error "illegal UCS-2 byte input sequence with odd length"
                                        -- is ignored (garbage in, garbage out)

-- ------------------------------------------------------------

-- |
-- UCS-2 little endian to Unicode conversion

ucs2LittleEndianToUnicode       :: String -> UString

ucs2LittleEndianToUnicode (l : b : r)
    = toEnum (fromEnum b * 256 + fromEnum l) : ucs2LittleEndianToUnicode r

ucs2LittleEndianToUnicode []
    = []

ucs2LittleEndianToUnicode [_]
    = []                                -- error "illegal UCS-2 byte input sequence with odd length"
                                        -- is ignored

-- ------------------------------------------------------------

-- |
-- UCS-2 to UTF-8 conversion with byte order mark analysis

ucs2ToUnicode           :: String -> UString

ucs2ToUnicode ('\xFE':'\xFF':s)         -- 2 byte mark for big endian encoding
    = ucs2BigEndianToUnicode s

ucs2ToUnicode ('\xFF':'\xFE':s)         -- 2 byte mark for little endian encoding
    = ucs2LittleEndianToUnicode s

ucs2ToUnicode s
    = ucs2BigEndianToUnicode s          -- default: big endian

-- ------------------------------------------------------------

-- |
-- UTF-8 to Unicode conversion with deletion of leading byte order mark, as described in XML standard F.1

utf8ToUnicode           :: DecodingFct

utf8ToUnicode ('\xEF':'\xBB':'\xBF':s)  -- remove byte order mark ( XML standard F.1 )
    = decodeUtf8 s

utf8ToUnicode s
    = decodeUtf8 s

utf8ToUnicodeEmbedErrors        :: DecodingFctEmbedErrors

utf8ToUnicodeEmbedErrors ('\xEF':'\xBB':'\xBF':s)       -- remove byte order mark ( XML standard F.1 )
    = decodeUtf8EmbedErrors s

utf8ToUnicodeEmbedErrors s
    = decodeUtf8EmbedErrors s

-- ------------------------------------------------------------

-- |
-- UTF-16 big endian to UTF-8 conversion with removal of byte order mark

utf16beToUnicode                :: String -> UString

utf16beToUnicode ('\xFE':'\xFF':s)              -- remove byte order mark
    = ucs2BigEndianToUnicode s

utf16beToUnicode s
    = ucs2BigEndianToUnicode s

-- ------------------------------------------------------------

-- |
-- UTF-16 little endian to UTF-8 conversion with removal of byte order mark

utf16leToUnicode                :: String -> UString

utf16leToUnicode ('\xFF':'\xFE':s)              -- remove byte order mark
    = ucs2LittleEndianToUnicode s

utf16leToUnicode s
    = ucs2LittleEndianToUnicode s


-- ------------------------------------------------------------

-- |
-- substitute all Unicode characters, that are not legal 1-byte
-- UTF-8 XML characters by a character reference.
--
-- This function can be used to translate all text nodes and
-- attribute values into pure ascii.
--
-- see also : 'unicodeToLatin1'

unicodeToXmlEntity      :: UString -> String
unicodeToXmlEntity
    = escape isXml1ByteChar (intToCharRef . fromEnum)

-- |
-- substitute all Unicode characters, that are not legal latin1
-- UTF-8 XML characters by a character reference.
--
-- This function can be used to translate all text nodes and
-- attribute values into ISO latin1.
--
-- see also : 'unicodeToXmlEntity'

unicodeToLatin1 :: UString -> String
unicodeToLatin1
    = escape isXmlLatin1Char (intToCharRef . fromEnum)


-- |
-- substitute selected characters
-- The @check@ function returns 'True' whenever a character needs to substitution
-- The function @esc@ computes a substitute.

escape :: (Unicode -> Bool) -> (Unicode -> String) -> UString -> String
escape check esc =
    concatMap (\uc -> if check uc then [uc] else esc uc)

-- |
-- removes all non ascii chars, may be used to transform
-- a document into a pure ascii representation by removing
-- all non ascii chars from tag and attibute names
--
-- see also : 'unicodeRemoveNoneLatin1', 'unicodeToXmlEntity'

unicodeRemoveNoneAscii  :: UString -> String
unicodeRemoveNoneAscii
    = filter isXml1ByteChar

-- |
-- removes all non latin1 chars, may be used to transform
-- a document into a pure ascii representation by removing
-- all non ascii chars from tag and attibute names
--
-- see also : 'unicodeRemoveNoneAscii', 'unicodeToLatin1'

unicodeRemoveNoneLatin1 :: UString -> String
unicodeRemoveNoneLatin1
    = filter isXmlLatin1Char

-- ------------------------------------------------------------

-- |
-- convert an Unicode into a XML character reference.
--
-- see also : 'intToCharRefHex'

intToCharRef            :: Int -> String
intToCharRef i
    = "&#" ++ show i ++ ";"

-- |
-- convert an Unicode into a XML hexadecimal character reference.
--
-- see also: 'intToCharRef'

intToCharRefHex         :: Int -> String
intToCharRefHex i
    = "&#x" ++ h2 ++ ";"
      where
      h1 = intToHexString i
      h2 = if length h1 `mod` 2 == 1
           then '0': h1
           else h1

-- ------------------------------------------------------------

intToHexString          :: Int -> String
intToHexString i
    | i == 0
        = "0"
    | i > 0
        = intToStr i
    | otherwise
        = error ("intToHexString: negative argument " ++ show i)
    where
    intToStr 0  = ""
    intToStr i' = intToStr (i' `div` 16) ++ [fourBitsToChar (i' `mod` 16)]

fourBitsToChar          :: Int -> Char
fourBitsToChar i        = "0123456789ABCDEF" !! i
{-# INLINE fourBitsToChar #-}

-- ------------------------------------------------------------
--
-- | White Space (XML Standard 2.3) and
-- end of line handling (2.11)
--
-- \#x0D and \#x0D\#x0A are mapped to \#x0A

normalizeNL     :: String -> String
normalizeNL ('\r' : '\n' : rest)        = '\n' : normalizeNL rest
normalizeNL ('\r' : rest)               = '\n' : normalizeNL rest
normalizeNL (c : rest)                  = c    : normalizeNL rest
normalizeNL []                          = []


-- ------------------------------------------------------------

-- |
-- the table of supported character encoding schemes and the associated
-- conversion functions into Unicode:q

{-
This table could be derived from decodingTableEither,
but this way it is certainly more efficient.
-}

decodingTable   :: [(String, DecodingFct)]
decodingTable
    = [ (utf8,          utf8ToUnicode                           )
      , (isoLatin1,     liftDecFct latin1ToUnicode              )
      , (usAscii,       decodeAscii                             )
      , (ucs2,          liftDecFct ucs2ToUnicode                )
      , (utf16,         liftDecFct ucs2ToUnicode                )
      , (utf16be,       liftDecFct utf16beToUnicode             )
      , (utf16le,       liftDecFct utf16leToUnicode             )
      , (iso8859_2,     liftDecFct (latinToUnicode iso_8859_2)  )
      , (iso8859_3,     liftDecFct (latinToUnicode iso_8859_3)  )
      , (iso8859_4,     liftDecFct (latinToUnicode iso_8859_4)  )
      , (iso8859_5,     liftDecFct (latinToUnicode iso_8859_5)  )
      , (iso8859_6,     liftDecFct (latinToUnicode iso_8859_6)  )
      , (iso8859_7,     liftDecFct (latinToUnicode iso_8859_7)  )
      , (iso8859_8,     liftDecFct (latinToUnicode iso_8859_8)  )
      , (iso8859_9,     liftDecFct (latinToUnicode iso_8859_9)  )
      , (iso8859_10,    liftDecFct (latinToUnicode iso_8859_10) )
      , (iso8859_11,    liftDecFct (latinToUnicode iso_8859_11) )
      , (iso8859_13,    liftDecFct (latinToUnicode iso_8859_13) )
      , (iso8859_14,    liftDecFct (latinToUnicode iso_8859_14) )
      , (iso8859_15,    liftDecFct (latinToUnicode iso_8859_15) )
      , (iso8859_16,    liftDecFct (latinToUnicode iso_8859_16) )
      , (unicodeString, liftDecFct id                           )
      , ("",            liftDecFct id                           )       -- default
      ]
    where
    liftDecFct df = \ s -> (df s, [])

-- |
-- the lookup function for selecting the decoding function

getDecodingFct          :: String -> Maybe DecodingFct
getDecodingFct enc
    = lookup (map toUpper enc) decodingTable


-- |
-- Similar to 'decodingTable' but it embeds errors
-- in the string of decoded characters.

decodingTableEmbedErrors        :: [(String, DecodingFctEmbedErrors)]
decodingTableEmbedErrors
    = [ (utf8,          utf8ToUnicodeEmbedErrors                )
      , (isoLatin1,     liftDecFct latin1ToUnicode              )
      , (usAscii,       decodeAsciiEmbedErrors                  )
      , (ucs2,          liftDecFct ucs2ToUnicode                )
      , (utf16,         liftDecFct ucs2ToUnicode                )
      , (utf16be,       liftDecFct utf16beToUnicode             )
      , (utf16le,       liftDecFct utf16leToUnicode             )
      , (iso8859_2,     liftDecFct (latinToUnicode iso_8859_2)  )
      , (iso8859_3,     liftDecFct (latinToUnicode iso_8859_3)  )
      , (iso8859_4,     liftDecFct (latinToUnicode iso_8859_4)  )
      , (iso8859_5,     liftDecFct (latinToUnicode iso_8859_5)  )
      , (iso8859_6,     liftDecFct (latinToUnicode iso_8859_6)  )
      , (iso8859_7,     liftDecFct (latinToUnicode iso_8859_7)  )
      , (iso8859_8,     liftDecFct (latinToUnicode iso_8859_8)  )
      , (iso8859_9,     liftDecFct (latinToUnicode iso_8859_9)  )
      , (iso8859_10,    liftDecFct (latinToUnicode iso_8859_10) )
      , (iso8859_11,    liftDecFct (latinToUnicode iso_8859_11) )
      , (iso8859_13,    liftDecFct (latinToUnicode iso_8859_13) )
      , (iso8859_14,    liftDecFct (latinToUnicode iso_8859_14) )
      , (iso8859_15,    liftDecFct (latinToUnicode iso_8859_15) )
      , (iso8859_16,    liftDecFct (latinToUnicode iso_8859_16) )
      , (unicodeString, liftDecFct id                           )
      , ("",            liftDecFct id                           )       -- default
      ]
    where
    liftDecFct df = map Right . df

-- |
-- the lookup function for selecting the decoding function

getDecodingFctEmbedErrors       :: String -> Maybe DecodingFctEmbedErrors
getDecodingFctEmbedErrors enc
    = lookup (map toUpper enc) decodingTableEmbedErrors



-- |
-- the table of supported output encoding schemes and the associated
-- conversion functions from Unicode

outputEncodingTable     :: [(String, (UString -> String))]
outputEncodingTable
    = [ (utf8,          unicodeToUtf8           )
      , (isoLatin1,     unicodeToLatin1         )
      , (usAscii,       unicodeToXmlEntity      )
      , (unicodeString, id                      )
      , ("",            unicodeToUtf8           )       -- default
      ]

-- |
-- the lookup function for selecting the encoding function

getOutputEncodingFct            :: String -> Maybe (String -> UString)
getOutputEncodingFct enc
    = lookup (map toUpper enc) outputEncodingTable

-- ------------------------------------------------------------
--

guessEncoding           :: String -> String

guessEncoding ('\xFF':'\xFE':'\x00':'\x00':_)   = "UCS-4LE"             -- with byte order mark
guessEncoding ('\xFF':'\xFE':_)                 = "UTF-16LE"            -- with byte order mark

guessEncoding ('\xFE':'\xFF':'\x00':'\x00':_)   = "UCS-4-3421"          -- with byte order mark
guessEncoding ('\xFE':'\xFF':_)                 = "UTF-16BE"            -- with byte order mark

guessEncoding ('\xEF':'\xBB':'\xBF':_)          = utf8                  -- with byte order mark

guessEncoding ('\x00':'\x00':'\xFE':'\xFF':_)   = "UCS-4BE"             -- with byte order mark
guessEncoding ('\x00':'\x00':'\xFF':'\xFE':_)   = "UCS-4-2143"          -- with byte order mark

guessEncoding ('\x00':'\x00':'\x00':'\x3C':_)   = "UCS-4BE"             -- "<" of "<?xml"
guessEncoding ('\x3C':'\x00':'\x00':'\x00':_)   = "UCS-4LE"             -- "<" of "<?xml"
guessEncoding ('\x00':'\x00':'\x3C':'\x00':_)   = "UCS-4-2143"          -- "<" of "<?xml"
guessEncoding ('\x00':'\x3C':'\x00':'\x00':_)   = "UCS-4-3412"          -- "<" of "<?xml"

guessEncoding ('\x00':'\x3C':'\x00':'\x3F':_)   = "UTF-16BE"            -- "<?" of "<?xml"
guessEncoding ('\x3C':'\x00':'\x3F':'\x00':_)   = "UTF-16LE"            -- "<?" of "<?xml"

guessEncoding ('\x4C':'\x6F':'\xA7':'\x94':_)   = "EBCDIC"              -- "<?xm" of "<?xml"

guessEncoding _                                 = ""                    -- no guess

-- ------------------------------------------------------------

swap :: (a,b) -> (b,a)
swap (x,y) = (y,x)
{-# INLINE swap #-}

partitionEither :: [Either a b] -> ([a], [b])
partitionEither =
   foldr (\x ~(ls,rs) -> either (\l -> (l:ls,rs)) (\r -> (ls,r:rs)) x) ([],[])
{-# INLINE partitionEither #-}

-- ------------------------------------------------------------

-- output encoding for bytestrings

-- |
-- the table of supported output encoding schemes and the associated
-- conversion functions from Unicode

type StringFct		= String -> String

outputEncodingTable'     :: [(String, (Char -> StringFct))]
outputEncodingTable'
    = [ (utf8,          unicodeCharToUtf8'           )
      , (isoLatin1,     unicodeCharToLatin1'         )
      , (usAscii,       unicodeCharToXmlEntity'      )
      , ("",            unicodeCharToUtf8'           )       -- default
      ]

-- |
-- the lookup function for selecting the encoding function

getOutputEncodingFct'            :: String -> Maybe (Char -> StringFct)
getOutputEncodingFct' enc
    = lookup (map toUpper enc) outputEncodingTable'


-- ------------------------------------------------------------

-- |
-- conversion from Unicode (Char) to a UTF8 encoded string.

unicodeCharToUtf8'      :: Char -> StringFct
unicodeCharToUtf8' c
    | i >= 0          && i <= 0x0000007F        -- 1 byte UTF8 (7 bits)
        = (c :)

    | i >= 0x00000080 && i <= 0x000007FF        -- 2 byte UTF8 (5 + 6 bits)
        = ((toEnum (0xC0 + i `div` 0x40)                   ) :) .
          ((toEnum (0x80 + i                    `mod` 0x40)) :)

    | i >= 0x00000800 && i <= 0x0000FFFF        -- 3 byte UTF8 (4 + 6 + 6 bits)
        = ((toEnum (0xE0 +  i `div`     0x1000)            ) :) .
          ((toEnum (0x80 + (i `div`       0x40) `mod` 0x40)) :) .
          ((toEnum (0x80 +  i                   `mod` 0x40)) :)

    | i >= 0x00010000 && i <= 0x001FFFFF        -- 4 byte UTF8 (3 + 6 + 6 + 6 bits) -- extension to encode 21 bit values
        = ((toEnum (0xF0 +  i `div`    0x40000)            ) :) .
          ((toEnum (0x80 + (i `div`     0x1000) `mod` 0x40)) :) .
          ((toEnum (0x80 + (i `div`       0x40) `mod` 0x40)) :) .
          ((toEnum (0x80 +  i                   `mod` 0x40)) :)

    | i >= 0x00200000 && i <= 0x03FFFFFF        -- 5 byte UTF8 (2 + 6 + 6 + 6 + 6 bits) -- extension to encode 26 bit values
        = ((toEnum (0xF8 +  i `div`  0x1000000)            ) :) .
          ((toEnum (0x80 + (i `div`    0x40000) `mod` 0x40)) :) .
          ((toEnum (0x80 + (i `div`     0x1000) `mod` 0x40)) :) .
          ((toEnum (0x80 + (i `div`       0x40) `mod` 0x40)) :) .
          ((toEnum (0x80 +  i                   `mod` 0x40)) :)

    | i >= 0x04000000 && i <= 0x7FFFFFFF        -- 6 byte UTF8 (1 + 6 + 6 + 6 + 6 + 6 bits) -- extension to encode 31 bit values
        = ((toEnum (0xFC +  i `div` 0x40000000)            ) :) .
          ((toEnum (0x80 + (i `div`  0x1000000) `mod` 0x40)) :) .
          ((toEnum (0x80 + (i `div`    0x40000) `mod` 0x40)) :) .
          ((toEnum (0x80 + (i `div`     0x1000) `mod` 0x40)) :) .
          ((toEnum (0x80 + (i `div`       0x40) `mod` 0x40)) :) .
          ((toEnum (0x80 +  i                   `mod` 0x40)) :)

    | otherwise                                 -- other values not supported
        = error ("unicodeCharToUtf8: illegal integer argument " ++ show i)
    where
    i = fromEnum c

-- ------------------------------------------------------------

-- |
-- substitute all Unicode characters, that are not legal 1-byte
-- UTF-8 XML characters by a character reference.

unicodeCharToXmlEntity'	:: Char -> StringFct
unicodeCharToXmlEntity' c
    | isXml1ByteChar c	= (c :)
    | otherwise		= ((intToCharRef . fromEnum $ c) ++)

-- ------------------------------------------------------------

-- |
-- substitute all Unicode characters, that are not legal latin1
-- UTF-8 XML characters by a character reference.

unicodeCharToLatin1' 	:: Char -> StringFct
unicodeCharToLatin1' c
    | isXmlLatin1Char c	= (c :)
    | otherwise		= ((intToCharRef . fromEnum $ c) ++)

-- ------------------------------------------------------------