-- --------------------------------------------------------------------------
--  $Revision: 974 $ $Date: 2010-06-20 12:44:41 +0200 (Sun, 20 Jun 2010) $
-- --------------------------------------------------------------------------

-- |
--
-- Module      :  Encode
-- Copyright   :  Otakar Smrz 2005-2010
-- License     :  BSD-style
--
-- Maintainer  :  otakar.smrz mff.cuni.cz
-- Stability   :  provisional
-- Portability :  portable
--
-- The Haskell analogy to the /Encode/ module in Perl:
-- <http://search.cpan.org/dist/Encode/>
--
-- "Encode.Arabic" "Encode.Mapper" "Encode.Unicode"


module Encode (

        -- * Classes

        Encoding,

        -- * Types

        UPoint, CSpace,

        -- * Methods

        encode, decode,

        -- * Version

        version

    ) where


import Data.Word

import Version

version = Version [1, 3, min build 4] []

    where Version [build] [] = revised "$Revision: 974 $"


-- | The datatype introduced for the internal representation of Unicode code
--   points is currently defined as @newtype 'UPoint' = UPoint CSpace@. The
--   shift to code points @UPoint@ from characters @Char@ is intentional, as
--   Unicode support in Haskell is not yet fully implemented, and code points
--   are, anyway, different entities. Since the 'UPoint' type is an instance
--   of the @Enum@ class, the type's constructor and destructor functions are
--   available as 'toEnum' and 'fromEnum', respectively.
--
--   The 'UPoint' datatype should be the transfer point on the way from one
--   encoding into another. It should not be the terminal stop, though. The
--   'encode' method should be used systematically, and not @show@, even if
--   it might temporarily produce somehow appealing results.

newtype UPoint = UPoint CSpace

    deriving (Eq, Ord)


-- | The 'CSpace' type denotes the code space, and it is a synonym to 'Word'.

type CSpace = Word


-- | Encodings are represented as distinct datatypes of the 'Encoding' class,
--   which defines two essential methods:
--
-- ['encode'] turning a list of 'internal code points' into a @String@, and
--
-- ['decode'] converting the lists in the opposite direction.
--
--   Developing a new encoding means to write a new module with a structure
--   similar to this:
--
-- @
--    module /MyEncModule/ (/MyEncType/ (..)) where
-- /  /
--    import "Encode"
-- /  /
--    data /MyEncType/ = /MyEncName | MyEncAlias deriving (Enum, Show)/
-- /  /
--    instance 'Encoding' /MyEncType/ where
-- /  /
--        'encode' /enc data/ = /show data/         /-- your choices .../
-- /  /
--        'decode' /enc data/ = /map (toEnum . fromEnum) data/
-- @
--
--   "Encode.Unicode.UTF8" is one concrete implementation that realizes
--   and illustrates this template. "Encode.Arabic.Buckwalter" implements
--   symmetric recoding using finite maps, and "Encode.Arabic.ArabTeX"
--   makes use of monadic parsing and the "PureFP" library.

class Encoding e where

    encode :: e -> [UPoint] -> [Char]
    decode :: e -> [Char] -> [UPoint]

    encode _ = map (toEnum . fromEnum)
    decode _ = map (toEnum . fromEnum)


instance Show UPoint where

    showsPrec p (UPoint x)
        | x == 38             = showsPrec p "&amp;"
        | x >= 32 && x <= 255 = showsPrec p (toEnum (fromIntegral x) :: Char)
        | otherwise           = showsPrec p ("&#" ++ showsPrec p x ";")


instance Enum UPoint where

    fromEnum (UPoint x) = fromIntegral x

    toEnum x = UPoint (fromIntegral x)