{-# LANGUAGE CPP #-}
#if __GLASGOW_HASKELL__ >= 701
{-# LANGUAGE Trustworthy #-}
#endif
--
-- |
-- Module      :  Data.String.UTF8
-- Copyright   :  (c) Iavor S. Diatchki 2009
-- License     :  BSD3-style (see LICENSE)
--
-- Maintainer  :  emertens@galois.com
-- Stability   :  experimental
-- Portability :  portable
--
{-# LANGUAGE MultiParamTypeClasses, UndecidableInstances #-}
module Data.String.UTF8
  ( -- * Representation
    UTF8
  , UTF8Bytes()
  , fromString
  , toString
  , fromRep
  , toRep
  , G.replacement_char

  -- * Character based operations
  , uncons
  , splitAt
  , take
  , drop
  , span
  , break
  , foldl
  , foldr
  , length
  , lines
  , lines'

  -- * Representation based operations
  , null
  , decode
  , byteSplitAt
  , byteTake
  , byteDrop
  ) where

import Prelude hiding (null,take,drop,span,break
                      ,foldl,foldr,length,lines,splitAt)
import qualified Codec.Binary.UTF8.Generic as G
import Codec.Binary.UTF8.Generic (UTF8Bytes)
import qualified Data.String as S

-- | The type of strings that are represented using the UTF8 encoding.
-- The parameter is the type of the container for the representation.
newtype UTF8 string = Str string deriving (UTF8 string -> UTF8 string -> Bool
(UTF8 string -> UTF8 string -> Bool)
-> (UTF8 string -> UTF8 string -> Bool) -> Eq (UTF8 string)
forall string. Eq string => UTF8 string -> UTF8 string -> Bool
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
/= :: UTF8 string -> UTF8 string -> Bool
$c/= :: forall string. Eq string => UTF8 string -> UTF8 string -> Bool
== :: UTF8 string -> UTF8 string -> Bool
$c== :: forall string. Eq string => UTF8 string -> UTF8 string -> Bool
Eq,Eq (UTF8 string)
Eq (UTF8 string)
-> (UTF8 string -> UTF8 string -> Ordering)
-> (UTF8 string -> UTF8 string -> Bool)
-> (UTF8 string -> UTF8 string -> Bool)
-> (UTF8 string -> UTF8 string -> Bool)
-> (UTF8 string -> UTF8 string -> Bool)
-> (UTF8 string -> UTF8 string -> UTF8 string)
-> (UTF8 string -> UTF8 string -> UTF8 string)
-> Ord (UTF8 string)
UTF8 string -> UTF8 string -> Bool
UTF8 string -> UTF8 string -> Ordering
UTF8 string -> UTF8 string -> UTF8 string
forall a.
Eq a
-> (a -> a -> Ordering)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> Bool)
-> (a -> a -> a)
-> (a -> a -> a)
-> Ord a
forall string. Ord string => Eq (UTF8 string)
forall string. Ord string => UTF8 string -> UTF8 string -> Bool
forall string. Ord string => UTF8 string -> UTF8 string -> Ordering
forall string.
Ord string =>
UTF8 string -> UTF8 string -> UTF8 string
min :: UTF8 string -> UTF8 string -> UTF8 string
$cmin :: forall string.
Ord string =>
UTF8 string -> UTF8 string -> UTF8 string
max :: UTF8 string -> UTF8 string -> UTF8 string
$cmax :: forall string.
Ord string =>
UTF8 string -> UTF8 string -> UTF8 string
>= :: UTF8 string -> UTF8 string -> Bool
$c>= :: forall string. Ord string => UTF8 string -> UTF8 string -> Bool
> :: UTF8 string -> UTF8 string -> Bool
$c> :: forall string. Ord string => UTF8 string -> UTF8 string -> Bool
<= :: UTF8 string -> UTF8 string -> Bool
$c<= :: forall string. Ord string => UTF8 string -> UTF8 string -> Bool
< :: UTF8 string -> UTF8 string -> Bool
$c< :: forall string. Ord string => UTF8 string -> UTF8 string -> Bool
compare :: UTF8 string -> UTF8 string -> Ordering
$ccompare :: forall string. Ord string => UTF8 string -> UTF8 string -> Ordering
$cp1Ord :: forall string. Ord string => Eq (UTF8 string)
Ord)   -- XXX: Is this OK?

instance UTF8Bytes string index => Show (UTF8 string) where
  show :: UTF8 string -> String
show UTF8 string
x = ShowS
forall a. Show a => a -> String
show (UTF8 string -> String
forall string index.
UTF8Bytes string index =>
UTF8 string -> String
toString UTF8 string
x)

instance UTF8Bytes string index => S.IsString (UTF8 string) where
  fromString :: String -> UTF8 string
fromString = String -> UTF8 string
forall string index.
UTF8Bytes string index =>
String -> UTF8 string
fromString

fromRep :: string -> UTF8 string
fromRep :: string -> UTF8 string
fromRep = string -> UTF8 string
forall string. string -> UTF8 string
Str

toRep :: UTF8 string -> string
toRep :: UTF8 string -> string
toRep (Str string
x) = string
x

-- | Converts a Haskell string into a UTF8 encoded string.
-- Complexity: linear.
fromString :: UTF8Bytes string index => String -> UTF8 string
fromString :: String -> UTF8 string
fromString String
xs = string -> UTF8 string
forall string. string -> UTF8 string
Str (String -> string
forall b s. UTF8Bytes b s => String -> b
G.fromString String
xs)

-- | Convert a UTF8 encoded string into a Haskell string.
-- Invalid characters are replaced by 'G.replacement_char'.
-- Complexity: linear.
toString :: UTF8Bytes string index => UTF8 string -> String
toString :: UTF8 string -> String
toString (Str string
xs) = string -> String
forall b s. UTF8Bytes b s => b -> String
G.toString string
xs

-- | Checks if there are no more bytes in the underlying representation.
null :: UTF8Bytes string index => UTF8 string -> Bool
null :: UTF8 string -> Bool
null (Str string
x) = string -> Bool
forall b s. UTF8Bytes b s => b -> Bool
G.null string
x

-- | Split after a given number of characters.
-- Negative values are treated as if they are 0.
splitAt :: UTF8Bytes string index
        => index -> UTF8 string -> (UTF8 string, UTF8 string)
splitAt :: index -> UTF8 string -> (UTF8 string, UTF8 string)
splitAt index
x (Str string
bs)  = case index -> string -> (string, string)
forall b s. UTF8Bytes b s => s -> b -> (b, b)
G.splitAt index
x string
bs of
                        (string
s1,string
s2) -> (string -> UTF8 string
forall string. string -> UTF8 string
Str string
s1, string -> UTF8 string
forall string. string -> UTF8 string
Str string
s2)

-- | Split after a given number of bytes in the underlying representation.
-- See also 'splitAt'.
byteSplitAt :: UTF8Bytes string index
             => index -> UTF8 string -> (UTF8 string, UTF8 string)
byteSplitAt :: index -> UTF8 string -> (UTF8 string, UTF8 string)
byteSplitAt index
n (Str string
x) = case index -> string -> (string, string)
forall b s. UTF8Bytes b s => s -> b -> (b, b)
G.bsplit index
n string
x of
                          (string
as,string
bs) -> (string -> UTF8 string
forall string. string -> UTF8 string
Str string
as, string -> UTF8 string
forall string. string -> UTF8 string
Str string
bs)

-- | Take only the given number of bytes from the underlying representation.
-- See also 'take'.
byteTake :: UTF8Bytes string index => index -> UTF8 string -> UTF8 string
byteTake :: index -> UTF8 string -> UTF8 string
byteTake index
n (Str string
x) = string -> UTF8 string
forall string. string -> UTF8 string
Str ((string, string) -> string
forall a b. (a, b) -> a
fst (index -> string -> (string, string)
forall b s. UTF8Bytes b s => s -> b -> (b, b)
G.bsplit index
n string
x))

-- | Drop the given number of bytes from the underlying representation.
-- See also 'drop'.
byteDrop :: UTF8Bytes string index => index -> UTF8 string -> UTF8 string
byteDrop :: index -> UTF8 string -> UTF8 string
byteDrop index
n (Str string
x) = string -> UTF8 string
forall string. string -> UTF8 string
Str (index -> string -> string
forall b s. UTF8Bytes b s => s -> b -> b
G.bdrop index
n string
x)


-- | @take n s@ returns the first @n@ characters of @s@.
-- If @s@ has less than @n@ characters, then we return the whole of @s@.
take :: UTF8Bytes string index => index -> UTF8 string -> UTF8 string
take :: index -> UTF8 string -> UTF8 string
take index
n (Str string
bs) = string -> UTF8 string
forall string. string -> UTF8 string
Str (index -> string -> string
forall b s. UTF8Bytes b s => s -> b -> b
G.take index
n string
bs)

-- | @drop n s@ returns the @s@ without its first @n@ characters.
-- If @s@ has less than @n@ characters, then we return an empty string.
drop :: UTF8Bytes string index => index -> UTF8 string -> UTF8 string
drop :: index -> UTF8 string -> UTF8 string
drop index
n (Str string
bs) = string -> UTF8 string
forall string. string -> UTF8 string
Str (index -> string -> string
forall b s. UTF8Bytes b s => s -> b -> b
G.drop index
n string
bs)

-- | Split a string into two parts:  the first is the longest prefix
-- that contains only characters that satisfy the predicate; the second
-- part is the rest of the string.
-- Invalid characters are passed as @\'\\0xFFFD\'@ to the predicate.
span :: UTF8Bytes string index
     => (Char -> Bool) -> UTF8 string -> (UTF8 string, UTF8 string)
span :: (Char -> Bool) -> UTF8 string -> (UTF8 string, UTF8 string)
span Char -> Bool
p (Str string
bs) = case (Char -> Bool) -> string -> (string, string)
forall b s. UTF8Bytes b s => (Char -> Bool) -> b -> (b, b)
G.span Char -> Bool
p string
bs of
                    (string
s1,string
s2) -> (string -> UTF8 string
forall string. string -> UTF8 string
Str string
s1, string -> UTF8 string
forall string. string -> UTF8 string
Str string
s2)

-- | Split a string into two parts:  the first is the longest prefix
-- that contains only characters that do not satisfy the predicate; the second
-- part is the rest of the string.
-- Invalid characters are passed as 'G.replacement_char' to the predicate.
break :: UTF8Bytes string index
      => (Char -> Bool) -> UTF8 string -> (UTF8 string, UTF8 string)
break :: (Char -> Bool) -> UTF8 string -> (UTF8 string, UTF8 string)
break Char -> Bool
p (Str string
bs)  = case (Char -> Bool) -> string -> (string, string)
forall b s. UTF8Bytes b s => (Char -> Bool) -> b -> (b, b)
G.break Char -> Bool
p string
bs of
                      (string
s1,string
s2) -> (string -> UTF8 string
forall string. string -> UTF8 string
Str string
s1, string -> UTF8 string
forall string. string -> UTF8 string
Str string
s2)

-- | Get the first character of a byte string, if any.
-- Invalid characters are replaced by 'G.replacement_char'.
uncons :: UTF8Bytes string index
       => UTF8 string -> Maybe (Char, UTF8 string)
uncons :: UTF8 string -> Maybe (Char, UTF8 string)
uncons (Str string
x)  = do (Char
c,string
y) <- string -> Maybe (Char, string)
forall b s. UTF8Bytes b s => b -> Maybe (Char, b)
G.uncons string
x
                     (Char, UTF8 string) -> Maybe (Char, UTF8 string)
forall (m :: * -> *) a. Monad m => a -> m a
return (Char
c, string -> UTF8 string
forall string. string -> UTF8 string
Str string
y)

-- | Extract the first character for the underlying representation,
-- if one is available.  It also returns the number of bytes used
-- in the representation of the character.
-- See also 'uncons'.
decode :: UTF8Bytes string index => UTF8 string -> Maybe (Char, index)
decode :: UTF8 string -> Maybe (Char, index)
decode (Str string
x)  = string -> Maybe (Char, index)
forall b s. UTF8Bytes b s => b -> Maybe (Char, s)
G.decode string
x

-- | Traverse a bytestring (right biased).
foldr :: UTF8Bytes string index => (Char -> a -> a) -> a -> UTF8 string -> a
foldr :: (Char -> a -> a) -> a -> UTF8 string -> a
foldr Char -> a -> a
cons a
nil (Str string
cs) = (Char -> a -> a) -> a -> string -> a
forall b s a. UTF8Bytes b s => (Char -> a -> a) -> a -> b -> a
G.foldr Char -> a -> a
cons a
nil string
cs

-- | Traverse a bytestring (left biased).
-- This function is strict in the accumulator.
foldl :: UTF8Bytes string index => (a -> Char -> a) -> a -> UTF8 string -> a
foldl :: (a -> Char -> a) -> a -> UTF8 string -> a
foldl a -> Char -> a
add a
acc (Str string
cs)  = (a -> Char -> a) -> a -> string -> a
forall b s a. UTF8Bytes b s => (a -> Char -> a) -> a -> b -> a
G.foldl a -> Char -> a
add a
acc string
cs

-- | Counts the number of characters encoded in the bytestring.
-- Note that this includes replacement characters.
-- The function is linear in the number of bytes in the representation.
length :: UTF8Bytes string index => UTF8 string -> index
length :: UTF8 string -> index
length (Str string
b) = string -> index
forall b s. UTF8Bytes b s => b -> s
G.length string
b

-- | Split a string into a list of lines.
-- Lines are terminated by @\'\\n\'@ or the end of the string.
-- Empty lines may not be terminated by the end of the string.
-- See also 'lines''.
lines :: UTF8Bytes string index => UTF8 string -> [UTF8 string]
lines :: UTF8 string -> [UTF8 string]
lines (Str string
b) = (string -> UTF8 string) -> [string] -> [UTF8 string]
forall a b. (a -> b) -> [a] -> [b]
map string -> UTF8 string
forall string. string -> UTF8 string
Str (string -> [string]
forall b s. UTF8Bytes b s => b -> [b]
G.lines string
b)   -- XXX: unnecessary map

-- | Split a string into a list of lines.
-- Lines are terminated by @\'\\n\'@ or the end of the string.
-- Empty lines may not be terminated by the end of the string.
-- This function preserves the terminators.
-- See also 'lines'.
lines' :: UTF8Bytes string index => UTF8 string -> [UTF8 string]
lines' :: UTF8 string -> [UTF8 string]
lines' (Str string
x)  = (string -> UTF8 string) -> [string] -> [UTF8 string]
forall a b. (a -> b) -> [a] -> [b]
map string -> UTF8 string
forall string. string -> UTF8 string
Str (string -> [string]
forall b s. UTF8Bytes b s => b -> [b]
G.lines' string
x)  -- XXX: unnecessary map