{-# LANGUAGE ForeignFunctionInterface #-} -- Copyright (C) 2009 John Millikin -- -- This program is free software: you can redistribute it and/or modify -- it under the terms of the GNU General Public License as published by -- the Free Software Foundation, either version 3 of the License, or -- any later version. -- -- This program is distributed in the hope that it will be useful, -- but WITHOUT ANY WARRANTY; without even the implied warranty of -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -- GNU General Public License for more details. -- -- You should have received a copy of the GNU General Public License -- along with this program. If not, see . module CPython.Types.Unicode ( -- * Unicode objects Unicode , Encoding , ErrorHandling (..) , unicodeType , toUnicode , fromUnicode , length , fromEncodedObject , fromObject , encode , decode -- * Methods and slot functions , append , split , splitLines , translate , join , MatchDirection (..) , tailMatch , FindDirection (..) , find , count , replace , format , contains ) where #include import Prelude hiding (length) import Control.Exception (ErrorCall (..), throwIO) import qualified Data.Text as T #ifdef Py_UNICODE_WIDE import Data.Char (chr, ord) #else import qualified Data.Text.Foreign as TF #endif import CPython.Internal import CPython.Types.Bytes (Bytes) newtype Unicode = Unicode (ForeignPtr Unicode) instance Object Unicode where toObject (Unicode x) = SomeObject x fromForeignPtr = Unicode instance Concrete Unicode where concreteType _ = unicodeType type Encoding = T.Text data ErrorHandling = Strict | Replace | Ignore deriving (Show, Eq) withErrors :: ErrorHandling -> (CString -> IO a) -> IO a withErrors errors = withCString $ case errors of Strict -> "strict" Replace -> "replace" Ignore -> "ignore" {# fun pure unsafe hscpython_PyUnicode_Type as unicodeType {} -> `Type' peekStaticObject* #} toUnicode :: T.Text -> IO Unicode toUnicode str = withBuffer toPython >>= stealObject where toPython ptr len = let len' = fromIntegral len ptr' = castPtr ptr in {# call hscpython_PyUnicode_FromUnicode #} ptr' len' #ifdef Py_UNICODE_WIDE ords = map (fromIntegral . ord) (T.unpack str) :: [CUInt] withBuffer = withArrayLen ords . flip #else withBuffer = TF.useAsPtr str #endif fromUnicode :: Unicode -> IO T.Text fromUnicode obj = withObject obj $ \ptr -> do buffer <- {# call hscpython_PyUnicode_AsUnicode #} ptr size <- {# call hscpython_PyUnicode_GetSize #} ptr #ifdef Py_UNICODE_WIDE raw <- peekArray (fromIntegral size) buffer return . T.pack $ map (chr . fromIntegral) raw #else TF.fromPtr (castPtr buffer) (fromIntegral size) #endif {# fun hscpython_PyUnicode_GetSize as length { withObject* `Unicode' } -> `Integer' checkIntReturn* #} -- | Coerce an encoded object /obj/ to an Unicode object. -- -- 'Bytes' and other char buffer compatible objects are decoded according to -- the given encoding and error handling mode. -- -- All other objects, including 'Unicode' objects, cause a @TypeError@ to be -- thrown. {# fun hscpython_PyUnicode_FromEncodedObject as fromEncodedObject `Object obj' => { withObject* `obj' , withText* `Encoding' , withErrors* `ErrorHandling' } -> `Unicode' stealObject* #} -- | Shortcut for @'fromEncodedObject' \"utf-8\" 'Strict'@ fromObject :: Object obj => obj -> IO Unicode fromObject obj = fromEncodedObject obj (T.pack "utf-8") Strict -- | Encode a 'Unicode' object and return the result as 'Bytes' object. -- The encoding and error mode have the same meaning as the parameters of -- the the @str.encode()@ method. The codec to be used is looked up using -- the Python codec registry. {# fun hscpython_PyUnicode_AsEncodedString as encode { withObject* `Unicode' , withText* `Encoding' , withErrors* `ErrorHandling' } -> `Bytes' stealObject* #} -- | Create a 'Unicode' object by decoding a 'Bytes' object. The encoding and -- error mode have the same meaning as the parameters of the the -- @str.encode()@ method. The codec to be used is looked up using the Python -- codec registry. decode :: Bytes -> Encoding -> ErrorHandling -> IO Unicode decode bytes enc errors = withObject bytes $ \bytesPtr -> withText enc $ \encPtr -> withErrors errors $ \errorsPtr -> alloca $ \bufferPtr -> alloca $ \lenPtr -> do {# call PyBytes_AsStringAndSize as ^ #} bytesPtr bufferPtr lenPtr >>= checkStatusCode buffer <- peek bufferPtr len <- peek lenPtr {# call hscpython_PyUnicode_Decode #} buffer len encPtr errorsPtr >>= stealObject {# fun hscpython_PyUnicode_Concat as append { withObject* `Unicode' , withObject* `Unicode' } -> `Unicode' stealObject* #} -- | Split a string giving a 'List' of 'Unicode' objects. If the separator is -- 'Nothing', splitting will be done at all whitespace substrings. Otherwise, -- splits occur at the given separator. Separators are not included in the -- resulting list. split :: Unicode -> Maybe Unicode -- ^ Separator -> Maybe Integer -- ^ Maximum splits -> IO List split s sep maxsplit = withObject s $ \sPtr -> maybeWith withObject sep $ \sepPtr -> let max' = maybe (- 1) fromInteger maxsplit in {# call hscpython_PyUnicode_Split #} sPtr sepPtr max' >>= stealObject -- | Split a 'Unicode' string at line breaks, returning a list of 'Unicode' -- strings. CRLF is considered to be one line break. If the second parameter -- is 'False', the line break characters are not included in the resulting -- strings. {# fun hscpython_PyUnicode_Splitlines as splitLines { withObject* `Unicode' , `Bool' } -> `List' stealObject* #} -- | Translate a string by applying a character mapping table to it. -- -- The mapping table must map Unicode ordinal integers to Unicode ordinal -- integers or @None@ (causing deletion of the character). -- -- Mapping tables need only provide the @__getitem__()@ interface; -- dictionaries and sequences work well. Unmapped character ordinals (ones -- which cause a @LookupError@) are left untouched and are copied as-is. -- -- The error mode has the usual meaning for codecs. {# fun hscpython_PyUnicode_Translate as translate `Object table' => { withObject* `Unicode' , withObject* `table' , withErrors* `ErrorHandling' } -> `Unicode' stealObject* #} -- | Join a sequence of strings using the given separator. {# fun hscpython_PyUnicode_Join as join `Sequence seq' => { withObject* `Unicode' , withObject* `seq' } -> `Unicode' stealObject* #} data MatchDirection = Prefix | Suffix deriving (Show, Eq) -- | Return 'True' if the substring matches @string*[*start:end]@ at the -- given tail end (either a 'Prefix' or 'Suffix' match), 'False' otherwise. tailMatch :: Unicode -- ^ String -> Unicode -- ^ Substring -> Integer -- ^ Start -> Integer -- ^ End -> MatchDirection -> IO Bool tailMatch str substr start end dir = withObject str $ \strPtr -> withObject substr $ \substrPtr -> let start' = fromInteger start in let end' = fromInteger end in let dir' = case dir of Prefix -> -1 Suffix -> 1 in {# call hscpython_PyUnicode_Tailmatch #} strPtr substrPtr start' end' dir' >>= checkBoolReturn data FindDirection = Forwards | Backwards deriving (Show, Eq) -- | Return the first position of the substring in @string*[*start:end]@ -- using the given direction. The return value is the index of the first -- match; a value of 'Nothing' indicates that no match was found. find :: Unicode -- ^ String -> Unicode -- ^ Substring -> Integer -- ^ Start -> Integer -- ^ End -> FindDirection -> IO (Maybe Integer) find str substr start end dir = withObject str $ \strPtr -> withObject substr $ \substrPtr -> do let start' = fromInteger start let end' = fromInteger end let dir' = case dir of Forwards -> 1 Backwards -> -1 cRes <- {# call hscpython_PyUnicode_Find #} strPtr substrPtr start' end' dir' exceptionIf $ cRes == -2 case cRes of -1 -> return Nothing x | x >= 0 -> return . Just . toInteger $ x x -> throwIO . ErrorCall $ "Invalid return code: " ++ show x -- | Return the number of non-overlapping occurrences of the substring in -- @string[start:end]@. count :: Unicode -- ^ String -> Unicode -- ^ Substring -> Integer -- ^ Start -> Integer -- ^ End -> IO Integer count str substr start end = withObject str $ \str' -> withObject substr $ \substr' -> let start' = fromInteger start in let end' = fromInteger end in {# call hscpython_PyUnicode_Count #} str' substr' start' end' >>= checkIntReturn -- | Replace occurrences of the substring with a given replacement. If the -- maximum count is 'Nothing', replace all occurences. replace :: Unicode -- ^ String -> Unicode -- ^ Substring -> Unicode -- ^ Replacement -> Maybe Integer -- ^ Maximum count -> IO Unicode replace str substr replstr maxcount = withObject str $ \strPtr -> withObject substr $ \substrPtr -> withObject replstr $ \replstrPtr -> let maxcount' = case maxcount of Nothing -> -1 Just x -> fromInteger x in {# call hscpython_PyUnicode_Replace #} strPtr substrPtr replstrPtr maxcount' >>= stealObject -- | Return a new 'Unicode' object from the given format and args; this is -- analogous to @format % args@. {# fun hscpython_PyUnicode_Format as format { withObject* `Unicode' , withObject* `Tuple' } -> `Unicode' stealObject* #} -- | Check whether /element/ is contained in a string. -- -- /element/ has to coerce to a one element string. {# fun hscpython_PyUnicode_Contains as contains `Object element' => { withObject* `Unicode' , withObject* `element' } -> `Bool' checkBoolReturn* #}