{-# LANGUAGE TypeSynonymInstances, FlexibleInstances, Trustworthy #-} -- | -- Copyright: Julian Fleischer -- License: MIT -- -- Maintainer: Julian Fleischer -- Stability: provisional -- Portability: portable -- -- Functions for working with strings, including 'Text', 'ByteString', etc. -- -- This module aims at offering a consistent interface across all the available -- string types. It currently offers instances for the ordinary Haskell 'String' -- type, 'Text', lazy 'LazyT.Text', 'ByteString', and lazy 'LazyB.ByteString'. -- -- It especially provides functions for some types, which are otherwise not -- available nativly (such as 'breakOnSubstring' which is not available for the -- lazy 'LazyT.Text' type, is offered by 'sBreak' and 'strBreak'). module Data.Strings ( text, lazyText, bytes, lazyBytes, charToByte, byteToChar, Str (..), Strings (..) ) where import Data.Int import Data.Word import Data.String hiding (fromString) import qualified Data.String as S import qualified Data.Char as C import qualified Data.ByteString.Char8 as B8 import qualified Data.ByteString as B import Data.ByteString (ByteString) import qualified Data.ByteString.Lazy.Char8 as LazyB8 import qualified Data.ByteString.Lazy as LazyB import qualified Data.Text as T import Data.Text (Text) import qualified Data.Text.Lazy as LazyT import qualified Data.List as L import qualified Data.Text.Encoding as Enc import qualified Data.Text.Lazy.Encoding as LazyEnc -- | Returns @True@ exactly for space (0x20), and horizontal tab (0x09). isSpace x | x == ' ' = True | x == '\t' = True | otherwise = False -- | Returns @True@ exactly for space (0x20), horizontal tab (0x09), carriage return (0x0D), and line feed (0x0A). isWhiteSpace x | x == ' ' = True | x == '\t' = True | x == '\r' = True | x == '\n' = True | otherwise = False charToByte :: Char -> Word8 -- ^ Convert a 'Char' into a 'Word8'. charToByte = fromIntegral . C.ord byteToChar :: Word8 -> Char -- ^ Convert a 'Word8' into a 'Char'. byteToChar = C.chr . fromIntegral text :: String -> Text -- ^ Create a 'Text' object form an ordinary Haskell 'String'. text = S.fromString lazyText :: String -> LazyT.Text -- ^ Create a lazy 'LazyT.Text' object from an ordinary Haskell 'String'. lazyText = S.fromString bytes :: String -> ByteString -- ^ Create a 'ByteString' object form an ordinary Haskell 'String'. -- This function will encode a String using the UTF-8 character encoding. bytes = sFromUnicodeString -- | Create a lazy 'LazyB.ByteString' object from an ordinary Haskell 'String'. -- This function will encode a String using the UTF-8 character encoding. lazyBytes :: String -> LazyB.ByteString lazyBytes = sFromUnicodeString -- | The 'Str' class provides functions for working with arbitrary Strings. -- It is basically the same interface as provided by the 'Strings' class. -- However, every input string is a Haskell String here, thus easing the -- usage of different string types with native Haskell String literals. -- -- For example @strAppend "suffix"@ works with any string type for which -- an instance of @Str@ is defined. In order to maximize the ease of use -- of this library, the functions are prefixed with @str@. -- -- The complexity and efficiency of these functions depends on the underlying -- string type being used. -- -- Minimal complete definition: It suffices to provide instances for -- 'Eq' and 'Strings'. class (Eq a, Strings a) => Str a where -- | Check whether the given string is empty or not. 'null' generalised. strNull :: a -> Bool -- | 'length' generalised. strLen :: a -> Int -- | 'head' generalised. This function is undefined if 'strNull' would return @True@. strHead :: a -> Char -- | 'last' generalised. This function is undefined if 'strNull' would return @True@. strLast :: a -> Char -- | 'init' generalised. This function is undefined if 'strNull' would return @True@. strInit :: a -> a -- | 'tail' generalised. This function is undefined if 'strNull' would return @True@. strTail :: a -> a -- | 'take' generalised. strTake :: Int -> a -> a -- | 'drop' generalised. strDrop :: Int -> a -> a -- | Replace a substring with another string. strReplace :: String -- ^ Needle. -> String -- ^ Replacement. -> a -- ^ Haystack. -> a -- ^ Result: @Haystack@ with @Needle@ replaced by @Replacement@. -- | Breaks the string on the first occurence of the given substring. -- -- > strBreak "xx" "1x2xx3xx4" = ("1x2", "xx3xx4") strBreak :: String -> a -> (a, a) -- | Like 'strBreak', but the string to break on is excluded from the result. -- -- > strSplit "xx" "1x2xx3xx4" = ("1x2", "3xx4") strSplit :: String -> a -> (a, a) -- | Split a string into multiple fragments, separated by the given substring. -- -- > strSplitAll "xx" "1x2xx3xx4" = ["1x2", "3", "4"] strSplitAll :: String -> a -> [a] -- | Turn all characters in the string to upper case. strToUpper :: a -> a -- | Turn all characters in the string to lower case. strToLower :: a -> a -- | Turn the first character in the string to upper case. strCapitalize :: a -> a -- | @map@ generalised. strMap :: (Char -> Char) -> a -> a -- | @concat@ generalised. strConcat :: [a] -> a -- | Glue together multiple strings by a given Haskell 'String'. -- -- > strJoin x = concat . intersperse x strJoin :: String -> [a] -> a -- | Appends the given Haskell 'String' to the string. @++@ generalised. -- -- > strAppend " world" "hello" = "hello world" strAppend :: String -- The String to append. -> a -- The string to append to. -> a -- The concatenation of the two strings. -- | Cons generalised. strCons :: Char -> a -> a -- | Strips white space characters off both ends of the string. strTrim :: a -> a -- | Appends the given character n times to the left, such that -- the resulting string has the given length. -- -- > strPadLeft '0' 8 "4711" == "00004711" strPadLeft :: Char -> Int -> a -> a -- | Appends the given character n times to the right, such that -- the resulting string has the given length. -- -- > strPadRight '0' 8 "4711" == "47110000" strPadRight :: Char -> Int -> a -> a -- | Appends the given character n times to both sides, such that -- the resulting string has the given length. -- -- > strPadBoth '0' 8 "4711" == "00471100" strPadBoth :: Char -> Int -> a -> a -- | Reverse the string. strReverse :: a -> a -- | Check if the given Haskell String equals the string. strEq :: String -- ^ The Haskell String. -> a -- ^ The string. -> Bool -- @True@ iff @fromString haskellString == string@ -- | Check if the string starts with the given Haskell String. strStartsWith :: a -> String -> Bool -- | Check if the string ends with the given Haskell String. strEndsWith :: a -> String -> Bool -- | Create a string from a Haskell String. fromString :: String -> a -- | Create a string from a Haskell String. If the string does not support unicode, -- the Haskell String is encoded using UTF-8. fromUnicodeString :: String -> a -- | Convert the string into a Haskell String. toString :: a -> String -- | Convert the string into a list of bytes. toWord8 :: a -> [Word8] strNull = sNull strLen = sLen strHead = sHead strLast = sLast strInit = sInit strTail = sTail strTake = sTake strDrop = sDrop strReplace n r = sReplace (fromString n) (fromString r) strBreak n = sBreak (fromString n) strSplit d s = (a, b) where (a, b, _) = sSplit (fromString d) s strSplitAll n = sSplitAll (fromString n) strToLower = strMap C.toLower strToUpper = strMap C.toUpper strCapitalize str = strCons (C.toUpper (strHead str)) (strTail str) strCons = sCons strMap = sMap strConcat = sConcat strJoin d = let d' = fromUnicodeString d in strConcat . L.intersperse d' strAppend x xs = sConcat [xs, fromString x] strTrim = sTrim strPadLeft = sPadLeft strPadRight = sPadRight strPadBoth = sPadBoth strReverse = sReverse strStartsWith s pref = sStartsWith s (fromString pref) strEndsWith s suff = sEndsWith s (fromString suff) strEq s1 s2 = fromUnicodeString s1 == s2 fromString = sFromString fromUnicodeString = sFromUnicodeString toString = sToString toWord8 = sToWord8 {- instance IsString Data.ByteString.ByteString where fromString = Enc.encodeUtf8 . Data.Text.pack instance IsString Data.ByteString.Lazy.ByteString where fromString = LazyEnc.encodeUtf8 . Data.Text.Lazy.pack -} instance Str Text where instance Str LazyT.Text where instance Str ByteString where instance Str LazyB.ByteString where instance Str String where -- | The goal of this class is to offer the same interface for various -- types of strings ('ByteString', 'Text', Haskell 'String', etc.). -- If a certain type offers a native implementation for a given function, -- 'Strings' uses it. If not, a default implementation is given. -- -- All of these functions are prefixed with @s@ to avoid nameclashes -- with existing functions from the prelude. -- -- The complexity and efficiency of these functions depends on the underlying -- string type being used. class Strings a where -- | Check whether the given string is empty or not. @null@ generalised. sNull :: a -> Bool -- | The empty string. sEmpty :: a -- | 'length' generalised. sLen :: a -> Int -- | 'head' generalised. This function is undefined if 'strNull' would return @True@. sHead :: a -> Char -- | 'last' generalised. This function is undefined if 'strNull' would return @True@. sLast :: a -> Char -- | 'init' generalised. This function is undefined if 'strNull' would return @True@. sInit :: a -> a -- | 'tail' generalised. This function is undefined if 'strNull' would return @True@. sTail :: a -> a -- | 'take' generalised. sTake :: Int -> a -> a -- | 'drop' generalised. sDrop :: Int -> a -> a -- | 'takeWhile' generalised. sTakeWhile :: (Char -> Bool) -> a -> a -- | 'dropWhile' generalised. sDropWhile :: (Char -> Bool) -> a -> a -- | Replace a substring with another string. sReplace :: a -> a -> a -> a -- | Breaks the string on the first occurence of the given substring. -- -- > strBreak "xx" "1x2xx3xx4" = ("1x2", "xx3xx4") sBreak :: a -> a -> (a, a) -- | Like 'sBreak', but the string to break on is excluded from the result. -- -- > strSplit "xx" "1x2xx3xx4" = ("1x2", "3xx4") sSplit :: a -> a -> (a, a, Bool) -- | Split a string into multiple fragments, separated by the given substring. -- -- > strSplitAll "xx" "1x2xx3xx4" = ["1x2", "3", "4"] sSplitAll :: a -> a -> [a] -- | Check if the string starts with the given string. sStartsWith :: a -> a -> Bool -- | Check if the string ends with the given string. sEndsWith :: a -> a -> Bool -- | Cons sCons :: Char -> a -> a -- | Turn the first character into an upper case character. sCapitalize :: a -> a -- | Map a function over all characters. sMap :: (Char -> Char) -> a -> a -- | Concatenate all the strings in the list to a single string. sConcat :: [a] -> a -- | Strips white space characters off both ends of the string. sTrim :: a -> a -- | Appends the given character n times to the left, such that -- the resulting string has the given length. -- -- > strPadLeft '0' 8 "4711" == "00004711" sPadLeft :: Char -> Int -> a -> a -- | Appends the given character n times to the right, such that -- the resulting string has the given length. -- -- > strPadRight '0' 8 "4711" == "47110000" sPadRight :: Char -> Int -> a -> a -- | Appends the given character n times to both sides, such that -- the resulting string has the given length. -- -- > strPadBoth '0' 8 "4711" == "00471100" sPadBoth :: Char -> Int -> a -> a -- 'reverse' the string. sReverse :: a -> a -- | Create a string from a Haskell String. sFromString :: String -> a -- | Create a string from a Haskell String. If the string does not support unicode, -- the Haskell String is encoded using UTF-8. sFromUnicodeString :: String -> a -- | Convert the string into a Haskell String. sToString :: a -> String -- | Convert the string into a list of bytes. sToWord8 :: a -> [Word8] sCapitalize xs | sNull xs = xs | otherwise = sCons (C.toUpper (sHead xs)) (sTail xs) sBreak d src = search 0 src where search a b | a `seq` b `seq` False = undefined search n s | sNull s = (src, sEmpty) | sStartsWith s d = (sTake n src, s) | otherwise = search (n+1) (sTail s) sReplace n r = sConcat . replace n r where replace n r h = if sNull b then (if c then [a, r] else [a]) else a : r : replace n r b where (a, b, c) = sSplit n h sSplit d s = (a, sDrop (sLen d) b, not $ sNull b) where (a, b) = sBreak d s sSplitAll d s = if sNull b then (if c then [a, b] else [a]) else a : sSplitAll d b where (a, b, c) = sSplit d s sPadLeft c n s = let len = sLen s padLen = max 0 (n - len) padStr = sFromString $ replicate padLen c in sConcat [padStr, s] sPadRight c n s = let len = sLen s padLen = max 0 (n - len) padStr = sFromString $ replicate padLen c in sConcat [s, padStr] sPadBoth c n s = let len = sLen s padLen = (max 0 (n - len)) padLenR = padLen `quot` 2 padLenL = padLen - padLenR padStrL = sFromString $ replicate padLenL c padStrR = sFromString $ replicate padLenR c in sConcat [padStrL, s, padStrR] sFromString = sFromUnicodeString sToWord8 = sToWord8 . sToString instance Strings ByteString where sNull = B.null sEmpty = B.empty sHead = toEnum . fromIntegral . B.head sLast = toEnum . fromIntegral . B.last sInit = B.init sTail = B.tail sTake = B.take sTakeWhile f = B.takeWhile (f . chr) sDrop = B.drop sDropWhile f = B.dropWhile (f . chr) sBreak = B.breakSubstring sStartsWith = flip B.isPrefixOf sEndsWith = flip B.isSuffixOf sCons = B.cons . charToByte sMap f = B.map (charToByte . f . byteToChar) sConcat = B.concat sLen = B.length sTrim s = let (a, b) = B.span (isWhiteSpace . chr) s (c, d) = B.spanEnd (isWhiteSpace . chr) b in c sReverse = B.reverse sFromString = B8.pack sFromUnicodeString = Enc.encodeUtf8 . T.pack sToString = B8.unpack sToWord8 = B.unpack instance Strings LazyB.ByteString where sNull = LazyB.null sEmpty = LazyB.empty sHead = toEnum . fromIntegral . LazyB.head sLast = toEnum . fromIntegral . LazyB.last sInit = LazyB.init sTail = LazyB.tail sTake = LazyB.take . fromIntegral sTakeWhile f = LazyB.takeWhile (f . chr) sDrop = LazyB.drop . fromIntegral sDropWhile f = LazyB.dropWhile (f . chr) sStartsWith = flip LazyB.isPrefixOf sEndsWith = flip LazyB.isSuffixOf sCons = LazyB.cons . charToByte sMap f = LazyB.map (charToByte . f . byteToChar) sConcat = LazyB.concat sLen = fromIntegral . LazyB.length sTrim = LazyB.fromChunks . (:[]) . sTrim . B.concat . LazyB.toChunks sReverse = LazyB.reverse sFromString = LazyB8.pack sFromUnicodeString = LazyEnc.encodeUtf8 . LazyT.pack sToString = LazyB8.unpack sToWord8 = LazyB.unpack instance Strings Text where sNull = T.null sEmpty = T.empty sHead = T.head sLast = T.last sInit = T.init sTail = T.tail sTake = T.take sTakeWhile = T.takeWhile sDrop = T.drop sDropWhile = T.dropWhile sReplace = T.replace sBreak = T.breakOn sStartsWith = flip T.isPrefixOf sEndsWith = flip T.isSuffixOf sCons = T.cons sMap = T.map sConcat = T.concat sLen = T.length sTrim = T.strip sPadLeft = flip T.justifyLeft sPadRight = flip T.justifyRight sPadBoth = flip T.center sReverse = T.reverse sFromUnicodeString = S.fromString sFromString = S.fromString sToString = T.unpack sToWord8 = B.unpack . Enc.encodeUtf8 instance Strings LazyT.Text where sNull = LazyT.null sEmpty = LazyT.empty sHead = LazyT.head sLast = LazyT.last sInit = LazyT.init sTail = LazyT.tail sTake = LazyT.take . fromIntegral sTakeWhile = LazyT.takeWhile sDrop = LazyT.drop . fromIntegral sDropWhile = LazyT.dropWhile sBreak = LazyT.breakOn sStartsWith = flip LazyT.isPrefixOf sEndsWith = flip LazyT.isSuffixOf sCons = LazyT.cons sMap = LazyT.map sConcat = LazyT.concat sLen = fromIntegral . LazyT.length sTrim = LazyT.strip sPadLeft = flip (LazyT.justifyLeft . fromIntegral) sPadRight = flip (LazyT.justifyRight . fromIntegral) sPadBoth = flip (LazyT.center . fromIntegral) sReverse = LazyT.reverse sFromUnicodeString = S.fromString sFromString = S.fromString sToString = LazyT.unpack sToWord8 = LazyB.unpack . LazyEnc.encodeUtf8 instance Strings String where sNull = null sEmpty = [] sHead = head sLast = last sInit = init sTail = tail sTake = take sTakeWhile = takeWhile sDrop = drop sDropWhile = dropWhile sStartsWith = flip L.isPrefixOf sEndsWith = flip L.isSuffixOf sCapitalize (x:xs) = C.toUpper x : xs sCapitalize _ = [] sCons = (:) sMap = map sConcat = concat sLen = length sTrim = T.unpack . T.strip . T.pack sReverse = reverse sFromString = id sFromUnicodeString = id sToString = id sToWord8 = B.unpack . Enc.encodeUtf8 . T.pack -- | Utility class for working with characters. class Chars a where -- | Retrieve the value of the given character. ord :: Char -> a -- | Construct a character form the given value. chr :: a -> Char instance Chars Int where ord = fromIntegral . fromEnum chr = toEnum . fromIntegral instance Chars Word8 where ord = fromIntegral . fromEnum chr = toEnum . fromIntegral instance Chars Word16 where ord = fromIntegral . fromEnum chr = toEnum . fromIntegral instance Chars Word32 where ord = fromIntegral . fromEnum chr = toEnum . fromIntegral instance Chars Word64 where ord = fromIntegral . fromEnum chr = toEnum . fromIntegral instance Chars Int8 where ord = fromIntegral . fromEnum chr = toEnum . fromIntegral instance Chars Int16 where ord = fromIntegral . fromEnum chr = toEnum . fromIntegral instance Chars Int32 where ord = fromIntegral . fromEnum chr = toEnum . fromIntegral instance Chars Int64 where ord = fromIntegral . fromEnum chr = toEnum . fromIntegral instance Chars Integer where ord = fromIntegral . fromEnum chr = toEnum . fromIntegral