module Pugs.Compat.String (
split,
split_n,
breakOnGlue,
afterPrefix,
decodeUTF8,
encodeUTF8,
toQuoteMeta
) where
import Debug.Trace
import Pugs.Compat.Monads
import Data.Char
split :: (Eq a) => [a] -> [a] -> [[a]]
split [] _ = internalError "splitting by an empty list"
split sep str =
case breakOnGlue sep str of
Just (before, after) -> before : split sep after
Nothing -> [str]
split_n :: (Eq a) => [a] -> [a] -> Int -> [[a]]
split_n [] _ _ = internalError "splitting by an empty list"
split_n sep str n
| n == 1 = [str]
| otherwise =
case breakOnGlue sep str of
Just (before, after) -> before : split_n sep after (n1)
Nothing -> [str]
breakOnGlue :: (Eq a) => [a] -> [a] -> Maybe ([a], [a])
breakOnGlue _ [] = Nothing
breakOnGlue glue list@(x:xs) =
case afterPrefix glue list of
Just rest -> Just ([], rest)
Nothing -> case breakOnGlue glue xs of
Just (before, after) -> Just (x : before, after)
Nothing -> Nothing
afterPrefix :: (Eq a) => [a] -> [a] -> Maybe [a]
afterPrefix [] list = Just list
afterPrefix _ [] = Nothing
afterPrefix (p:ps) (x:xs)
| p == x = afterPrefix ps xs
| otherwise = Nothing
decodeUTF8 :: String -> String
decodeUTF8 xs = concatMap decodeUTF8' (chunkDec4096 xs)
decodeUTF8' :: String -> String
decodeUTF8' [] = []
decodeUTF8' (c:cs)
| c < '\x80'
= let rest = decodeUTF8' cs
in seq rest
(c:rest)
decodeUTF8' (c:d:cs)
| '\xC0' <= c, c <= '\xDF'
, '\x80' <= d, d <= '\xBF'
= let rest = decodeUTF8' cs
in seq rest
( toEnum ( (fromEnum c `mod` 0x20) * 0x40
+ fromEnum d `mod` 0x40
)
: rest
)
decodeUTF8' (c:d:e:cs)
| '\xE0' <= c, c <= '\xEF'
, '\x80' <= d, d <= '\xBF'
, '\x80' <= e, e <= '\xBF'
= let rest = decodeUTF8' cs
in seq rest
( toEnum ( (fromEnum c `mod` 0x10 * 0x1000)
+ (fromEnum d `mod` 0x40) * 0x40
+ fromEnum e `mod` 0x40
)
: rest
)
decodeUTF8' (c:d:e:f:cs)
| '\xF0' <= c, c <= '\xF7'
, '\x80' <= d, d <= '\xBF'
, '\x80' <= e, e <= '\xBF'
, '\x80' <= f, f <= '\xBF'
= let rest = decodeUTF8' cs
in seq rest
( toEnum ( (fromEnum c `mod` 0x10 * 0x40000)
+ (fromEnum d `mod` 0x40) * 0x1000
+ (fromEnum e `mod` 0x40) * 0x40
+ fromEnum f `mod` 0x40
)
: rest
)
decodeUTF8' (x:xs) = trace ("decodeUTF8': bad data: " ++ show x) (x:decodeUTF8' xs)
chunkDec4096 :: [Char] -> [[Char]]
chunkDec4096 xs = doChunk (splitAt 4096 xs)
where
doChunk ([], _) = []
doChunk (pre, post@(c:_))
| c < '\x80' = pre : chunkDec4096 post
| c > '\xBF' = pre : chunkDec4096 post
| otherwise = doChunk (init pre, last pre : post)
doChunk (pre, _) = [pre]
chunk :: Int -> [Char] -> [[Char]]
chunk _ [] = []
chunk size xs = case splitAt size xs of (xs', xs'') -> xs' : chunk size xs''
encodeUTF8 :: String -> String
encodeUTF8 xs = concatMap encodeUTF8' (chunk 4096 xs)
encodeUTF8' :: String -> String
encodeUTF8' [] = []
encodeUTF8' ('\0':cs)
= let rest = encodeUTF8' cs
in seq rest
('\xC0':'\x80':rest)
encodeUTF8' (c:cs)
| c < '\x80'
= let rest = encodeUTF8' cs
in seq rest
(c:rest)
| c < '\x800'
= let i = fromEnum c
rest = encodeUTF8' cs
in seq rest
( toEnum (0xC0 + i `div` 0x40)
: toEnum (0x80 + i `mod` 0x40)
: rest
)
| c < '\x10000'
= let i = fromEnum c
rest = encodeUTF8' cs
in seq rest
( toEnum (0xE0 + i `div` 0x1000)
: toEnum (0x80 + (i `div` 0x40) `mod` 0x40)
: toEnum (0x80 + i `mod` 0x40)
: rest
)
| otherwise
= let i = fromEnum c
rest = encodeUTF8' cs
in seq rest
( toEnum (0xF0 + i `div` 0x40000)
: toEnum (0x80 + (i `div` 0x1000) `mod` 0x40)
: toEnum (0x80 + (i `div` 0x40) `mod` 0x40)
: toEnum (0x80 + i `mod` 0x40)
: rest
)
toQuoteMeta :: Char -> String
toQuoteMeta c =
if not (isLatin1 c)
|| isAsciiUpper c || isAsciiLower c || isDigit c || c == '_'
then [ c ]
else [ '\\', c ]