Haskell Code by HsColour

module Pugs.Compat.String (
    split,
    split_n,
    breakOnGlue,
    afterPrefix,
    decodeUTF8,
    encodeUTF8,
    toQuoteMeta
) where

import Debug.Trace
import Pugs.Compat.Monads
import Data.Char

split :: (Eq a) => [a] -> [a] -> [[a]]
split []  _   = internalError "splitting by an empty list"
split sep str =
   case breakOnGlue sep str of
     Just (before, after) -> before : split sep after
     Nothing -> [str]

split_n :: (Eq a) => [a] -> [a] -> Int -> [[a]]
split_n [] _ _ = internalError "splitting by an empty list"
split_n sep str n
   | n == 1 = [str]
   | otherwise =
   case breakOnGlue sep str of
       Just (before, after) -> before : split_n sep after (n-1)
       Nothing -> [str]

-- returns Nothing if the glue isn't there
breakOnGlue :: (Eq a) => [a] -> [a] -> Maybe ([a], [a])
breakOnGlue _    [] = Nothing
breakOnGlue glue list@(x:xs) =
   case afterPrefix glue list of
      Just rest -> Just ([], rest)
      Nothing -> case breakOnGlue glue xs of
                    Just (before, after) -> Just (x : before, after)
                    Nothing -> Nothing

afterPrefix :: (Eq a) => [a] -> [a] -> Maybe [a]
afterPrefix []     list = Just list
afterPrefix _      []   = Nothing  -- non-empty prefix of an empty list
afterPrefix (p:ps) (x:xs)
   | p == x = afterPrefix ps xs
   | otherwise = Nothing

{-# INLINE decodeUTF8 #-}
decodeUTF8 :: String -> String
decodeUTF8 xs = concatMap decodeUTF8' (chunkDec4096 xs)

{-# INLINE decodeUTF8' #-}
decodeUTF8' :: String -> String
decodeUTF8' [] = []
decodeUTF8' (c:cs)
    | c < '\x80'
    = let rest = decodeUTF8' cs
       in seq rest
          (c:rest)
decodeUTF8' (c:d:cs)
    | '\xC0' <= c, c <= '\xDF'
    , '\x80' <= d, d <= '\xBF'
    = let rest = decodeUTF8' cs
       in seq rest
          ( toEnum ( (fromEnum c `mod` 0x20) * 0x40
                   + fromEnum d `mod` 0x40
                   )
          : rest
          )
decodeUTF8' (c:d:e:cs)
    | '\xE0' <= c, c <= '\xEF'
    , '\x80' <= d, d <= '\xBF'
    , '\x80' <= e, e <= '\xBF'
    = let rest = decodeUTF8' cs
       in seq rest
          ( toEnum ( (fromEnum c `mod` 0x10 * 0x1000)
                   + (fromEnum d `mod` 0x40) * 0x40
                   + fromEnum e `mod` 0x40
                   )
          : rest
          )
decodeUTF8' (c:d:e:f:cs)
    | '\xF0' <= c, c <= '\xF7'
    , '\x80' <= d, d <= '\xBF'
    , '\x80' <= e, e <= '\xBF'
    , '\x80' <= f, f <= '\xBF'
    = let rest = decodeUTF8' cs
       in seq rest
          ( toEnum ( (fromEnum c `mod` 0x10 * 0x40000)
                   + (fromEnum d `mod` 0x40) * 0x1000
                   + (fromEnum e `mod` 0x40) * 0x40
                   + fromEnum f `mod` 0x40
                   )
          : rest
          )
decodeUTF8' (x:xs) = trace ("decodeUTF8': bad data: " ++ show x) (x:decodeUTF8' xs)

{-# INLINE chunkDec4096 #-}
chunkDec4096 :: [Char] -> [[Char]]
chunkDec4096 xs = doChunk (splitAt 4096 xs)
    where
    doChunk ([], _)  = []
    doChunk (pre, post@(c:_))
        | c < '\x80' = pre : chunkDec4096 post
        | c > '\xBF' = pre : chunkDec4096 post
        | otherwise  = doChunk (init pre, last pre : post)
    doChunk (pre, _) = [pre]

{-# INLINE chunk #-}
chunk :: Int -> [Char] -> [[Char]]
chunk _    [] = []
chunk size xs = case splitAt size xs of (xs', xs'') -> xs' : chunk size xs''

{-# INLINE encodeUTF8 #-}
encodeUTF8 :: String -> String
encodeUTF8 xs = concatMap encodeUTF8' (chunk 4096 xs)

{-# INLINE encodeUTF8' #-}
encodeUTF8' :: String -> String
encodeUTF8' [] = []
-- In the \0 case, we diverge from the Unicode standard to remove any trace
-- of embedded nulls in our bytestrings, to allow the use of Judy.StrMap
-- and to make passing CString around easier.  See Java for the same treatment:
-- http://java.sun.com/j2se/1.5.0/docs/api/java/io/DataInput.html#modified-utf-8
encodeUTF8' ('\0':cs)
    = let rest = encodeUTF8' cs
       in seq rest
          ('\xC0':'\x80':rest)
encodeUTF8' (c:cs)
    | c < '\x80'
    = let rest = encodeUTF8' cs
       in seq rest
          (c:rest)
    | c < '\x800'
    = let i     = fromEnum c
          rest  = encodeUTF8' cs
       in seq rest
          ( toEnum (0xC0 + i `div` 0x40)
          : toEnum (0x80 + i `mod` 0x40)
          : rest
          )
    | c < '\x10000'
    = let i     = fromEnum c
          rest  = encodeUTF8' cs
       in seq rest
          ( toEnum (0xE0 + i `div` 0x1000)
          : toEnum (0x80 + (i `div` 0x40) `mod` 0x40)
          : toEnum (0x80 + i `mod` 0x40)
          : rest
          )
    | otherwise
    = let i     = fromEnum c
          rest  = encodeUTF8' cs
       in seq rest
          ( toEnum (0xF0 + i `div` 0x40000)
          : toEnum (0x80 + (i `div` 0x1000) `mod` 0x40)
          : toEnum (0x80 + (i `div` 0x40) `mod` 0x40)
          : toEnum (0x80 + i `mod` 0x40)
          : rest
          )

-- perform char quotation according to original Perl 5 quotemeta
-- have to return a string because of the quote, this requires
-- concat in quotemeta above.
toQuoteMeta :: Char -> String
toQuoteMeta c =
   if not (isLatin1 c) -- Ignore Unicode characters beyond the 256-th
      || isAsciiUpper c || isAsciiLower c || isDigit c || c == '_'
      then [ c ]
      else [ '\\', c ]