{-# OPTIONS_GHC -cpp -fno-warn-orphans #-}
-- |
-- Module      : Data.CompactString
-- License     : BSD-style
-- Maintainer  : twanvl@gmail.com
-- Stability   : experimental
-- Portability : untested
-- 
-- A time and space-efficient implementation of strings using
-- packed Word8 arrays, suitable for high performance use, both in terms
-- of large data quantities, or high speed requirements.
--
-- This module is intended to be imported @qualified@, to avoid name
-- clashes with "Prelude" functions.  eg.
--
-- > import qualified Data.CompactString as C
--
-- Internally, CompactStrings are encoded 'ByteString's.
--
module Data.CompactString (
        
        -- * The @CompactString@ type
        Encoding,
        CompactString,          -- abstract, instances: Eq, Ord, Show, Monoid
        
        -- * Introducing and eliminating 'CompactString's
        empty,                  -- :: CompactString
        singleton,              -- :: Char   -> CompactString
        pack,                   -- :: String -> CompactString
        unpack,                 -- :: CompactString -> String
        
        -- * Basic interface
        cons,                   -- :: Char -> CompactString -> CompactString
        snoc,                   -- :: CompactString -> Char -> CompactString
        append,                 -- :: CompactString -> CompactString -> CompactString
        head,                   -- :: CompactString -> Char
        last,                   -- :: CompactString -> Char
        tail,                   -- :: CompactString -> CompactString
        init,                   -- :: CompactString -> CompactString
        headView,               -- :: CompactString -> Maybe (Char, CompactString)
        lastView,               -- :: CompactString -> Maybe (CompactString, Char)
        null,                   -- :: CompactString -> Bool
        length,                 -- :: CompactString -> Int
        
        -- * Transforming 'CompactString's
        map,                    -- :: (Char -> Char) -> CompactString -> CompactString
        reverse,                -- :: CompactString -> CompactString
        intersperse,            -- :: Char -> CompactString -> CompactString
        intercalate,            -- :: CompactString -> [CompactString] -> CompactString
        transpose,              -- :: [CompactString] -> [CompactString]
        
        -- * Reducing 'CompactString's (folds)
        foldl,                  -- :: (a -> Char -> a) -> a -> CompactString -> a
        foldl',                 -- :: (a -> Char -> a) -> a -> CompactString -> a
        foldl1,                 -- :: (Char -> Char -> Char) -> CompactString -> Char
        foldl1',                -- :: (Char -> Char -> Char) -> CompactString -> Char
        
        foldr,                  -- :: (Char -> a -> a) -> a -> CompactString -> a
        foldr',                 -- :: (Char -> a -> a) -> a -> CompactString -> a
        foldr1,                 -- :: (Char -> Char -> Char) -> CompactString -> Char
        foldr1',                -- :: (Char -> Char -> Char) -> CompactString -> Char
        
        -- ** Special folds
        concat,                 -- :: [CompactString] -> CompactString
        concatMap,              -- :: (Char -> CompactString) -> CompactString -> CompactString
        any,                    -- :: (Char -> Bool) -> CompactString -> Bool
        all,                    -- :: (Char -> Bool) -> CompactString -> Bool
        maximum,                -- :: CompactString -> Char
        minimum,                -- :: CompactString -> Char
        
        -- * Building CompactStrings
        -- ** Scans
        scanl,                  -- :: (Char -> Char -> Char) -> Char -> CompactString -> CompactString
        scanl1,                 -- :: (Char -> Char -> Char) ->         CompactString -> CompactString
        scanr,                  -- :: (Char -> Char -> Char) -> Char -> CompactString -> CompactString
        scanr1,                 -- :: (Char -> Char -> Char) ->         CompactString -> CompactString
        
        -- ** Accumulating maps
        mapAccumL,              -- :: (acc -> Char -> (acc, Char)) -> acc -> CompactString -> (acc, CompactString)
        mapAccumR,              -- :: (acc -> Char -> (acc, Char)) -> acc -> CompactString -> (acc, CompactString)
        mapIndexed,             -- :: (Int -> Char -> Char) -> CompactString -> CompactString
        
        -- ** Unfolding CompactStrings
        replicate,              -- :: Int -> Char -> CompactString
        unfoldr,                -- :: (a -> Maybe (Char, a)) -> a -> CompactString
        unfoldrN,               -- :: Int -> (a -> Maybe (Char, a)) -> a -> (CompactString, Maybe a)
        
        -- * Substrings
        
        -- ** Breaking strings
        take,                   -- :: Int -> CompactString -> CompactString
        drop,                   -- :: Int -> CompactString -> CompactString
        splitAt,                -- :: Int -> CompactString -> (CompactString, CompactString)
        takeWhile,              -- :: (Char -> Bool) -> CompactString -> CompactString
        dropWhile,              -- :: (Char -> Bool) -> CompactString -> CompactString
        span,                   -- :: (Char -> Bool) -> CompactString -> (CompactString, CompactString)
        spanEnd,                -- :: (Char -> Bool) -> CompactString -> (CompactString, CompactString)
        break,                  -- :: (Char -> Bool) -> CompactString -> (CompactString, CompactString)
        breakEnd,               -- :: (Char -> Bool) -> CompactString -> (CompactString, CompactString)
        group,                  -- :: CompactString -> [CompactString]
        groupBy,                -- :: (Char -> Char -> Bool) -> CompactString -> [CompactString]
        inits,                  -- :: CompactString -> [CompactString]
        tails,                  -- :: CompactString -> [CompactString]
        
        -- ** Breaking into many substrings
        split,                  -- :: Char -> CompactString -> [CompactString]
        splitWith,              -- :: (Char -> Bool) -> CompactString -> [CompactString]
        
        -- ** Breaking into lines and words
        lines,                  -- :: CompactString -> [CompactString]
        words,                  -- :: CompactString -> [CompactString]
        unlines,                -- :: [CompactString] -> CompactString
        unwords,                -- :: CompactString -> [CompactString]
        
        -- * Predicates
        isPrefixOf,             -- :: CompactString -> CompactString -> Bool
        isSuffixOf,             -- :: CompactString -> CompactString -> Bool
        isInfixOf,              -- :: CompactString -> CompactString -> Bool
        
        -- ** Search for arbitrary substrings
        findSubstring,          -- :: CompactString -> CompactString -> Maybe Int
        findSubstrings,         -- :: CompactString -> CompactString -> [Int]
        
        -- * Searching CompactStrings
        
        -- ** Searching by equality
        elem,                   -- :: Char -> CompactString -> Bool
        notElem,                -- :: Char -> CompactString -> Bool
        
        -- ** Searching with a predicate
        find,                   -- :: (Char -> Bool) -> CompactString -> Maybe Char
        filter,                 -- :: (Char -> Bool) -> CompactString -> CompactString
        partition,              -- :: (Char -> Bool) -> CompactString -> (CompactString, CompactString)
        
        -- * Indexing CompactStrings
        index,                  -- :: CompactString -> Int -> Char
        elemIndex,              -- :: Char -> CompactString -> Maybe Int
        elemIndices,            -- :: Char -> CompactString -> [Int]
        elemIndexEnd,           -- :: Char -> CompactString -> Maybe Int
        findIndex,              -- :: (Char -> Bool) -> CompactString -> Maybe Int
        findIndexEnd,           -- :: (Char -> Bool) -> CompactString -> Maybe Int
        findIndices,            -- :: (Char -> Bool) -> CompactString -> [Int]
        count,                  -- :: Char -> CompactString -> Int
        
        -- * Zipping and unzipping CompactStrings
        zip,                    -- :: CompactString -> CompactString -> [(Char,Char)]
        zipWith,                -- :: (Char -> Char -> c) -> CompactString -> CompactString -> [c]
        zipWith',               -- :: (Char -> Char -> Char) -> CompactString -> CompactString -> CompactString
        unzip,                  -- :: [(Char,Char)] -> (CompactString,CompactString)
        
        -- * Ordered CompactStrings
        sort,                   -- :: CompactString -> CompactString
        compare',               -- :: CompactString a -> CompactString b -> Ordering
        
        -- * Encoding
        toByteString,           -- :: Encoding a => CompactString a -> ByteString
        fromByteString,         -- :: Encoding a => ByteString -> m (CompactString a)
        fromByteString_,        -- :: Encoding a => ByteString -> CompactString a
        validate,               -- :: Encoding a => CompactString a -> m (CompactString a)
        validate_,              -- :: Encoding a => CompactString a -> CompactString a
        -- ** Encoding conversion
        module Data.CompactString.Encodings,
        recode,                 -- :: (Encoding a, Encoding b) => CompactString a -> m (CompactString b)
        recode_,                -- :: (Encoding a, Encoding b) => CompactString a -> CompactString b
        encode,                 -- :: (Encoding a, Encoding b) => a -> CompactString b -> m (ByteString)
        encode_,                -- :: (Encoding a, Encoding b) => a -> CompactString b -> ByteString
        decode,                 -- :: (Encoding a, Encoding b) => a -> ByteString -> m (CompactString b)
        decode_,                -- :: (Encoding a, Encoding b) => a -> ByteString -> CompactString b
        encodeBOM,              -- :: Encoding a => CompactString a -> m (ByteString)
        encodeBOM_,             -- :: Encoding a => CompactString a -> ByteString
        decodeBOM,              -- :: Encoding a => ByteString -> m (CompactString a)
        decodeBOM_,             -- :: Encoding a => ByteString -> CompactString a
        
        -- * I\/O with 'CompactString's
        
        -- ** Standard input and output
        getLine,                -- :: IO CompactString
        getContents,            -- :: IO CompactString
        putStr,                 -- :: CompactString -> IO ()
        putStrLn,               -- :: CompactString -> IO ()
        interact,               -- :: (CompactString -> CompactString) -> IO ()
        
        -- ** Files
        readFile,               -- :: FilePath -> IO CompactString
        readFile',              -- :: FilePath -> IO CompactString
        writeFile,              -- :: FilePath -> CompactString -> IO ()
        writeFile',             -- :: FilePath -> CompactString -> IO ()
        appendFile,             -- :: FilePath -> CompactString -> IO ()
        appendFile',            -- :: FilePath -> CompactString -> IO ()
        
        -- ** I\/O with Handles
        hGetLine,               -- :: Handle -> IO CompactString
        hGetContents,           -- :: Handle -> IO CompactString
        hGetContents',          -- :: Handle -> IO CompactString
        hGet,                   -- :: Handle -> Int -> IO CompactString
        hGetNonBlocking,        -- :: Handle -> Int -> IO CompactString
        hPut,                   -- :: Handle -> CompactString -> IO ()
        hPutStr,                -- :: Handle -> CompactString -> IO ()
        hPutStrLn,              -- :: Handle -> CompactString -> IO ()
        
        ) where

import Prelude hiding
        (length, head, tail, last, init, null, 
         map, reverse, foldl, foldr, foldl1, foldr1, concat, concatMap,
         scanl, scanl1, scanr, scanr1, replicate,
         take, drop, splitAt, takeWhile, dropWhile,
         span, break, any, all, elem, notElem,
         maximum, minimum, filter, zip, zipWith, unzip,
         lines, unlines, words, unwords,
         putStr, putStrLn, getContents, getLine, interact,
         readFile, writeFile, appendFile)

import Data.Monoid
import qualified Data.List as L
import Data.Maybe               (isJust, isNothing, listToMaybe)
import Data.String              (IsString(..))

import Foreign.Ptr              (minusPtr)
import Foreign.ForeignPtr       (withForeignPtr)

import System.IO                (Handle,openFile,hClose,IOMode(..),
                                 hSeek,hTell,SeekMode(..),stdin,stdout)
import qualified System.IO      (hGetLine)
import System.IO.Unsafe         (unsafePerformIO)

import Control.Monad            (liftM,MonadPlus)
import Control.Exception        (bracket)

import Data.Char                (isSpace)

import qualified Data.ByteString.Internal as B
import qualified Data.ByteString.Unsafe   as B
import qualified Data.ByteString          as B
import qualified Data.ByteString.Char8    as C8

import Data.CompactString.Internal
import Data.CompactString.Fusion
import Data.CompactString.Unsafe
import Data.CompactString.Encodings

-- -----------------------------------------------------------------------------
--
-- Type signatures & documentation
--

#define COMPACTSTRING CompactString a
#define DESCRIPTION the encoding @a@
#define CONTEXT Encoding a =>
#define CONTEXT_ Encoding a, 
#define IF_FIXED(a,b) b
#define IF_NOT_REPRESENT(a) a
#include "signatures.include"

-- -----------------------------------------------------------------------------
--
-- Useful macros, until we have bang patterns
--

#define STRICT1(f) f _a             | _a                                     `seq` False = undefined
#define STRICT2(f) f _a _b          | _a `seq` _b                            `seq` False = undefined
#define STRICT3(f) f _a _b _c       | _a `seq` _b `seq` _c                   `seq` False = undefined
#define STRICT4(f) f _a _b _c _d    | _a `seq` _b `seq` _c `seq` _d          `seq` False = undefined
#define STRICT5(f) f _a _b _c _d _e | _a `seq` _b `seq` _c `seq` _d `seq` _e `seq` False = undefined

-- -----------------------------------------------------------------------------
--
-- Comparison
--

instance Encoding a => Eq (CompactString a) where
        (CS a) == (CS b) = a == b

instance Encoding a => Monoid (CompactString a) where
        mempty  = empty
        mappend = append
        mconcat = concat

instance Encoding a => Ord (CompactString a) where
        compare = doCompare

doCompare :: Encoding a => CompactString a -> CompactString a -> Ordering
doCompare a b
    | validOrdering (encoding a) = compare (unCS a) (unCS b)
     -- We can't compare the ByteStrings, because the encoding results in a different ordering
    | otherwise                  = compare' a b

-- -----------------------------------------------------------------------------
--
-- Construction/destruction
--

empty = CS $ B.empty
{-# INLINE empty #-}

singleton c = cs
    where
        cs = CS $ B.unsafeCreate l p
        (l,p) = pokeCharFun (encoding cs) c
{-# SPECIALIZE singleton :: Char -> CompactString UTF8 #-}

pack str = cs
    where
        cs = CS $ B.unsafeCreate (L.sum . L.map (pokeCharLen (encoding cs)) $ str) $ \p -> go p str
        go _ []     = return ()
        go p (x:xs) = do  l <- pokeChar (encoding cs) p x
                          go (p `plusPtr` l) xs
{-# SPECIALIZE pack :: String -> CompactString UTF8 #-}

unpack cs@(CS (PS ps ss ls)) = inlinePerformIO $ withForeignPtr ps $ \p -> return (loop p ss ls)
    where
        STRICT3(loop)
        loop _ _ 0 = []
        loop _ _ l | l < 0 = error $ "string length incorrect in " ++ show (PS ps ss ls) -- This really shouldn't happen!
        loop p s l = case inlinePerformIO $ peekChar (encoding cs) (p `plusPtr` s) of
                        (l',c) -> c : loop p (s + l') (l - l')
{-# SPECIALIZE unpack :: CompactString UTF8 -> String #-}

{-# RULES

"CompactString pack/unpack" forall cs.
    pack (unpack cs) = cs
"CompactString unpack/pack" forall ls.
    unpack (pack ls) = ls

  #-}

instance Encoding a => Show (CompactString a) where
        showsPrec p = showsPrec p . unpack

instance Encoding a => IsString (CompactString a) where
        fromString = pack

-- -----------------------------------------------------------------------------
--
-- Basic interface
--

null  = B.null . unCS
{-# INLINE null #-}

length cs@(CS (PS _ _ l)) = unsafeWithBuffer cs (length_ 0 l)
    where
        STRICT3(length_)
        length_ acc 0 _ = return acc
        length_ acc n p = do  l' <- peekCharLen (encoding cs) p
                              length_ (acc + 1) (n - l') (p `plusPtr` l')
{-# INLINE length #-}

cons c cs@(CS (PS _ _ l)) = unsafeWithBuffer cs $ \src -> create (l+l') $ \dst -> do
        pokeC dst
        memcpy (dst `plusPtr` l') src (fromIntegral l)
 where (l',pokeC) = pokeCharFun (encoding cs) c
{-# INLINE cons #-}

snoc cs@(CS (PS _ _ l)) c = unsafeWithBuffer cs $ \src -> create (l+l') $ \dst -> do
        memcpy dst src (fromIntegral l)
        pokeC (dst `plusPtr` l)
 where (l',pokeC) = pokeCharFun (encoding cs) c
{-# INLINE snoc #-}

append (CS xs) (CS ys) = CS (B.append xs ys)
{-# INLINE append #-}

-- -----------------------------------------------------------------------------
--
-- Head & tail and friends
--

head cs
    | null cs   = errorEmptyList "head"
    | otherwise = unsafeHead cs
{-# INLINE head #-}

tail cs
    | null cs   = errorEmptyList "tail"
    | otherwise = unsafeTail cs
{-# INLINE tail #-}

last cs
    | null cs   = errorEmptyList "last"
    | otherwise = unsafeLast cs
{-# INLINE last #-}

init cs
    | null cs   = errorEmptyList "init"
    | otherwise = unsafeInit cs
{-# INLINE init #-}

headView cs@(CS (PS x s l))
    | null cs   = Nothing
    | otherwise = let (headlen,c) = unsafeWithBuffer cs $ peekChar (encoding cs)
                  in  Just (c, CS (PS x (s+headlen) (l-headlen)))
{-# INLINE headView #-}

lastView cs@(CS (PS x s l))
    | null cs   = Nothing
    | otherwise = let (lastlen,c) = unsafeWithBuffer cs $ peekCharRev (encoding cs)
                  in  Just (CS (PS x s (l-lastlen)), c)
{-# INLINE lastView #-}


-- -----------------------------------------------------------------------------
--
-- List functions
--

reverse cs@(CS (PS _ _ l)) = unsafeWithBufferEnd cs $ create l . reverse_ l
    where
        STRICT3(reverse_)
        reverse_ 0 _   _   = return ()
        reverse_ n src dst = do
            i <- copyCharRev (encoding cs) src dst
            reverse_ (n-i) (src `plusPtr` negate i) (dst `plusPtr` i)
{-# SPECIALIZE reverse :: CompactString UTF8 -> CompactString UTF8 #-}

intersperse c cs@(CS (PS _ _ l))
 | l == 0    = cs
 | otherwise = unsafeWithBuffer cs $ create (l+(len-1)*lc) . intersperse_copy l
   where
       (lc,pokeC) = pokeCharFun (encoding cs) c
       len        = length cs
       STRICT3(intersperse_copy)
       intersperse_copy  0 _   _   = return ()
       intersperse_copy  n src dst = do  l' <- copyChar (encoding cs) src dst
                                         intersperse_inter (n-l') (src `plusPtr` l') (dst `plusPtr` l')
       STRICT3(intersperse_inter)
       intersperse_inter 0 _   _   = return ()
       intersperse_inter n src dst = do  pokeC dst
                                         intersperse_copy  n       src               (dst `plusPtr` lc)
{-# SPECIALIZE intersperse :: Char -> CompactString UTF8 -> CompactString UTF8 #-}

transpose ps = L.map pack (L.transpose (L.map unpack ps))

-- -----------------------------------------------------------------------------
--
-- Simple loops
--

--map f = loopArr . loopUp (mapEFL f) NoAcc
map f cs@(CS (PS _ _ l)) = result
  where
    result = CS $ inlinePerformIO $ withBuffer cs $ \p1 -> B.createAndTrim (newSize (encoding cs) l) $
             map_ p1
    map_ p1 p2 = go 0 0
      where
        STRICT2(go)
        go i j
           | i >= l     = return j
           | otherwise  = do
                (l1,c1) <- peekChar (encoding cs) (p1 `plusPtr` i)
                l2      <- pokeChar (encoding cs) (p2 `plusPtr` j) (f c1)
                go (i+l1) (j+l2)
{-# INLINE map #-}

--filter p = loopArr . loopUpC (filterEFL p) NoAcc
filter predicate cs@(CS (PS _ _ l))
    | null cs   = cs
    | otherwise = result
  where
    result = CS $ unsafeWithBuffer cs $ \p1 -> B.createAndTrim l $ filter_ p1
    filter_ p1 p2 = go 0 0
      where
        STRICT2(go)
        go i j | i >= l  = return j
               | otherwise = do
                    (l1,c1) <- peekChar (encoding cs) (p1 `plusPtr` i)
                    if predicate c1
                        then do l2 <- pokeChar (encoding cs) (p2 `plusPtr` j) c1
                                go (i+l1) (j+l2)
                        else    go (i+l1) j
{-# INLINE filter #-}

partition p cs = (filter p cs, filter (not . p) cs)

-- -----------------------------------------------------------------------------
--
-- Folds
--

foldl f z = loopUpFold f z
foldl' = foldl
foldr  f z = loopDownFold (flip f) z -- TODO : Is this too strict?
foldr' f z = loopDownFold (flip f) z


foldl1 f cs
    | null cs   = errorEmptyList "foldl1"
    | otherwise = foldl f (unsafeHead cs) (unsafeTail cs)
{-# INLINE foldl1 #-}

foldl1' f cs
    | null cs   = errorEmptyList "foldl1'"
    | otherwise = foldl' f (unsafeHead cs) (unsafeTail cs)
{-# INLINE foldl1' #-}

foldr1 f cs
    | null cs        = errorEmptyList "foldr1"
    | otherwise      = foldr f (unsafeLast cs) (unsafeInit cs)
{-# INLINE foldr1 #-}

foldr1' f cs
    | null cs        = errorEmptyList "foldr1'"
    | otherwise      = foldr' f (unsafeLast cs) (unsafeInit cs)
{-# INLINE [1] foldr1' #-}

-- ---------------------------------------------------------------------
-- Special folds

concat = CS . B.concat . L.map unCS

concatMap f = concat . foldr ((:) . f) []

intercalate s = concat . (L.intersperse s)
{-# INLINE [1] intercalate #-}

any p = isJust . find p
all p = isNothing . find (not . p)

maximum cs
    | null cs   = errorEmptyList "maximum"
    | otherwise = foldl1' max cs
minimum cs
    | null cs   = errorEmptyList "minimum"
    | otherwise = foldl1' min cs

-- ---------------------------------------------------------------------
-- Building CompactString : scans

scanl f z ps = loopArr . loopUp (scanEFL f) z $ (ps `snoc` '\0')
{-# INLINE scanl #-}

scanl1 f ps
    | null ps   = empty
    | otherwise = scanl f (unsafeHead ps) (unsafeTail ps)
{-# INLINE scanl1 #-}

scanr f z ps = loopArr . loopDown (scanEFL (flip f)) z $ ('\0' `cons` ps) -- extra space
{-# INLINE scanr #-}

scanr1 f cs
    | null cs   = empty
    | otherwise = scanr f (unsafeLast cs) (unsafeInit cs)
{-# INLINE scanr1 #-}

-- ---------------------------------------------------------------------
-- Building CompactString : Accumulating maps

mapAccumL f z = unSP . loopUp (mapAccumEFL f) z
{-# INLINE mapAccumL #-}
mapAccumR f z = unSP . loopDown (mapAccumEFL f) z
{-# INLINE mapAccumR #-}
mapIndexed f = loopArr . loopUp (mapIndexEFL f) 0
{-# INLINE mapIndexed #-}

-- ---------------------------------------------------------------------
-- Building CompactString : unfolding

replicate w c
    | w <= 0    = empty
    | otherwise = cs
 where
    cs = CS $ B.unsafeCreate (l*w) (go w)
    (l,pokeC) = pokeCharFun (encoding cs) c
    STRICT2(go)
    go 0 _   = return ()
    go n ptr = pokeC ptr >> go (n-1) (ptr `plusPtr` l)
{-# SPECIALIZE replicate :: Int -> Char -> CompactString UTF8 #-}

unfoldr f = concat . unfoldChunk 32 64
  where unfoldChunk n n' x =
          case unfoldrN n f x of
            (s, Nothing) -> s : []
            (s, Just x') -> s : unfoldChunk n' (n+n') x'


unfoldrN i f x0
    | i <= 0    = (empty, Just x0)
    | otherwise = result
  where
   result = (\(b,s) -> (CS b,s)) $ unsafePerformIO $ B.createAndTrim' (byteCount (encoding (fst result)) i) go_
   go_ start_p = go start_p x0 0
    where
        STRICT3(go)
        go p x n
         | n == i    = return (0, p `minusPtr` start_p, Just x)
         | otherwise = case f x of
                          Nothing     -> return (0, p `minusPtr` start_p, Nothing)
                          Just (w,x') -> do l <- pokeChar (encoding (fst result)) p w
                                            go (p `plusPtr` l) x' (n+1)


-- ---------------------------------------------------------------------
-- Substrings

take n cs@(CS (PS x s l))
    | n <= 0    = empty
    | idx == l  = cs
    | otherwise = CS $ PS x s idx
    where idx = charIndex cs n
{-# INLINE take #-}

drop n cs@(CS (PS x s l))
    | n <= 0    = cs
    | idx == l  = empty
    | otherwise = CS $ PS x (s+idx) (l-idx)
    where idx = charIndex cs n
{-# INLINE drop #-}

splitAt n cs@(CS (PS x s l))
    | n <= 0    = (empty, cs)
    | idx == l  = (cs, empty)
    | otherwise = (CS $ PS x s idx, CS $ PS x (s+idx) (l-idx))
    where idx = charIndex cs n
{-# INLINE splitAt #-}

takeWhile f cs@(CS bs) = CS $ B.unsafeTake (findIndexOrEnd (not . f) cs) bs
dropWhile f cs@(CS bs) = CS $ B.unsafeDrop (findIndexOrEnd (not . f) cs) bs
{-# INLINE takeWhile #-}
{-# INLINE dropWhile #-}

break p cs@(CS bs) = (CS $ B.unsafeTake n bs, CS $ B.unsafeDrop n bs)
    where n = findIndexOrEnd p cs
span p ps = break (not . p) ps
{-# INLINE [1] break #-}
{-# INLINE [1] span #-}

breakEnd p cs@(CS bs) = (CS $ B.unsafeTake n bs, CS $ B.unsafeDrop n bs)
    where n = findIndexOrBeginRev p cs
spanEnd p cs = breakEnd (not . p) cs
{-# INLINE [1] breakEnd #-}
{-# INLINE [1] spanEnd #-}

-- ---------------------------------------------------------------------
-- To list

group = groupBy (==)

groupBy k cs@(CS bs) = case headView cs of
     Nothing     -> []
     Just (x,xs) -> let n = pokeCharLen (encoding cs) x  +  findIndexOrEnd (not . k x) xs
                    in  CS (B.unsafeTake n bs) : groupBy k (CS (B.unsafeDrop n bs))


inits cs@(CS (PS x s l)) = inits_ 0
    where
        STRICT1(inits_)
        inits_ n
          | n >= l    = [cs]
          | otherwise = unsafeWithBuffer cs $ \ptr -> do
                          len <- peekCharLen (encoding cs) (ptr `plusPtr` n)
                          return (CS (PS x s n) : inits_ (n + len))

tails p | null p    = [empty]
        | otherwise = p : tails (unsafeTail p)


split x = splitWith (x==)

splitWith _ (CS (PS _ _ 0)) = []
splitWith p cs = loop p cs
    where
        STRICT2(loop)
        loop q qs = if null rest then [chunk]
                                 else chunk : loop q (unsafeTail rest)
            where (chunk,rest) = break q qs


lines = dropEmptyLast . split '\n'
    where  dropEmptyLast []           = []
           dropEmptyLast [x] | null x = []
           dropEmptyLast (x:xs)       = x : dropEmptyLast xs
{-# INLINE lines #-}

unlines [] = empty
unlines ss = (concat $ L.intersperse nl ss) `append` nl
    where nl = singleton '\n'

words = L.filter (not . null) . splitWith isSpace
{-# INLINE words #-}

unwords = intercalate (singleton ' ')
{-# INLINE unwords #-}

-- ---------------------------------------------------------------------
-- Searching

isPrefixOf (CS a) (CS b) = B.isPrefixOf a b
isSuffixOf (CS a) (CS b) = B.isSuffixOf a b

isInfixOf p s
 | validSubstring (encoding p) = B.isInfixOf (unCS p) (unCS s)
 | otherwise                   = not $ L.null $ findSubstrings p s
{-# INLINE isInfixOf #-}

findSubstring = (listToMaybe .) . findSubstrings

-- NOTE: We can't just call the ByteString version, because a three byte
--       encoded char has a valid 1 byte encoded char as a substring.
--       TODO: copy the KMP algorithm from Data.ByteString here
findSubstrings pat@(CS (PS _ _ m)) = search 0
    where
        STRICT2(search)
        search i str@(CS (PS _ _ n))
            | n < m                = []
            | pat `isPrefixOf` str = i : search (i+1) (unsafeTail str)
            | otherwise            =     search (i+1) (unsafeTail str)

-- ---------------------------------------------------------------------
-- Indexing CompactString

index cs@(CS (PS _ _ l)) n
 | n < 0     = moduleError "index" ("negative index: " ++ show n)
 | idx == l  = moduleError "index" ("index too large: " ++ show n ++ ", length = " ++ show (length cs))
 | otherwise = snd $ unsafeWithBuffer cs $ peekChar (encoding cs) . (`plusPtr` idx)
 where idx = charIndex cs n
{-# INLINE index #-}

-- ---------------------------------------------------------------------
-- Element searching

elem    c = any (c==)
notElem c = not . elem c
{-# INLINE elem #-}
{-# INLINE notElem #-}

elemIndex    c = findIndex    (c==)
elemIndexEnd c = findIndexEnd (c==)
elemIndices  c = findIndices  (c==)
{-# INLINE elemIndex #-}


find f cs@(CS (PS _ _ l)) = unsafeWithBuffer cs $ \ptr -> go ptr (ptr `plusPtr` l)
    where
        STRICT2(go)
        go p q | p == q    = return Nothing
               | otherwise = do (l',c) <- peekChar (encoding cs) p
                                if f c then return (Just c) 
                                       else go (p `plusPtr` l') q
{-# INLINE find #-}

findIndex f cs@(CS (PS _ _ l)) = unsafeWithBuffer cs $ \ptr -> go 0 ptr (ptr `plusPtr` l)
    where
        STRICT3(go)
        go n p q | p == q    = return Nothing
                 | otherwise = do (l',c) <- peekChar (encoding cs) p
                                  if f c then return (Just n) 
                                         else go (n+1) (p `plusPtr` l') q
{-# INLINE findIndex #-}

findIndexEnd f cs@(CS (PS _ _ l)) = unsafeWithBufferEnd cs $ \ptr -> go 1 ptr (ptr `plusPtr` (-l))
    where
        STRICT3(go)
        go n p q | p == q    = return Nothing
                 | otherwise = do (l',c) <- peekCharRev (encoding cs) p
                                  if f c then return $! Just $! length cs - n
                                         else go (n+1) (p `plusPtr` (-l')) q

findIndices p cs = loop 0 cs
    where
        STRICT2(loop)
        loop n qs = case headView qs of
                     Nothing       -> []
                     Just (q',qs')
                      | p q'       -> n : loop (n+1) qs'
                      | otherwise  ->     loop (n+1) qs'


count c = foldl' (\l x -> if x == c then l + 1 else l) 0


-- ---------------------------------------------------------------------
-- Zipping

zip xxs yys = case (headView xxs, headView yys) of
    (Nothing,     _          ) -> []
    (_,           Nothing    ) -> []
    (Just (x,xs), Just (y,ys)) -> (x,y) : zip xs ys

zipWith f xxs yys = case (headView xxs, headView yys) of
    (Nothing,     _          ) -> []
    (_,           Nothing    ) -> []
    (Just (x,xs), Just (y,ys)) -> f x y : zipWith f xs ys
#if defined(__GLASGOW_HASKELL__)
{-# INLINE [1] zipWith #-}
#endif

zipWith' f a@(CS (PS _ _ l)) b@(CS (PS _ _ m)) = c
 where
  c = CS $ inlinePerformIO $
      withBuffer a $ \p1 ->
      withBuffer b $ \p2 ->
      B.createAndTrim (byteCount (encoding c) (charCount (encoding a) l `min` charCount (encoding b) m)) $
      zipWith_ p1 p2
  zipWith_ p1 p2 p3 = go 0 0 0
   where
    STRICT3(go)
    go i j k
       | i >= l || j >= m = return k
       | otherwise        = do
            (l1,c1) <- peekChar (encoding a) (p1 `plusPtr` i)
            (l2,c2) <- peekChar (encoding b) (p2 `plusPtr` j)
            l3      <- pokeChar (encoding c) (p3 `plusPtr` k) (f c1 c2)
            go (i+l1) (j+l2) (k+l3)
{-# INLINE zipWith' #-}

{-# RULES

"CompactString specialise zipWith"
 forall (f :: Char -> Char -> Char) p q .
    zipWith f p q = unpack (zipWith' f p q)

  #-}

unzip ls = (pack (L.map fst ls), pack (L.map snd ls))
{-# INLINE unzip #-}


-- ---------------------------------------------------------------------
-- Ordered CompactStrings

-- Implementation for lazy programmers
-- Maybe QuickSort would be appropriate here?
sort = pack . L.sort . unpack

-- | Compare two bytestrings, possibly with a different encoding.
compare' :: (Encoding a, Encoding b) => CompactString a -> CompactString b -> Ordering
compare' a@(CS (PS _ _ l1)) b@(CS (PS _ _ l2))
    = B.inlinePerformIO $
        withBuffer a $ \p1 ->
        withBuffer b $ \p2 -> comp p1 p2
  where
   comp p1 p2 = comp_ 0 0
    where
        STRICT2(comp_)
        comp_ pos1 pos2
         | pos1 >= l1  =  return $! if l1 < l2 then LT else EQ
         | pos2 >= l2  =  return $! GT
         | otherwise  =  do
                 (lc1,c1) <- peekChar (encoding a) (p1 `plusPtr` pos1)
                 (lc2,c2) <- peekChar (encoding b) (p2 `plusPtr` pos2)
                 if c1 /= c2
                   then return $! (c1 `compare` c2)
                   else comp_ (pos1 + lc1) (pos2 + lc2)

{-# SPECIALIZE doCompare :: CompactString UTF8 -> CompactString UTF8 -> Ordering #-}
{-# RULES
"CompactString: compare' UTF8"
  compare' = doCompare :: CompactString UTF8 -> CompactString UTF8 -> Ordering
"CompactString: compare' UTF32BE"
  compare' = doCompare :: CompactString UTF32BE -> CompactString UTF32BE -> Ordering
"CompactString: compare' ASCII"
  compare' = doCompare :: CompactString ASCII -> CompactString ASCII -> Ordering
"CompactString: compare' Latin1"
  compare' = doCompare :: CompactString Latin1 -> CompactString Latin1 -> Ordering
   #-}

-- -----------------------------------------------------------------------------
--
-- Encoding
--

-- for type inference
toByteStringAs :: Encoding a => a -> CompactString a -> ByteString
toByteStringAs _ = toByteString
unsafeFromByteStringAs :: Encoding a => a -> ByteString -> CompactString a
unsafeFromByteStringAs _ = unsafeFromByteString


toByteString = unCS

fromByteString   = validate   . unsafeFromByteString
fromByteString_  = validate_  . unsafeFromByteString
fromByteStringIO :: Encoding a => ByteString -> IO (CompactString a)
fromByteStringIO = validateIO . unsafeFromByteString

validate  = unsafeTry       . validateIO
validate_ = unsafePerformIO . validateIO

-- | Convert between two different encodings, fails if conversion is not possible.
recode :: (Encoding a, Encoding b, MonadPlus m) => CompactString a -> m (CompactString b)
recode  = unsafeTry       . recodeIO
-- | Convert between two different encodings, raises an error if conversion is not possible.
recode_ :: (Encoding a, Encoding b) => CompactString a -> CompactString b
recode_ = unsafePerformIO . recodeIO

-- | recode =<< validate
recodeV :: (Encoding a, Encoding b, MonadPlus m) => CompactString a -> m (CompactString b)
recodeV  = unsafeTry       . recodeVIO
-- | recode_ . validate_
recodeV_ :: (Encoding a, Encoding b) => CompactString a -> CompactString b
recodeV_ = unsafePerformIO . recodeVIO

encode  e = liftM (toByteStringAs e) . recode
encode_ e =       (toByteStringAs e) . recode_

decode  e = recodeV  . (unsafeFromByteStringAs e)
decode_ e = recodeV_ . (unsafeFromByteStringAs e)

encodeBOM  e = encode  e . cons '\xFEFF'
encodeBOM_ e = encode_ e . cons '\xFEFF'

decodeBOM  = unsafeTry       . decodeBOM_IO
decodeBOM_ = unsafePerformIO . decodeBOM_IO

{-# INLINE[1] validate   #-}
{-# INLINE[1] validate_  #-}
{-# INLINE[1] recode     #-}
{-# INLINE[1] recode_    #-}
{-# INLINE[1] recodeV    #-}
{-# INLINE[1] recodeV_   #-}
{-# INLINE[1] decodeBOM  #-}
{-# INLINE[1] decodeBOM_ #-}
{-# RULES

"CompactString: to/fromByteString"
  forall s.
  toByteString (unsafeFromByteString s) = s
"CompactString: from/toByteString"
  forall s.
  unsafeFromByteString (toByteString s) = s

"CompactString: recode  -> return"
  recode  = return
"CompactString: recode_ -> id"
  recode_ = id

"CompactString: recode/recode -> recode"
  forall s.
  recode s >>= recode = recode s
"CompactString: recode_/recode_ -> recode_"
  forall s.
  recode_ (recode_ s) = recode_ s

"CompactString: recode/validate"
  forall s.
  validate s >>= recode = recodeV s
"CompactString: recode_/validate_"
  forall s.
  recode_ (validate_ s) = recodeV_ s

"CompactString: recodeV  -> validate"
  recodeV   = validate
"CompactString: recodeV_ -> validate_"
  recodeV_  = validate_
"CompactString: recodeVIO -> validateIO"
  recodeVIO = validateIO
  
{-
-- TODO: Make these rules work
"CompactString: recode/fromByteString"
  forall bs.
  fromByteString bs >>= (recode :: (Encoding a, Encoding b, MonadPlus m) => CompactString a -> m (CompactString b))
  = decode (undefined::a) bs :: m (CompactString b)
"CompactString: recode_/fromByteString_"
  forall bs.
  recode_ (fromByteString_ bs :: Encoding a => CompactString a)       = decode_ (undefined::a) bs
-}

  #-}

decodeBOM_IO :: Encoding a => ByteString -> IO (CompactString a)
decodeBOM_IO bs
 | t2 == [0,0] && t4 == [0xFE,0xFF] = decodeIO (UTF32 BE) (B.drop 4 bs)
 | t2 == [0xFF,0xFE] && t4 == [0,0] = decodeIO (UTF32 LE) (B.drop 4 bs)
 | t2 == [0xFE,0xFF]                = decodeIO (UTF16 BE) (B.drop 2 bs)
 | t2 == [0xFF,0xFE]                = decodeIO (UTF16 LE) (B.drop 2 bs)
 | t3 == [0xEF,0xBB,0xBF]           = decodeIO UTF8       (B.drop 3 bs)
 | otherwise                        = decodeIO UTF8        bs -- no BOM
 where t2 = B.unpack (B.take 2 bs)
       t3 = B.unpack (B.take 3 bs)
       t4 = B.unpack (B.take 2 (B.drop 2 bs))
       decodeIO e = recodeVIO . (unsafeFromByteStringAs e)

-- | Validate encoding, convert to normal form
validateIO :: Encoding a => CompactString a -> IO (CompactString a)
validateIO cs@(CS (PS fp s l))
 | validEquality (encoding cs) = validateLength (encoding cs) l
                              >> withForeignPtr fp (\p -> check (p `plusPtr` s))
 | otherwise = recodeVIO_ cs -- There are multiple representations of the same string, convert to a normal form
 where
   check src = loop 0
     where
        STRICT1(loop)
        loop src_off
         | src_off == l = return cs
         | src_off >  l = failMessage "validate" "Incomplete character"
         | otherwise    = do (l',_) <- peekCharSafe (encoding cs) (l - src_off) (src `plusPtr` src_off)
                             loop (src_off+l')
{-# SPECIALIZE validateIO :: CompactString UTF8 -> IO (CompactString UTF8) #-}

-- | Convert between encodings
recodeIO :: (Encoding a, Encoding b) => CompactString a -> IO (CompactString b)
recodeIO a@(CS (PS fp s l))
 | l == 0    = return empty
 | otherwise = result
 where
   len    = byteCount (encoding_b) (charCount (encoding a) l)
   result = liftM CS $
               withForeignPtr fp $ \p -> 
               B.createAndTrim len $ doRecode (p `plusPtr` s)
   encoding_b = (undefined :: IO (CompactString a) -> Proxy a) result
   doRecode src dest = loop 0 0
     where
        STRICT2(loop)
        loop src_off dest_off
         | src_off >= l  =    return dest_off
         | otherwise     = do (l',c) <- peekChar (encoding a) (src  `plusPtr` src_off)
                              l''    <- pokeChar (encoding_b) (dest `plusPtr` dest_off) c
                              loop (src_off+l') (dest_off+l'')

-- | Validate encoding, convert to normal form
--   Can be rewritten by rules, in particular: recodeVIO -> validateIO
recodeVIO :: (Encoding a, Encoding b) => CompactString a -> IO (CompactString b)
recodeVIO = recodeVIO_
{-# INLINE[1] validateIO #-}

-- | Convert between encodings, use peekCharSafe
recodeVIO_ :: (Encoding a, Encoding b) => CompactString a -> IO (CompactString b)
recodeVIO_ a@(CS (PS fp s l))
 | l == 0    = return empty
 | otherwise = result
 where
   len    = byteCount (encoding_b) (charCount (encoding a) l)
   result = validateLength (encoding a) l
            >> (liftM CS $
                withForeignPtr fp $ \p -> 
                B.createAndTrim len $ doRecode (p `plusPtr` s))
   encoding_b = (undefined :: IO (CompactString a) -> Proxy a) result
   doRecode src dest = loop 0 0
     where
        STRICT2(loop)
        loop src_off dest_off
         | src_off >= l  =    return dest_off
         | otherwise     = do (l',c) <- peekCharSafe (encoding a) (l - src_off) (src  `plusPtr` src_off)
                              l''    <- pokeChar     (encoding_b)               (dest `plusPtr` dest_off) c
                              loop (src_off+l') (dest_off+l'')


-- ---------------------------------------------------------------------
-- Standard IO

getLine = hGetLine stdin

getContents = hGetContents stdin

putStr = hPut stdout

putStrLn = hPutStrLn stdout

interact transformer = putStr . transformer =<< getContents

-- ---------------------------------------------------------------------
-- File IO

readFile  f = C8.readFile f >>= fromByteStringIO
readFile' f = C8.readFile f >>= decodeBOM_IO

writeFile  f txt = C8.writeFile f (toByteString txt)
writeFile' f txt = C8.writeFile f (toByteString ('\xFEFF' `cons` txt))

appendFile  f txt = C8.appendFile f (toByteString txt)
appendFile' f txt = bracket (openFile f AppendMode) hClose
    (\h -> appendHandle h txt)

-- | Append a 'ByteString' to a file.
-- appendFile :: FilePath -> ByteString -> IO ()
-- TODO : Determine encoding used by the file, then append using the same encoding

appendHandle :: Encoding a => Handle -> CompactString a -> IO ()
appendHandle h cs = do
        pos <- hTell h
        hSeek h AbsoluteSeek 0
        enc <- findEncoding h
        bs <- enc cs
        hSeek h AbsoluteSeek pos
        B.hPut h bs

-- | Determine the encoding to use for a handle by examining the Byte Order Mark.
--   The handle should be positioned at the start of the file.
findEncoding :: Encoding a => Handle -> IO (CompactString a -> IO ByteString)
findEncoding h = do
        bs <- B.hGet h 4
        return (encodingOf bs)
 where encodingOf bs
        | B.null bs                        = return . toByteString . cons '\xFEFF' -- empty file, start with a BOM
        | t2 == [0,0] && t4 == [0xFE,0xFF] = encodeIO (UTF32 BE)
        | t2 == [0xFF,0xFE] && t4 == [0,0] = encodeIO (UTF32 LE)
        | t2 == [0xFE,0xFF]                = encodeIO (UTF16 BE)
        | t2 == [0xFF,0xFE]                = encodeIO (UTF16 LE)
        | otherwise                        = encodeIO UTF8 -- no BOM or UTF8 BOM
        where t2 = B.unpack (B.take 2 bs)
              t4 = B.unpack (B.take 2 (B.drop 2 bs))
              encodeIO e = liftM (toByteStringAs e) . recodeIO

-- ---------------------------------------------------------------------
-- Handle IO

hGetLine h = System.IO.hGetLine h >>= return . pack

hGetContents  h = B.hGetContents h >>= fromByteStringIO
hGetContents' h = B.hGetContents h >>= decodeBOM_IO

hGet            h i = B.hGet            h i >>= fromByteStringIO
hGetNonBlocking h i = B.hGetNonBlocking h i >>= fromByteStringIO

hPut h = B.hPut h . toByteString

hPutStr = hPut

hPutStrLn h cs@(CS bs)
    | B.length bs < 1024 = hPut h (cs `snoc` '\n')
    | otherwise          = hPut h cs >> hPut h (singleton '\n' `asTypeOf` cs) -- don't copy


-- ---------------------------------------------------------------------
-- Internal utilities

-- | Find the byte position corresponding to the given character index,
--   the index must be positive.
charIndex :: Encoding a => CompactString a -> Int -> Int
charIndex cs@(CS (PS _ _ l)) n = unsafeWithBuffer cs $ \src -> (go src n 0)
    where
        STRICT3(go)
        go _   0 p = return p
        go src i p
         | p >= l     = return l
         | otherwise  = do  l' <- peekCharLen (encoding cs) (src `plusPtr` p)
                            go src (i-1) (p+l')

-- | 'findIndexOrEnd' is a variant of findIndex, that returns the length
-- in bytes of the string if no element is found, rather than Nothing.
findIndexOrEnd :: Encoding a => (Char -> Bool) -> CompactString a -> Int
findIndexOrEnd f cs@(CS (PS _ _ l)) = unsafeWithBuffer cs $ \ptr -> go ptr 0
    where
        STRICT2(go)
        go ptr n | n >= l    = return l
                 | otherwise = do (l',c) <- peekChar (encoding cs) ptr
                                  if f c then return n
                                         else go (ptr `plusPtr` l') (n+l')
{-# INLINE findIndexOrEnd #-}

-- | 'findIndexOrBeginRev' is a variant of findIndexOrEnd, that searches
-- from the end instead of from the start
findIndexOrBeginRev :: Encoding a => (Char -> Bool) -> CompactString a -> Int
findIndexOrBeginRev f cs@(CS (PS _ _ l)) = unsafeWithBufferEnd cs $ \ptr -> go ptr l
    where
        STRICT2(go)
        go ptr n | n <= 0    = return 0
                 | otherwise = do (l',c) <- peekCharRev (encoding cs) ptr
                                  if f c then return n
                                         else go (ptr `plusPtr` (-l')) (n-l')
{-# INLINE findIndexOrBeginRev #-}