Haskell Code by HsColour

{-# LANGUAGE BangPatterns, ForeignFunctionInterface, CPP #-}

-----------------------------------------------------------------------------
-- |
-- Module      :  ByteStringUtils
-- Copyright   :  (c) The University of Glasgow 2001,
--                    David Roundy 2003-2005
-- License : GPL (I'm happy to also license this file BSD style but don't
--           want to bother distributing two license files with darcs.
--
-- Maintainer  :  droundy@abridgegame.org
-- Stability   :  experimental
-- Portability :  portable
--
-- GZIp and MMap IO for ByteStrings, and miscellaneous functions for Data.ByteString
--

module ByteStringUtils (

        unsafeWithInternals,
        unpackPSfromUTF8,

        -- IO with mmap or gzip
        gzReadFilePS,
        mmapFilePS,
        gzWriteFilePS,
        gzWriteFilePSs,

        -- list utilities
        ifHeadThenTail,
        dropSpace,
        breakSpace,
        linesPS,
        unlinesPS,
        hashPS,
        breakFirstPS,
        breakLastPS,
        substrPS,
        readIntPS,
        is_funky,
        fromHex2PS,
        fromPS2Hex,
        betweenLinesPS,
        break_after_nth_newline,
        break_before_nth_newline,
        intercalate
    ) where

import Autoconf                 ( use_mmap )

import qualified Data.ByteString            as B
import qualified Data.ByteString.Char8      as BC
#if __GLASGOW_HASKELL__ > 606
import qualified Data.ByteString.Internal   as BI
import Data.ByteString (intercalate, uncons)
import Data.ByteString.Internal (fromForeignPtr)
#else
import qualified Data.ByteString.Base     as BI
#endif

import System.IO
import System.IO.Unsafe         ( unsafePerformIO )

#if __GLASGOW_HASKELL__ > 606
import Foreign.Storable         ( peekElemOff, peek )
#else
import Foreign.Storable         ( peekElemOff, peek, peekByteOff )
import Data.List                ( intersperse )
#endif
import Foreign.Marshal.Alloc    ( free )
import Foreign.Marshal.Array    ( mallocArray, peekArray, advancePtr )
import Foreign.C.Types          ( CInt, CSize )

import Data.Bits                ( rotateL )
import Data.Char                ( chr, ord, isSpace )
import Data.Word                ( Word8 )
import Data.Int                 ( Int32 )
import Control.Monad            ( when )

import Foreign.Ptr              ( nullPtr, plusPtr, Ptr )
import Foreign.ForeignPtr       ( ForeignPtr, withForeignPtr )

#if defined(__GLASGOW_HASKELL__)
import qualified Foreign.Concurrent as FC ( newForeignPtr )
import System.Posix             ( handleToFd )
#endif

#ifdef DEBUG_PS
import Foreign.ForeignPtr       ( addForeignPtrFinalizer )
import Foreign.Ptr              ( FunPtr )
#endif

#if HAVE_HASKELL_ZLIB
import qualified Data.ByteString.Lazy as BL
import qualified Codec.Compression.GZip as GZ
#else
import Foreign.C.String ( CString, withCString )
#endif

-- -----------------------------------------------------------------------------
-- obsolete debugging code

debugForeignPtr :: ForeignPtr a -> String -> IO ()
#ifdef DEBUG_PS
foreign import ccall unsafe "static fpstring.h debug_alloc" debug_alloc
    :: Ptr a -> CString -> IO ()
foreign import ccall unsafe "static fpstring.h & debug_free" debug_free
    :: FunPtr (Ptr a -> IO ())
debugForeignPtr fp n =
    withCString n $ \cname-> withForeignPtr fp $ \p->
    do debug_alloc p cname
       addForeignPtrFinalizer debug_free fp
#else
debugForeignPtr _ _ = return ()
#endif

-- -----------------------------------------------------------------------------
-- unsafeWithInternals

-- | Do something with the internals of a PackedString. Beware of
-- altering the contents!
unsafeWithInternals :: B.ByteString -> (Ptr Word8 -> Int -> IO a) -> IO a
unsafeWithInternals ps f
 = case BI.toForeignPtr ps of
   (fp,s,l) -> withForeignPtr fp $ \p -> f (p `plusPtr` s) l

-- | readIntPS skips any whitespace at the beginning of its argument, and
-- reads an Int from the beginning of the PackedString.  If there is no
-- integer at the beginning of the string, it returns Nothing, otherwise it
-- just returns the int read, along with a B.ByteString containing the
-- remainder of its input.

readIntPS :: B.ByteString -> Maybe (Int, B.ByteString)
readIntPS = BC.readInt . BC.dropWhile isSpace

-- -----------------------------------------------------------------------------
-- Destructor functions (taking PackedStrings apart)

unpackPSfromUTF8 :: B.ByteString -> String
unpackPSfromUTF8 ps =
 case BI.toForeignPtr ps of
   (_,_, 0) -> ""
   (x,s,l)  ->
    unsafePerformIO $ withForeignPtr x $ \p->
    do outbuf <- mallocArray l
       lout <- fromIntegral `fmap`
               utf8_to_ints outbuf (p `plusPtr` s) (fromIntegral l)
       when (lout < 0) $ error "Bad UTF8!"
       str <- (map (chr . fromIntegral)) `fmap` peekArray lout outbuf
       free outbuf
       return str

foreign import ccall unsafe "static fpstring.h utf8_to_ints" utf8_to_ints
    :: Ptr Int -> Ptr Word8 -> CInt -> IO CInt

-- -----------------------------------------------------------------------------
-- List-mimicking functions for PackedStrings

{-# INLINE ifHeadThenTail #-}
ifHeadThenTail :: Word8 -> B.ByteString -> Maybe B.ByteString
ifHeadThenTail c s = case uncons s of
    Just (w, t) | w == c    -> Just t
    _                       -> Nothing

#if __GLASGOW_HASKELL__ <= 606
-- | /O(1)/ Extract the head and tail of a ByteString, returning Nothing
-- if it is empty.
uncons :: B.ByteString -> Maybe (Word8, B.ByteString)
uncons (BI.PS x s l)
    | l <= 0    = Nothing
    | otherwise = Just (BI.inlinePerformIO $ withForeignPtr x
                                        $ \p -> peekByteOff p s,
                        BI.PS x (s+1) (l-1))
{-# INLINE uncons #-}
-- | /O(1)/ Build a ByteString from a ForeignPtr
fromForeignPtr :: ForeignPtr Word8
               -> Int -- ^ Offset
               -> Int -- ^ Length
               -> B.ByteString
fromForeignPtr fp s l = BI.PS fp s l
{-# INLINE fromForeignPtr #-}
-- | /O(n)/ The 'intercalate' function takes a 'ByteString' and a list of
-- 'ByteString's and concatenates the list after interspersing the first
-- argument between each element of the list.
intercalate :: B.ByteString -> [B.ByteString] -> B.ByteString
intercalate s = B.concat . (intersperse s)
{-# INLINE [1] intercalate #-}
#endif


------------------------------------------------------------------------
-- A reimplementation of Data.ByteString.Char8.dropSpace, but
-- specialised to darcs' need for a 4 way isspace.
--
-- TODO: if it is safe to use the expanded definition of isSpaceWord8
-- provided by Data.ByteString.Char8, then all this can go.

-- A locale-independent isspace(3) so patches are interpreted the same everywhere.
-- ((c) == ' ' || (c) == '\t' || (c) == '\n' || (c) == '\r')
isSpaceWord8 :: Word8 -> Bool
isSpaceWord8 w =
    w == 0x20 ||    -- ' '
    w == 0x09 ||    -- '\t'
    w == 0x0A ||    -- '\n'
    w == 0x0D       -- '\r'
{-# INLINE isSpaceWord8 #-}

firstnonspace :: Ptr Word8 -> Int -> Int -> IO Int
firstnonspace !ptr !n !m
    | n >= m    = return n
    | otherwise = do w <- peekElemOff ptr n
                     if isSpaceWord8 w then firstnonspace ptr (n+1) m else return n

firstspace :: Ptr Word8 -> Int -> Int -> IO Int
firstspace !ptr !n !m
    | n >= m    = return n
    | otherwise = do w <- peekElemOff ptr n
                     if (not . isSpaceWord8) w then firstspace ptr (n+1) m else return n

-- | 'dropSpace' efficiently returns the 'ByteString' argument with
-- white space Chars removed from the front. It is more efficient than
-- calling dropWhile for removing whitespace. I.e.
-- 
-- > dropWhile isSpace == dropSpace
--
dropSpace :: B.ByteString -> B.ByteString
dropSpace (BI.PS x s l) = BI.inlinePerformIO $ withForeignPtr x $ \p -> do
    i <- firstnonspace (p `plusPtr` s) 0 l
    return $! if i == l then B.empty else BI.PS x (s+i) (l-i)
{-# INLINE dropSpace #-}

-- | 'breakSpace' returns the pair of ByteStrings when the argument is
-- broken at the first whitespace byte. I.e.
-- 
-- > break isSpace == breakSpace
--
breakSpace :: B.ByteString -> (B.ByteString,B.ByteString)
breakSpace (BI.PS x s l) = BI.inlinePerformIO $ withForeignPtr x $ \p -> do
    i <- firstspace (p `plusPtr` s) 0 l
    return $! case () of {_
        | i == 0    -> (B.empty, BI.PS x s l)
        | i == l    -> (BI.PS x s l, B.empty)
        | otherwise -> (BI.PS x s i, BI.PS x (s+i) (l-i))
    }
{-# INLINE breakSpace #-}

------------------------------------------------------------------------

{-# INLINE is_funky #-}
is_funky :: B.ByteString -> Bool
is_funky ps = case BI.toForeignPtr ps of
   (x,s,l) ->
    unsafePerformIO $ withForeignPtr x $ \p->
    (/=0) `fmap` has_funky_char (p `plusPtr` s) (fromIntegral l)

foreign import ccall unsafe "fpstring.h has_funky_char" has_funky_char
    :: Ptr Word8 -> CInt -> IO CInt

------------------------------------------------------------------------

-- ByteString rewrites break (=='x') to breakByte 'x'
--  break ((==) x) = breakChar x
--  break (==x) = breakChar x
--

{-
{-# INLINE breakOnPS #-}
breakOnPS :: Char -> B.ByteString -> (B.ByteString, B.ByteString)
breakOnPS c p = case BC.elemIndex c p of
                Nothing -> (p, BC.empty)
                Just n  -> (B.take n p, B.drop n p)
-}

{-# INLINE hashPS #-}
hashPS :: B.ByteString -> Int32
hashPS ps =
   case BI.toForeignPtr ps of
   (x,s,l) ->
    unsafePerformIO $ withForeignPtr x $ \p->
    do hash (p `plusPtr` s) l

hash :: Ptr Word8 -> Int -> IO Int32
hash ptr len = f (0 :: Int32) ptr len
 where f h _ 0 = return h
       f h p n = do x <- peek p
                    let !h' =  (fromIntegral x) + (rotateL h 8)
                    f h' (p `advancePtr` 1) (n-1)

{-# INLINE substrPS #-}
substrPS :: B.ByteString -> B.ByteString -> Maybe Int
substrPS tok str
    | B.null tok = Just 0
    | B.length tok > B.length str = Nothing
    | otherwise = do n <- BC.elemIndex (BC.head tok) str
                     let ttok = B.tail tok
                         reststr = B.drop (n+1) str
                     if ttok == B.take (B.length ttok) reststr
                        then Just n
                        else ((n+1)+) `fmap` substrPS tok reststr

------------------------------------------------------------------------

-- TODO: replace breakFirstPS and breakLastPS with definitions based on
-- ByteString's break/breakEnd
{-# INLINE breakFirstPS #-}
breakFirstPS :: Char -> B.ByteString -> Maybe (B.ByteString,B.ByteString)
breakFirstPS c p = case BC.elemIndex c p of
                   Nothing -> Nothing
                   Just n -> Just (B.take n p, B.drop (n+1) p)

{-# INLINE breakLastPS #-}
breakLastPS :: Char -> B.ByteString -> Maybe (B.ByteString,B.ByteString)
breakLastPS c p = case BC.elemIndexEnd c p of
                  Nothing -> Nothing
                  Just n -> Just (B.take n p, B.drop (n+1) p)

-- TODO: rename
{-# INLINE linesPS #-}
linesPS :: B.ByteString -> [B.ByteString]
linesPS ps
     | B.null ps = [B.empty]
     | otherwise = BC.split '\n' ps

{- QuickCheck property:

import Test.QuickCheck
import qualified Data.ByteString.Char8 as BC
import Data.Char
instance Arbitrary BC.ByteString where
    arbitrary = fmap BC.pack arbitrary
instance Arbitrary Char where
  arbitrary = chr `fmap` choose (32,127)
deepCheck = check (defaultConfig { configMaxTest = 10000})
testLines =  deepCheck (\x -> (linesPS x == linesPSOld x))
linesPSOld ps = case  BC.elemIndex '\n' ps of
             Nothing -> [ps]
             Just n -> B.take n ps : linesPS (B.drop (n+1) ps) -}

{-| This function acts exactly like the "Prelude" unlines function, or like
"Data.ByteString.Char8" 'unlines', but with one important difference: it will
produce a string which may not end with a newline! That is:

> unlinesPS ["foo", "bar"]

evaluates to \"foo\\nbar\", not \"foo\\nbar\\n\"! This point should hold true for
'linesPS' as well.

TODO: rename this function. -}
unlinesPS :: [B.ByteString] -> B.ByteString
unlinesPS [] = BC.empty
unlinesPS x  = BC.init $ BC.unlines x
{-# INLINE unlinesPS #-}
{- QuickCheck property:

testUnlines = deepCheck (\x -> (unlinesPS x == unlinesPSOld x))
unlinesPSOld ss = BC.concat $ intersperse_newlines ss
    where intersperse_newlines (a:b:s) = a : newline : intersperse_newlines (b:s)
          intersperse_newlines s = s
          newline = BC.pack "\n" -}

-- -----------------------------------------------------------------------------
-- gzReadFilePS

-- | Read an entire file, which may or may not be gzip compressed, directly
-- into a 'B.ByteString'.

#ifndef HAVE_HASKELL_ZLIB
foreign import ccall unsafe "static zlib.h gzopen" c_gzopen
    :: CString -> CString -> IO (Ptr ())
foreign import ccall unsafe "static zlib.h gzclose" c_gzclose
    :: Ptr () -> IO ()
foreign import ccall unsafe "static zlib.h gzread" c_gzread
    :: Ptr () -> Ptr Word8 -> CInt -> IO CInt
foreign import ccall unsafe "static zlib.h gzwrite" c_gzwrite
    :: Ptr () -> Ptr Word8 -> CInt -> IO CInt
#endif

gzReadFilePS :: FilePath -> IO B.ByteString
gzReadFilePS f = do
    h <- openBinaryFile f ReadMode
    header <- B.hGet h 2
    if header /= BC.pack "\31\139"
       then do hClose h
               mmapFilePS f
       else do hSeek h SeekFromEnd (-4)
               len <- hGetLittleEndInt h
               hClose h
#ifdef HAVE_HASKELL_ZLIB
               -- Passing the length to GZ.decompressWith means
               -- that BL.toChunks only produces one chunk, which in turn
               -- means that B.concat won't need to copy data.
               -- If the length is wrong this will just affect efficiency, not correctness
               let decompress = GZ.decompressWith GZ.defaultDecompressParams {
                                  GZ.decompressBufferSize = len
                                }
               fmap (B.concat . BL.toChunks . decompress) $
#ifdef HAVE_OLD_BYTESTRING
                        -- bytestring < 0.9.1 had a bug where it did not know to close handles upon EOF
                        -- performance would be better with a newer bytestring and lazy
                        -- readFile below -- ratify readFile: comment
                        fmap (BL.fromChunks . (:[])) $
                        B.readFile f  -- ratify readFile: immediately consumed
#else
                        BL.readFile f -- ratify readFile: immediately consumed by the conversion to a strict bytestring
#endif
#else
               withCString f $ \fstr-> withCString "rb" $ \rb-> do
                 gzf <- c_gzopen fstr rb
                 when (gzf == nullPtr) $ fail $ "problem opening file "++f
                 fp <- BI.mallocByteString len
                 debugForeignPtr fp $ "gzReadFilePS "++f
                 lread <- withForeignPtr fp $ \p ->
                          c_gzread gzf p (fromIntegral len)
                 c_gzclose gzf
                 when (fromIntegral lread /= len) $
                      fail $ "problem gzreading file "++f
                 return $ fromForeignPtr fp 0 len
#endif

hGetLittleEndInt :: Handle -> IO Int
hGetLittleEndInt h = do
    b1 <- ord `fmap` hGetChar h
    b2 <- ord `fmap` hGetChar h
    b3 <- ord `fmap` hGetChar h
    b4 <- ord `fmap` hGetChar h
    return $ b1 + 256*b2 + 65536*b3 + 16777216*b4

gzWriteFilePS :: FilePath -> B.ByteString -> IO ()
gzWriteFilePS f ps = gzWriteFilePSs f [ps]

gzWriteFilePSs :: FilePath -> [B.ByteString] -> IO ()
gzWriteFilePSs f pss  =
#ifdef HAVE_HASKELL_ZLIB
    BL.writeFile f $ GZ.compress $ BL.fromChunks pss
#else
    withCString f $ \fstr -> withCString "wb" $ \wb -> do
    gzf <- c_gzopen fstr wb
    when (gzf == nullPtr) $ fail $ "problem gzopening file for write: "++f
    mapM_ (gzWriteToGzf gzf) pss `catch`
              \_ -> fail $ "problem gzwriting file: "++f
    c_gzclose gzf

gzWriteToGzf :: Ptr () -> B.ByteString -> IO ()
gzWriteToGzf gzf ps = case BI.toForeignPtr ps of
 (_,_,0) -> return () -- avoid calling gzwrite with 0 length this would
                      -- trouble on some versions of zlib, and is always
                      -- unnecessary.
 (x,s,l) -> do
    lw <- withForeignPtr x $ \p -> c_gzwrite gzf (p `plusPtr` s)
                                                 (fromIntegral l)
    when (fromIntegral lw /= l) $ fail $ "problem in gzWriteToGzf"
#endif

-- -----------------------------------------------------------------------------
-- mmapFilePS

-- | Like readFilePS, this reads an entire file directly into a
-- 'B.ByteString', but it is even more efficient.  It involves directly
-- mapping the file to memory.  This has the advantage that the contents of
-- the file never need to be copied.  Also, under memory pressure the page
-- may simply be discarded, wile in the case of readFilePS it would need to
-- be written to swap.  If you read many small files, mmapFilePS will be
-- less memory-efficient than readFilePS, since each mmapFilePS takes up a
-- separate page of memory.  Also, you can run into bus errors if the file
-- is modified.  NOTE: as with 'readFilePS', the string representation in
-- the file is assumed to be ISO-8859-1.

mmapFilePS :: FilePath -> IO B.ByteString
mmapFilePS f = if use_mmap
               then do (fp,l) <- mmap f
                       return $ fromForeignPtr fp 0 l
               else B.readFile f

#if defined(__GLASGOW_HASKELL__)
foreign import ccall unsafe "static fpstring.h my_mmap" my_mmap
    :: CSize -> CInt -> IO (Ptr Word8)
foreign import ccall unsafe "static sys/mman.h munmap" c_munmap
    :: Ptr Word8 -> CSize -> IO CInt
foreign import ccall unsafe "static unistd.h close" c_close
    :: CInt -> IO CInt
#endif

mmap :: FilePath -> IO (ForeignPtr Word8, Int)
mmap f = do
    h <- openBinaryFile f ReadMode
    l <- fromIntegral `fmap` hFileSize h
    -- Don't bother mmaping small files because each mmapped file takes up
    -- at least one full VM block.
    if l < mmap_limit
       then do thefp <- BI.mallocByteString l
               debugForeignPtr thefp $ "mmap short file "++f
               withForeignPtr thefp $ \p-> hGetBuf h p l
               hClose h
               return (thefp, l)
       else do
#if defined(__GLASGOW_HASKELL__)
               fd <- fromIntegral `fmap` handleToFd h
               p <- my_mmap (fromIntegral l) fd
               fp <- if p == nullPtr
                     then
#else
               fp <-
#endif
                          do thefp <- BI.mallocByteString l
                             debugForeignPtr thefp $ "mmap short file "++f
                             withForeignPtr thefp $ \p' -> hGetBuf h p' l
                             return thefp
#if defined(__GLASGOW_HASKELL__)
                     else do
                             fp <- FC.newForeignPtr p
                                   (do {c_munmap p $ fromIntegral l;
                                        return (); })
                             debugForeignPtr fp $ "mmap "++f
                             return fp
               c_close fd
#endif
               hClose h
               return (fp, l)
    where mmap_limit = 16*1024


-- -------------------------------------------------------------------------
-- fromPS2Hex

foreign import ccall unsafe "static fpstring.h conv_to_hex" conv_to_hex
    :: Ptr Word8 -> Ptr Word8 -> CInt -> IO ()

fromPS2Hex :: B.ByteString -> B.ByteString
fromPS2Hex ps = case BI.toForeignPtr ps of
          (x,s,l) ->
           BI.unsafeCreate (2*l) $ \p -> withForeignPtr x $ \f ->
           conv_to_hex p (f `plusPtr` s) $ fromIntegral l

-- -------------------------------------------------------------------------
-- fromHex2PS

foreign import ccall unsafe "static fpstring.h conv_from_hex" conv_from_hex
    :: Ptr Word8 -> Ptr Word8 -> CInt -> IO ()

fromHex2PS :: B.ByteString -> B.ByteString
fromHex2PS ps = case BI.toForeignPtr ps of
          (x,s,l) ->
           BI.unsafeCreate (l `div` 2) $ \p -> withForeignPtr x $ \f ->
           conv_from_hex p (f `plusPtr` s) (fromIntegral $ l `div` 2)

-- -------------------------------------------------------------------------
-- betweenLinesPS

-- | betweenLinesPS returns the B.ByteString between the two lines given,
-- or Nothing if they do not appear.

betweenLinesPS :: B.ByteString -> B.ByteString -> B.ByteString
               -> Maybe (B.ByteString)
betweenLinesPS start end ps
 = case break (start ==) (linesPS ps) of
       (_, _:rest@(bs1:_)) ->
           case BI.toForeignPtr bs1 of
            (ps1,s1,_) ->
             case break (end ==) rest of
               (_, bs2:_) -> case BI.toForeignPtr bs2 of (_,s2,_) -> Just $ fromForeignPtr ps1 s1 (s2 - s1)
               _ -> Nothing
       _ -> Nothing

-- -------------------------------------------------------------------------
-- break_after_nth_newline

break_after_nth_newline :: Int -> B.ByteString
                        -> Maybe (B.ByteString, B.ByteString)
break_after_nth_newline 0 the_ps | B.null the_ps = Just (B.empty, B.empty)
break_after_nth_newline n the_ps =
  case BI.toForeignPtr the_ps of
  (fp,the_s,l) ->
   unsafePerformIO $ withForeignPtr fp $ \p ->
   do let findit 0 s | s == end = return $ Just (the_ps, B.empty)
          findit _ s | s == end = return Nothing
          findit 0 s = let left_l = s - the_s
                       in return $ Just (fromForeignPtr fp the_s left_l,
                                         fromForeignPtr fp s (l - left_l))
          findit i s = do w <- peekElemOff p s
                          if w == nl then findit (i-1) (s+1)
                                     else findit i (s+1)
          nl = BI.c2w '\n'
          end = the_s + l
      findit n the_s

-- -------------------------------------------------------------------------
-- break_before_nth_newline

break_before_nth_newline :: Int -> B.ByteString -> (B.ByteString, B.ByteString)
break_before_nth_newline 0 the_ps
 | B.null the_ps = (B.empty, B.empty)
break_before_nth_newline n the_ps =
 case BI.toForeignPtr the_ps of
 (fp,the_s,l) ->
   unsafePerformIO $ withForeignPtr fp $ \p ->
   do let findit _ s | s == end = return (the_ps, B.empty)
          findit i s = do w <- peekElemOff p s
                          if w == nl
                            then if i == 0
                                 then let left_l = s - the_s
                                      in return (fromForeignPtr fp the_s left_l,
                                                 fromForeignPtr fp s (l - left_l))
                                 else findit (i-1) (s+1)
                            else findit i (s+1)
          nl = BI.c2w '\n'
          end = the_s + l
      findit n the_s