| 1 | % |
|---|
| 2 | % (c) The University of Glasgow 2006 |
|---|
| 3 | % (c) The University of Glasgow, 1997-2006 |
|---|
| 4 | % |
|---|
| 5 | |
|---|
| 6 | Buffers for scanning string input stored in external arrays. |
|---|
| 7 | |
|---|
| 8 | \begin{code} |
|---|
| 9 | {-# LANGUAGE BangPatterns #-} |
|---|
| 10 | {-# OPTIONS_GHC -O -funbox-strict-fields #-} |
|---|
| 11 | -- We always optimise this, otherwise performance of a non-optimised |
|---|
| 12 | -- compiler is severely affected |
|---|
| 13 | |
|---|
| 14 | module StringBuffer |
|---|
| 15 | ( |
|---|
| 16 | StringBuffer(..), |
|---|
| 17 | -- non-abstract for vs\/HaskellService |
|---|
| 18 | |
|---|
| 19 | -- * Creation\/destruction |
|---|
| 20 | hGetStringBuffer, |
|---|
| 21 | hGetStringBufferBlock, |
|---|
| 22 | appendStringBuffers, |
|---|
| 23 | stringToStringBuffer, |
|---|
| 24 | |
|---|
| 25 | -- * Inspection |
|---|
| 26 | nextChar, |
|---|
| 27 | currentChar, |
|---|
| 28 | prevChar, |
|---|
| 29 | atEnd, |
|---|
| 30 | |
|---|
| 31 | -- * Moving and comparison |
|---|
| 32 | stepOn, |
|---|
| 33 | offsetBytes, |
|---|
| 34 | byteDiff, |
|---|
| 35 | |
|---|
| 36 | -- * Conversion |
|---|
| 37 | lexemeToString, |
|---|
| 38 | lexemeToFastString, |
|---|
| 39 | |
|---|
| 40 | -- * Parsing integers |
|---|
| 41 | parseUnsignedInteger, |
|---|
| 42 | ) where |
|---|
| 43 | |
|---|
| 44 | #include "HsVersions.h" |
|---|
| 45 | |
|---|
| 46 | import Encoding |
|---|
| 47 | import FastString hiding ( buf ) |
|---|
| 48 | import FastTypes |
|---|
| 49 | import FastFunctions |
|---|
| 50 | |
|---|
| 51 | import System.IO ( hGetBuf, hFileSize,IOMode(ReadMode), hClose |
|---|
| 52 | , Handle, hTell, openBinaryFile ) |
|---|
| 53 | import System.IO.Unsafe ( unsafePerformIO ) |
|---|
| 54 | |
|---|
| 55 | import GHC.Exts |
|---|
| 56 | |
|---|
| 57 | #if __GLASGOW_HASKELL__ >= 701 |
|---|
| 58 | import Foreign.Safe |
|---|
| 59 | #else |
|---|
| 60 | import Foreign hiding ( unsafePerformIO ) |
|---|
| 61 | #endif |
|---|
| 62 | |
|---|
| 63 | -- ----------------------------------------------------------------------------- |
|---|
| 64 | -- The StringBuffer type |
|---|
| 65 | |
|---|
| 66 | -- |A StringBuffer is an internal pointer to a sized chunk of bytes. |
|---|
| 67 | -- The bytes are intended to be *immutable*. There are pure |
|---|
| 68 | -- operations to read the contents of a StringBuffer. |
|---|
| 69 | -- |
|---|
| 70 | -- A StringBuffer may have a finalizer, depending on how it was |
|---|
| 71 | -- obtained. |
|---|
| 72 | -- |
|---|
| 73 | data StringBuffer |
|---|
| 74 | = StringBuffer { |
|---|
| 75 | buf :: {-# UNPACK #-} !(ForeignPtr Word8), |
|---|
| 76 | len :: {-# UNPACK #-} !Int, -- length |
|---|
| 77 | cur :: {-# UNPACK #-} !Int -- current pos |
|---|
| 78 | } |
|---|
| 79 | -- The buffer is assumed to be UTF-8 encoded, and furthermore |
|---|
| 80 | -- we add three '\0' bytes to the end as sentinels so that the |
|---|
| 81 | -- decoder doesn't have to check for overflow at every single byte |
|---|
| 82 | -- of a multibyte sequence. |
|---|
| 83 | |
|---|
| 84 | instance Show StringBuffer where |
|---|
| 85 | showsPrec _ s = showString "<stringbuffer(" |
|---|
| 86 | . shows (len s) . showString "," . shows (cur s) |
|---|
| 87 | . showString ")>" |
|---|
| 88 | |
|---|
| 89 | -- ----------------------------------------------------------------------------- |
|---|
| 90 | -- Creation / Destruction |
|---|
| 91 | |
|---|
| 92 | hGetStringBuffer :: FilePath -> IO StringBuffer |
|---|
| 93 | hGetStringBuffer fname = do |
|---|
| 94 | h <- openBinaryFile fname ReadMode |
|---|
| 95 | size_i <- hFileSize h |
|---|
| 96 | let size = fromIntegral size_i |
|---|
| 97 | buf <- mallocForeignPtrArray (size+3) |
|---|
| 98 | withForeignPtr buf $ \ptr -> do |
|---|
| 99 | r <- if size == 0 then return 0 else hGetBuf h ptr size |
|---|
| 100 | hClose h |
|---|
| 101 | if (r /= size) |
|---|
| 102 | then ioError (userError "short read of file") |
|---|
| 103 | else newUTF8StringBuffer buf ptr size |
|---|
| 104 | |
|---|
| 105 | hGetStringBufferBlock :: Handle -> Int -> IO StringBuffer |
|---|
| 106 | hGetStringBufferBlock handle wanted |
|---|
| 107 | = do size_i <- hFileSize handle |
|---|
| 108 | offset_i <- hTell handle |
|---|
| 109 | let size = min wanted (fromIntegral $ size_i-offset_i) |
|---|
| 110 | buf <- mallocForeignPtrArray (size+3) |
|---|
| 111 | withForeignPtr buf $ \ptr -> |
|---|
| 112 | do r <- if size == 0 then return 0 else hGetBuf handle ptr size |
|---|
| 113 | if r /= size |
|---|
| 114 | then ioError (userError $ "short read of file: "++show(r,size,size_i,handle)) |
|---|
| 115 | else newUTF8StringBuffer buf ptr size |
|---|
| 116 | |
|---|
| 117 | newUTF8StringBuffer :: ForeignPtr Word8 -> Ptr Word8 -> Int -> IO StringBuffer |
|---|
| 118 | newUTF8StringBuffer buf ptr size = do |
|---|
| 119 | pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0] |
|---|
| 120 | -- sentinels for UTF-8 decoding |
|---|
| 121 | let |
|---|
| 122 | sb0 = StringBuffer buf size 0 |
|---|
| 123 | (first_char, sb1) = nextChar sb0 |
|---|
| 124 | -- skip the byte-order mark if there is one (see #1744) |
|---|
| 125 | -- This is better than treating #FEFF as whitespace, |
|---|
| 126 | -- because that would mess up layout. We don't have a concept |
|---|
| 127 | -- of zero-width whitespace in Haskell: all whitespace codepoints |
|---|
| 128 | -- have a width of one column. |
|---|
| 129 | return (if first_char == '\xfeff' then sb1 else sb0) |
|---|
| 130 | |
|---|
| 131 | appendStringBuffers :: StringBuffer -> StringBuffer -> IO StringBuffer |
|---|
| 132 | appendStringBuffers sb1 sb2 |
|---|
| 133 | = do newBuf <- mallocForeignPtrArray (size+3) |
|---|
| 134 | withForeignPtr newBuf $ \ptr -> |
|---|
| 135 | withForeignPtr (buf sb1) $ \sb1Ptr -> |
|---|
| 136 | withForeignPtr (buf sb2) $ \sb2Ptr -> |
|---|
| 137 | do copyArray ptr (sb1Ptr `advancePtr` cur sb1) sb1_len |
|---|
| 138 | copyArray (ptr `advancePtr` sb1_len) (sb2Ptr `advancePtr` cur sb2) sb2_len |
|---|
| 139 | pokeArray (ptr `advancePtr` size) [0,0,0] |
|---|
| 140 | return (StringBuffer newBuf size 0) |
|---|
| 141 | where sb1_len = calcLen sb1 |
|---|
| 142 | sb2_len = calcLen sb2 |
|---|
| 143 | calcLen sb = len sb - cur sb |
|---|
| 144 | size = sb1_len + sb2_len |
|---|
| 145 | |
|---|
| 146 | stringToStringBuffer :: String -> StringBuffer |
|---|
| 147 | stringToStringBuffer str = |
|---|
| 148 | unsafePerformIO $ do |
|---|
| 149 | let size = utf8EncodedLength str |
|---|
| 150 | buf <- mallocForeignPtrArray (size+3) |
|---|
| 151 | withForeignPtr buf $ \ptr -> do |
|---|
| 152 | utf8EncodeString ptr str |
|---|
| 153 | pokeArray (ptr `plusPtr` size :: Ptr Word8) [0,0,0] |
|---|
| 154 | -- sentinels for UTF-8 decoding |
|---|
| 155 | return (StringBuffer buf size 0) |
|---|
| 156 | |
|---|
| 157 | -- ----------------------------------------------------------------------------- |
|---|
| 158 | -- Grab a character |
|---|
| 159 | |
|---|
| 160 | -- Getting our fingers dirty a little here, but this is performance-critical |
|---|
| 161 | {-# INLINE nextChar #-} |
|---|
| 162 | nextChar :: StringBuffer -> (Char,StringBuffer) |
|---|
| 163 | nextChar (StringBuffer buf len (I# cur#)) = |
|---|
| 164 | inlinePerformIO $ do |
|---|
| 165 | withForeignPtr buf $ \(Ptr a#) -> do |
|---|
| 166 | case utf8DecodeChar# (a# `plusAddr#` cur#) of |
|---|
| 167 | (# c#, b# #) -> |
|---|
| 168 | let cur' = I# (b# `minusAddr#` a#) in |
|---|
| 169 | return (C# c#, StringBuffer buf len cur') |
|---|
| 170 | |
|---|
| 171 | currentChar :: StringBuffer -> Char |
|---|
| 172 | currentChar = fst . nextChar |
|---|
| 173 | |
|---|
| 174 | prevChar :: StringBuffer -> Char -> Char |
|---|
| 175 | prevChar (StringBuffer _ _ 0) deflt = deflt |
|---|
| 176 | prevChar (StringBuffer buf _ cur) _ = |
|---|
| 177 | inlinePerformIO $ do |
|---|
| 178 | withForeignPtr buf $ \p -> do |
|---|
| 179 | p' <- utf8PrevChar (p `plusPtr` cur) |
|---|
| 180 | return (fst (utf8DecodeChar p')) |
|---|
| 181 | |
|---|
| 182 | -- ----------------------------------------------------------------------------- |
|---|
| 183 | -- Moving |
|---|
| 184 | |
|---|
| 185 | stepOn :: StringBuffer -> StringBuffer |
|---|
| 186 | stepOn s = snd (nextChar s) |
|---|
| 187 | |
|---|
| 188 | offsetBytes :: Int -> StringBuffer -> StringBuffer |
|---|
| 189 | offsetBytes i s = s { cur = cur s + i } |
|---|
| 190 | |
|---|
| 191 | byteDiff :: StringBuffer -> StringBuffer -> Int |
|---|
| 192 | byteDiff s1 s2 = cur s2 - cur s1 |
|---|
| 193 | |
|---|
| 194 | atEnd :: StringBuffer -> Bool |
|---|
| 195 | atEnd (StringBuffer _ l c) = l == c |
|---|
| 196 | |
|---|
| 197 | -- ----------------------------------------------------------------------------- |
|---|
| 198 | -- Conversion |
|---|
| 199 | |
|---|
| 200 | lexemeToString :: StringBuffer -> Int {-bytes-} -> String |
|---|
| 201 | lexemeToString _ 0 = "" |
|---|
| 202 | lexemeToString (StringBuffer buf _ cur) bytes = |
|---|
| 203 | inlinePerformIO $ |
|---|
| 204 | withForeignPtr buf $ \ptr -> |
|---|
| 205 | utf8DecodeString (ptr `plusPtr` cur) bytes |
|---|
| 206 | |
|---|
| 207 | lexemeToFastString :: StringBuffer -> Int {-bytes-} -> FastString |
|---|
| 208 | lexemeToFastString _ 0 = nilFS |
|---|
| 209 | lexemeToFastString (StringBuffer buf _ cur) len = |
|---|
| 210 | inlinePerformIO $ |
|---|
| 211 | withForeignPtr buf $ \ptr -> |
|---|
| 212 | return $! mkFastStringBytes (ptr `plusPtr` cur) len |
|---|
| 213 | |
|---|
| 214 | -- ----------------------------------------------------------------------------- |
|---|
| 215 | -- Parsing integer strings in various bases |
|---|
| 216 | {- |
|---|
| 217 | byteOff :: StringBuffer -> Int -> Char |
|---|
| 218 | byteOff (StringBuffer buf _ cur) i = |
|---|
| 219 | inlinePerformIO $ withForeignPtr buf $ \ptr -> do |
|---|
| 220 | -- return $! cBox (indexWord8OffFastPtrAsFastChar |
|---|
| 221 | -- (pUnbox ptr) (iUnbox (cur+i))) |
|---|
| 222 | --or |
|---|
| 223 | -- w <- peek (ptr `plusPtr` (cur+i)) |
|---|
| 224 | -- return (unsafeChr (fromIntegral (w::Word8))) |
|---|
| 225 | -} |
|---|
| 226 | -- | XXX assumes ASCII digits only (by using byteOff) |
|---|
| 227 | parseUnsignedInteger :: StringBuffer -> Int -> Integer -> (Char->Int) -> Integer |
|---|
| 228 | parseUnsignedInteger (StringBuffer buf _ cur) len radix char_to_int |
|---|
| 229 | = inlinePerformIO $ withForeignPtr buf $ \ptr -> return $! let |
|---|
| 230 | --LOL, in implementations where the indexing needs slow unsafePerformIO, |
|---|
| 231 | --this is less (not more) efficient than using the IO monad explicitly |
|---|
| 232 | --here. |
|---|
| 233 | !ptr' = pUnbox ptr |
|---|
| 234 | byteOff i = cBox (indexWord8OffFastPtrAsFastChar ptr' (iUnbox (cur + i))) |
|---|
| 235 | go i x | i == len = x |
|---|
| 236 | | otherwise = case byteOff i of |
|---|
| 237 | char -> go (i + 1) (x * radix + toInteger (char_to_int char)) |
|---|
| 238 | in go 0 0 |
|---|
| 239 | |
|---|
| 240 | \end{code} |
|---|