{-# LANGUAGE ScopedTypeVariables #-} -- | Functions to represent a 'Vector' on disk in efficient, if -- unportable, ways. -- -- This module uses memory-mapping, a feature of all modern -- operating-systems, to mirror the disk contents in memory. There are -- quite a few advantages to memory-mapping files instead of reading -- the files traditionally: -- -- * Speed: memory-mapping is often much faster than traditional -- reading. -- -- * Memory efficiency: Memory-mapped files are loaded into RAM -- on-demand, and easily swapped out. The upside is that the -- program can work with data-sets larger than the available RAM, -- as long as they are accessed carefully. -- -- The caveat to using memory-mapping is that it makes the files -- specific to the current architecture because of the endianness of -- the data. For more information, see the description in -- "System.IO.MMap" -- -- If you wish to write the contents in a portable fashion, either use -- the ASCII load and save functions in "Numeric.Container", or use -- the binary serialization in "Data.Binary". module Data.Packed.Vector.MMap ( -- * Memory-mapping 'Vector' from disk unsafeMMapVector, unsafeLazyMMapVectors, -- * Writing 'Vector' to disk -- | These functions write the 'Vector' in a way suitable for -- reading back with 'unsafeMMapVector'. hPutVector, writeVector ) where import Control.Monad (when) import System.IO import System.IO.MMap import System.IO.Unsafe import Foreign.ForeignPtr import Foreign.Ptr import Foreign.Storable import qualified Data.Packed.Development as I import qualified Data.Packed.Vector as I import Data.Int --------------------------- -- Memory-Mapping 'Vector' from disk -- | Map a file into memory (read-only) as a 'Vector'. -- -- It is considered unsafe because changes to the underlying file may -- (or may not) be reflected in the 'Vector', which breaks referential -- transparency. unsafeMMapVector :: forall a. Storable a => FilePath -- ^ Path of the file to map -> Maybe (Int64, Int) -- ^ 'Nothing' to map entire file into memory, otherwise 'Just (fileOffset, elementCount)' -> IO (I.Vector a) unsafeMMapVector path range = do (foreignPtr, offset, size) <- mmapFileForeignPtr path ReadOnly $ case range of Nothing -> Nothing Just (start, length) -> Just (start, length * sizeOf (undefined :: a)) return $ I.unsafeFromForeignPtr foreignPtr offset (size `div` sizeOf (undefined :: a)) -- | Map a file into memory as a lazy-list of equal-sized 'Vector', -- even if they can't all fit in the address space at the same time. -- -- > (numVectors,vectors) <- unsafeLazyMMapVectors filename Nothing vectorSize -- -- Commonly, a data file will contain multiple vectors of equal length -- (matrix). This function is convenient for those uses, but it plays -- a more important role: supporting data-sets that cannot fit in the -- address space of the current machine. -- -- On 32-bit machines the address space is only 4GB, and it is -- actually pretty easy to find data-sets that are too large to be -- represented, even in virtual memory. -- -- This function loads the data in chunks, and as long as you drop -- your reference to the vectors as you consume the data, the old -- chunks will be unmapped before mapping the next chunk. -- -- The number of vectors in the list is returned because it's often -- needed, yet calculating it using 'length' would demand the whole -- list. unsafeLazyMMapVectors :: forall a. Storable a => FilePath -- ^ Path of the file to map -> Maybe (Int64, Int64) -- ^ 'Nothing' to map entire file into memory, -- otherwise @'Just' (fileOffset, totalElementCount)@ -> Int -- ^ The number of elements in each 'Vector' -> IO (Int64,[I.Vector a]) -- ^ Return @(numberOfVectors,vectors)@ unsafeLazyMMapVectors path range vsize = do when (vecSize > maxChunkSize) vecTooBigError filesize <- withFile path ReadMode hFileSize let filesize' :: Int64 filesize' = fI filesize imgs <- unsafeInterleaveIO $ unsafeLazyMMapVectors' filesize' path range vsize return (nimages range filesize', imgs) where nimages :: Maybe (Int64, Int64) -> Int64 -> Int64 nimages Nothing fsz = fsz `div` imageSize nimages (Just (_,sz)) _ = sz `div` imageSize imageSize = fI vsize * eltSize eltSize = fI (sizeOf (undefined :: a)) vecSize = fI vsize * eltSize vecTooBigError = fail "The requested vector size can't be mapped into memory" unsafeLazyMMapVectors' :: forall a. Storable a => Int64 -> FilePath -> Maybe (Int64, Int64) -> Int -> IO [I.Vector a] unsafeLazyMMapVectors' fileSize fileName fileRange numEltsPerVec | mapSize < maxChunkSize = mmapAll | otherwise = mmapChunks 0 where mapSize, eltSize, vecSize, chunkSize, baseOffset :: Int64 eltSize = fI $ sizeOf (undefined :: a) (baseOffset,mapSize) = case fileRange of Just (off,nelts) -> (off,nelts*eltSize) _ -> (0,fileSize) vecSize = fI numEltsPerVec * eltSize chunkSize = (maxChunkSize `div` vecSize) * vecSize fileRange' = do (offset, nelts) <- fileRange return (offset, fI nelts) splitVecs :: I.Vector a -> [I.Vector a] splitVecs bigVec = let nvecs = I.dim bigVec `div` numEltsPerVec in I.takesV (replicate nvecs numEltsPerVec) bigVec mmapAll :: IO [I.Vector a] mmapAll = do allVecs <- unsafeMMapVector fileName fileRange' return $ splitVecs allVecs mmapChunks :: Int64 -> IO [I.Vector a] mmapChunks offs | remaining <= 0 = return [] | otherwise = do chunk <- unsafeMMapVector fileName mmapRange rest <- unsafeInterleaveIO $ mmapChunks (offs+chunkSize') return $ splitVecs chunk ++ rest where mmapRange = Just (baseOffset+offs,fI (chunkSize' `div` eltSize)) remaining = mapSize-offs chunkSize' = min chunkSize remaining -- Maximum size for chunks maxChunkSize :: Int64 maxChunkSize = fI (maxBound `div` 256 :: Int) -- Handy alias for 'fromIntegral' fI :: (Integral a, Num b) => a -> b fI = fromIntegral --------------------------- -- Writing 'Vector' to disk -- | Write out a vector verbatim into an open file handle. hPutVector :: forall a. Storable a => Handle -> I.Vector a -> IO () hPutVector h v = withForeignPtr fp $ \p -> hPutBuf h (p `plusPtr` offset) sz where (fp, offset, n) = I.unsafeToForeignPtr v eltsize = sizeOf (undefined :: a) sz = n * eltsize -- | Write the vector verbatim to a file. writeVector :: forall a. Storable a => FilePath -> I.Vector a -> IO () writeVector fp v = withFile fp WriteMode $ \h -> hPutVector h v