-------------------------------------------------------------------------------- -- | -- Module : Data.IDX -- Copyright : Christof Schramm -- License : GPL v 3 -- -- Maintainer : Christof Schramm -- Stability : Experimental -- Portability : Should work in all common Haskell implementations -- -- A package for reading and writing data in the IDX format. -- This data format is used for machine-learning data sets like the -- MNIST database of handwritten digits () -------------------------------------------------------------------------------- module Data.IDX ( -- * Data types IDXData , IDXLabels , IDXContentType(..) -- * Accessing data , idxType , idxDimensions , isIDXReal , isIDXIntegral -- * Raw data , idxDoubleContent , idxIntContent -- * Labeled data , labeledIntData , labeledDoubleData -- * IO / Serialization -- ** IDXLabels -- *** ByteString serialization , encodeIDXLabels , decodeIDXLabels -- *** FileIO , encodeIDXLabelsFile , decodeIDXLabelsFile -- ** IDXData (e.g. images) -- *** ByteString serialization , encodeIDX , decodeIDX -- *** File IO , encodeIDXFile , decodeIDXFile )where -- For compatibility with versions of base < 4.8 import Control.Applicative ((<$>)) import Control.Monad import Data.Binary import Data.ByteString.Lazy (ByteString) import qualified Data.ByteString.Lazy as BL import Data.IDX.Internal import Data.Int import Data.Traversable import qualified Data.Vector.Unboxed as V import Data.Vector.Unboxed ((!)) import Data.Word -- | Partition a dataset and label each subpartition, return int values labeledIntData:: IDXLabels -> IDXData -> Maybe [(Int, V.Vector Int)] labeledIntData (IDXLabels v) dat = if (V.length v) == dim0 then Just $ do i <- [0 .. dim0 - 1] let lab = v ! i return $ (lab,V.slice (i*entrySize) entrySize content) else Nothing where dim0 = (idxDimensions dat) ! 0 content = idxIntContent dat entrySize = (V.product $ idxDimensions dat) `div` dim0 -- | Partition a dataset and label each subpartition, return double values labeledDoubleData:: IDXLabels -> IDXData -> Maybe [(Int, V.Vector Double)] labeledDoubleData (IDXLabels v) dat = if (V.length v) == dim0 then Just $ do i <- [0 .. dim0 - 1] let lab = v ! i return $ (lab,V.slice (i*entrySize) entrySize content) else Nothing where dim0 = (idxDimensions dat) ! 0 content = idxDoubleContent dat entrySize = (V.product $ idxDimensions dat) `div` dim0 -- | Read labels from a file, return 'Nothing' if something doesn't work decodeIDXLabelsFile :: FilePath -> IO (Maybe IDXLabels) decodeIDXLabelsFile path = BL.readFile path >>= return . decodeIDXLabels decodeIDXLabels :: BL.ByteString -> Maybe IDXLabels decodeIDXLabels content = case decodeOrFail content of Right (_,_,result) -> Just result Left _ -> Nothing -- | Read data from a file, return 'Nothing' if something doesn't work encodeIDXLabelsFile :: IDXLabels -> FilePath -> IO () encodeIDXLabelsFile labs path = encodeFile path labs encodeIDXLabels :: IDXLabels -> BL.ByteString encodeIDXLabels = encode decodeIDXFile :: FilePath -> IO (Maybe IDXData) decodeIDXFile path = BL.readFile path >>= return . decodeIDX decodeIDX :: BL.ByteString -> Maybe IDXData decodeIDX content = case decodeOrFail content of Right (_,_,result) -> Just result Left _ -> Nothing encodeIDXFile :: IDXData -> FilePath -> IO () encodeIDXFile idx path = encodeFile path idx encodeIDX :: IDXData -> BL.ByteString encodeIDX = encode