--------------------------------------------------------------------------------
-- |
-- Module    : Data.IDX
-- Copyright : Christof Schramm
-- License   : GPL v 3
--
-- Maintainer : Christof Schramm <christof.schramm@campus.lmu.de>
-- Stability : Experimental
-- Portability : Should work in all common Haskell implementations
--
-- A package for reading and writing data in the IDX format.
-- This data format is used for machine-learning data sets like the
-- MNIST database of handwritten digits (<http://yann.lecun.com/exdb/mnist/>)
--------------------------------------------------------------------------------
module Data.IDX (
                -- * Data types
                  IDXData
                , IDXLabels
                , IDXContentType(..)

                -- * Accessing data
                , idxType
                , idxDimensions

                , isIDXReal
                , isIDXIntegral

                -- ** Raw data
                , idxDoubleContent
                , idxIntContent

                -- ** Partitioned data
                , partitionedDoubleData
                , partitionedIntData

                -- ** Labeled data
                , labeledIntData
                , labeledDoubleData

                -- * IO / Serialization

                -- ** IDXLabels
                  
                -- *** ByteString serialization
                , encodeIDXLabels
                , decodeIDXLabels

                -- *** FileIO
                , encodeIDXLabelsFile
                , decodeIDXLabelsFile
                  
                -- ** IDXData (e.g. images)
                  
                -- *** ByteString serialization
                , encodeIDX
                , decodeIDX

                -- *** File IO
                , encodeIDXFile
                , decodeIDXFile
                )where

-- For compatibility with versions of base < 4.8
import           Control.Applicative ((<$>))
import           Control.Monad

import           Data.Binary
import           Data.ByteString.Lazy (ByteString)
import qualified Data.ByteString.Lazy as BL
import           Data.IDX.Internal
import           Data.Int
import           Data.Traversable
import qualified Data.Vector.Unboxed as V
import           Data.Vector.Unboxed ((!))
import           Data.Word

-- | Partition a dataset and label each subpartition, return int values
labeledIntData :: IDXLabels -> IDXData -> Maybe [(Int, V.Vector Int)]
labeledIntData :: IDXLabels -> IDXData -> Maybe [(Int, Vector Int)]
labeledIntData (IDXLabels Vector Int
v) IDXData
dat =
  if forall a. Unbox a => Vector a -> Int
V.length Vector Int
v forall a. Eq a => a -> a -> Bool
== forall (t :: * -> *) a. Foldable t => t a -> Int
length [Vector Int]
partitionedData
  then forall a. a -> Maybe a
Just forall a b. (a -> b) -> a -> b
$ forall a b. [a] -> [b] -> [(a, b)]
zip (forall a. Unbox a => Vector a -> [a]
V.toList Vector Int
v) [Vector Int]
partitionedData
  else forall a. Maybe a
Nothing
  where
    partitionedData :: [Vector Int]
partitionedData = IDXData -> [Vector Int]
partitionedIntData IDXData
dat

-- | Partition a dataset and label each subpartition, return double values
labeledDoubleData :: IDXLabels -> IDXData -> Maybe [(Int, V.Vector Double)]
labeledDoubleData :: IDXLabels -> IDXData -> Maybe [(Int, Vector Double)]
labeledDoubleData (IDXLabels Vector Int
v) IDXData
dat =
  if forall a. Unbox a => Vector a -> Int
V.length Vector Int
v forall a. Eq a => a -> a -> Bool
== forall (t :: * -> *) a. Foldable t => t a -> Int
length [Vector Double]
partitionedData
  then forall a. a -> Maybe a
Just forall a b. (a -> b) -> a -> b
$ forall a b. [a] -> [b] -> [(a, b)]
zip (forall a. Unbox a => Vector a -> [a]
V.toList Vector Int
v) [Vector Double]
partitionedData
  else forall a. Maybe a
Nothing
  where
    partitionedData :: [Vector Double]
partitionedData = IDXData -> [Vector Double]
partitionedDoubleData IDXData
dat

-- | Partition a dataset along the first dimension. If the data set contains
-- images this means splitting the dataset up into a list of images where each
-- 'Double' represents one pixel.
partitionedDoubleData :: IDXData -> [V.Vector Double]
partitionedDoubleData :: IDXData -> [Vector Double]
partitionedDoubleData = forall a. Unbox a => (IDXData -> Vector a) -> IDXData -> [Vector a]
partitionedData IDXData -> Vector Double
idxDoubleContent

-- | Partition a dataset along the first dimension. If the data set contains
-- images this means splitting the dataset up into a list of images where each
-- 'Int' represents one pixel.
partitionedIntData :: IDXData -> [V.Vector Int]
partitionedIntData :: IDXData -> [Vector Int]
partitionedIntData = forall a. Unbox a => (IDXData -> Vector a) -> IDXData -> [Vector a]
partitionedData IDXData -> Vector Int
idxIntContent

-- | Read labels from a file, return 'Nothing' if something doesn't work
decodeIDXLabelsFile :: FilePath -> IO (Maybe IDXLabels)
decodeIDXLabelsFile :: FilePath -> IO (Maybe IDXLabels)
decodeIDXLabelsFile FilePath
path = FilePath -> IO ByteString
BL.readFile FilePath
path forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= forall (m :: * -> *) a. Monad m => a -> m a
return forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> Maybe IDXLabels
decodeIDXLabels

decodeIDXLabels :: BL.ByteString -> Maybe IDXLabels
decodeIDXLabels :: ByteString -> Maybe IDXLabels
decodeIDXLabels ByteString
content = case forall a.
Binary a =>
ByteString
-> Either
     (ByteString, ByteOffset, FilePath) (ByteString, ByteOffset, a)
decodeOrFail ByteString
content of
                           Right (ByteString
_,ByteOffset
_,IDXLabels
result) -> forall a. a -> Maybe a
Just IDXLabels
result
                           Left (ByteString, ByteOffset, FilePath)
_             -> forall a. Maybe a
Nothing

-- | Read data from a file, return 'Nothing' if something doesn't work
encodeIDXLabelsFile :: IDXLabels -> FilePath -> IO ()
encodeIDXLabelsFile :: IDXLabels -> FilePath -> IO ()
encodeIDXLabelsFile IDXLabels
labs FilePath
path = forall a. Binary a => FilePath -> a -> IO ()
encodeFile FilePath
path IDXLabels
labs

encodeIDXLabels :: IDXLabels -> BL.ByteString
encodeIDXLabels :: IDXLabels -> ByteString
encodeIDXLabels = forall a. Binary a => a -> ByteString
encode

decodeIDXFile :: FilePath -> IO (Maybe IDXData)
decodeIDXFile :: FilePath -> IO (Maybe IDXData)
decodeIDXFile FilePath
path = FilePath -> IO ByteString
BL.readFile FilePath
path forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= forall (m :: * -> *) a. Monad m => a -> m a
return forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> Maybe IDXData
decodeIDX

decodeIDX :: BL.ByteString -> Maybe IDXData
decodeIDX :: ByteString -> Maybe IDXData
decodeIDX ByteString
content = case forall a.
Binary a =>
ByteString
-> Either
     (ByteString, ByteOffset, FilePath) (ByteString, ByteOffset, a)
decodeOrFail ByteString
content of
  Right (ByteString
_,ByteOffset
_,IDXData
result) -> forall a. a -> Maybe a
Just IDXData
result
  Left (ByteString, ByteOffset, FilePath)
_ -> forall a. Maybe a
Nothing

encodeIDXFile :: IDXData -> FilePath -> IO ()
encodeIDXFile :: IDXData -> FilePath -> IO ()
encodeIDXFile IDXData
idx FilePath
path = forall a. Binary a => FilePath -> a -> IO ()
encodeFile FilePath
path IDXData
idx

encodeIDX :: IDXData -> BL.ByteString
encodeIDX :: IDXData -> ByteString
encodeIDX = forall a. Binary a => a -> ByteString
encode