--------------------------------------------------------------------------------
-- |
-- Module    : Data.IDX
-- Copyright : Christof Schramm
-- License   : GPL v 3
--
-- Maintainer : Christof Schramm <christof.schramm@campus.lmu.de>
-- Stability : Experimental
-- Portability : Should work in all common Haskell implementations
--
-- A package for reading and writing data in the IDX format.
-- This data format is used for machine-learning data sets like the
-- MNIST database of handwritten digits (<http://yann.lecun.com/exdb/mnist/>)
--------------------------------------------------------------------------------
module Data.IDX (
                -- * Data types
                  IDXData
                , IDXLabels
                , IDXContentType(..)

                -- * Accessing data
                , idxType
                , idxDimensions

                , isIDXReal
                , isIDXIntegral

                -- ** Raw data
                , idxDoubleContent
                , idxIntContent

                -- ** Partitioned data
                , partitionedDoubleData
                , partitionedIntData

                -- ** Labeled data
                , labeledIntData
                , labeledDoubleData

                -- * IO / Serialization

                -- ** IDXLabels
                  
                -- *** ByteString serialization
                , encodeIDXLabels
                , decodeIDXLabels

                -- *** FileIO
                , encodeIDXLabelsFile
                , decodeIDXLabelsFile
                  
                -- ** IDXData (e.g. images)
                  
                -- *** ByteString serialization
                , encodeIDX
                , decodeIDX

                -- *** File IO
                , encodeIDXFile
                , decodeIDXFile
                )where

-- For compatibility with versions of base < 4.8
import           Control.Applicative ((<$>))
import           Control.Monad

import           Data.Binary
import           Data.ByteString.Lazy (ByteString)
import qualified Data.ByteString.Lazy as BL
import           Data.IDX.Internal
import           Data.Int
import           Data.Traversable
import qualified Data.Vector.Unboxed as V
import           Data.Vector.Unboxed ((!))
import           Data.Word

-- | Partition a dataset and label each subpartition, return int values
labeledIntData :: IDXLabels -> IDXData -> Maybe [(Int, V.Vector Int)]
labeledIntData :: IDXLabels -> IDXData -> Maybe [(Int, Vector Int)]
labeledIntData (IDXLabels Vector Int
v) IDXData
dat =
  if Vector Int -> Int
forall a. Unbox a => Vector a -> Int
V.length Vector Int
v Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== [Vector Int] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [Vector Int]
partitionedData
  then [(Int, Vector Int)] -> Maybe [(Int, Vector Int)]
forall a. a -> Maybe a
Just ([(Int, Vector Int)] -> Maybe [(Int, Vector Int)])
-> [(Int, Vector Int)] -> Maybe [(Int, Vector Int)]
forall a b. (a -> b) -> a -> b
$ [Int] -> [Vector Int] -> [(Int, Vector Int)]
forall a b. [a] -> [b] -> [(a, b)]
zip (Vector Int -> [Int]
forall a. Unbox a => Vector a -> [a]
V.toList Vector Int
v) [Vector Int]
partitionedData
  else Maybe [(Int, Vector Int)]
forall a. Maybe a
Nothing
  where
    partitionedData :: [Vector Int]
partitionedData = IDXData -> [Vector Int]
partitionedIntData IDXData
dat

-- | Partition a dataset and label each subpartition, return double values
labeledDoubleData :: IDXLabels -> IDXData -> Maybe [(Int, V.Vector Double)]
labeledDoubleData :: IDXLabels -> IDXData -> Maybe [(Int, Vector Double)]
labeledDoubleData (IDXLabels Vector Int
v) IDXData
dat =
  if Vector Int -> Int
forall a. Unbox a => Vector a -> Int
V.length Vector Int
v Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== [Vector Double] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [Vector Double]
partitionedData
  then [(Int, Vector Double)] -> Maybe [(Int, Vector Double)]
forall a. a -> Maybe a
Just ([(Int, Vector Double)] -> Maybe [(Int, Vector Double)])
-> [(Int, Vector Double)] -> Maybe [(Int, Vector Double)]
forall a b. (a -> b) -> a -> b
$ [Int] -> [Vector Double] -> [(Int, Vector Double)]
forall a b. [a] -> [b] -> [(a, b)]
zip (Vector Int -> [Int]
forall a. Unbox a => Vector a -> [a]
V.toList Vector Int
v) [Vector Double]
partitionedData
  else Maybe [(Int, Vector Double)]
forall a. Maybe a
Nothing
  where
    partitionedData :: [Vector Double]
partitionedData = IDXData -> [Vector Double]
partitionedDoubleData IDXData
dat

-- | Partition a dataset along the first dimension. If the data set contains
-- images this means splitting the dataset up into a list of images where each
-- 'Double' represents one pixel.
partitionedDoubleData :: IDXData -> [V.Vector Double]
partitionedDoubleData :: IDXData -> [Vector Double]
partitionedDoubleData = (IDXData -> Vector Double) -> IDXData -> [Vector Double]
forall a. Unbox a => (IDXData -> Vector a) -> IDXData -> [Vector a]
partitionedData IDXData -> Vector Double
idxDoubleContent

-- | Partition a dataset along the first dimension. If the data set contains
-- images this means splitting the dataset up into a list of images where each
-- 'Int' represents one pixel.
partitionedIntData :: IDXData -> [V.Vector Int]
partitionedIntData :: IDXData -> [Vector Int]
partitionedIntData = (IDXData -> Vector Int) -> IDXData -> [Vector Int]
forall a. Unbox a => (IDXData -> Vector a) -> IDXData -> [Vector a]
partitionedData IDXData -> Vector Int
idxIntContent

-- | Read labels from a file, return 'Nothing' if something doesn't work
decodeIDXLabelsFile :: FilePath -> IO (Maybe IDXLabels)
decodeIDXLabelsFile :: FilePath -> IO (Maybe IDXLabels)
decodeIDXLabelsFile FilePath
path = FilePath -> IO ByteString
BL.readFile FilePath
path IO ByteString
-> (ByteString -> IO (Maybe IDXLabels)) -> IO (Maybe IDXLabels)
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= Maybe IDXLabels -> IO (Maybe IDXLabels)
forall (m :: * -> *) a. Monad m => a -> m a
return (Maybe IDXLabels -> IO (Maybe IDXLabels))
-> (ByteString -> Maybe IDXLabels)
-> ByteString
-> IO (Maybe IDXLabels)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> Maybe IDXLabels
decodeIDXLabels

decodeIDXLabels :: BL.ByteString -> Maybe IDXLabels
decodeIDXLabels :: ByteString -> Maybe IDXLabels
decodeIDXLabels ByteString
content = case ByteString
-> Either
     (ByteString, ByteOffset, FilePath)
     (ByteString, ByteOffset, IDXLabels)
forall a.
Binary a =>
ByteString
-> Either
     (ByteString, ByteOffset, FilePath) (ByteString, ByteOffset, a)
decodeOrFail ByteString
content of
                           Right (ByteString
_,ByteOffset
_,IDXLabels
result) -> IDXLabels -> Maybe IDXLabels
forall a. a -> Maybe a
Just IDXLabels
result
                           Left (ByteString, ByteOffset, FilePath)
_             -> Maybe IDXLabels
forall a. Maybe a
Nothing

-- | Read data from a file, return 'Nothing' if something doesn't work
encodeIDXLabelsFile :: IDXLabels -> FilePath -> IO ()
encodeIDXLabelsFile :: IDXLabels -> FilePath -> IO ()
encodeIDXLabelsFile IDXLabels
labs FilePath
path = FilePath -> IDXLabels -> IO ()
forall a. Binary a => FilePath -> a -> IO ()
encodeFile FilePath
path IDXLabels
labs

encodeIDXLabels :: IDXLabels -> BL.ByteString
encodeIDXLabels :: IDXLabels -> ByteString
encodeIDXLabels = IDXLabels -> ByteString
forall a. Binary a => a -> ByteString
encode

decodeIDXFile :: FilePath -> IO (Maybe IDXData)
decodeIDXFile :: FilePath -> IO (Maybe IDXData)
decodeIDXFile FilePath
path = FilePath -> IO ByteString
BL.readFile FilePath
path IO ByteString
-> (ByteString -> IO (Maybe IDXData)) -> IO (Maybe IDXData)
forall (m :: * -> *) a b. Monad m => m a -> (a -> m b) -> m b
>>= Maybe IDXData -> IO (Maybe IDXData)
forall (m :: * -> *) a. Monad m => a -> m a
return (Maybe IDXData -> IO (Maybe IDXData))
-> (ByteString -> Maybe IDXData)
-> ByteString
-> IO (Maybe IDXData)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> Maybe IDXData
decodeIDX

decodeIDX :: BL.ByteString -> Maybe IDXData
decodeIDX :: ByteString -> Maybe IDXData
decodeIDX ByteString
content = case ByteString
-> Either
     (ByteString, ByteOffset, FilePath)
     (ByteString, ByteOffset, IDXData)
forall a.
Binary a =>
ByteString
-> Either
     (ByteString, ByteOffset, FilePath) (ByteString, ByteOffset, a)
decodeOrFail ByteString
content of
  Right (ByteString
_,ByteOffset
_,IDXData
result) -> IDXData -> Maybe IDXData
forall a. a -> Maybe a
Just IDXData
result
  Left (ByteString, ByteOffset, FilePath)
_ -> Maybe IDXData
forall a. Maybe a
Nothing

encodeIDXFile :: IDXData -> FilePath -> IO ()
encodeIDXFile :: IDXData -> FilePath -> IO ()
encodeIDXFile IDXData
idx FilePath
path = FilePath -> IDXData -> IO ()
forall a. Binary a => FilePath -> a -> IO ()
encodeFile FilePath
path IDXData
idx

encodeIDX :: IDXData -> BL.ByteString
encodeIDX :: IDXData -> ByteString
encodeIDX = IDXData -> ByteString
forall a. Binary a => a -> ByteString
encode