-- ----------------------------------------------------------------------------

{- |
  Module     : Holumbus.Data.Crunch
  Copyright  : Copyright (C) 2008 Timo B. Huebel
  License    : MIT
  
  Maintainer : Timo B. Huebel (tbh@holumbus.org)
  Stability  : experimental
  Portability: portable
  Version    : 0.1
  
  This module provides compression for streams of 32-bit words. Because of 
  some internal restriction in GHC, which makes all fixed integer size equal
  in terms of bit-width, the algorithm tries to crunch as much numbers as
  possible into a single 64-bit word.

  Based on the Simple9 encoding scheme from this article:
  
    * Vo N. Anh, Alstair Moffat,
      \"/Inverted Index Compression Using Word-Aligned Binary Codes/\",
      Information Retrieval, 8 (1), 2005, pages 151-166

-}

-- ----------------------------------------------------------------------------

{-# OPTIONS -fno-warn-type-defaults #-}

module Holumbus.Data.Crunch 
  (
  -- * Compression
  crunch8
  , crunch16
  , crunch32
  , crunch64
  
  -- * Decompression
  , decrunch8
  , decrunch16
  , decrunch32
  , decrunch64
  )
where

import Data.Bits
import Data.Word

data Width = W01 | W02 | W03 | W04 | W05 | W06 | W07 | W08 | W10 | W12 | W15 | W20 | W30 | W60
             deriving (Show, Eq, Enum)

-- | Increase performance by avoiding the calculation of maximum values.
value :: Width -> Word64
value W01 = (2 ^ 1) - 1
value W02 = (2 ^ 2) - 1
value W03 = (2 ^ 3) - 1
value W04 = (2 ^ 4) - 1
value W05 = (2 ^ 5) - 1
value W06 = (2 ^ 6) - 1
value W07 = (2 ^ 7) - 1
value W08 = (2 ^ 8) - 1
value W10 = (2 ^ 10) - 1
value W12 = (2 ^ 12) - 1
value W15 = (2 ^ 15) - 1
value W20 = (2 ^ 20) - 1
value W30 = (2 ^ 30) - 1
value W60 = (2 ^ 60) - 1

-- | Increase performance by avoiding the calculation of counts.
count :: Width -> Int
count W01 = 60
count W02 = 30
count W03 = 20
count W04 = 15
count W05 = 12
count W06 = 10
count W07 = 8
count W08 = 7
count W10 = 6
count W12 = 5
count W15 = 4
count W20 = 3
count W30 = 2
count W60 = 1

width :: Width -> Int
width W01 = 1
width W02 = 2
width W03 = 3
width W04 = 4
width W05 = 5
width W06 = 6
width W07 = 7
width W08 = 8
width W10 = 10
width W12 = 12
width W15 = 15
width W20 = 20
width W30 = 30
width W60 = 60

-- | Crunch some values by encoding several values into one 'Word64'. The values may not exceed
-- the upper limit of @(2 ^ 60) - 1@. This precondition is not checked! The compression works
-- best on small values, therefore a difference encoding (like the one in 
-- "Holumbus.Data.DiffList") prior to compression pays off well.
crunch64 :: [Word64] -> [Word64]
crunch64 [] = []
crunch64 s = crunch' s (count W01) W01 []

-- | Crunch a non empty list. This precondition is not checked!
crunch' :: [Word64] -> Int -> Width -> [Word64] -> [Word64]
crunch' [] 0 m r = [encode m r]
crunch' [] _ m r = crunch' r (count (succ m)) (succ m) []
crunch' s 0 m r = (encode m r):(crunch' s (count W01) W01 [])
crunch' s@(x:xs) n m r = if x <= value m then crunch' xs (n - 1) m (r ++ [x])
                         else crunch' (r ++ s) (count (succ m)) (succ m) []

-- | Encode some numbers with a given width. The number of elements in the list may not exceed 
-- the amount of numbers allowed for this width and the numbers in the list may not exceed the
-- upper bound for this width. These preconditions are not checked!
encode :: Width -> [Word64] -> Word64                         
encode w [] = rotateR (fromIntegral (fromEnum w)) (64 - (width w * count w))
encode w (x:xs) = rotateR (encode w xs .|. fromIntegral x) (width w)

-- | Decrunch a list of crunched values. No checking for properly encoded values is done, weird
-- results have to be expected if calling this function on a list of arbitrary values.
decrunch64 :: [Word64] -> [Word64]
decrunch64 [] = []
decrunch64 (x:xs) = (decode (width w) (count w) (value w) (rotateL x (width w))) ++ (decrunch64 xs)
                   where
                     w = toEnum $ fromIntegral (x .&. 15)  -- Extract the 4 selector bits.

-- Decode some numbers with a given width.
decode :: Int -> Int -> Word64 -> Word64 -> [Word64]
decode _ 0 _ _ = []
decode w n m x = (x .&. m):(decode w (n - 1) m (rotateL x w))

-- | Crunching 'Word8' values, defined in terms of 'crunch64'.
crunch8 :: [Word8] -> [Word64]
crunch8 = crunch64 . (map fromIntegral)

-- | Crunching 'Word16' values, defined in terms of 'crunch64'.
crunch16 :: [Word16] -> [Word64]
crunch16 = crunch64 . (map fromIntegral)

-- | Crunching 'Word32' values, defined in terms of 'crunch64'.
crunch32 :: [Word32] -> [Word64]
crunch32 = crunch64 . (map fromIntegral)

-- | Decrunching to 'Word8' values, defined in terms of 'decrunch64'.
decrunch8 :: [Word64] -> [Word8]
decrunch8 = (map fromIntegral) . decrunch64

-- | Decrunching to 'Word16' values, defined in terms of 'decrunch64'.
decrunch16 :: [Word64] -> [Word16]
decrunch16 = (map fromIntegral) . decrunch64

-- | Decrunching to 'Word32' values, defined in terms of 'decrunch64'.
decrunch32 :: [Word64] -> [Word32]
decrunch32 = (map fromIntegral) . decrunch64