{- http://www.codeproject.com/KB/recipes/AprioriAlgorithm.aspx 
   http://fimi.ua.ac.be/data/
-}
{-# LANGUAGE FlexibleContexts #-}
module Data.HInduce.Associations.Apriori where

import Control.Parallel.Strategies
import Control.DeepSeq
import Data.Foldable (Foldable)
import qualified Data.Foldable as F
import Data.List
import Data.Map (Map)
import qualified Data.Map as M
import Data.Set (Set)
import qualified Data.Set as S
import Data.Vector (Vector)
import qualified Data.Vector as V

type Transaction a = Set a
type Items a = Set a

-- Uncomment if you need containers < 0.4.2.0
--instance (NFData a) => NFData (Set a) where  
--  rnf = rnf . S.toList
  
class (Ord a, NFData a) => Item a where

instance Item Int

-- | Given transactions on items, derive rules for items
{-# SPECIALIZE rules :: [Transaction Int] -> Items Int -> (Map (Items Int) Int -> Map (Items Int) Int) -> Map (Items Int, Items Int) Double #-}
{-# SPECIALIZE rules :: Vector (Transaction Int) -> Items Int -> (Map (Items Int) Int -> Map (Items Int) Int) -> Map (Items Int, Items Int) Double #-}
rules :: (Item a, Foldable container) 
  => container (Transaction a)                -- ^ all transactions
  -> Items a                                  -- ^ the items that you are interested in
  -> (Map (Items a) Int -> Map (Items a) Int) -- ^ a function which can filter itemsets from depending on the support
  -> Map (Items a, Items a) Double            -- ^ the tuple represents a rule, the double represents the confidence in that rule [0,1]
rules ts is f = M.fromList $ map (\a -> (a, conf a)) $ possiblerules fs where 
  frequencyBy :: (NFData a, Foldable container) => 
    (a -> b -> Bool) -> [a] -> container b -> [(a,Int)]
  frequencyBy f as bs = 
    map (\a ->(a, F.foldr (\b -> if f a b then (+) 1 else id) 0 bs)) as `using` 
      parListChunk 100 rdeepseq

  fs = frequentsets ts is f
  conf (a,b) = (fromIntegral $ fs M.! (a `S.union` b)) / (fromIntegral $ fs M.! a)
  
  join :: Eq a => [[a]] -> [[a]]
  join ls = concat $ map join' $ groupBy (\a b->init a == init b) ls where
    join' [] = []
    join' (x:xs) = [ x ++ [y] | y <- map last xs ] ++ join' xs
      
  frequency :: (Item a, Foldable container) => 
    container (Transaction a) -> [Items a] -> Map (Items a) Int
  frequency ts iss = M.fromAscList $ (frequencyBy S.isSubsetOf iss ts )

  frequentsets :: (Item a, Foldable container) => 
    container (Transaction a) -> Items a -> (Map (Items a) Int -> Map (Items a) Int) -> Map (Items a) Int
  frequentsets ts is f = M.unions $ takeWhile (/= M.empty) $ map l [0..] where
    l = f . (frequency ts) . c
    c 0 = map S.singleton $ S.toAscList is
    c n = join' (M.keys $ l (n-1))
    join' = (map S.fromAscList) . join . (map S.toAscList)

  split :: Item a => Items a -> [(Items a, Items a)]
  split = (map (\(a,b)->(S.fromList a, S.fromList b))) . split' . S.toList where
    split' = init . tail . split'' where
      split'' [] = [([],[])]
      split'' (x:ys) = foldr (\(a,b) r -> (x:a,b):(a,x:b):r) [] (split'' ys)

  possiblerules :: Item a => Map (Items a) Int -> [(Items a, Items a)]
  possiblerules fs = filter (\(a,b)->a `M.member` fs && b `M.member` fs) $ concat $ map split $ (M.keys fs)

-- example

-- | Load a dataset from a file, where each line represents one transaction
loadDataSet :: String -> IO [Transaction Int]
loadDataSet filename = do
  filedata <- readFile filename
  return $ map (S.fromList . (map read) . words) (lines filedata)
  
top :: Item a => ([(Items a, Int)] -> [(Items a, Int)]) -> Map (Items a) Int -> Map (Items a) Int
top f m = M.fromList $ f $ sortBy (\(_,a) (_,b) -> compare b a) $ M.toList m 

test = do
  transactions <- loadDataSet "T10I4D100K.dat"
  let items = S.unions transactions
      -- select only the best 40 each round, as long as the support is at least 60
      res = rules (V.fromList $ transactions) items (top ((take 40) . (filter (\(_,a)->a>= 60))))
  -- only return the (sorted) rules with a confidence of at least 0.5
  return $ takeWhile (\(_,a)->a > 0.5) $ sortBy (\(_,a) (_,b) -> compare b a) $ M.toList res