{- http://www.codeproject.com/KB/recipes/AprioriAlgorithm.aspx
http://fimi.ua.ac.be/data/
-}
{-# LANGUAGE FlexibleContexts #-}
module Data.HInduce.Associations.Apriori where
import Control.Parallel.Strategies
import Control.DeepSeq
import Data.Foldable (Foldable)
import qualified Data.Foldable as F
import Data.List
import Data.Map (Map)
import qualified Data.Map as M
import Data.Set (Set)
import qualified Data.Set as S
import Data.Vector (Vector)
import qualified Data.Vector as V
type Transaction a = Set a
type Items a = Set a
-- Uncomment if you need containers < 0.4.2.0
--instance (NFData a) => NFData (Set a) where
-- rnf = rnf . S.toList
class (Ord a, NFData a) => Item a where
instance Item Int
-- | Given transactions on items, derive rules for items
{-# SPECIALIZE rules :: [Transaction Int] -> Items Int -> (Map (Items Int) Int -> Map (Items Int) Int) -> Map (Items Int, Items Int) Double #-}
{-# SPECIALIZE rules :: Vector (Transaction Int) -> Items Int -> (Map (Items Int) Int -> Map (Items Int) Int) -> Map (Items Int, Items Int) Double #-}
rules :: (Item a, Foldable container)
=> container (Transaction a) -- ^ all transactions
-> Items a -- ^ the items that you are interested in
-> (Map (Items a) Int -> Map (Items a) Int) -- ^ a function which can filter itemsets from depending on the support
-> Map (Items a, Items a) Double -- ^ the tuple represents a rule, the double represents the confidence in that rule [0,1]
rules ts is f = M.fromList $ map (\a -> (a, conf a)) $ possiblerules fs where
frequencyBy :: (NFData a, Foldable container) =>
(a -> b -> Bool) -> [a] -> container b -> [(a,Int)]
frequencyBy f as bs =
map (\a ->(a, F.foldr (\b -> if f a b then (+) 1 else id) 0 bs)) as `using`
parListChunk 100 rdeepseq
fs = frequentsets ts is f
conf (a,b) = (fromIntegral $ fs M.! (a `S.union` b)) / (fromIntegral $ fs M.! a)
join :: Eq a => [[a]] -> [[a]]
join ls = concat $ map join' $ groupBy (\a b->init a == init b) ls where
join' [] = []
join' (x:xs) = [ x ++ [y] | y <- map last xs ] ++ join' xs
frequency :: (Item a, Foldable container) =>
container (Transaction a) -> [Items a] -> Map (Items a) Int
frequency ts iss = M.fromAscList $ (frequencyBy S.isSubsetOf iss ts )
frequentsets :: (Item a, Foldable container) =>
container (Transaction a) -> Items a -> (Map (Items a) Int -> Map (Items a) Int) -> Map (Items a) Int
frequentsets ts is f = M.unions $ takeWhile (/= M.empty) $ map l [0..] where
l = f . (frequency ts) . c
c 0 = map S.singleton $ S.toAscList is
c n = join' (M.keys $ l (n-1))
join' = (map S.fromAscList) . join . (map S.toAscList)
split :: Item a => Items a -> [(Items a, Items a)]
split = (map (\(a,b)->(S.fromList a, S.fromList b))) . split' . S.toList where
split' = init . tail . split'' where
split'' [] = [([],[])]
split'' (x:ys) = foldr (\(a,b) r -> (x:a,b):(a,x:b):r) [] (split'' ys)
possiblerules :: Item a => Map (Items a) Int -> [(Items a, Items a)]
possiblerules fs = filter (\(a,b)->a `M.member` fs && b `M.member` fs) $ concat $ map split $ (M.keys fs)
-- example
-- | Load a dataset from a file, where each line represents one transaction
loadDataSet :: String -> IO [Transaction Int]
loadDataSet filename = do
filedata <- readFile filename
return $ map (S.fromList . (map read) . words) (lines filedata)
top :: Item a => ([(Items a, Int)] -> [(Items a, Int)]) -> Map (Items a) Int -> Map (Items a) Int
top f m = M.fromList $ f $ sortBy (\(_,a) (_,b) -> compare b a) $ M.toList m
test = do
transactions <- loadDataSet "T10I4D100K.dat"
let items = S.unions transactions
-- select only the best 40 each round, as long as the support is at least 60
res = rules (V.fromList $ transactions) items (top ((take 40) . (filter (\(_,a)->a>= 60))))
-- only return the (sorted) rules with a confidence of at least 0.5
return $ takeWhile (\(_,a)->a > 0.5) $ sortBy (\(_,a) (_,b) -> compare b a) $ M.toList res