{-# LANGUAGE BangPatterns, MagicHash, UnboxedTuples #-} {-| Library for using the LCM algorithm in order to compute closed frequent pattern. Input must be a transaction database, either in text format (as a ByteString) or in @[[Item]]@ format, where @Item = Int@. Several bencharking functions allowing to tune parallel strategy used and depth cutoff are also provided. -} -- HLCM -- (c) Alexandre Termier, 2009-2010 -- Original LCM algorithm from Takaki Uno and Hiroki Arimura. -- Many performance improvements thanks to Simon Marlow and Satnam Singh. -- -- Module implementing the LCM algorithm. -- -- -- See the README file for installation, usage, and details. -- See the LICENSE file for licensing. module HLCM ( Frequency, Item , runLCMstring , runLCMmatrix , benchLCM_parBuffer , benchLCM_parMap ) where ----------------------------------------------------------------- -- Imports ----------------------------------------------------------------- import Data.List import Data.Array.Unboxed import Control.Monad.ST import Data.Array.ST import Control.Parallel import Control.Parallel.Strategies import Control.Exception( evaluate ) import Data.Array.Base import Control.Monad import GHC.Exts import GHC.ST import qualified Data.List as List import qualified Data.ByteString.Char8 as L ----------------------------------------------------------------- -- Type definitions ----------------------------------------------------------------- type Item = Int type Frequency = Int type Tid = Int type Weight = Int ----------------------------------------------------------------- -- LCM functions ----------------------------------------------------------------- {-| Get the data as a long bytestring, parses it and and executes LCM to discover closed frequent itemsets. -} runLCMstring :: L.ByteString -- ^ The transaction database as a big string. Transactions are separated by newlines, items are separated by spaces -> Frequency -- ^ Minimum frequency threshold for the frequent itemsets -> [[Item]] -- ^ Output: list of closed frequent itemsets runLCMstring stringMatrix freq = let -- Load the file and convert it in the transaction database format (lexicographic tree) (transactionsLT, maxItem, antiPerm) = loadTransactionsString stringMatrix freq -- Compute occurrences for 1-itemsets occs = occurrenceDeliverLT transactionsLT maxItem in reversePerm antiPerm $ concat (parBuffer 8 rdeepseq $ (map (\x -> lcmIter 1 transactionsLT [] x (-1) occs maxItem freq) [0..maxItem]) ) {-| Get the data as a matrix of Items, parses it and and executes LCM to discover closed frequent itemsets. -} runLCMmatrix :: [[Item]] -- ^ The transaction database as matrix of items (List of List) -> Frequency -- ^ Minimum frequency threshold for the frequent itemsets -> [[Item]] -- ^ Output: list of closed frequent itemsets runLCMmatrix transMatrix freq = let -- Convert the matrix in the transaction database format (lexicographic tree) (transactionsLT, maxItem, antiPerm) = loadTransactionsMatrix transMatrix freq -- Compute occurrences for 1-itemsets occs = occurrenceDeliverLT transactionsLT maxItem in reversePerm antiPerm $ concat (parBuffer 8 rdeepseq $ (map (\x -> lcmIter 1 transactionsLT [] x (-1) occs maxItem freq) [0..maxItem]) ) {-| Use for benchmarking, parallel strategy = parBuffer by Simon Marlow. This strategy does not have space leak. /Warning: outputs are unusable as is, because items are renamed internally, and in this function the reverse renaming is not performed. It is trivial to have it back by copying the code from runLCMstring./ -} benchLCM_parBuffer :: L.ByteString -- ^ The transaction database as a big string. Transactions are separated by newlines, items are separated by spaces -> Frequency -- ^ Minimum frequency threshold for the frequent itemsets -> Int -- ^ value for parBuffer -> Int -- ^ depth for cutting parallelism -> [[Item]] -- ^ Output: list of closed frequent itemsets benchLCM_parBuffer stringMatrix freq n d = let -- Load the file and convert it in the transaction database format (lexicographic tree) (transactionsLT, maxItem, antiPerm) = loadTransactionsString stringMatrix freq -- Compute occurrences for 1-itemsets occs = occurrenceDeliverLT transactionsLT maxItem in concat (parBuffer n rdeepseq $ (map (\x -> lcmIterParBuffer 1 transactionsLT [] x (-1) occs maxItem freq n d) [0..maxItem]) ) {-| Use for benchmarking, parallel strategy = parMap from Control.Parallel.Strategies. /Warning: outputs are unusable as is, because items are renamed internally, and in this function the reverse renaming is not performed. It is trivial to have it back by copying the code from runLCMstring./ -} benchLCM_parMap :: L.ByteString -- ^ The transaction database as a big string. Transactions are separated by newlines, items are separated by spaces -> Frequency -- ^ Minimum frequency threshold for the frequent itemsets -> Int -- ^ depth for cutting parallelism -> [[Item]] -- ^ Output: list of closed frequent itemsets benchLCM_parMap stringMatrix freq d = let -- Load the file and convert it in the transaction database format (lexicographic tree) (transactionsLT, maxItem, antiPerm) = loadTransactionsString stringMatrix freq -- Compute occurrences for 1-itemsets occs = occurrenceDeliverLT transactionsLT maxItem in concat ( ((parMap rdeepseq) (\x -> lcmIterParMap 1 transactionsLT [] x (-1) occs maxItem freq d) [0..maxItem])) {-| Takes a list of itemsets and a permutation of items, an apply this permutation to the itemsets. -} reversePerm :: UArray Item Item -- ^ A permutation of items -> [[Item]] -- ^ Input list of itemsets. An itemset is a list of items, starting with its frequency. -> [[Item]] -- ^ Permuted list of itemsets. Frequency at the head of the itemset is untouched. reversePerm _ [] = [] reversePerm antiPerm (t:ts) = ((head t):(sort $ map (\it -> unsafeAt antiPerm it) (tail t))):(reversePerm antiPerm ts) {-| Loads input datase as string into a database of transactions in lexicographic tree format. Also reorders/renames the item by their frequency. The permutation between old item values and new item values is also returned. -} loadTransactionsString :: L.ByteString -- ^ Data as a long bytestring -> Frequency -- ^ Minimum support threshold -> (LexicoTreeItem, Item, UArray Item Item) -- ^ Return value : (Database, maximum item value after reduction, permutation of items) loadTransactionsString s thres = case stringToTransDB (L.lines s) of Nothing -> (Nil, -1, array (1,0) []) Just intMatrix -> let itmFrq = histogram (0, maxItem) intMatrix maxItem = maximum (concat intMatrix) (perm, antiPerm) = permut itmFrq reorderedMat = reorderMat intMatrix perm itmFrq thres maxItem' = maximum (concat reorderedMat) lexicoTree = foldr (\t lt -> insertLT t (-1) 1 lt) Nil reorderedMat in (lexicoTree, maxItem', antiPerm) {-| Loads input datase as a matrix of Items into a database of transactions in lexicographic tree format. Also reorders/renames the item by their frequency. The permutation between old item values and new item values is also returned. -} loadTransactionsMatrix :: [[Item]] -- ^ Data as a matrix of Items -> Frequency -- ^ Minimum support threshold -> (LexicoTreeItem, Item, UArray Item Item) -- ^ Return value : (Database, maximum item value after reduction, permutation of items) loadTransactionsMatrix intMatrix thres = let itmFrq = histogram (0, maxItem) intMatrix maxItem = maximum (concat intMatrix) (perm, antiPerm) = permut itmFrq reorderedMat = reorderMat intMatrix perm itmFrq thres maxItem' = maximum (concat reorderedMat) lexicoTree = foldr (\t lt -> insertLT t (-1) 1 lt) Nil reorderedMat in (lexicoTree, maxItem', antiPerm) {-| Converts the contents of a data string to a transaction database and computes maxItem. -} stringToTransDB :: [L.ByteString] -- ^ List of transactions -> Maybe [[Item]] -- ^ Return value : transactions as lists of items, and maxItem stringToTransDB [] = return [] stringToTransDB (t:ts) = do ts' <- stringToTransDB ts return (convertOneTrans t:ts') {-| Converts one transaction from string format to item format [Item]. -} convertOneTrans :: L.ByteString -- ^ Transaction as extracted from file -> [Item] -- ^ Result : list of items convertOneTrans s = case L.readInt s of Nothing -> [] Just (itm, rest) -> if (not $ L.null rest) then itm:(convertOneTrans (L.tail rest)) else [itm] {-| For a transaction database of type [[Item]], compute the frequency of each item and return an array (item, frequency). -} histogram :: (Item,Item) -- ^ Bounds for resulting array, must be (0, maxItem) -> [[Item]] -- ^ Transaction database -> UArray Item Frequency -- ^ Result : Array associating each item with its frequency histogram bnds lst = accumArray (+) 0 bnds [(i, 1) | j <- lst, i <- {-nub-} j{-, inRange bnds i-}] {-| Sorts a list of (Item, frequency of this item) in decreasing ordrer of frequency. XXX PERF : seems unefficient. -} sortFrq :: [(Item,Frequency)] -- ^ Each item is associated with its frequency -> [(Item,Frequency)] -- ^ Same list as input, but sorted in decreasing frequency order sortFrq [] = [] sortFrq ((x,y):rest) = (sortFrq [(x',y') | (x',y') <- rest, y' >= y]) ++ [(x,y)] ++ (sortFrq [(x'',y'') | (x'',y'') <- rest, y'' < y]) -- XXX PERF: lots of copies here...maybe not a pb for small lists ? {-| From an array of (item, frequency of this item), computes an associative array X where X[i] is the ith most frequent item. XXX PERF : performance issue with conversions to and from list inside function. -} permut :: UArray Item Frequency -- ^ Array associating to each item its frequency -> (UArray Item Item, UArray Item Item) -- ^ Result : new array where items are ranked by frequency, and the reverse permut arr = let (lo,hi) = bounds arr lstVal = [(i,(arr!i)) | i <- [lo..hi]] lst2 = sortFrq lstVal -- in array (lo,hi) [(k, (fst (lst2 !! (k)))) | k <- [lo..hi]] in (array (lo,hi) [(k, (head $ findIndices ((==k).fst) lst2)) | k <- [lo..hi]], array (lo,hi) [(k, (fst (lst2 !! (k)))) | k <- [lo..hi]]) -- XXX PERF: must be bad : the array is converted to list (one copy), -- then this list is sorted (more copies of small lists), and at -- last a new array is created... -- Try to improve this with a mutable array and more "in place" spirit... {-| Rewrites an input transaction database by removing infrequent items and using reordered items (i.e. each item is replaced by its rank in decreasing frequency order). -} reorderMat :: [[Item]] -- ^ Original transaction database -> UArray Item Item -- ^ Array of items sorted by decreasing frequency order (item, rank) -> UArray Item Frequency -- ^ Array associating each item with its frequency (item, frequency) -> Frequency -- ^ Minimum frequency threshold -> [[Item]] -- ^ Result : rewritten transaction database reorderMat ts perm itmFrq thres = map (\t -> sort $ (map (\i -> (unsafeAt perm i)) -- permutates items (filter (\x -> (unsafeAt itmFrq x) >= thres) -- eliminate unfrequent items t))) ts {-| Compute for each item of the transaction database its frequency. -} occurrenceDeliverLT :: LexicoTreeItem -- ^ Transaction database (in lexicographic tree format) -> Item -- ^ Maximal item in transaction database -> UArray Item Frequency -- ^ Result : array associating each item to its frequency. occurrenceDeliverLT cdb maxItem = runST (do arr <- newArray_ (0,maxItem) -- Creates an empty array : each item starts with frequency 0 forM_ [0..maxItem] $ \i -> unsafeWrite arr i 0 -- workaround for http://hackage.haskell.org/trac/ghc/ticket/3586 -- Compute frequencies for each item by efficient tree traversal _ <- traverse cdb arr unsafeFreeze arr ) {-| Efficient traversal of the transaction database as a lexicographic tree. Items frequencies are updated on the fly. -} traverse :: LexicoTreeItem -- ^ Transaction database -> STUArray s Item Frequency -- ^ Array associating each item with its frequency. UPDATED by this function ! -> ST s () traverse tree arr = ST $ \s -> case traverse' tree arr s of (# s', _ #) -> (# s', () #) traverse' :: LexicoTreeItem -> STUArray s Item Frequency -> State# s -> (# State# s, Int# #) traverse' Nil !arr s = (# s, 0# #) traverse' (Node item child alt w@(I# w#)) !arr s0 = case traverse' child arr s0 of { (# s1, childw #) -> case traverse' alt arr s1 of { (# s2, altw #) -> case unsafeRead arr item of { ST f -> case f s2 of { (# s3, I# itemw #) -> case unsafeWrite arr item (I# itemw + I# childw + w) of { ST f -> case f s2 of { (# s4, _ #) -> (# s4, childw +# w# +# altw #) }}}}}} {-| For a transaction database, a closed frequent itemset, and a candidate item for extension of this closed frequent itemset, recursively computes all the successor closed frequent itemsets by PPC-extension. -} lcmIter :: Int -- ^ Current depth in the search tree (for parallel optimisation purposes) -> LexicoTreeItem -- ^ Transaction database. -> [Item] -- ^ Input closed frequent itemset. -> Item -- ^ Candidate to extend the closed frequent itemset above. -> Item -- ^ CoreI item relative to the closed frequent itemset -> UArray Item Frequency -- ^ Array associating each item with its frequency -> Item -- ^ Maximal item -> Frequency -- ^ Minimum suppport threshold -> [[Item]] -- ^ Result : list of closed frequent itemsets. Each result is a list of items, the head of the list being the frequency of the item. lcmIter prof cdb itemset candidate coreI occs maxItem thres = let -- Reduce database rdb = projectAndReduce cdb candidate occs -- Compute items occurrences in reduced database newOccs = occurrenceDeliverLT rdb maxItem -- Check which items actually appear in reduced database presentItems = filter (\i -> newOccs!i > 0) [0..maxItem] candidateFreq = occs!candidate -- Compute 100% frequent items, unfrequent items, and future candidates (closedFrqItems, candidates, unfrqItems) = computeCandidates thres candidateFreq presentItems newOccs -- Update items occurrences table by suppressing 100% frequent and unfrequent items newOccs' = suppressItems newOccs closedFrqItems unfrqItems -- Result closed frequent itemset = input closed frequent itemset + 100% frequent items closedItemset = sort (itemset ++ closedFrqItems) -- Only candidates with value lower than input candidate can be used for further extension on this branch smallCandidates = takeWhile (<candidate) candidates in if (closedFrqItems /= []) -- if there is a result ... then if ((last closedFrqItems) <= candidate)-- ...and if it is OK to extend it then if ((length smallCandidates) > 0) -- ... and if we have at least 1 possible extension then -- recursiverly extend the candidates if (prof < 3) -- create parallel sparks only for low search space depth then ((candidateFreq:closedItemset):(concat (parBuffer 2 rdeepseq $ (map (\x -> lcmIter (prof+1) rdb closedItemset x candidate newOccs' maxItem thres) smallCandidates)))) else ((candidateFreq:closedItemset):(concat (map (\x -> lcmIter (prof+1) rdb closedItemset x candidate newOccs' maxItem thres) smallCandidates))) else [candidateFreq:closedItemset] else [] else [] ------------------------------- BENCHMARKING ------------------------------ {-| -} lcmIterParBuffer :: Int -- ^ Current depth in the search tree (for parallel optimisation purposes) -> LexicoTreeItem -- ^ Transaction database. -> [Item] -- ^ Input closed frequent itemset. -> Item -- ^ Candidate to extend the closed frequent itemset above. -> Item -- ^ CoreI item relative to the closed frequent itemset -> UArray Item Frequency -- ^ Array associating each item with its frequency -> Item -- ^ Maximal item -> Frequency -- ^ Minimum suppport threshold -> Int -- ^ Value for parBuffer -> Int -- ^ Depth for cutting parallelism -> [[Item]] -- ^ Result : list of closed frequent itemsets. Each result is a list of items, the head of the list being the frequency of the item. lcmIterParBuffer prof cdb itemset candidate coreI occs maxItem thres n d = let -- Reduce database rdb = projectAndReduce cdb candidate occs -- Compute items occurrences in reduced database newOccs = occurrenceDeliverLT rdb maxItem -- Check which items actually appear in reduced database presentItems = filter (\i -> newOccs!i > 0) [0..maxItem] candidateFreq = occs!candidate -- Compute 100% frequent items, unfrequent items, and future candidates (closedFrqItems, candidates, unfrqItems) = computeCandidates thres candidateFreq presentItems newOccs -- Update items occurrences table by suppressing 100% frequent and unfrequent items newOccs' = suppressItems newOccs closedFrqItems unfrqItems -- Result closed frequent itemset = input closed frequent itemset + 100% frequent items closedItemset = sort (itemset ++ closedFrqItems) -- Only candidates with value lower than input candidate can be used for further extension on this branch smallCandidates = takeWhile (<candidate) candidates in if (closedFrqItems /= []) -- if there is a result ... then if ((last closedFrqItems) <= candidate)-- ...and if it is OK to extend it then if ((length smallCandidates) > 0) -- ... and if we have at least 1 possible extension then -- recursiverly extend the candidates if (prof < d) -- create parallel sparks only for low search space depth then ((candidateFreq:closedItemset):(concat (parBuffer n rdeepseq $ (map (\x -> lcmIterParBuffer (prof+1) rdb closedItemset x candidate newOccs' maxItem thres n d) smallCandidates)))) else ((candidateFreq:closedItemset):(concat (map (\x -> lcmIterParBuffer (prof+1) rdb closedItemset x candidate newOccs' maxItem thres n d) smallCandidates))) else [candidateFreq:closedItemset] else [] else [] --- {-| -} lcmIterParMap :: Int -- ^ Current depth in the search tree (for parallel optimisation purposes) -> LexicoTreeItem -- ^ Transaction database. -> [Item] -- ^ Input closed frequent itemset. -> Item -- ^ Candidate to extend the closed frequent itemset above. -> Item -- ^ CoreI item relative to the closed frequent itemset -> UArray Item Frequency -- ^ Array associating each item with its frequency -> Item -- ^ Maximal item -> Frequency -- ^ Minimum suppport threshold -> Int -- ^ Depth for cutting parallelism -> [[Item]] -- ^ Result : list of closed frequent itemsets. Each result is a list of items, the head of the list being the frequency of the item. lcmIterParMap prof cdb itemset candidate coreI occs maxItem thres d = let -- Reduce database rdb = projectAndReduce cdb candidate occs -- Compute items occurrences in reduced database newOccs = occurrenceDeliverLT rdb maxItem -- Check which items actually appear in reduced database presentItems = filter (\i -> newOccs!i > 0) [0..maxItem] candidateFreq = occs!candidate -- Compute 100% frequent items, unfrequent items, and future candidates (closedFrqItems, candidates, unfrqItems) = computeCandidates thres candidateFreq presentItems newOccs -- Update items occurrences table by suppressing 100% frequent and unfrequent items newOccs' = suppressItems newOccs closedFrqItems unfrqItems -- Result closed frequent itemset = input closed frequent itemset + 100% frequent items closedItemset = sort (itemset ++ closedFrqItems) -- Only candidates with value lower than input candidate can be used for further extension on this branch smallCandidates = takeWhile (<candidate) candidates in if (closedFrqItems /= []) -- if there is a result ... then if ((last closedFrqItems) <= candidate)-- ...and if it is OK to extend it then if ((length smallCandidates) > 0) -- ... and if we have at least 1 possible extension then -- recursiverly extend the candidates if (prof < d) -- create parallel sparks only for low search space depth then ((candidateFreq:closedItemset):(concat ((parMap rdeepseq) (\x -> lcmIterParMap (prof+1) rdb closedItemset x candidate newOccs' maxItem thres d) smallCandidates))) else ((candidateFreq:closedItemset):(concat (map (\x -> lcmIterParMap (prof+1) rdb closedItemset x candidate newOccs' maxItem thres d) smallCandidates))) else [candidateFreq:closedItemset] else [] else [] ------------------- END OF BENMARKING FUNCTIONS ------------------------------- {-| For a given itemset being extended by a given candidate, compute the closure of this itemset and compute the candidates for further extension. -} computeCandidates :: Frequency -- ^ Minimum support thresold -> Frequency -- ^ Frequency of item selected for extension (@candidate@) -> [Item] -- ^ List of items in the transaction database -> UArray Item Frequency -- ^ Array associating to each item its frequency -> ([Item], [Item], [Item]) -- ^ Result : (100% frequent items = closure, -- candidates for further extension, unfrequent items) computeCandidates thres candidateFreq presentItems occs = let (frequentItems, unfrqItems) = partition(\i -> occs!i >= thres) presentItems closedFrqItems = filter (\i -> occs!i == candidateFreq) frequentItems candidates = frequentItems \\ closedFrqItems in (closedFrqItems, candidates, unfrqItems) {-| Modifies an array associating items with their frequency, in order to give a frequency of 0 to a given list of items. NB : for performance reasons, this is REALLY a modification, made with unsafe operations. -} suppressItems :: UArray Item Frequency -- ^ Array associating an item with its frequency -> [Item] -- ^ List of 100% frequent items -> [Item] -- ^ List of unfrequent items -> UArray Item Frequency -- ^ Initial array, with frequencies of 100% frequent items -- and unfrequent items set to 0. suppressItems occs closedItems unfreqItems = runST $ do occsST <- unsafeThaw occs :: ST s (STUArray s Item Frequency) -- Can be used in multithread because no concurrent write sequence_ (map (\i -> writeArray occsST i 0) closedItems) sequence_ (map (\i -> writeArray occsST i 0) unfreqItems) newOccs <- unsafeFreeze occsST -- Can be used in multithread because no concurrent write return newOccs {-| Creates a new, reduced transaction database by eliminating all items greater than @candidate@ item, and all infrequent items. -} projectAndReduce :: LexicoTreeItem -- ^ Original transaction database -> Item -- ^ Candidate item, on which the projection is made -> UArray Item Frequency -- ^ Array associating each item with its frequency in -- original transaction database. -> LexicoTreeItem -- ^ Result : Reduced transaction database projectAndReduce Nil _ _ = Nil projectAndReduce (Node e suiv alt w) !candidate occs | e > candidate = Nil | e == candidate = let !(TreeWeight suiv' addWeight) = filterInfrequent suiv occs in (Node e suiv' Nil (w+addWeight)) | e < candidate = let !alt' = projectAndReduce alt candidate occs !suiv' = projectAndReduce suiv candidate occs in if (occs!e > 0) then if notNil suiv' && notNil alt' then (Node e suiv' alt' 0) else if notNil suiv' then (Node e suiv' Nil 0) else alt' else if notNil suiv' && notNil alt' then mergeAlts suiv' alt' else if notNil suiv' then suiv' else alt' {-| Suppress all infrequent items from a transaction database expressed as lexicographic tree, and returns a new lexicographic tree. -} filterInfrequent :: LexicoTreeItem -- ^ Original transaction database -> UArray Item Frequency -- ^ Array associating each item with its frequency in -- original transaction database. In this setting, -- an infrequent item as a frequency of 0 (because of preprocessing by -- 'suppressItems'). -> TreeWeight -- ^ Result : (transaction database without infrequent items, weight to report in parent nodes) filterInfrequent Nil _ = TreeWeight Nil 0 filterInfrequent (Node e suiv alt w) occs | occs!e > 0 = TreeWeight (Node e suiv' alt' (w+ws)) wa | notNil suiv' && notNil alt' = TreeWeight (mergeAlts suiv' alt') w' | notNil alt' = TreeWeight alt' w' | notNil suiv' = TreeWeight suiv' w' | otherwise = TreeWeight Nil w' where w' = w+ws+wa !(TreeWeight suiv' ws) = filterInfrequent suiv occs !(TreeWeight alt' wa) = filterInfrequent alt occs {-# INLINE notNil #-} notNil Nil = False notNil _ = True data TreeWeight = TreeWeight !LexicoTreeItem {-#UNPACK#-}!Weight ----------------------------------------------------------------- -- LEXICOGRAPHIC TREE MANIPULATION ----------------------------------------------------------------- {-| Type for a lexicographic tree, implementating a n-ary tree over a binary tree. -} data LexicoTreeItem = Nil -- ^ Void node | Node {-#UNPACK#-} !Item !LexicoTreeItem -- NB. experimental strictness annotation !LexicoTreeItem -- NB. experimental strictness annotation {-#UNPACK#-} !Int -- ^ A node : item, next node (next in transaction), alternative node (other branch), weight deriving (Eq, Show) {-| Returns the maximal item of a lexicographic tree -} treeMax :: LexicoTreeItem -> Item treeMax Nil = -1 treeMax (Node e Nil Nil _) = e treeMax (Node _ suiv Nil _) = treeMax suiv treeMax (Node _ Nil alt _) = treeMax alt treeMax (Node _ suiv alt _) = max (treeMax suiv) (treeMax alt) {-| Inserts a transaction in list format into the lexicographic tree. Automatically merges identical transactions. Performs suffix intersection. -} insertLT :: [Item] -- ^ Transaction to insert into lexicographic tree -> Item -- ^ "coreI" item, for suffix intersection. -> Int -- ^ Weight of the transaction to inserct -> LexicoTreeItem -- ^ Input lexicographic tree -> LexicoTreeItem -- ^ Result : a new lexicographic tree with the transaction inserted insertLT [] _ _ lt = lt insertLT lst _ w Nil = createPath lst w insertLT [x] i w (Node e suiv alt weight) | x < e = Node x Nil (Node e suiv alt weight) w | x == e = Node e suiv alt (weight + w) | x > e = Node e suiv (insertLT [x] i w alt) weight insertLT (x:xs) i w (Node e suiv alt weight) | x < e = Node x (createPath xs w) (Node e suiv alt weight) 0 | x == e = if (e /= i) then Node e (insertLT xs i w suiv) alt weight else suffixIntersectionLT xs w (Node e suiv alt weight) | x > e = Node e suiv (insertLT (x:xs) i w alt) weight {-| From a transaction and its weight, directly creates a path-shaped lexicographic tree. -} createPath :: [Item] -- ^ Transaction -> Int -- ^ Weight of the transaction -> LexicoTreeItem -- ^ Result : a path-shaped lexicographic tree encoding the transaction createPath [] _ = Nil createPath [x] w = Node x Nil Nil w createPath (x:xs) w = Node x (createPath xs w) Nil 0 {-| Perform the "suffix intersection" operation with the suffix of a transaction and the corresponding part of a lexicographic tree. For more details, see "prefixIntersection" in Takeaki Uno's papers about LCM. -} suffixIntersectionLT :: [Item] -- ^ Suffix of the transaction to insert. -> Int -- ^ Weight of the transaction to insert -> LexicoTreeItem -- ^ (Sub-)lexicographic tree where the transaction must be inserted. The @next@ part (see data type comments) -- should be a simple path, it will be the target of intersection with the suffix. -> LexicoTreeItem -- ^ Result : lexicographic tree where the suffix has been added, with correct intersections performed. suffixIntersectionLT _ w (Node e Nil alt weight) = Node e Nil alt (weight+w) suffixIntersectionLT lst w (Node e suiv alt weight) = let (newSuiv, addWeight) = suffInterSuiv lst w suiv in Node e newSuiv alt (weight+addWeight) {-| Intersects a list-shaped transaction and a path-shaped lexicographic tree. The result is a path shaped lexicographic tree with weights correctly updated. -} suffInterSuiv :: [Item] -- ^ Transaction as list -> Int -- ^ Weight of the above transaction -> LexicoTreeItem -- ^ Path-shaped lexicographic tree -> (LexicoTreeItem, Int) -- ^ Result : (path-shaped lexicographic tree representing the intersection -- of transaction and input path , 0 if intersection not [] / sum of weights else) suffInterSuiv lst w suiv = let (lstSuiv, weightSuiv) = getLstSuiv suiv inter = List.intersect lstSuiv lst in if (inter /= []) then (createPath inter (weightSuiv+w), 0) else (Nil, weightSuiv+w) {-| Collects all the nodes of lexicographic tree in a list of elements. -} getLstSuiv :: LexicoTreeItem -- ^ Path shaped lexicographic tree. -> ([Item], Int) -- ^ Result : (list of elements in the path, sum of weights of nodes in the path) getLstSuiv Nil = ([], 0) getLstSuiv (Node e suiv Nil weight) = let (lst, w) = getLstSuiv suiv in (e:lst, w + weight) {-| Merge two lexicographic trees. -} mergeAlts :: LexicoTreeItem -- ^ Tree 1 -> LexicoTreeItem -- ^ Tree 2 -> LexicoTreeItem -- ^ Return : Tree 1 merged with Tree 2 mergeAlts Nil lt = lt mergeAlts lt Nil = lt mergeAlts (Node e1 suiv1 alt1 w1) (Node e2 suiv2 alt2 w2) | e1 < e2 = (Node e1 suiv1 (mergeAlts alt1 (Node e2 suiv2 alt2 w2)) w1) | e1 > e2 = (Node e2 suiv2 (mergeAlts (Node e1 suiv1 alt1 w1) alt2) w2) | e1 == e2 = (Node e1 (mergeAlts suiv1 suiv2) (mergeAlts alt1 alt2) (w1+w2)) ------------------------------------------------------------------