{- | Emping 0.4, 0.5 (provisional) Module Codec transforms a list of lists of String attributes and values into a list of lists of Int tuples. The first list of Strings must be the attribute names (may contain blankId), the following lists must contain attribute values or blankId. The module also provides an Array to display the names of coded attributes and values, and functions to deal with table rows that are equal (duplicates). Fri 04 Apr 2008 07:17:45 PM CEST -} module Codec (tableCode, tableToArray, partDups, checkNoDups, factsDups, factsUniques ) where import Data.List (transpose, elemIndex) import Control.Monad.State import Data.Array import Aux ( AVp, partitionBy ) import CSVParse ( blankId ) -- | check whether a value string is in a list, add it if not, and return its index. A blankId returns an index of -1 and leaves the value list unchanged. repl1 :: String -> State [String] Int repl1 x | x == blankId = State (\col -> (-1, col)) | otherwise = State (\col -> case elemIndex x col of Nothing -> ((length col), col ++ [x]) Just i -> (i, col)) -- | replace value strings with indices to a unique list of value strings. Note: the blankId is represented by -1 getInds :: [String] -> [Int] getInds strcol = evalState (mapM repl1 strcol) [] -- | get the list of values itself. Note: the blankId is not represented in this list. getVals :: [String] -> [String] getVals strcol = execState (mapM repl1 strcol) [] -- | get all unique value lists from the original table avLs :: [[String]] -> [[String]] avLs [] = error "Codec avLs: empty String list" avLs (_:[]) = error "Codec avLs : only one element in String list" avLs tb = ((map getVals) . transpose . tail) tb {- | tableCode produces a list of lists of attribute-value tuples, coded as Int. A list of tuples represents a conjunction of attribute values, also called a fact list. first of each tuple indexes into the attribute-value array (and the first element of that tuple, the attribute name) second of each tuble indexes into the second element of an array tuple, the list of value names tableCode represents blankId as -1, which is not in the string array. -} tableCode :: [[String]] -> [[AVp]] tableCode [] = error "Codec tableCode: empty String list" tableCode (_:[]) = error "Codec tableCode : only one element in String list" tableCode tb = map (zip [0..]) (values (tail tb)) where values = transpose . (map getInds) . transpose {- | tableToArray returns an array of attribute value-lists. The first element of a tuple is an attribute name, the second the list of its unique values. An attribute coding must be an index to the array (and the String) A value coding must be an index to the list of Strings. Note: blankId is not represented in the Array. -} tableToArray :: [[String]] -> Array Int (String, [String]) tableToArray tb = listArray (0, (length (head tb)) -1) atplusvals where atplusvals = zip (head tb) (avLs tb) -- | partDups partitions a fact list into a list of duplicates and their counts. This is a data check, duplicates do not effect the reduction, but should be removed initially. partDups :: [[AVp]] -> [([[AVp]], Int)] partDups origs = [(x, length x) | x <- parts ] where parts = partitionBy (\x y -> x == y ) origs -- | check the partition of possible Dups for any Dups (True if no Dups) checkNoDups :: [([[AVp]], Int)] -> Bool checkNoDups parts | (filter (\x -> (snd x) > 1) parts) == [] = True | otherwise = False -- | get a representative of each double in the facts with its occurrence count factsDups :: [([[AVp]],Int)] -> [([AVp],Int)] factsDups dupfacs = [ ((head . fst) x, snd x) | x <- dbls ] where dbls = [ x | x <- dupfacs, (snd x) > 1 ] -- | factsUniques returns the head of each partition of a Dups (including singles)list. This function removes any double data entries, if they exist. factsUniques :: [([[AVp]], Int)] -> [[AVp]] factsUniques dupfacs = map (head . fst) dupfacs