{- Copyright (C) 2013-2015 Dr. Alistair Ward This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . -} {- | [@AUTHOR@] Dr. Alistair Ward [@DESCRIPTION@] Profiles lists of file-sizes. -} module FishFood.Profiler( -- * Types -- ** Type-synonyms -- Probability, -- Result, -- FileSizeDistribution, -- * Functions calculateFileSizeDistribution, formatFileSizeDistribution, -- ** Accessors getFileSize, getValue ) where import Control.Arrow((&&&),(***)) import qualified Control.Monad.Writer import qualified Data.Default import qualified Data.List import qualified Data.Map import qualified Data.Maybe import qualified FishFood.Data.CommandOptions as Data.CommandOptions import qualified FishFood.Data.File as Data.File import FishFood.Data.Verbosity() import qualified Text.Printf -- | Define a type to represent the fractional closed unit-interval. type Probability = Double -- | Defines either the number of files or the probability that a files has a specific size. type Result = (Data.File.FileSize, Either Int {-file-count-} Probability) -- | Accessor. getFileSize :: Result -> Data.File.FileSize getFileSize = fst -- | Accessor. getValue :: Result -> Either Int {-file-count-} Probability getValue = snd -- | Defines either a /Probability Mass Function/ or /Frequency-distribution/. type FileSizeDistribution = [Result] -- | Calculates either the /Probability Mass Function/ or /Frequency-distribution/ for the specified files. calculateFileSizeDistribution :: (Floating ratio, RealFrac ratio) => Data.CommandOptions.CommandOptions ratio -> [Data.File.FileSize] -> Control.Monad.Writer.Writer [String] FileSizeDistribution calculateFileSizeDistribution commandOptions fileSizes = let binSizeDelta = Data.CommandOptions.getBinSizeDelta commandOptions deriveProbabilityMassFunction = Data.CommandOptions.getDeriveProbabilityMassFunction commandOptions nDecimalDigits = Data.CommandOptions.getNDecimalDigits commandOptions mean, standardDeviation :: Double (nFiles, mean, standardDeviation) = Data.File.getFileSizeStatistics fileSizes in do Control.Monad.Writer.tell [Text.Printf.printf "Files=%d, mean=%.*f, standard-deviation=%.*f" nFiles nDecimalDigits mean nDecimalDigits standardDeviation] return {-to Writer-monad-} $ if standardDeviation == 0 then return {-to List-monad-} . (,) (head fileSizes) $ if deriveProbabilityMassFunction then Right 1 -- i.e. certainty. else Left nFiles -- i.e. all. else let getDefaultedBinSizeIncrement :: Maybe Data.File.FileSize -> Data.File.FileSize getDefaultedBinSizeIncrement = Data.Maybe.fromMaybe $ round standardDeviation `max` 1 {-minimum increment-} -- CAVEAT: guard against subsequent division by zero or infinite iteration. calculatedBinSizes :: [Data.File.FileSize] calculatedBinSizes = map ( \fileSize -> either ( div {-round down-} fileSize . getDefaultedBinSizeIncrement {-non-zero-} ) ( floor {-round down-} . (`logBase` fromIntegral fileSize) -- CAVEAT: converts file-size 0, to bin-size -infinity. ) binSizeDelta ) fileSizes -- Each bin spans the semi-closed integral interval [size, succ size), so round down fractional values to match the lower bin. initialFrequencyDistribution :: Data.Map.Map Data.File.FileSize Int initialFrequencyDistribution = Data.Map.fromAscList . ( `zip` repeat 0 -- The initial file-count. ) . takeWhile ( <= maximum calculatedBinSizes ) . dropWhile ( < minimum calculatedBinSizes ) $ either ( \maybeBinSizeIncrement -> iterate (+ getDefaultedBinSizeIncrement {-non-zero-} maybeBinSizeIncrement) 0 ) ( \binRatio -> map round {-file-sizes are integral-} $ iterate (* binRatio) 1 -- The sequence could be started at fractional values in the open unit-interval, but the only value less than 1 which may be required is 0 (which isn't a sequence-member), which will be created later on demand. ) binSizeDelta mapBinSizeToFileSize :: Data.Map.Map Data.File.FileSize value -> Data.Map.Map Data.File.FileSize value mapBinSizeToFileSize = Data.Map.mapKeys $ \binSize -> either ( (* binSize) . getDefaultedBinSizeIncrement ) ( ceiling {-round up-} . (^^ binSize) -- Converts binSize -infinity, back to file-size 0. ) binSizeDelta -- Represent each bin by the minimum file-size it can accept. in Data.Map.toList . ( if deriveProbabilityMassFunction then Data.Map.map Right . mapBinSizeToFileSize . Data.Map.map ((/ fromIntegral nFiles {-non-zero-}) . fromIntegral) else Data.Map.map Left . mapBinSizeToFileSize ) $ foldr ( Data.Map.insertWith (+) `flip` 1 -- Count the files allocated to each bin. ) initialFrequencyDistribution calculatedBinSizes -- | Formats a file-size distribution. formatFileSizeDistribution :: Data.CommandOptions.CommandOptions ratio -> FileSizeDistribution -> String formatFileSizeDistribution commandOptions = Data.List.intercalate "\n" . map ( \(fileSize, value) -> fileSize ++ " " ++ value ) . ( if Data.CommandOptions.getVerbosity commandOptions > Data.Default.def then ( [ ( ($ (fileSizeWidth, fileSizeHeader)) &&& ($ (valueWidth, valueHeader)) ) . uncurry $ Text.Printf.printf "%*s", -- Column-headers. (`replicate` '=') *** (`replicate` '=') $ columnWidths -- Separator-bar. ] ++ ) -- Section. else id ) . map ( Text.Printf.printf "%*d" fileSizeWidth *** either ( Text.Printf.printf "%*d" valueWidth ) ( Text.Printf.printf "%.*f" $ Data.CommandOptions.getNDecimalDigits commandOptions ) ) where fileSizeHeader, valueHeader :: String headers@(fileSizeHeader, valueHeader) = (,) "Bin-size" $ if Data.CommandOptions.getDeriveProbabilityMassFunction commandOptions then "Probability" else "Frequency" fileSizeWidth, valueWidth :: Int columnWidths@(fileSizeWidth, valueWidth) = (`max` 10) . length *** length $ headers -- CAVEAT: the data-length may exceed the header-length, so define a minimum.