{- |

This package easily lets you create high quality histogram plots from your data in Haskell.  It automatically bins your data using whichever binning strategy you'd like, then plots the data.  It uses the gnuplot package to do all the actual graphing, so any options that work for making gnuplot pretty will also work here.

Here's a brief example that should get you going:

>import qualified Graphics.Gnuplot.Frame.OptionSet as Opts
>
>input = [1,0.2,0.23,0.15,0.1,0.88,0.89,0.33,0.05,0.33,0.45,0.99,0.01,0.01,0.5]
>
>simple = do
>    let hist = histogram binSturges input
>    plot "simple.png" hist
>
>advanced = do
>    let hist = histogram binSqrt input
>    let opts = Opts.title "I'm a histogram!" $ 
>               Opts.yLabel "Why?" $ 
>               Opts.xLabel "Because!" $ 
>               defOpts hist
>    plotAdv "advanced.eps" opts hist

-}

module Graphics.Histogram 
    ( 
    
    -- * Creating Histograms
    histogram, histogramBinSize, histogramNumBins,
    
    -- * Binning Strategies
    binSturges, binDoane, binSqrt, binScott, binFreedmanDiaconis,
    
    -- * Graphing Histograms
    PlotOptions, plot, plotAdv, defOpts
    )
-- module Main
    where

import Data.List
import Data.Monoid
import Debug.Trace
import qualified Data.Map as Map

import Numeric

import GHC.IO.Exception

import qualified Graphics.Gnuplot.Advanced as Plot
import qualified Graphics.Gnuplot.Terminal.X11 as X11
import qualified Graphics.Gnuplot.Terminal.PNG as PNG
import qualified Graphics.Gnuplot.Terminal.SVG as SVG
import qualified Graphics.Gnuplot.Terminal.PostScript as EPS

import qualified Graphics.Gnuplot.Frame.Option as Option
import qualified Graphics.Gnuplot.Frame as Frame
import qualified Graphics.Gnuplot.Frame.OptionSet as Opts
import qualified Graphics.Gnuplot.Frame.OptionSet.Style as OptsStyle
import qualified Graphics.Gnuplot.Frame.OptionSet.Histogram as Histogram

import qualified Graphics.Gnuplot.Plot.TwoDimensional as Plot2D
import qualified Graphics.Gnuplot.Graph.TwoDimensional as Graph2D

import qualified Graphics.Gnuplot.LineSpecification as LineSpec
import qualified Graphics.Gnuplot.ColorSpecification as Color

-------------------------------------------------------------------------------

-- | Holds all the information needed to plot your histogram.  You shouldn't need to worry about this at all.
data Histogram = Histogram Double Double [(Double,Int)]
--     deriving Show

-------------------------------------------------------------------------------
-- Bin counters; check out http://en.wikipedia.org/wiki/Histogram#Number_of_bins_and_width

type BinningStrat = [Double] -> Int

-- | This is only a helper function to convert strategies that specify bin width into strategies that specify the number of bins
stratFromBinWidth :: [Double] -> Double -> Int
stratFromBinWidth xs h = ceiling $ ((maximum xs) - (minimum xs))/h

-- | Sturges' binning strategy is the least computational work, but recommended for only normal data
binSturges :: BinningStrat
binSturges xs = ceiling $ (log n)/(log 2) + 1
    where n = fromIntegral $ length xs
          
-- | Doane's binning strategy extends Sturges' for non-normal data.  It takes a little more time because it must calculate the kurtosis (peakkiness) of the distribution
binDoane :: BinningStrat
binDoane xs = ceiling $ 1 + (log n) + (log $ 1 + a*((n/6)**(1/2)))
    where 
        n = fromIntegral $ length xs -- :: Double
        a = kurtosis xs -- :: Double
        
-- | Using the sqrt of the number of samples is not supported by any theory, but is commonly used by excell and other histogram making software
binSqrt :: BinningStrat
binSqrt xs = round $ sqrt n
    where 
        n = fromIntegral $ length xs

-- | Scott's rule is the optimal solution for normal data, but requires more computation than Spurges'
binScott :: BinningStrat
binScott xs = stratFromBinWidth xs $ 3.5*(stddev xs) / (n**(1/3))
    where 
        n = fromIntegral $ length xs
        
-- | The Freedman-Diaconis rule is less susceptible to outliers than Scott's and is also used on \"normalish\" data
binFreedmanDiaconis :: BinningStrat
binFreedmanDiaconis xs = stratFromBinWidth xs $ 3.5*(stddev xs) / (n**(1/3))
    where 
        n = fromIntegral $ length xs

-------------------------------------------------------------------------------
-- create the histogram data

-- | Creates a histogram that's ready for plotting.  Call it with one of the binning strategies that is appropriate to the type of data you have.  If you don't know, then try using binSturges.
histogram :: BinningStrat -> [Double] -> Histogram
histogram strat xs = histogramNumBins (strat xs) xs

-- | Create a histogram by specifying the exact bin size. 
-- You probably don't want to use this function, and should use histogram with an appropriate binning strategy.
histogramBinSize :: Double -> [Double] -> Histogram
histogramBinSize size xs = Histogram m s $ fillhist size $ histbin size $ bin size xs
    where
        m = mean xs
        s = stddev xs

-- | Create a histogram by specifying the exact number of bins
-- You probably don't want to use this function, and should use histogram with an appropriate binning strategy.
histogramNumBins :: Int -> [Double] -> Histogram
histogramNumBins n xs =
    histogramBinSize size xs
    where
        size = (fromIntegral $ firstdigit diff) *((10) ** (fromIntegral $ exponent10 diff))
        diff = if diff_test==0
                  then 1
                  else diff_test
        diff_test = ((maximum xs)-(minimum xs))/(fromIntegral n)

        firstdigit dbl = floor $ dbl/((10) ** (fromIntegral $ exponent10 dbl))
        exponent10 dbl = floor $ log10 dbl
        log10 x = (log x) / (log 10)

-- helpers

   -- histbin does all the binning for the histogram
histbin :: Double -> [Double] -> [(Double,Int)]
histbin size xs = Map.toList $ Map.fromList [ (head l, length l) | l <- group (sort xs) ]

   -- histbin bins all the numbers in the histogram, but it ignores any columns with zero elements.
   -- fillhist adds those zero element columns
fillhist :: Double -> [(Double,Int)] -> [(Double,Int)]
fillhist size ((a,b):[]) = [(roundFloat a,b)]
fillhist size ((a,b):xs) = 
    if abs (next-a')<0.0001
       then (roundFloat a,b):(fillhist size xs)
       else (roundFloat a,b):(fillhist size $ (next,0):xs)
    where
        a' = fst $ head xs
        b' = snd $ head xs
        next = roundFloat (a+size)

-- | bin "rounds" every number into the closest number below it that is divisible by size
-- bin :: (Num a, RealFrac a) => a -> [a] -> [a]
bin :: Double -> [Double] -> [Double]
bin size xs = map (\x -> size*(fromIntegral $ floor (x/size))) xs

roundFloat :: Double -> Double
roundFloat num = read $ showFFloat (Just 3) num ""

-------------------------------------------------------------------------------
-- IO

-- | Options for a plot, as specified in the gnuplot library
type PlotOptions = Opts.T (Graph2D.T Int Double)
-- type PlotOptions = Graphics.Gnuplot.Frame.OptionSet.T (Graphics.Gnuplot.Graph.TwoDimensional.T Int Double)

-- | Plots your histogram using gnuplot.  If the filename is empty, then it will open a window and display the histogram on screen.  Otherwise, the filetype is automatically determined by the extension.  Supported file types are .png, .svg (vector graphics), and .eps (PostScript).
plot :: 
    String -> -- Filename @a@
    Histogram -> -- Data to be plotted
    IO GHC.IO.Exception.ExitCode
plot file histdata = plotAdv file (defOpts histdata) histdata

-- | Just like "plot", except you may specify additional options from the gnuplot library.  For example, you could add labels and a title.
plotAdv :: String -> -- Filename
           PlotOptions -> -- gnuplot options from Graphics.Gnuplot.Frame.OptionSet
           Histogram -> -- Data to be plotted
           IO GHC.IO.Exception.ExitCode
-- plotAdv file opts histdata = Plot.plot (SVG.cons $ file) $ histgen histdata opts
plotAdv file opts histdata = cmd
    where
        n = length file
        cmd = 
            if n==0
                then Plot.plot X11.cons $ histgen histdata opts
                else if (file !! (n-3))=='s' && (file !! (n-2))=='v' && (file !! (n-1))=='g' 
                    then Plot.plot (SVG.cons $ file) $ histgen histdata opts
                    else if (file !! (n-2))=='p' && (file !! (n-1))=='s' 
                        then Plot.plot (EPS.cons $ file) $ histgen histdata opts
                        else Plot.plot (PNG.cons $ file) $ histgen histdata opts
        
-- | Default plot display parameters
defOpts :: Histogram -> PlotOptions
defOpts (Histogram m s xs) = 
--     Opts.add (Option.custom "xtics" "") ["autojustify"] $
    Histogram.clusteredGap 0 $
    OptsStyle.fillBorderLineType (-1) $
    OptsStyle.fillSolid $
    Opts.xTicks2d xlabels $
    Opts.deflt
    where
        xlabels = zip (map (show . fst) xs) [0..]
        
histgen :: Histogram -> Opts.T (Graph2D.T Int Double) -> Frame.T (Graph2D.T Int Double)
histgen (Histogram m s xs) frameOpts =
    Frame.cons frameOpts $
    mconcat $ concat 
        -- this is the bar chart
        [ map (\(title,dat) ->
            fmap (Graph2D.lineSpec (LineSpec.lineColor Color.red $ LineSpec.title title LineSpec.deflt)) $
            Plot2D.list Graph2D.histograms dat) $
                ("", map (fromIntegral . snd) xs) :
                []
                
        -- this is the gaussian
{-        , map (\(title,dat) ->
            fmap (Graph2D.lineSpec (
--                 OptsStyle.custom "smooth" "bezier" $
                LineSpec.title "" $ 
                LineSpec.lineWidth 3 $
                LineSpec.deflt
                )) $
            Plot2D.list Graph2D.listLines dat) $ 
--                 ("", map snd xs):
                ("", map (\(x,y) -> normalscale*(normalpdf m s x)) xs):
--                 ("", replicate 20 20):
                []-}
        ]
    where
        normalscale = (fromIntegral $ maximum $ map snd xs)/(normalpdf m s m)

-- normalcoord m s (x,y) = normalpdf m s x

normalpdf :: Double -> Double -> Double -> Double
normalpdf m s x = (1/(s*(sqrt $ 2*pi)))*(exp $ -(x-m)^2/(2*s^2))

------------------------------------------------------------------------------
-- simple math functions
-- taken from package hstats, which wouldn't fully compile, so I just copied these here

-- |Numerically stable mean
mean :: Floating a => [a] -> a
mean x = fst $ foldl' (\(m, n) x -> (m+(x-m)/(n+1),n+1)) (0,0) x

-- |Standard deviation of sample
stddev :: (Floating a) => [a] -> a
stddev xs = sqrt $ var xs

-- |Sample variance
var :: (Floating a) => [a] -> a
var xs = (var' 0 0 0 xs) / (fromIntegral $ length xs - 1)
    where
      var' _ _ s [] = s
      var' m n s (x:xs) = var' nm (n + 1) (s + delta * (x - nm)) xs
         where
           delta = x - m
           nm = m + delta/(fromIntegral $ n + 1)


-- | kurtosis is taken from wikipedia's definition
kurtosis :: (Floating a) => [a] -> a
kurtosis xs = ((1/n) * (sum [(x-x_bar)^4 | x <- xs]))
            / ((1/n) * (sum [(x-x_bar)^2 | x <- xs]))^2 -3
    where 
        n = fromIntegral $ length xs
        x_bar = mean xs