{-# LANGUAGE FlexibleContexts #-}

module Data.Text.WordCount
    ( topN
    , displayWords
    , filterTop
    -- * Filters
    , small
    , common
    -- * For making graphs
    , makeFile
    , makeDistribution
    -- * File processing with pandoc
    , processFile
    , globFile
    -- * Low level-ish
    , buildFreq
    -- * Keyed maps
    , indexed
    ) where

import           Control.Arrow                             ((&&&))
import           Control.Composition                       (on)
import           Control.Lens                              hiding (argument,
                                                            indexed)
import           Data.Char
import qualified Data.IntMap                               as IM
import           Data.List
import qualified Data.Map.Lazy                             as M
import           Data.Map.Lens
import           Data.Monoid
import           Data.Ord
import qualified Data.Text                                 as T
import qualified Data.Text.Lazy                            as TL
import           Data.Text.WordCount.FileRead
import           Data.Tuple
import           Graphics.Rendering.Chart.Backend.Diagrams
import           Graphics.Rendering.Chart.Easy             hiding (argument,
                                                            indexed)

-- | Return an `IntMap` containing words indexed by their frequencies.
indexed :: TL.Text -> IM.IntMap [TL.Text]
indexed = orderM . buildFreq

-- | Return top n words and their frequencies
--
-- @
-- >>> topN 2 "hello hello goodbye it is time is it why why why it it"
-- [(4,"it"),(3,"why")]
-- @
topN :: Int -> TL.Text -> [(Int,TL.Text)]
topN n = take n . order . buildFreq

-- | Return the top n words, with some filter applied.
filterTop :: Int -> (TL.Text -> Bool) -> TL.Text -> [(Int, TL.Text)]
filterTop n p = take n . filter (p . snd) . order . buildFreq

small :: TL.Text -> Bool
small = (> 5) . TL.length . TL.filter (/= '\'')

common :: TL.Text -> Bool
common = flip elem ["the","and","a","an","or","not","but","on","so","if","in","that","this","for"]

displayWords :: [(Int,TL.Text)] -> T.Text
displayWords = mconcat . fmap (\(i, str) -> (T.pack . show) i <> ": " <> TL.toStrict str <> "\n")

buildFreq :: TL.Text -> M.Map TL.Text Int
buildFreq = count . filter (/=" ") . TL.split (`elem` (" \n;:,./" :: String)) . TL.map toLower

orderM :: M.Map TL.Text Int -> IM.IntMap [TL.Text]
orderM = IM.fromList . fmap ((fst . head) &&& fmap snd) . groupBy go . sortBy (flip (comparing fst)) . fmap swap . M.toList
    where go :: (Int, TL.Text) -> (Int, TL.Text) -> Bool
          go = on (==) fst

order :: M.Map TL.Text Int -> [(Int, TL.Text)]
order = sortBy (flip (comparing fst)) . fmap swap . M.toList

count :: [TL.Text] -> M.Map TL.Text Int
count words = foldr ((.) . go) id words M.empty
    where go word m = case m^.at word of
            Nothing -> at word ?~ 1 $ m
            _       -> ix word %~ (+1) $ m

count' :: [TL.Text] -> M.Map TL.Text Int
count' = M.fromListWith (+) . (`zip` repeat 1)

count'' :: [TL.Text] -> [(TL.Text, Int)]
count'' = fmap (head &&& length) . group . sort

-- | Make a bar graph from the word frequencies
--
-- @
-- makeFile :: IO ()
-- makeFile [(4,"it"),(3,"why")] "out.html"
-- @
makeFile :: [(Int,TL.Text)] -> FilePath -> IO ()
makeFile points out = toFile def out (makeDistribution points)

makeDistribution points = do
    let values = addIndexes (fmap fst points)
    let alabels = fmap (TL.unpack . snd) points
    let fillStyle = solidFillStyle (opaque lightblue)
    layout_title .= "Word Frequencies"
    layout_x_axis . laxis_generate .= autoIndexAxis alabels
    layout_y_axis . laxis_override .= axisGridHide
    layout_left_axis_visibility . axis_show_ticks .= False
    plot $ fmap plotBars $ liftEC $ do
        plot_bars_values .= fmap (over _2 pure) values
        plot_bars_item_styles .= pure (fillStyle, Nothing)