module Data.Text.WordCount
( topN
, displayWords
, filterTop
, small
, common
, makeFile
, makeDistribution
, processFile
, globFile
, buildFreq
, indexed
) where
import Control.Arrow ((&&&))
import Control.Composition (on)
import Control.Lens hiding (argument,
indexed)
import Data.Char
import qualified Data.IntMap as IM
import Data.List
import qualified Data.Map.Lazy as M
import Data.Map.Lens
import Data.Monoid
import Data.Ord
import qualified Data.Text as T
import qualified Data.Text.Lazy as TL
import Data.Text.WordCount.FileRead
import Data.Tuple
import Graphics.Rendering.Chart.Backend.Diagrams
import Graphics.Rendering.Chart.Easy hiding (argument,
indexed)
indexed :: TL.Text -> IM.IntMap [TL.Text]
indexed = orderM . buildFreq
topN :: Int -> TL.Text -> [(Int,TL.Text)]
topN n = take n . order . buildFreq
filterTop :: Int -> (TL.Text -> Bool) -> TL.Text -> [(Int, TL.Text)]
filterTop n p = take n . filter (p . snd) . order . buildFreq
small :: TL.Text -> Bool
small = (> 5) . TL.length . TL.filter (/= '\'')
common :: TL.Text -> Bool
common = flip elem ["the","and","a","an","or","not","but","on","so","if","in","that","this","for"]
displayWords :: [(Int,TL.Text)] -> T.Text
displayWords [] = ""
displayWords (pair:pairs) = display pair <> "\n" <> displayWords pairs
where display (n,str) = (T.pack . show) n <> ": " <> TL.toStrict str
buildFreq :: TL.Text -> M.Map TL.Text Int
buildFreq = count . filter (/=" ") . TL.split (`elem` (" \n;:,./" :: String)) . TL.map toLower
orderM :: M.Map TL.Text Int -> IM.IntMap [TL.Text]
orderM = IM.fromList . fmap ((fst . head) &&& fmap snd) . groupBy go . sortBy (flip (comparing fst)) . fmap swap . M.toList
where go :: (Int, TL.Text) -> (Int, TL.Text) -> Bool
go = on (==) fst
order :: M.Map TL.Text Int -> [(Int, TL.Text)]
order = sortBy (flip (comparing fst)) . fmap swap . M.toList
count :: [TL.Text] -> M.Map TL.Text Int
count words = foldr ((.) . go) id words M.empty
where go word m = case m^.at word of
Nothing -> at word ?~ 1 $ m
_ -> ix word %~ (+1) $ m
count' :: [TL.Text] -> M.Map TL.Text Int
count' = M.fromListWith (+) . (`zip` repeat 1)
count'' :: [TL.Text] -> [(TL.Text, Int)]
count'' = fmap (head &&& length) . group . sort
makeFile :: [(Int,TL.Text)] -> FilePath -> IO ()
makeFile points out = toFile def out (makeDistribution points)
makeDistribution points = do
let values = addIndexes (fmap fst points)
let alabels = fmap (TL.unpack . snd) points
let fillStyle = solidFillStyle (opaque lightblue)
layout_title .= "Word Frequencies"
layout_x_axis . laxis_generate .= autoIndexAxis alabels
layout_y_axis . laxis_override .= axisGridHide
layout_left_axis_visibility . axis_show_ticks .= False
plot $ fmap plotBars $ liftEC $ do
plot_bars_values .= fmap (over _2 pure) values
plot_bars_item_styles .= pure (fillStyle, Nothing)