{-# LANGUAGE FlexibleContexts #-} module Data.Text.WordCount ( topN , displayWords , filterTop -- * Filters , small , common -- * For making graphs , makeFile , makeDistribution -- * File processing with pandoc , processFile , globFile -- * Keyed maps , indexed ) where import Control.Arrow ((&&&)) import Control.Composition (on) import Control.Lens hiding (argument, indexed) import Data.Char import qualified Data.IntMap as IM import Data.List import qualified Data.Map.Lazy as M import Data.Map.Lens import Data.Monoid import Data.Ord import qualified Data.Text as T import qualified Data.Text.Lazy as TL import Data.Text.WordCount.FileRead import Data.Tuple import Graphics.Rendering.Chart.Backend.Diagrams import Graphics.Rendering.Chart.Easy hiding (argument, indexed) -- | Return an `IntMap` containing words indexed by their frequencies. indexed :: TL.Text -> IM.IntMap [TL.Text] indexed = orderM . buildFreq -- | Return top n words and their frequencies -- -- @ -- >>> topN 2 "hello hello goodbye it is time is it why why why it it" -- [(4,"it"),(3,"why")] -- @ topN :: Int -> TL.Text -> [(Int,TL.Text)] topN n = take n . order . buildFreq -- | Return the top n words, with some filter applied. filterTop :: Int -> (TL.Text -> Bool) -> TL.Text -> [(Int, TL.Text)] filterTop n p = take n . filter (p . snd) . order . buildFreq small :: TL.Text -> Bool small = (> 5) . TL.length . TL.filter (/= '\'') common :: TL.Text -> Bool common = flip elem ["the","and","a","an","or","not","but","on","so","if","in","that","this","for"] displayWords :: [(Int,TL.Text)] -> T.Text displayWords = mconcat . fmap (\(i, str) -> (T.pack . show) i <> ": " <> TL.toStrict str <> "\n") buildFreq :: TL.Text -> M.Map TL.Text Int buildFreq = count . filter (/=" ") . TL.split (`elem` (" \n;:,./" :: String)) . TL.map toLower orderM :: M.Map TL.Text Int -> IM.IntMap [TL.Text] orderM = IM.fromList . fmap ((fst . head) &&& fmap snd) . groupBy go . sortBy (flip (comparing fst)) . fmap swap . M.toList where go :: (Int, TL.Text) -> (Int, TL.Text) -> Bool go = on (==) fst order :: M.Map TL.Text Int -> [(Int, TL.Text)] order = sortBy (flip (comparing fst)) . fmap swap . M.toList count :: [TL.Text] -> M.Map TL.Text Int count words = foldr ((.) . go) id words M.empty where go word m = case m^.at word of Nothing -> at word ?~ 1 $ m _ -> ix word %~ (+1) $ m count' :: [TL.Text] -> M.Map TL.Text Int count' = M.fromListWith (+) . (`zip` repeat 1) count'' :: [TL.Text] -> [(TL.Text, Int)] count'' = fmap (head &&& length) . group . sort -- | Make a bar graph from the word frequencies -- -- @ -- makeFile :: IO () -- makeFile [(4,"it"),(3,"why")] "out.html" -- @ makeFile :: [(Int,TL.Text)] -> FilePath -> IO () makeFile points out = toFile def out (makeDistribution points) makeDistribution points = do let values = addIndexes (fmap fst points) let alabels = fmap (TL.unpack . snd) points let fillStyle = solidFillStyle (opaque lightblue) layout_title .= "Word Frequencies" layout_x_axis . laxis_generate .= autoIndexAxis alabels layout_y_axis . laxis_override .= axisGridHide layout_left_axis_visibility . axis_show_ticks .= False plot $ fmap plotBars $ liftEC $ do plot_bars_values .= fmap (over _2 pure) values plot_bars_item_styles .= pure (fillStyle, Nothing)