{-# LANGUAGE TypeSynonymInstances, TemplateHaskell, MultiParamTypeClasses, FlexibleInstances #-} module SmartGroup (Splittable(wordsOf), groupRoot, group1, smartGroup, notStopWord, Result(..)) where import qualified Data.Map as Map import Data.Map (Map) import qualified Data.Set as Set import Data.Set (Set) import Data.Char import Data.List import qualified Data.ByteString.Char8 as S import qualified Data.ByteString.Lazy.Char8 as L import Data.Ord import Language.Haskell.TH.Syntax data Doc a b = Doc {val :: b, wds :: [a], assoc :: (Map a Int), maxFreq :: Int} data Result a b = Points (Map a [b]) | Subpoints (Map a (Result a b)) deriving Show class (Ord b) => Splittable a b where wordsOf :: a -> [b] instance Splittable String String where wordsOf = filter notStopWord . words instance Splittable L.ByteString L.ByteString where wordsOf = filter (notStopWord . L.unpack) . L.splitWith isSpace instance Splittable S.ByteString S.ByteString where wordsOf = filter (notStopWord . S.unpack) . S.splitWith isSpace -- | Test that the word is important to the document's meaning (not 'the', 'a', 'an', etc) notStopWord :: String -> Bool notStopWord = not . flip Set.member (Set.fromList $(runIO (readFile "stopWords") >>= (lift . words))) -- | Groups a list of splittable items to a given breadth and depth smartGroup :: (Splittable a b) => Int -> Int -> [a] -> Result b a smartGroup _ _ [] = Points Map.empty smartGroup maxSize maxDepth l = f maxDepth docList where docList = flip map l $ \x-> let w = wordsOf x ass = foldl (flip $ Map.alter (return . maybe 1 (+1))) Map.empty w in Doc x w ass (Map.fold (\x y-> if x>y then x else y) 0 ass) tf w (Doc{assoc = m, maxFreq=mf}) = fromIntegral (Map.findWithDefault 0 w m) / fromIntegral mf f mx a = if mx == 1 then Points (Map.map (map val) m) else Subpoints $ Map.map (f (mx-1)) m where m = minGroups a Map.empty docFreqs w = foldl (\x y->x + if Map.member w (assoc y) then 1 else 0) 0 a idf w = let d = docFreqs w len = genericLength a in if d > 0 then logBase 2 $ len / (1 + (abs $ fromIntegral maxSize - (len / d))) else 0 getRank i j = tf j i * idf j minGroups [] r = r minGroups (x@(Doc{wds=ws}):ns) r = minGroups ns (Map.alter (return . maybe [x] (x:)) (maximumBy (comparing (getRank x)) ws) r) -- | Divide list into groups such that the amount of groups -- resembles the square root of the number of elements groupRoot :: (Splittable a b) => Int -> [a] -> Result b a groupRoot d s = smartGroup (truncate $ sqrt $ fromIntegral (length s)) d s -- | Groups to the root of the number of elements using a depth of 1. -- Elements are automatically extracted from the `Result` group1 :: (Splittable a b) => [a] -> (Map b [a]) group1 l = let (Points m) = groupRoot 1 l in m