module Wordlint.Words where import Data.Char (isPunctuation, toLower) import Data.List -- -- This module contains types and functions for working with words and their -- positions used in processing of file. A "Word" is a data structure -- containing a word string; its position by word count, line, or percentage of -- file used to calculate intervening distances; and its line and and column -- positions used when returning data. -- -- To create each word, four sets of functions each provide one element and -- these are zipped together for each word in the file. A Word's "lemma" is -- first returned by the basic prelude function `words` and therefore includes -- punctuation marks. While this is required in order to identify the correct -- coordinates for each Word, this default is also helpful to catch, for -- example, the repetition of transition words such as "Furthermore,". -- -- Punctuation and capitalization filters, as well as a user-defined word -- blacklist, are available to filter the list of Words prior to matching -- duplicates. data Word a = Word { lemma :: String ,position :: a ,line :: Int ,column :: Int } type Words a = [Word a] --Words are tested for equality and ordered by their lemma instance Eq (Word a) where x == y = lemma x == lemma y instance Ord (Word a) where compare x y = compare (lemma x) (lemma y) ----------------------------------------------------------------- -- --Word creation -- ----------------------------------------------------------------- -- -- Position -- Create list of tuples containing lemma and word position -- "Position" depends on type-of-check -- For word and line checks, Position is an Int representing -- the word count and line positions, respectively. -- A percentage check returns a Position of type Double, -- representing the Word's position as a percentage of -- total words in the file. -- class NumOps a where createPos :: String -> String -> [(String, a)] instance NumOps Double where createPos text _ = createPercentPos text instance NumOps Int where createPos = wordOrLine wordOrLine :: String -> String -> [(String, Int)] wordOrLine s t = case t of "word" -> createWordPos s "line" -> createLinePos s _ -> createWordPos s createWordPos :: String -> [(String, Int)] createWordPos s = zip (words s) [1..] createLinePos :: String -> [(String, Int)] createLinePos = createWordLinePos createPercentPos :: String -> [(String, Double)] createPercentPos s = getWordPercentPos wrdlst wrdln where wrdln = length $ words s wrdlst = createWordPos s getWordPercentPos :: [(String, Int)] -> Int -> [(String, Double)] getWordPercentPos [] _ = [] getWordPercentPos (x:xs) y = divWordPercentPos x y : getWordPercentPos xs y divWordPercentPos :: (String, Int) -> Int -> (String, Double) divWordPercentPos (s,x) y = (s,p) where xi = fromIntegral x yi = fromIntegral y p = (xi/yi)*100 -- -- Line coordinate -- Create list of tuples containing lemma and line position createWordLinePos :: String -> [(String, Int)] createWordLinePos xs = setWordLines $ zip (getWordLines xs) [1..] -- Makes lists of words by line, fed to the functions below getWordLines :: String -> [[String]] getWordLines xs = fmap words (lines xs) setWordLine :: ([String], Int) -> [(String, Int)] setWordLine (([],_)) = [] setWordLine ((x:xs,i)) = (x,i) : setWordLine (xs,i) setWordLines :: [([String], Int)] -> [(String, Int)] setWordLines [] = [] setWordLines (x:xs) = setWordLine x ++ setWordLines xs -- Column coordinate -- 1) lines on file and zip infinite lists for each char. [[(Char,Int)]] -- 2) compare first char of words from each inner [[String]] -- and add Int from [[(Char,Int)]] when Char == first Char of string createWordColPos :: String -> [(String, Int)] createWordColPos xs = setWordCols lin $ numWordCols lin where lin = lines xs -- number columns in each line of file numWordCols :: [String] -> [[(Char,Int)]] numWordCols [[]] = [[]] numWordCols [] = [[]] numWordCols (x:xs) = zip x [1..] : numWordCols xs -- call filter on lines setWordCols :: [String] -> [[(Char,Int)]] -> [(String,Int)] setWordCols [] [] = [] setWordCols (_:_) [] = [] setWordCols [] (_:_) = [] setWordCols (x:xs) (y:ys) = filtWordCols (words x) y ++ setWordCols xs ys filtWordCols :: [String] -> [(Char,Int)] -> [(String,Int)] filtWordCols [] _ = [] filtWordCols _ [] = [] filtWordCols w@(x:xs) c@(y:ys) = if (fst y == ' ') || (fst y /= head x) then filtWordCols w ys else (x,snd y) : filtWordCols xs (drop (length x) c ) -- -- Master functoion to create a list of words from a file. zipWords :: (NumOps a) => String -> String -> Words a zipWords s t = zipWith4 Word (words s) (wordpos s t) linepos colpos where linepos = snd . unzip $ createWordLinePos s colpos = snd . unzip $ createWordColPos s wordpos u v = snd . unzip $ createPos u v ----------------------------------------------------------------- -- -- Functions that operate on Word(s) type -- ----------------------------------------------------------------- -- Check a word against the minimum word length (matchlength Arguments in Wordlint.Args) isCheckWordLong :: (NumOps a) => Word a -> Int -> Bool isCheckWordLong (Word w _ _ _) x = length w > x -- Filter Words based on minimum word length checkWordList :: (NumOps a) => Words a -> Int -> Words a checkWordList [] _ = [] checkWordList (x:xs) i = if isCheckWordLong x i then x : checkWordList xs i else checkWordList xs i -- Equality function checking for string match in different coordinate positions -- Converts coordinates to tuples to ensure proper equality checking checkWordEquality :: (NumOps a) => Word a -> Word a -> Bool checkWordEquality (Word a _ b c) (Word x _ y z) = coord1 /= coord2 && a==x where coord1 = (b,c) coord2 = (y,z) -- Function to determine distance between two words. Uses Position of Word -- rather than coordinates so that it is type-of-search independent -- checkWordDistance :: (Num a, NumOps a) => Word a -> Word a -> a checkWordDistance (Word _ x _ _) (Word _ y _ _) = x - y -- Filter all but matching pairs of words filterMatchingWords :: (NumOps a) => Words a -> Words a filterMatchingWords [] = [] filterMatchingWords xs = sort $ intersectBy checkWordEquality xs xs -- The following filter functions are denoted by the --nopunct, --nocaps and -- --blacklist flags. These functions are used prior to creating Wordpair -- types from Words and should not affect the above checkWordEquality function -- used in building the former. -- Filter punctuation with --nopunct flag filterWordPunctuation :: (NumOps a) => Words a -> Words a filterWordPunctuation xs = [Word [a | a <- lemma x, not $ isPunctuation a] (position x) (line x) (column x) | x <- xs] -- Filter capitals with --nocaps flag filterWordCapitalization :: (NumOps a) => Words a -> Words a filterWordCapitalization xs = [Word [toLower a | a <- lemma x] (position x) (line x) (column x) | x <- xs] --Filter blacklist given in file denoted by -b flag filterWordBlacklist :: (NumOps a) => Words a -> [String] -> Words a filterWordBlacklist xs blacklist = [x | x <- xs, let w = lemma x, w `notElem` blacklist]