{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE TupleSections #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE DeriveGeneric #-} {-# LANGUAGE OverloadedLists #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE ViewPatterns #-} {-| Module : FuzzyFind Description : Provides fuzzy matching on text Copyright : Unison Computing, 2021 License : MIT Maintainer : runar.bjarnason@unison.cloud Stability : experimental A package that provides an API for fuzzy text search in Haskell, using a modified version of the Smith-Waterman algorithm. The search is intended to behave similarly to the excellent fzf tool by Junegunn Choi. -} module Text.FuzzyFind where import Control.Monad (join) import Data.Massiv.Array ( Array, (!), Ix2(..), (...), forM, forM_ ) import qualified Data.Massiv.Array as A import qualified Data.Massiv.Array.Unsafe as A import qualified Data.Massiv.Array.Mutable as M import Data.Char (isAlphaNum, isLower, isUpper, toLower) import Data.Foldable (maximumBy, toList, foldl') import Data.Function (on) import Data.Maybe (fromMaybe) import Data.Sequence ( Seq (..), ViewL (..), ViewR (..), viewl, viewr, (<|) ) import qualified Data.Sequence as Seq import GHC.Generics (Generic) import Data.Text (Text) import qualified Data.Text as Text import Control.Monad.ST (runST) -- | @bestMatch query string@ will return 'Nothing' if @query@ is not a -- subsequence of @string@. Otherwise, it will return the "best" way to line up -- the characters in @query@ with the characters in @string@. Lower-case -- characters in the @query@ are assumed to be case-insensitive, and upper-case -- characters are assumed to be case-sensitive. -- -- For example: -- -- @ -- > bestMatch "ff" \"FuzzyFind\" -- Just (Alignment {score = 25, result = Result {[Match \"F\", Gap "uzzy", Match \"F\", Gap "ind"]}}) -- @ -- -- The score indicates how "good" the match is. Better matches have higher -- scores. There's no maximum score (except for the upper limit of the 'Int' -- datatype), but the lowest score is @0@. -- -- A substring from the query will generate a 'Match', and any characters from -- the input that don't result in a 'Match' will generate a 'Gap'. -- Concatenating all the 'Match' and 'Gap' results should yield the original -- input string. -- -- Note that the matched characters in the input always occur in the same order -- as they do in the query pattern. -- -- The algorithm prefers (and will generate higher scores for) the following -- kinds of matches: -- -- 1. Contiguous characters from the query string. For example, @bestMatch "pp"@ -- will find the last two ps in "pickled pepper". -- 2. Characters at the beginnings of words. For example, @bestMatch "pp"@ -- will find the first two Ps in \"Peter Piper\". -- 3. Characters at CamelCase humps. For example, @bestMatch "bm" \"BatMan\"@ -- will score higher than @bestMatch "bm" \"Batman\".@ -- 4. The algorithm strongly prefers the first character of the query pattern -- to be at the beginning of a word or CamelHump. For example, -- @bestMatch "mn" \"Bat Man\"@ will score higher than @bestMatch "atn" \"Batman\"@. -- -- All else being equal, matches that occur later in the input string are preferred. bestMatch :: String -- ^ The query pattern. -> String -- ^ The input string. -> Maybe Alignment bestMatch = bestMatch' defaultMatchScore defaultMismatchScore defaultGapPenalty defaultBoundaryBonus defaultCamelCaseBonus defaultFirstCharBonusMultiplier defaultConsecutiveBonus -- | Finds input strings that match all the given input patterns. For each input -- that matches, it returns one 'Alignment'. The output is not sorted. -- ascending. -- -- For example: -- -- @ -- > import Data.Foldable -- > traverse_ (putStrLn . ("\\n" ++) . highlight) $ fuzzyFind ["dad", "mac", "dam"] ["red macadamia", "Madam Card"] -- -- Madam Card -- * *** ** * -- -- red macadamia -- * ******* -- @ fuzzyFind :: [String] -- ^ The query patterns. -> [String] -- ^ The input strings. -> [Alignment] fuzzyFind = (fmap fst .) . fuzzyFindOn id -- | A version of 'fuzzyFind' that searches on the given text field of the data. fuzzyFindOn :: (a -> String) -> [String] -> [a] -> [(Alignment, a)] fuzzyFindOn f query d = d >>= (\s -> toList $ (, s) <$> foldl' (\a q -> (<>) <$> a <*> bestMatch q (f s)) (Just mempty) query ) instance Semigroup Alignment where Alignment n r <> Alignment m s = Alignment (n + m) (mergeResults r s) instance Monoid Alignment where mempty = Alignment 0 mempty type Score = Int -- | An 'Alignment' is a 'Score' together with a 'Result'. Better results have -- higher scores. data Alignment = Alignment { score :: !Score, result :: !Result } deriving (Eq, Ord, Show, Generic) -- | The base score given to a matching character defaultMatchScore :: Int defaultMatchScore = 16 -- | The base score given to a mismatched character defaultMismatchScore :: Int defaultMismatchScore = 0 -- | Bonus points given to characters matching at the beginning of words defaultBoundaryBonus :: Int defaultBoundaryBonus = defaultMatchScore `div` 2 -- | Bonus points given to characters matching a hump of a CamelCase word. -- We subtract a point from the word boundary score, since a word boundary will -- incur a gap penalty. defaultCamelCaseBonus :: Int defaultCamelCaseBonus = defaultBoundaryBonus - 1 -- | Double any bonus points for matching the first pattern of the character. -- This way we strongly prefer starting the match at the beginning of a word. defaultFirstCharBonusMultiplier :: Int defaultFirstCharBonusMultiplier = 2 -- | We prefer consecutive runs of matched characters in the pattern, so we -- impose a penalty for any gaps, which is added to the size of the gap. defaultGapPenalty :: Int defaultGapPenalty = 3 -- | We give a bonus to consecutive matching characters. -- A number about the same as the boundary bonus will prefer -- runs of consecutive characters vs finding acronyms. defaultConsecutiveBonus :: Int defaultConsecutiveBonus = 11 -- | Renders an 'Alignment' as a pair of lines with "*" on the lower line -- indicating the location of pattern matches. -- highlight' :: Alignment -> Text -- highlight' (Alignment s (Result segments)) = -- foldMap prettySegment segments <> "\n" <> foldMap showGaps segments -- where -- prettySegment (Gap xs) = xs -- prettySegment (Match xs) = xs -- showGaps (Gap xs) = Text.pack $ replicate (Text.length xs) ' ' -- showGaps (Match xs) = Text.pack $ replicate (Text.length xs) '*' -- highlight :: Alignment -> String -- highlight = Text.unpack . highlight' -- | A highly configurable version of 'bestMatch'. bestMatch' :: Int -- ^ Base score for a matching character. See 'defaultMatchScore'. -> Int -- ^ Base score for a mismatched character. See 'defaultMismatchScore'. -> Int -- ^ Additional penalty for a gap. See 'defaultGapPenalty'. -> Int -- ^ Bonus score for a match at the beginning of a word. See 'defaultBoundaryBonus'. -> Int -- ^ Bonus score for a match on a CamelCase hump. See 'defaultCamelCaseBonus'. -> Int -- ^ Bonus multiplier for matching the first character of the pattern. -- See 'defaultFirstCharBonusMultiplier'. -> Int -- ^ Bonus score for each consecutive character matched. -- See 'defaultFirstCharBonusMultiplier'. -> String -- ^ The query pattern. -> String -- ^ The input string. -> Maybe Alignment bestMatch' matchScore mismatchScore gapPenalty boundaryBonus camelCaseBonus firstCharBonusMultiplier consecutiveBonus query str = Alignment (totalScore m nx) . Result <$> traceback where totalScore i j = if i > m then 0 else (A.index' hs (i :. j)) + (A.index' bonuses (i :. j)) -- table = unlines -- [ unwords -- $ (if y > 0 then show $ str' ! y else " ") -- : [ show (totalScore x y) | x <- [0 .. m] ] -- | y <- [0 .. n] -- ] similarity a b = if a == b || a == toLower b then matchScore else mismatchScore traceback :: Maybe (Seq ResultSegment) traceback = (<> gaps (drop nx str)) <$> go [] [] (-1) m nx go r m currOp 0 j = (gaps (take j str) <>) <$> case m of [] -> Just r _ -> case currOp of 1 -> Just (r :|> Match (reverse m)) 0 -> Just (r :|> Gap (reverse m)) -1 -> Nothing go _ _ _ _ 0 = Nothing go r m currOp i j = if similarity (A.index' query' (i - 1)) (A.index' str' (j - 1)) > 0 then case currOp of 0 -> go (r :|> Gap (reverse m)) [A.index' str' (j - 1)] 1 (i - 1) (j - 1) _ -> go r (A.index' str' (j - 1) : m) 1 (i - 1) (j - 1) else case currOp of 1 -> go (r :|> Match (reverse m)) [A.index' str' (j - 1)] 0 i (j - 1) _ -> go r (A.index' str' (j - 1) : m) 0 i (j - 1) nx = localMax m n 1 0 0 localMax m n j r s = if j > n then r else let s' = totalScore m j in localMax m n (j + 1) (if s' > s then j else r) s' query' = A.fromList A.Seq query :: Array A.U A.Ix1 Char str' = A.fromList A.Seq str :: Array A.U A.Ix1 Char m = A.unSz $ A.size query' n = A.unSz $ A.size str' hs :: Array A.U Ix2 Int hs = M.createArrayST_ (A.Sz (m + 1 :. n + 1)) $ \marr -> do A.forM_ ((0 :. 0) ... (m :. n)) $ \(i :. j) -> if (i == 0 || j == 0) then M.writeM marr (i :. j) 0 else do scoreMatch <- do hprev <- M.readM marr ((i - 1) :. (j - 1)) pure $ hprev + similarity (A.index' query' (i - 1)) (A.index' str' (j - 1)) + A.index' bonuses (i :. j) scoreGap <- do (arr :: Array A.U A.Ix1 Int) <- forM (1 ... j) $ \l -> (\x -> x - (l + gapPenalty)) <$> M.readM marr (i :. (j - l)) pure . fromMaybe 0 $ A.maximumM arr M.writeM marr (i :. j) (scoreMatch `max` scoreGap `max` 0) bonuses = A.makeArray A.Seq (A.Sz (m + 1 :. n + 1)) f :: Array A.U Ix2 Int where f (i :. j) = bonus i j bonus :: Int -> Int -> Int bonus 0 j = 0 bonus i 0 = 0 bonus i j = if similarity (A.index' query' (i - 1)) (A.index' str' (j - 1)) > 0 then multiplier * (boundary + camel + consecutive) else 0 where boundary = if j < 2 || isAlphaNum (A.index' str' (j - 1)) && not (isAlphaNum (A.index' str' (j - 2))) then boundaryBonus else 0 camel = if j > 1 && isLower (A.index' str' (j - 2)) && isUpper (A.index' str' (j - 1)) then camelCaseBonus else 0 multiplier = if i == 1 then firstCharBonusMultiplier else 1 consecutive = let similar = i > 0 && j > 0 && similarity (A.index' query' (i - 1)) (A.index' str' (j - 1)) > 0 afterMatch = i > 1 && j > 1 && similarity (A.index' query' (i - 2)) (A.index' str' (j - 2)) > 0 beforeMatch = i < m && j < n && similarity (A.index' query' i) (A.index' str' j) > 0 in if similar && (afterMatch || beforeMatch) then consecutiveBonus else 0 gaps :: String -> Seq ResultSegment gaps s = [Gap s] data ResultSegment = Gap !String | Match !String deriving (Eq, Ord, Show, Generic) -- | Concatenating all the 'ResultSegment's should yield the original input string. newtype Result = Result { segments :: Seq ResultSegment } deriving (Eq, Ord, Show, Generic) instance Monoid Result where mempty = Result [] instance Semigroup Result where Result Empty <> as = as as <> Result Empty = as Result (viewr -> h :> Gap []) <> as = Result h <> as as <> Result (viewl -> Gap [] :< t) = as <> Result t Result (viewr -> h :> Match []) <> as = Result h <> as as <> Result (viewl -> Match [] :< t) = as <> Result t Result (viewr -> i :> Gap l) <> Result (viewl -> Gap h :< t) = Result (i <> [Gap (l <> h)] <> t) Result (viewr -> i :> Match l) <> Result (viewl -> Match h :< t) = Result (i <> [Match (l <> h)] <> t) Result a <> Result b = Result (a <> b) mergeResults :: Result -> Result -> Result mergeResults as bs = merge as bs where drop' :: Int -> Result -> Result drop' n m | n < 1 = m drop' n (Result (viewl -> Gap g :< t)) = Result [Gap (drop n g)] <> drop' (n - length g) (Result t) drop' n (Result (viewl -> Match g :< t)) = Result [Match (drop n g)] <> drop' (n - length g) (Result t) merge :: Result -> Result -> Result merge (Result Seq.Empty) ys = ys merge xs (Result Seq.Empty) = xs merge (Result xs) (Result ys ) = case (viewl xs, viewl ys) of (Gap g :< t, Gap g' :< t') | length g <= length g' -> Result [Gap g] <> merge (Result t) (drop' (length g) (Result ys)) | otherwise -> Result [Gap g'] <> merge (drop' (length g') (Result xs)) (Result t') (Match m :< t, Match m' :< t') | length m >= length m' -> Result [Match m] <> merge (Result t) (drop' (length m) (Result ys)) | otherwise -> Result [Match m'] <> merge (drop' (length m') (Result xs)) (Result t') (Gap g :< t, Match m' :< t') -> Result [Match m'] <> merge (drop' (length m') (Result xs)) (Result t') (Match m :< t, Gap g' :< t') -> Result [Match m] <> merge (Result t) (drop' (length m) (Result ys))