{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE OverloadedStrings #-}

-- The code in this module is responsible for querying a set for possible
-- matches and determining how similar the string is to each candidate match.
--

module Data.FuzzySet.Internal
    ( (|>)
    , matches
    , getMatches
    , gramVector
    , grams
    ) where

import Data.Function ((&))
import Data.FuzzySet.Types
import Data.FuzzySet.Util (distance)
import Data.FuzzySet.Util (norm)
import Data.FuzzySet.Util (normalized, substr, enclosedIn)
import Data.HashMap.Strict (HashMap, elems, foldrWithKey, lookup, lookupDefault, alter)
import Data.List (sortBy)
import Data.Maybe (fromMaybe)
import Data.Ord (Down(..), comparing)
import Data.Text (Text)
import Data.Vector ((!?))
import qualified Data.FuzzySet.Util as Util
import qualified Data.HashMap.Strict as HashMap
import qualified Data.Text as Text


-- | Alternative syntax for the reverse function application operator @(&)@,
-- known also as the /pipe/ operator.
--
(|>) :: a -> (a -> b) -> b
(|>) = (&)
infixl 1 |>


-- | Dot products used to compute the cosine similarity, which is the similarity
-- score assigned to entries that match the search string in the fuzzy set.
--
matches
    :: FuzzySet
    -- ^ The string set
    -> HashMap Text Int
    -- ^ A sparse vector representation of the search string (generated by 'gramVector')
    -> HashMap Int Int
    -- ^ A mapping from item index to the dot product between the corresponding
    -- entry of the set and the search string
matches set@FuzzySet{..} =
    foldrWithKey fun mempty
  where
    fun gram count map =
        let
            insScore otherCount entry =
                Just (fromMaybe 0 entry + otherCount * count)
        in
        gram `HashMap.lookup` matchDict
            |> maybe map (foldr (\GramInfo{..} -> alter (insScore gramCount) itemIndex) map)


-- | This function performs the actual task of querying a set for matches,
-- supported by the other functions in this module.
-- See [Implementation](Data-FuzzySet.html#g:8) for an explanation.
--
getMatches
    :: FuzzySet
    -- ^ The string set
    -> Text
    -- ^ A string to search for
    -> Double
    -- ^ Minimum score
    -> Int
    -- ^ The gram size /n/, which must be at least /2/
    -> [( Double, Text )]
    -- ^ A list of results (score and matched value)
getMatches set@FuzzySet{..} query minScore gramSize =
    results
        |> filter (\pair -> fst pair >= minScore)
        |> fmap (\( score, entry ) -> ( score, exactSet |> lookupDefault "" entry ))
  where
    results =
        let sorted =
                matches set queryVector
                    |> HashMap.foldrWithKey fun []
                    |> sortBy (comparing (Down . fst))
        in
        if useLevenshtein then
            sorted
                |> take 50
                |> fmap (\( _, entry ) -> ( distance query entry, entry ))
                |> sortBy (comparing (Down . fst))
        else
            sorted

    queryMagnitude = norm (elems queryVector)
    queryVector = gramVector query gramSize
    itemsVector = fromMaybe mempty (gramSize `HashMap.lookup` items)

    fun index score list =
        case itemsVector !? index of
            Nothing ->
                list

            Just FuzzySetItem{..} ->
                ( fromIntegral score / (queryMagnitude * vectorMagnitude)
                , normalizedEntry
                ) : list


-- | Generate a list of /n/-grams (character substrings) from the normalized
-- input and then translate this into a dictionary with the /n/-grams as keys
-- mapping to the number of occurences of the substring in the list.
--
-- >>> gramVector "xxxx" 2
-- fromList [("-x",1), ("xx",3), ("x-",1)]
--
-- The substring @"xx"@ appears three times in the normalized string:
--
-- >>> grams "xxxx" 2
-- ["-x","xx","xx","xx","x-"]
--
-- >>> Data.HashMap.Strict.lookup "nts" (gramVector "intrent'srestaurantsomeoftrent'saunt'santswantsamtorentsomepants" 3)
-- Just 8
--
gramVector
    :: Text
    -- ^ An input string
    -> Int
    -- ^ The gram size /n/, which must be at least /2/
    -> HashMap Text Int
    -- ^ A sparse vector with the number of times a substring occurs in the
    -- normalized input string
gramVector value size =
    foldr fun HashMap.empty (grams value size)
  where
    fun = HashMap.alter (pure . succ . fromMaybe 0)


-- | Break apart the input string into a list of /n/-grams. The string is
-- first 'Data.FuzzySet.Util.normalized' and enclosed in hyphens.  We then take
-- all substrings of length /n/, letting the offset range from
-- \(0 \text{ to } s + 2 − n\), where /s/ is the length of the normalized input.
--
-- /Example:/
-- The string @"Destroido Corp."@ is first normalized to @"destroido corp"@,
-- and then enclosed in hyphens, so that it becomes @"-destroido corp-"@. The
-- trigrams generated from this normalized string are:
--
-- > [ "-de"
-- > , "des"
-- > , "est"
-- > , "str"
-- > , "tro"
-- > , "roi"
-- > , "oid"
-- > , "ido"
-- > , "do "
-- > , "o c"
-- > , " co"
-- > , "cor"
-- > , "orp"
-- > , "rp-"
-- > ]
--
grams
    :: Text
    -- ^ An input string
    -> Int
    -- ^ The gram size /n/, which must be at least /2/
    -> [Text]
    -- ^ A list of /n/-grams
grams value size
    | size < 2 = error "gram size must be at least 2"
    | otherwise =
        (\offs -> substr size offs str) <$> offsets
  where
    str = normalized value `enclosedIn` '-'
    offsets = [0 .. Text.length str - size]