module Synt.Similar (sim, ngramRange, ngram, ngrams) where import Synt.Parser import Data.List import Data.List.Split import Data.Ix import Data.Maybe import Text.RegexPR import Synt.Similar.Jaccard import Synt.Similar.Tanimoto int :: String -> Int int x = read x :: Int tupint :: [String] -> (Int, Int) tupint [x, y] = (int x, int y) join :: [[String]] -> [String] join = map (foldr (++) "") ngramRange :: String -> [a] -> (Int, Int) ngramRange n l | null n || n == "" = (1, 1) | n == "all" = (1, length l) | isJust (matchRegexPR "\\.\\." n) = tupint $ splitOn ".." n | not (null n) = (int n, int n) ngram :: Int -> [String] -> [[String]] ngram _ [_] = [] ngram 1 list = map (: []) list ngram n list = take n list : if length list - 1 >= n then ngram n (tail list) else [] ngrams :: (Int, Int) -> [String] -> [String] ngrams r list = concatMap (\x -> join $ ngram x list) (range r) sim' :: [String] -> [String] -> String -> String -> Float sim' cmp to algo n = do let r = ngramRange n cmp let a = ngrams r cmp let b = ngrams r to if algo == "tanimoto" then tanimoto a b else jaccard a b sim :: String -> String -> String -> String -> Float sim from to = sim' (tokenize from) (tokenize to)