--- Soundex code calculator --- Copyright © 2008 Bart Massey --- ALL RIGHTS RESERVED --- This software is licensed under the "3-clause ('new') --- BSD License". Please see the file COPYING provided with --- this distribution for license terms. -- |Soundex is a phonetic coding algorithm. -- It transforms word into a similarity hash based on an -- approximation of its sounds. Thus, similar-sounding -- words tend to have the same hash. -- -- This implementation is based on a number of sources, -- including a description of soundex at -- http://wikipedia.org/wiki/Soundex -- and in Knuth's "The Art of Computer Programming" 2nd ed -- v1 pp394-395. A very helpful reference on the details -- and differences among soundex algorithms is "Soundex: -- The True Story", -- http://west-penwith.org.uk/misc/soundex.htm -- accessed 11 September 2008. -- -- This code was originally written for the "thimk" spelling suggestion -- application in Nickle (http://nickle.org) in July 2002 -- based on a description from -- http://www.geocities.com/Heartland/Hills/3916/soundex.html -- which is now -- http://www.searchforancestors.com/soundex.html -- The code was ported September 2008; the Soundex variants were also -- added at this time. module Text.PhoneticCode.Soundex (soundex, soundexSimple, soundexNARA, soundexCodes) where import Data.List import Data.Char import Data.Array.IArray import Data.Maybe -- |Array of soundex codes for single characters. The -- array maps uppercase letters (only) to a character -- representing a code in the range ['1'..'7'] or '?'. Code -- '7' is returned as a coding convenience for -- American/Miracode/NARA/Knuth soundex. soundexCodes :: Array Char Char soundexCodes = accumArray updater '?' ('A', 'Z') codes where updater '?' c = c updater _ c = error ("updater called twice on " ++ [c]) groups = [('1', "BFPV"), ('2', "CGJKQSXZ"), ('3', "DT"), ('4', "L"), ('5', "MN"), ('6', "R"), ('7', "HW")] codes = concatMap make_codes groups make_codes (i, s) = zip s (repeat i) -- | Utility function: id except for point substitution. subst :: Eq a => a -> a -> a -> a subst from to source | from == source = to | otherwise = source -- | Compute a "full" soundex code; i.e., do not drop any -- encodable characters from the result. The leading -- character of the code will be folded to uppercase. -- Non-alphabetics are not encoded. If no alphabetics are -- present, the soundex code will be "0". -- -- The two commonly encountered forms of soundex are Simplified -- and another known as American, Miracode, NARA or Knuth. This -- code will calculate either---passing True gets NARA, and False -- gets Simplified. soundex :: Bool -> String -> String soundex nara = filter (/= '?') . encode . map toUpper . dropWhile (not . isAlpha) where narify | nara = filter (/= '7') | otherwise = map (subst '7' '?') filter_multiples = map head . group --- The second clause of encode originally had a bug --- correctly predicted by STTS (ref above)! encode "" = "0" encode as@(a : _) = (a :) . drop 1 . filter_multiples . narify . map unsound $ as unsound c | c >= 'A' && c <= 'Z' = soundexCodes ! c unsound _ = '?' soundex_truncated nara = take 4 . (++ repeat '0') . soundex nara --- | This is the simple variant of `soundex`. It gives the --- first four characters of the full soundex code, zero-padded --- as needed. soundexSimple :: String -> String soundexSimple = soundex_truncated False --- | This is the most common US census variant of `soundex`, --- compatible with most existing calculators. It gives the --- first four characters of the full soundex code, zero-padded --- as needed. soundexNARA :: String -> String soundexNARA = soundex_truncated True --- Some tests from the web and from Knuth that this --- software passes. --- -- soundexTest = and [ -- soundexSimple "Lloyd" == "L300", -- soundexSimple "Woolcock" == "W422", -- soundexSimple "Donnell" == "D540", -- soundexSimple "Baragwanath" == "B625", -- soundexSimple "Williams" == "W452", -- soundexSimple "Ashcroft" == "A226", -- soundexNARA "Ashcroft" == "A261", -- soundexSimple "Euler" == "E460", -- soundexSimple "Ellery" == "E460", -- soundexSimple "Gauss" == "G200", -- soundexSimple "Ghosh" == "G200", -- soundexSimple "Hilbert" == "H416", -- soundexSimple "Heilbronn" == "H416", -- soundexSimple "Knuth" == "K530", -- soundexSimple "Kant" == "K530", -- soundexSimple "Ladd" == "L300", -- soundexSimple "Lukasiewicz" == "L222", -- soundexSimple "Lissajous" == "L222"]