--- Phonix code calculator --- Copyright © 2008 Bart Massey --- ALL RIGHTS RESERVED --- This software is licensed under the "3-clause ('new') --- BSD License". Please see the file COPYING provided with --- this distribution for license terms. -- | Phonix codes (Gadd 1990) augment slightly improved -- Soundex codes with a preprocessing step for cleaning up -- certain n-grams. Since the preprocessing step contains -- around 90 rules processed by a slow custom-written -- scanner, this implementation is not too fast. -- -- This code was based on a number of sources, including the -- CPAN Phonix code calculator `Text::Phonetic::Phonix.pm`. -- Because the paper describing the codes is not freely -- available and I'm lazy, I did not use it as a reference. -- Also because Phonix involves around 90 substitution rules, -- I transformed the Perl ones, which was easier than -- generating them from scratch. module Text.PhoneticCode.Phonix ( phonix, phonixCodes, phonixRules, phonixRulesPatSubsts, applyPhonixRules ) where import Data.List import Data.Char import Data.Array.IArray import qualified Data.Set as Set import Text.Regex -- | Compute a "full" phonix code; i.e., do not drop any -- encodable characters from the result. The leading -- character of the code will be folded to uppercase. -- Non-alphabetics are not encoded. If no alphabetics are -- present, the phonix code will be "0". -- -- There appear to be many, many variants of phonix -- implemented on the web, and I'm too cheap and lazy to go -- find the original paper by Gadd (1990) that actually -- describes the original algorithm. Thus, I am taking some -- big guesses on intent here as I implement. Corrections, -- especially those involving getting me a copy of the -- article, are welcome. -- -- Dropping the "trailing sound" seems to be an integral -- part of Gadd's technique, but I'm not sure how it is -- supposed to be done. I am currently compressing runs of -- vowels, and then dropping the trailing digit or vowel -- from the code. -- -- Another area of confusion is whether to compress strings -- of the same code, as in Soundex, or merely strings of the -- same consonant. I have chosen the former. phonix :: String -> String phonix = filter (/= '?') . drop_trailing_sound . encode . applyPhonixRules where drop_trailing_sound = init . concatMap question_squash . group where question_squash ('?' : _) = ['?'] question_squash l = l filter_multiples = map head . group encode "" = "0" encode as@(a : _) = (devowel a :) . drop 1 . filter_multiples . map unsound $ as unsound c | c >= 'A' && c <= 'Z' = phonixCodes ! c unsound _ = '?' devowel c | isVowely c = 'v' devowel c = c isVowely :: Char -> Bool isVowely c = c `Set.member` (Set.fromList "AEIOUY") -- | Array of phonix codes for single characters. The -- array maps uppercase letters (only) to a character -- representing a code in the range ['1'..'8'] or '?'. phonixCodes :: Array Char Char phonixCodes = accumArray updater '?' ('A', 'Z') codes where updater '?' c = c updater _ c = error ("updater called twice on " ++ [c]) groups = [('1', "BP"), ('2', "CGJKQ"), ('3', "DT"), ('4', "L"), ('5', "MN"), ('6', "R"), ('7', "FV"), ('8', "SXZ")] codes = concatMap make_codes groups make_codes (i, s) = zip s (repeat i) -- | Substitution rules for Phonix canonicalization. "^" ("$") -- is used to anchor a pattern to the beginning (end) of the word. -- "c" ("v", ".") at the beginning or end of a pattern match -- a consonant (vowel, arbitrary character). A character matched -- in this fashion is automatically tacked onto the beginning (end) -- of the pattern. phonixRules :: [(String, String)] phonixRules = [ ("DG","G"), ("CO","KO"), ("CA","KA"), ("CU","KU"), ("CY","SI"), ("CI","SI"), ("CE","SE"), ("^CLv","KL"), ("CK","K"), ("GC$","K"), ("JC$","K"), ("^CRv","KR"), ("^CHRv","KR"), ("^WR","R"), ("NC","NK"), ("CT","KT"), ("PH","F"), ("AA","AR"), --- neu ("SCH","SH"), ("BTL","TL"), ("GHT","T"), ("AUGH","ARF"), (".LJv","LD"), ("LOUGH","LOW"), ("^Q","KW"), ("^KN","N"), ("GN$","N"), ("GHN","N"), ("GNE$","N"), ("GHNE","NE"), ("GNES$","NS"), ("^GN","N"), (".GNc","N"), ("^PS","S"), ("^PT","T"), ("^CZ","C"), ("vWZ.","Z"), (".CZ.","CH"), ("LZ","LSH"), ("RZ","RSH"), (".Zv","S"), ("ZZ","TS"), ("cZ.","TS"), ("HROUGH","REW"), ("OUGH","OF"), ("vQv","KW"), ("vJv","Y"), ("^YJv","Y"), ("^GH","G"), ("vGH$","E"), ("^CY","S"), ("NX","NKS"), ("^PF","F"), ("DT$","T"), ("TL$","TIL"), ("DL$","DIL"), ("YTH","ITH"), ("^TJv","CH"), ("^TSJv","CH"), ("^TSv","T"), ("TCH","CH"), --- old che ("^vWSK","VSIKE"), ("^PNv","N"), ("^MNv","N"), ("vSTL","SL"), ("TNT$","ENT"), ("EAUX$","OH"), ("EXCI","ECS"), ("X","ECS"), ("NED$","ND"), ("JR","DR"), ("EE$","EA"), ("ZS","S"), ("vRc","AH"), ("vHRc","AH"), ("vHR$","AH"), ("RE$","AR"), ("vR$","AH"), ("LLE","LE"), ("cLE$","ILE"), ("cLES$","ILES"), ("E$",""), ("ES$","S"), ("vSS","AS"), ("vMB$","M"), ("MPTS","MPS"), ("MPS","MS"), ("MPT","MT") ] -- | Apply each of the Phonix preprocessing rules in turn to -- the target word returning the resulting accumulated -- substitution. applyPhonixRules :: String -> String applyPhonixRules = flip (foldl' res) phonixRulesREs . map toUpper . filter isAlpha where res target (pat, subst) = subRegex pat target subst -- Compile a regex in single-line mode. mre :: String -> Regex mre s = mkRegexWithOpts s False True -- Compile the patterns in the patSubsts. phonixRulesREs :: [(Regex, String)] phonixRulesREs = map (\(pat, subst) -> (mre pat, subst)) phonixRulesPatSubsts -- | List of pattern/substitution pairs built from the -- 'phonixRules'. phonixRulesPatSubsts :: [(String, String)] phonixRulesPatSubsts = map reFormat phonixRules where reFormat (src, dst) = let vowelSubst = mre "v" consSubst = mre "c" dotSubst = mre "\\." in let src' = flip (subRegex dotSubst) "([A-Z])" $ flip (subRegex consSubst) "([BCDFGHJKLMNPQRSTVWXYZ])" $ flip (subRegex vowelSubst) "([AEIOU])" $ src in let front = case matchRegex (mre "^\\^?[^^A-Z]") src' of Just _ -> "\\1" Nothing -> "" in let back = case matchRegex (mre "[^A-Z$]\\$?$") src' of Just _ -> case front of "" -> "\\1" _ -> "\\2" Nothing -> "" in (src', front ++ dst ++ back)