{-|
Module      : Sonnex
Description : Sonnex is an alternative to Soundex for french language
Copyright   : © 2014 Frédéric BISSON
License     : GPL-3
Maintainer  : zigazou@free.fr
Stability   : alpha
Portability : POSIX

This package computes Sonnex codes for french words or phrases. It is an
alternative to the Soundex algorithm for french language.

= Characters of the Sonnex code

The Sonnex code contains the following characters:

- 1 ← un, ein, in, ain
- 2 ← en, an
- 3 ← on
- a ← a, à, â
- b ← b, bb
- C ← ch
- d ← d, dd
- e ← e, eu
- é ← ê, é, è, ai, ei
- f ← f, ff, ph
- g ← gu
- i ← î, i, ille
- j ← j, ge
- k ← k, c, qu, ck
- l ← l, ll
- m ← m, mm
- n ← n, nn
- o ← o, ô
- p ← p, pp
- r ← r, rr
- s ← s, ss
- t ← t, tt
- u ← u, ù, û
- v ← v, w
- z ← z, s
- U ← ou

The apostroph is ignored, every other character not understood by the Sonnex
algorthim is copied without changes.

= Examples

Here are a few examples of sonnex results:

>>> sonnex "champ"
C2

>>> sonnex "chant"
C2

>>> sonnex "boulot"
bUlo

>>> sonnex "bouleau"
bUlo

>>> sonnex "compte"
k3t

>>> sonnex "comte"
k3t

>>> sonnex "conte"
k3t

-}

module Text.Sonnex (sonnex, sonnexPhrase) where

import Data.Char

-- | Test is a character is to be considered a vowel
isVowel :: Char -> Bool
isVowel = (`elem` "aâàäeéèêëiîïoôöuùûüyœ")

-- | Test is a character is to be considered a conson
isConson :: Char -> Bool
isConson = (`elem` "bcçdfghjklmnpqrstvwxyz")

-- | Internal function computing a Sonnex code for a french word.
--
-- It requires its entry to be converted to lower case before being called.
sonx :: String -> String -> String
-- End of recursive calls
sonx "" s = s

-- Apostroph is ignored/silent
sonx ('\'':cs) s = sonx cs s

-- Starting with 'a'
sonx "a" s = s ++ "a"
sonx "aient" s = s ++ "é"
sonx "ain" s = s ++ "1"
sonx ('a':'i':'n':v:cs) s
    | isVowel v = sonx (v:cs) (s ++ "é")
    | otherwise = sonx (v:cs) (s ++ "1")
sonx "ais" s = s ++ "é"
sonx ('a':'i':'s':v:cs) s
    | isVowel v = sonx (v:cs) (s ++ "éz")
    | otherwise = sonx (v:cs) (s ++ "és")
sonx "ail" s = s ++ "ai"
sonx ('a':'i':'l':'l':cs) s = sonx cs (s ++ "ai")
sonx ('a':'i':cs) s = sonx cs (s ++ "é")
sonx ('a':'m':'m':cs) s = sonx cs (s ++ "am")
sonx ('a':'m':c:cs) s
    | isVowel c = sonx (c:cs) (s ++ "am")
    | otherwise = sonx (c:cs) (s ++ "2")
sonx "an" s = s ++ "2"
sonx ('a':'n':'n':cs) s = sonx cs (s ++ "an")
sonx ('a':'n':c:cs) s
    | isVowel c = sonx (c:cs) (s ++ "an")
    | otherwise = sonx (c:cs) (s ++ "2")
sonx ('a':'s':'s':cs) s = sonx cs (s ++ "as")
sonx ('a':'s':c:cs) s
    | isConson c = sonx (c:cs) (s ++ "as")
    | otherwise  = sonx (c:cs) (s ++ "az")
sonx ('a':'u':cs) s = sonx cs (s ++ "o")
sonx "ay" s = s ++ "é"
sonx "ays" s = s ++ "é"

sonx ('à':cs) s = sonx cs (s ++ "a")
sonx ('â':cs) s = sonx cs (s ++ "a")

-- Starting with 'b'
sonx "b" s = s
sonx ('b':'b':cs) s = sonx cs (s ++ "b")

-- Starting with 'c'
sonx "c" s = s
sonx ('c':'a':cs) s = sonx ('a':cs) (s ++ "k")
sonx ('c':'c':cs) s = sonx cs (s ++ "ks")
sonx ('c':'e':cs) s = sonx ('e':cs) (s ++ "s")
sonx ('c':'\'':cs) s = sonx cs (s ++ "s")
sonx ('c':'h':'a':'o':cs) s = sonx ('a':'o':cs) (s ++ "k")
sonx ('c':'h':'l':cs) s = sonx cs (s ++ "kl")
sonx ('c':'h':'o':'e':cs) s = sonx ('o':'e':cs) (s ++ "k")
sonx ('c':'h':'œ':cs) s = sonx ('o':'e':cs) (s ++ "k")
sonx ('c':'h':'r':cs) s = sonx cs (s ++ "kr")
sonx ('c':'h':cs) s = sonx cs (s ++ "C")
sonx ('c':'i':cs) s = sonx ('i':cs) (s ++ "s")
sonx ('c':'k':cs) s = sonx cs (s ++ "k")
sonx ('c':'o':'e':'u':cs) s = sonx ('o':'e':'u':cs) (s ++ "k")
sonx ('c':'o':'m':'p':'t':cs) s = sonx cs (s ++ "k3t")
sonx ('c':'œ':'u':cs) s = sonx ('œ':'u':cs) (s ++ "k")
sonx ('c':'o':cs) s = sonx ('o':cs) (s ++ "k")
sonx ('c':'u':'e':'i':cs) s = sonx ('i':cs) (s ++ "ke")
sonx ('c':'u':cs) s = sonx ('u':cs) (s ++ "k")
sonx ('c':'y':cs) s = sonx ('y':cs) (s ++ "s")
sonx ('c':cs) s = sonx cs (s ++ "k")

sonx ('ç':cs) s = sonx cs (s ++ "s")

-- Starting with 'd'
sonx "d" s = s
sonx "ds" s = s
sonx ('d':'d':cs) s = sonx cs (s ++ "d")

-- Starting with 'e'
sonx "e" s = s
sonx "ec" s = s ++ "éc"
sonx "ef" s = s ++ "éf"
sonx "eaux" s = s ++ "o"
sonx ('e':'a':'n':'n':cs) s = sonx cs (s ++ "an")
sonx ('e':'a':'n':cs) s = sonx cs (s ++ "2")
sonx ('e':'a':'u':cs) s = sonx cs (s ++ "o")
sonx ('e':'f':'f':cs) s = sonx cs (s ++ "éf")
sonx "ein" s = s ++ "1"
sonx ('e':'i':'n':c:cs) s
    | isVowel c = sonx (c:cs) (s ++ "én")
    | otherwise = sonx (c:cs) (s ++ "1")
sonx ('e':'i':cs) s = sonx cs (s ++ "é")
sonx ('e':'l':'l':cs) s = sonx cs (s ++ "él")
sonx ('e':'l':c:cs) s
    | isConson c = sonx ('l':c:cs) (s ++ "é")
    | otherwise  = sonx ('l':c:cs) (s ++ "e")
sonx ('e':'m':'m':cs) s = sonx cs (s ++ "ém")
sonx ('e':'m':'p':cs) s = sonx cs (s ++ "2")
sonx ('e':'n':'n':cs) s = sonx cs (s ++ "én")
sonx "en" s = s ++ "2"
sonx ('e':'n':c:cs) s
    | isVowel c = sonx (c:cs) (s ++ "en")
    | otherwise = sonx (c:cs) (s ++ "2")
sonx "er" s = s ++ "é"
sonx "ert" s = s ++ "ér"
sonx ('e':'r':'r':cs) s = sonx cs (s ++ "ér")
sonx ('e':'r':'f':cs) s = sonx ('f':cs) (s ++ "ér")
sonx "es" s = s
sonx ('e':'s':'c':'h':cs) s = sonx cs (s ++ "éC")
sonx ('e':'s':'h':cs) s = sonx ('h':cs) (s ++ "é")
sonx ('e':'s':'n':cs) s = sonx ('n':cs) (s ++ "é")
sonx ('e':'s':'s':cs) s = sonx cs (s ++ "és")
sonx ('e':'s':c:cs) s
    | isConson c = sonx (c:cs) (s ++ "és")
    | otherwise  = sonx (c:cs) (s ++ "ez")
sonx ('é':'s':c:cs) s
    | isConson c = sonx cs (s ++ "és")
    | otherwise  = sonx (c:cs) (s ++ "éz")
sonx ('e':'t':'t':cs) s = sonx cs (s ++ "ét")
sonx "et" s = s ++ "é"
sonx ('e':'t':cs) s = sonx cs (s ++ "et")
sonx ('e':'u':'n':c:cs) s
    | isVowel c = sonx (c:cs) (s ++ "en")
    | otherwise = sonx (c:cs) (s ++ "1")
sonx "eux" s = s ++ "e"
sonx ('e':'u':'x':'i':cs) s = sonx ('i':cs) (s ++ "ez")
sonx ('e':'u':cs) s = sonx cs (s ++ "e")
sonx "ex" s = s ++ "éks"
sonx ('e':'y':c:cs) s
    | isConson c = sonx (c:cs) (s ++ "é")
    | otherwise  = sonx ('y':c:cs) (s ++ "é")
sonx "ez" s = s ++ "é"

sonx ('è':cs) s = sonx cs (s ++ "é")
sonx ('ê':cs) s = sonx cs (s ++ "é")
sonx ('ë':'l':cs) s = sonx ('l':cs) (s ++ "é")

-- Starting with 'f'
sonx ('f':'f':cs) s = sonx cs (s ++ "f")

-- Starting with 'g'
sonx "g" s = s
sonx ('g':'e':cs) s = sonx ('e':cs) (s ++ "j")
sonx ('g':'é':cs) s = sonx ('é':cs) (s ++ "j")
sonx ('g':'i':cs) s = sonx ('i':cs) (s ++ "j")
sonx ('g':'n':cs) s = sonx cs (s ++ "n")
sonx ('g':'y':cs) s = sonx ('y':cs) (s ++ "j")
sonx ('g':'u':'ë':cs) s = sonx cs (s ++ "gu")
sonx ('g':'u':cs) s = sonx cs (s ++ "g")
sonx ('g':'g':cs) s = sonx cs (s ++ "g")

-- Starting with 'h'
sonx ('h':cs) s = sonx cs s

-- Starting with 'i'
sonx "ic" s = s ++ "ik"
sonx "ics" s = s ++ "ik"
sonx ('i':'e':'n':'n':cs) s = sonx cs (s ++ "ién")
sonx ('i':'e':'n':cs) s = sonx cs (s ++ "i1")
sonx ('i':'n':'n':cs) s = sonx cs (s ++ "in")
sonx ('i':'n':'e':cs) s = sonx cs (s ++ "in")
sonx "in" s = s ++ "1"
sonx ('i':'n':c:cs) s
    | isVowel c = sonx (c:cs) (s ++ "in")
    | otherwise = sonx (c:cs) (s ++ "1")
sonx ('i':'s':c:cs) s
    | isConson c = sonx (c:cs) (s ++ "is")
    | otherwise  = sonx (c:cs) (s ++ "iz")
sonx ('i':'x':'i':cs) s = sonx ('i':cs) (s ++ "iz")
sonx ('i':'l':'l':cs) s = sonx cs (s ++ "i")
sonx ('i':cs) s = sonx cs (s ++ "i")

sonx ('ï':cs) s = sonx cs (s ++ "i")

-- Starting with 'l'
sonx ('l':'l':cs) s = sonx cs (s ++ "l")

-- Starting with 'm'
sonx ('m':'m':cs) s = sonx cs (s ++ "m")

-- Starting with 'n'
sonx ('n':'n':cs) s = sonx cs (s ++ "n")

-- Starting with 'o'
sonx ('o':'c':'c':cs) s = sonx cs (s ++ "ok")
sonx ('o':'e':'u':cs) s = sonx cs (s ++ "e")
sonx ('œ':'u':cs) s = sonx cs (s ++ "e")
sonx ('œ':cs) s = sonx cs (s ++ "e")
sonx "oient" s = s ++ "Ua"
sonx ('o':'i':'n':cs) s = sonx cs (s ++ "U1")
sonx ('o':'i':cs) s = sonx cs (s ++ "Ua")
sonx ('o':'m':'m':cs) s = sonx cs (s ++ "om")
sonx ('o':'m':c:cs) s
    | isVowel c = sonx (c:cs) (s ++ "om")
    | otherwise = sonx (c:cs) (s ++ "3")
sonx ('o':'n':'n':cs) s = sonx cs (s ++ "on")
sonx ('o':'n':cs) s = sonx cs (s ++ "3")
sonx ('o':'s':c:cs) s
    | isConson c = sonx cs (s ++ "os")
    | otherwise  = sonx (c:cs) (s ++ "oz")
sonx ('o':'u':cs) s = sonx cs (s ++ "U")
sonx ('o':'ù':cs) s = sonx cs (s ++ "U")
sonx ('o':'û':cs) s = sonx cs (s ++ "U")

sonx ('ô':cs) s = sonx cs (s ++ "o")
sonx ('ö':cs) s = sonx cs (s ++ "o")

-- Starting with 'p'
sonx "p" s = s
sonx ('p':'h':cs) s = sonx cs (s ++ "f")
sonx ('p':'p':cs) s = sonx cs (s ++ "p")
sonx ('p':'a':'y':'s':cs) s = sonx ('i':'s':cs) (s ++ "pé")

-- Starting with 'q'
sonx ('q':'u':'r':cs) s = sonx ('r':cs) (s ++ "ku")
sonx ('q':'u':cs) s = sonx cs (s ++ "k")
sonx ('q':cs) s = sonx cs (s ++ "k")

-- Starting with 'r'
sonx ('r':'r':cs) s = sonx cs (s ++ "r")

-- Starting with 's'
sonx "s" s = s
sonx ('s':'s':cs) s = sonx cs (s ++ "s")
sonx ('s':'c':'i':cs) s = sonx ('i':cs) (s ++ "s")

-- Starting with 't'
sonx "t" s = s
sonx ('t':'t':cs) s = sonx cs (s ++ "t")

-- Starting with 'u'
sonx "un" s = s ++ "1"
sonx ('û':cs) s = sonx cs (s ++ "u")

-- Starting with 'v'
-- 'v' can be handled by the generic case since there is no special thing
-- about this letter, it’s always pronounced 'v' and never doubled.

-- Starting with 'w'
sonx ('w':cs) s = sonx cs (s ++ "v")

-- Starting with 'x'
sonx "x" s = s
sonx ('x':'c':cs) s = sonx cs (s ++ "ks")
sonx ('x':c:cs) s
    | isVowel c = sonx (c:cs) (s ++ "kz")
    | otherwise = sonx (c:cs) (s ++ "ks")

-- Starting with 'y'
sonx ('y':cs) s = sonx cs (s ++ "i")

-- Starting with 'z'
sonx ('z':'z':cs) s = sonx cs (s ++ "z")

-- Copy every other character as is
sonx (c:cs) s = sonx cs (s ++ [c])

-- | Compute a Sonnex code for a french word.
--
-- The string must contain only one word.
-- Each character should be considered as being vocal, not silent
--
-- prop> length (sonnex w) <= length w
sonnex :: String -> String
sonnex word = sonx (map toLower word) ""

-- | Compute a Sonnex code for a french phrase.
--
-- It applies the sonnex function to each word in the phrase.
-- Since it uses the words/unwords couple, superfluous space character
-- are removed.
sonnexPhrase :: String -> String
sonnexPhrase phrase = unwords $ map sonnex (words phrase)