{-|
Module      : Sonnex
Description : Sonnex is an alternative to Soundex for french language
Copyright   : © 2014 Frédéric BISSON
License     : GPL-3
Maintainer  : zigazou@free.fr
Stability   : alpha
Portability : POSIX

This package computes Sonnex codes for french words or phrases. It is an
alternative to the Soundex algorithm for french language.

= Characters of the Sonnex code

The Sonnex code contains the following characters:

- 1 ← un, ein, in, ain
- 2 ← en, an
- 3 ← on
- a ← a, à, â
- b ← b, bb
- C ← ch
- d ← d, dd
- e ← e, eu
- E ← ê, é, è, ai, ei
- f ← f, ff, ph
- g ← gu
- i ← î, i, ille
- j ← j, ge
- k ← k, c, qu, ck
- l ← l, ll
- m ← m, mm
- n ← n, nn
- o ← o, ô
- p ← p, pp
- r ← r, rr
- s ← s, ss
- t ← t, tt
- u ← u, ù, û
- v ← v, w
- z ← z, s
- U ← ou

The apostroph is ignored, every other character not understood by the Sonnex
algorthim is copied without changes.

= Examples

Here are a few examples of sonnex results:

>>> sonnex "champ"
C2

>>> sonnex "chant"
C2

>>> sonnex "boulot"
bUlo

>>> sonnex "bouleau"
bUlo

>>> sonnex "compte"
k3t

>>> sonnex "comte"
k3t

>>> sonnex "conte"
k3t

-}

module Text.Sonnex (sonnex, sonnexPhrase) where

import Data.Char

-- | Test is a character is to be considered a vowel
isVowel :: Char -> Bool
isVowel = (`elem` "aâàäeéèêëiîïoôöuùûüyœ")

-- | Test is a character is to be considered a conson
isConson :: Char -> Bool
isConson = (`elem` "bcçdfghjklmnpqrstvwxyz")

-- | Internal function computing a Sonnex code for a french word.
--
-- It requires its entry to be converted to lower case before being called.
sonx :: String -> String
-- End of recursive calls
sonx "" = ""

-- Apostroph is ignored/silent
sonx ('\'':cs) = sonx cs
sonx ('’':cs) = sonx cs

-- Starting with 'a'
sonx "a" = "a"
sonx "aient" = "E"
sonx "ain" = "1"
sonx ('a':'i':'n':v:cs)
    | isVowel v = 'E':sonx (v:cs)
    | otherwise = '1':sonx (v:cs)
sonx "ais" = "E"
sonx ('a':'i':'s':v:cs)
    | v == 's'  = 'E':'s':sonx cs
    | isVowel v = 'E':'z':sonx (v:cs)
    | otherwise = 'E':'s':sonx (v:cs)
sonx "ail" = "ai"
sonx ('a':'i':'l':'l':cs) = 'a':'i':sonx cs
sonx ('a':'i':cs) = 'E':sonx cs
sonx ('a':'m':'m':cs) = 'a':'m':sonx cs
sonx ('a':'m':c:cs)
    | c == 'm'  = 'a':'m':sonx cs
    | isVowel c = 'a':'m':sonx (c:cs)
    | otherwise = '2':sonx (c:cs)
sonx "an" = "2"
sonx ('a':'n':c:cs)
    | c == 'n'  = 'a':'n':sonx cs
    | isVowel c = 'a':'n':sonx (c:cs)
    | otherwise = '2':sonx (c:cs)
sonx "assent" = "as"
sonx ('a':'s':c:cs)
    | c == 's'   = 'a':'s':sonx cs
    | isConson c = 'a':'s':sonx (c:cs)
    | otherwise  = 'a':'z':sonx (c:cs)
sonx ('a':'u':cs) = 'o':sonx cs
sonx "ay" = "E"
sonx "ays" = "E"

sonx ('à':cs) = 'a':sonx cs
sonx ('â':c:cs)
    | c == 't'  = 'a':sonx ('t':cs)
    | otherwise = 'a':sonx(c:cs)

-- Starting with 'b'
sonx "b" = ""
sonx ('b':'b':cs) = 'b':sonx cs

-- Starting with 'c'
sonx "c" = ""
sonx ('c':'a':cs) = 'k':sonx ('a':cs)
sonx ('c':'c':v:cs)
    | v == 'o'  = 'k':sonx ('o':cs)
    | v == 'u'  = 'k':sonx ('u':cs)
    | otherwise = 'k':'s':sonx cs
sonx ('c':'e':cs) = 's':sonx ('e':cs)
sonx ('c':'\'':cs) = 's':sonx cs
sonx ('c':'h':'a':'o':cs) = 'k':sonx ('a':'o':cs)
sonx ('c':'h':'l':cs) = 'k':'l':sonx cs
sonx ('c':'h':'o':'e':cs) = 'k':sonx ('o':'e':cs)
sonx ('c':'h':'œ':cs) = 'k':sonx ('o':'e':cs)
sonx ('c':'h':'r':cs) = 'k':'r':sonx cs
sonx ('c':'h':cs) = 'C':sonx cs
sonx ('c':'i':cs) = 's':sonx ('i':cs)
sonx ('c':'k':cs) = 'k':sonx cs
sonx ('c':'o':'e':'u':cs) = 'k':sonx ('o':'e':'u':cs)
sonx ('c':'o':'m':'p':'t':cs) = 'k':'3':'t':sonx cs
sonx ('c':'œ':'u':cs) = 'k':sonx ('œ':'u':cs)
sonx ('c':'o':cs) = 'k':sonx ('o':cs)
sonx ('c':'u':'e':'i':cs) = 'k':'e':sonx ('i':cs)
sonx ('c':'u':cs) = 'k':sonx ('u':cs)
sonx ('c':'y':cs) = 's':sonx ('y':cs)
sonx ('c':cs) = 'k':sonx cs

sonx ('ç':cs) = 's':sonx cs

-- Starting with 'd'
sonx "d" = ""
sonx "ds" = ""
sonx ('d':'d':cs) = 'd':sonx cs

-- Starting with 'e'
sonx "e" = ""
sonx "ec" = "Ec"
sonx "ef" = "Ef"
sonx "eaux" = "o"
sonx ('e':'a':'n':'n':cs) = 'a':'n':sonx cs
sonx ('e':'a':'n':cs) = '2':sonx cs
sonx ('e':'a':'u':cs) = 'o':sonx cs
sonx ('e':'f':'f':cs) = 'E':'f':sonx cs
sonx ('e':'g':'m':cs) = 'E':sonx ('g':'m':cs)
sonx "ein" = "1"
sonx ('e':'i':'n':c:cs)
    | c == 'n'  = 'E':'n':sonx cs
    | isVowel c = 'E':'n':sonx (c:cs)
    | otherwise = '1':sonx (c:cs)
sonx ('e':'i':cs) = 'E':sonx cs
sonx ('e':'l':'l':cs) = 'E':'l':sonx cs
sonx ('e':'l':c:cs)
    | isConson c = 'E':sonx ('l':c:cs)
    | otherwise  = 'e':sonx ('l':c:cs)
sonx ('e':'m':'m':cs) = 'E':'m':sonx cs
sonx ('e':'m':'p':cs) = '2':sonx cs
sonx ('e':'n':'n':cs) = 'E':'n':sonx cs
sonx "en" = "2"
sonx ('e':'n':c:cs)
    | isVowel c = 'e':'n':sonx (c:cs)
    | otherwise = '2':sonx (c:cs)
sonx "er" = "E"
sonx "ert" = "Er"
sonx ('e':'r':'r':cs) = 'E':'r':sonx cs
sonx ('e':'r':'f':cs) = 'E':'r':sonx ('f':cs)
sonx "es" = ""
sonx ('e':'s':'c':'h':cs) = 'E':'C':sonx cs
sonx "essent" = "Es"
sonx ('e':'s':c:cs)
    | c == 'h'   = 'E':sonx ('h':cs)
    | c == 'n'   = 'E':sonx ('n':cs)
    | c == 's'   = 'E':'s':sonx cs
    | isConson c = 'E':'s':sonx (c:cs)
    | otherwise  = 'e':'z':sonx (c:cs)
sonx ('é':'s':c:cs)
    | c == 's'   = 'E':'s':sonx cs
    | isConson c = 'E':'s':sonx (c:cs)
    | otherwise  = 'E':'z':sonx (c:cs)
sonx ('e':'t':'t':cs) = 'E':'t':sonx cs
sonx "et" = "E"
sonx ('e':'t':cs) = 'e':'t':sonx cs
sonx ('e':'u':'n':c:cs)
    | isVowel c = 'e':'n':sonx (c:cs)
    | otherwise = '1':sonx (c:cs)
sonx "eux" = "e"
sonx ('e':'u':'x':'i':cs) = 'e':'z':sonx ('i':cs)
sonx ('e':'u':cs) = 'e':sonx cs
sonx "ex" = "Eks"
sonx ('e':'y':c:cs)
    | isConson c = 'E':sonx (c:cs)
    | otherwise  = 'E':sonx ('y':c:cs)
sonx "ez" = "E"

sonx ('è':cs) = 'E':sonx cs
sonx ('ê':c:cs)
    | c == 't'  = 'E':sonx ('t':cs)
    | otherwise = 'E':sonx(c:cs)
sonx ('ë':'l':cs) = 'E':sonx ('l':cs)
sonx "é" = "E"
sonx ('é':c:cs)
    | c == 't'  = 'E':'t':sonx cs
    | otherwise = 'E':sonx(c:cs)


-- Starting with 'f'
sonx ('f':'f':cs) = 'f':sonx cs

-- Starting with 'g'
sonx "g" = ""
sonx ('g':'e':cs) = 'j':sonx ('e':cs)
sonx ('g':'é':cs) = 'j':sonx ('E':cs)
sonx ('g':'i':cs) = 'j':sonx ('i':cs)
sonx ('g':'n':cs) = 'n':sonx cs
sonx ('g':'y':cs) = 'j':sonx ('y':cs)
sonx ('g':'u':'ë':cs) = 'g':'u':sonx cs
sonx ('g':'u':cs) = 'g':sonx cs
sonx ('g':'g':cs) = 'g':sonx cs

-- Starting with 'h'
sonx ('h':cs) = sonx cs

-- Starting with 'i'
sonx "ic" = "ik"
sonx "ics" = "ik"
sonx ('i':'e':'n':'n':cs) = 'i':'E':'n':sonx cs
sonx ('i':'e':'n':cs) = 'i':'1':sonx cs
sonx "in" = "1"
sonx ('i':'n':c:cs)
    | c == 'n'  = 'i':'n':sonx cs
    | isVowel c = 'i':'n':sonx (c:cs)
    | otherwise = '1':sonx (c:cs)
sonx "issent" = "is"
sonx ('i':'s':c:cs)
    | c == 's'   = 'i':'s':sonx cs
    | isConson c = 'i':'s':sonx (c:cs)
    | otherwise  = 'i':'z':sonx (c:cs)
sonx ('i':'x':'i':cs) = 'i':'z':sonx ('i':cs)
sonx ('i':'l':'l':cs) = 'i':sonx cs
sonx ('i':cs) = 'i':sonx cs

sonx ('ï':cs) = 'i':sonx cs

-- Starting with 'l'
sonx ('l':'l':cs) = 'l':sonx cs

-- Starting with 'm'
sonx ('m':'m':cs) = 'm':sonx cs

-- Starting with 'n'
sonx ('n':'n':cs) = 'n':sonx cs

-- Starting with 'o'
sonx ('o':'c':'c':cs) = 'o':'k':sonx cs
sonx ('o':'e':'u':cs) = 'e':sonx cs
sonx ('œ':'u':cs) = 'e':sonx cs
sonx ('œ':cs) = 'e':sonx cs
sonx "oient" = "Ua"
sonx ('o':'i':'n':cs) = 'U':'1':sonx cs
sonx ('o':'i':cs) = 'U':'a':sonx cs
sonx ('o':'m':'m':cs) = 'o':'m':sonx cs
sonx ('o':'m':c:cs)
    | isVowel c = 'o':'m':sonx (c:cs)
    | otherwise = '3':sonx (c:cs)
sonx ('o':'n':'n':cs) = 'o':'n':sonx cs
sonx ('o':'n':cs) = '3':sonx cs
sonx "ossent" = "os"
sonx ('o':'s':c:cs)
    | c == 's'   = 'o':'s':sonx cs
    | isConson c = 'o':'s':sonx (c:cs)
    | otherwise  = 'o':'z':sonx (c:cs)
sonx ('o':'u':cs) = 'U':sonx cs
sonx ('o':'ù':cs) = 'U':sonx cs
sonx ('o':'û':cs) = 'U':sonx cs

sonx ('ô':cs) = 'o':sonx cs
sonx ('ö':cs) = 'o':sonx cs

-- Starting with 'p'
sonx "p" = ""
sonx ('p':'h':cs) = 'f':sonx cs
sonx ('p':'p':cs) = 'p':sonx cs
sonx ('p':'a':'y':'s':cs) = 'p':'E':sonx ('i':'s':cs)

-- Starting with 'q'
sonx ('q':'u':'r':cs) = 'k':'u':sonx ('r':cs)
sonx ('q':'u':cs) = 'k':sonx cs
sonx ('q':cs) = 'k':sonx cs

-- Starting with 'r'
sonx ('r':'r':cs) = 'r':sonx cs

-- Starting with 's'
sonx "s" = ""
sonx ('s':'s':cs) = 's':sonx cs
sonx ('s':'t':cs) = 's':'t':sonx cs
sonx ('s':'c':'i':cs) = 's':sonx ('i':cs)

-- Starting with 't'
sonx "t" = ""
sonx ('t':'i':'e':'r':cs) = 't':sonx ('i':'e':'r':cs)
sonx ('t':'i':cs) = 's':sonx ('i':cs)
sonx ('t':'t':cs) = 't':sonx cs

-- Starting with 'u'
sonx "un" = "1"
sonx ('û':cs) = 'u':sonx cs
sonx "ussent" = "us"
sonx ('u':'s':c:cs)
    | c == 's'   = 'u':'s':sonx cs
    | isConson c = 'u':'s':sonx (c:cs)
    | otherwise  = 'u':'z':sonx (c:cs)

-- Starting with 'v'
-- 'v' can be handled by the generic case since there is no special thing
-- about this letter, it’s always pronounced 'v' and never doubled.

-- Starting with 'w'
sonx ('w':cs) = 'v':sonx cs

-- Starting with 'x'
sonx "x" = ""
sonx ('x':c:cs)
    | c == 'c'  = 'k':'s':sonx cs
    | isVowel c = 'k':'z':sonx (c:cs)
    | otherwise = 'k':'s':sonx (c:cs)

-- Starting with 'y'
sonx ('y':cs) = 'i':sonx cs

-- Starting with 'z'
sonx ('z':'z':cs) = 'z':sonx cs

-- Copy every other character as is
sonx (c:cs) = c:sonx cs

-- | Compute a Sonnex code for a french word.
--
-- The string must contain only one word.
-- Each character should be considered as being vocal, not silent
--
-- prop> length (sonnex w) <= length w
sonnex :: String -> String
sonnex = sonnex' . map toLower
    where sonnex' "cerf" = "sEr"
          sonnex' "cerfs" = "sEr"
          sonnex' "de"  = "de"
          sonnex' "est" = "E"
          sonnex' "es"  = "E"
          sonnex' "huit" = "uit"
          sonnex' "les" = "lE"
          sonnex' "mer" = "mEr"
          sonnex' "mes" = "mE"
          sonnex' "ressent" = "res2"
          sonnex' "serf" = "sEr"
          sonnex' "serfs" = "sEr"
          sonnex' ('s':'e':'p':'t':cs) = 's':'E':'t':sonx cs
          sonnex' "ses" = "sE"
          sonnex' "tes" = "tE"
          sonnex' ('t':'i':'e':'n':cs) = 't':sonx ('i':'e':'n':cs)
          sonnex' word = sonx word

-- | Compute a Sonnex code for a french phrase.
--
-- It applies the sonnex function to each word in the phrase.
-- Since it uses the words/unwords couple, superfluous space character
-- are removed.
sonnexPhrase :: String -> String
sonnexPhrase = unwords . map sonnex . words