{-# LANGUAGE LambdaCase #-}

{-| Deburr 
    A small package exposing the deburr function, which converts unicode
    characters with burrs (umlauts, accents, etc) to their ASCII counterparts.
    The function intelligently handles capitals and some other edge cases.
-}

module Text.Deburr (deburr) where

import Data.Char (isUpper)

-- | Deburr a string, removing umlauts, accents, etc.
--
-- >>> deburr "Jeg spiser brød."
-- "Jeg spiser brod."
--
deburr :: String -> String
deburr  = snd . foldr f (Nothing, "")
    where
        f x = \case
            (Nothing, _) -> (Just x, deburrLetter x Nothing)
            (Just c,  xs) -> (Just x, deburrLetter x (Just c) ++ xs)

deburrLetter :: Char -> Maybe Char -> String
deburrLetter n _ |  --------------------------------------------- A
    n == '\xc0'   || n == '\xc1'   || n == '\xc2'   ||
    n == '\xc3'   || n == '\xc4'   || n == '\xc5'   ||
    n == '\x0100' || n == '\x0102' || n == '\x0104'    = "A"
               |  ----------------------------------------------- a
    n == '\xe0'   || n == '\xe1'   || n == '\xe2'   ||
    n == '\xe3'   || n == '\xe4'   || n == '\xe5'   ||
    n == '\x0101' || n == '\x0103' || n == '\x0105'    = "a"
               |  ----------------------------------------------- C
    n == '\xc7'   || n == '\x0106' || n == '\x0108' ||
    n == '\x010a' || n == '\x010c'                     = "C"
               |  ----------------------------------------------- c
    n == '\xe7'   || n == '\x0107' || n == '\x0109' ||
    n == '\x010b' || n == '\x010d'                     = "c"
               |  ----------------------------------------------- D
    n == '\xd0'   || n == '\x010e' || n == '\x0110'    = "D"
               |  ----------------------------------------------- d
    n == '\xf0'   || n == '\x010f' || n == '\x0111'    = "d"
               |  ----------------------------------------------- E
    n == '\xc8'   || n == '\xc9'   || n == '\xca'   ||
    n == '\xcb'   || n == '\x0112' || n == '\x0114' ||
    n == '\x0116' || n == '\x0118' || n == '\x011a'    = "E"
               |  ----------------------------------------------- e
    n == '\xe8'   || n == '\xe9'   || n == '\xea'   ||
    n == '\xeb'   || n == '\x0113' || n == '\x0115' ||
    n == '\x0117' || n == '\x0119' || n == '\x011b'    = "e"
               |  ----------------------------------------------- G
    n == '\x011c' || n == '\x011e' || n == '\x0120' ||
    n == '\x0122'                                      = "G"
               |  ----------------------------------------------- g
    n == '\x011d' || n == '\x011f' || n == '\x0121' ||
    n == '\x0123'                                      = "g"
               |     n == '\x0124' || n == '\x0126'    = "H"  --- H
               |     n == '\x0125' || n == '\x0127'    = "h"  --- h
               |  ----------------------------------------------- I
    n == '\xcc'   || n == '\xcd'   || n == '\xce'   ||
    n == '\xcf'   || n == '\x0128' || n == '\x012a' ||
    n == '\x012c' || n == '\x012e' || n == '\x0130'    = "I"
               |  ----------------------------------------------- i
    n == '\xec'   || n == '\xed'   || n == '\xee'   ||
    n == '\xef'   || n == '\x0129' || n == '\x012b' ||
    n == '\x012d' || n == '\x012f' || n == '\x0131'    = "i"
               |     n == '\x0134'                     = "J"  --- J
               |     n == '\x0135'                     = "j"  --- j
               |     n == '\x0136'                     = "K"  --- K
               |     n == '\x0137' || n == '\x0138'    = "k"  --- k
               |  ----------------------------------------------- L
    n == '\x0139' || n == '\x013b' || n == '\x013d' ||
    n == '\x013f' || n == '\x0141'                     = "L"
               |  ----------------------------------------------- l
    n == '\x013a' || n == '\x013c' || n == '\x013e' ||
    n == '\x0140' || n == '\x0142'                     = "l"
               |  ----------------------------------------------- N
    n == '\xd1'   || n == '\x0143' || n == '\x0145' ||
    n == '\x0147' || n == '\x014a'                     = "N"
               |  ----------------------------------------------- n
    n == '\xf1'   || n == '\x0144' || n == '\x0146' ||
    n == '\x0148' || n == '\x014b'                     = "n"
               |  ----------------------------------------------- O
    n == '\xd2'   || n == '\xd3'   || n == '\xd4'   ||
    n == '\xd5'   || n == '\xd6'   || n == '\xd8'   ||
    n == '\x014c' || n == '\x014e' || n == '\x0150'    = "O"
               |  ----------------------------------------------- o
    n == '\xf2'   || n == '\xf3'   || n == '\xf4'   ||
    n == '\xf5'   || n == '\xf6'   || n == '\xf8'   ||
    n == '\x014d' || n == '\x014f' || n == '\x0151'    = "o"
               |  ----------------------------------------------- R
    n == '\x0154' || n == '\x0156' || n == '\x0158'    = "R"
               |  ----------------------------------------------- r
    n == '\x0155' || n == '\x0157' || n == '\x0159'    = "r"
               |  ----------------------------------------------- S
    n == '\x015a' || n == '\x015c' || n == '\x015e' ||
    n == '\x0160'                                      = "S"
               |  ----------------------------------------------- s
    n == '\x015b' || n == '\x015d' || n == '\x015f' ||
    n == '\x0161' || n == '\x017f'                     = "s"
               |  ----------------------------------------------- T
    n == '\x0162' || n == '\x0164' || n == '\x0166'    = "T"
               |  ----------------------------------------------- t
    n == '\x0163' || n == '\x0165' || n == '\x0167'    = "t"
               |  ----------------------------------------------- U
    n == '\xd9'   || n == '\xda'   || n == '\xdb'   ||
    n == '\xdc'   || n == '\x0168' || n == '\x016a' ||
    n == '\x016c' || n == '\x016e' || n == '\x0170' ||
    n == '\x0172'                                      = "U"
               |  ----------------------------------------------- u
    n == '\xf9'   || n == '\xfa'   || n == '\xfb'   ||
    n == '\xfc'   || n == '\x0169' || n == '\x016b' ||
    n == '\x016d' || n == '\x016f' || n == '\x0171' ||
    n == '\x0173'                                      = "u"
               |     n == '\x0174'                     = "W"  --- W
               |     n == '\x0175'                     = "w"  --- w
               |  ----------------------------------------------- Y
    n == '\xdd'   || n == '\x0176' || n == '\x0178'    = "Y"
               |  ----------------------------------------------- y
    n == '\xfd'   || n == '\xff'   || n == '\x0177'    = "y"
               |  ----------------------------------------------- Z
    n == '\x0179' || n == '\x017b' || n == '\x017d'    = "Z"
               |  ----------------------------------------------- z
    n == '\x017a' || n == '\x017c' || n == '\x017e'    = "z"

deburrLetter n nxt | n == '\xc6' && maybe False isUpper nxt   = "AE"  -- AE
                   | n == '\xc6'                              = "Ae"  -- Ae
                   | n == '\xe6'                              = "ae"  -- ae
                   | n == '\xde' && maybe False isUpper nxt   = "TH"  -- TH
                   | n == '\xde'                              = "Th"  -- Th
                   | n == '\xfe'                              = "th"  -- th
                   | n == '\xdf'                              = "ss"  -- ss
                   | n == '\x0132'                            = "IJ"  -- IJ
                   | n == '\x0133'                            = "ij"  -- ij
                   | n == '\x0152' && maybe False isUpper nxt = "OE"  -- OE
                   | n == '\x0152'                            = "Oe"  -- Oe
                   | n == '\x0153'                            = "oe"  -- oe
                   | n == '\x0149'                            = "'n"  -- 'n
                   | otherwise = [n]