-- | A set of transliteration rules prepared for documents from the
-- IMPACT project.
--
-- >>> import NLP.HistPL.Transliter.Impact
-- >>> putStrLn $ transliter impactRules "angol"
-- "anjoł"


module NLP.HistPL.Transliter.Impact
( impactRules
, module NLP.HistPL.Transliter 
) where

import Control.Applicative ((<$>), (<*))
import Text.Parsec hiding (Line)

import NLP.HistPL.Transliter

-- | A set of transliteration rules prepared for documents
-- from the IMPACT project. 
impactRules :: TrRules
impactRules = TrRules impactWordRules impactCharRules

-- Word-level transliteration rules 
impactWordRules :: [Parser String]
impactWordRules = 
    [ "y" #> "i"
    , "ztąd" #> "stąd"
    , "i" #> "j" >+> vowel >+> many1 anyChar
    , "ktor" #> "któr" >+> many1 anyChar
    , "gor" #> "gór" >+> many1 anyChar
    , "wtor" #> "wtór" >+> many1 anyChar
    , "rożn" #> "różn" >+> many1 anyChar
    , ("anyol" .| "angol" .|. "angyol" .|. "angiol" >#> "anjoł")
        >+> many anyChar ]

-- Character-level transliteration rules 
impactCharRules :: [Parser String]
impactCharRules =
    [ "à" .|  "á" .|. "á" .|. "â" .|. "ã" .|. "ä" .|. "ȧ" >#> "a"
    , "è" .|  "é" .|. "ê" .|. "ë" .|. "ē" .|. "ĕ"
          .|. "ė" .|. "ẽ" .|. "ə" >#> "e"
    , "ì" .|  "í" .|. "î" .|. "ï" >#> "i"
    , "ñ" #> "ń"
    , "z̄" #> "ż"
    , "ċ" #> "ć"
    , "ṡ" #> "ś"
    , "ow" #> "ów" <* eof
    , "æ" .| "œ" >#> "e"
    , "ⱥ" #> "ą"
    , "ɇ" .| "ẹ" >#> "ę"
    , "ø" #> "ą" -- lub ę
    , sChar >#> "s"   >+> (vowel <|> hardConsonant)
    , sChar >#> "ś"   >+> softConsonant
    , sChar >#> "s"   >+> str "k"
    -- , sChar >#> "s"   >+> str "ki"
    , "ʃʃ" .| "ſſ" .|. "ſs" .|. "ſz" .|. "β" .|. "ß" >#> "sz"
    , "\60122" #> "st"
    , "\60322" #> "si"
    , "st" #> "st"   -- czy to dobra reguła? 
    , "źi" .| "żi" >#> "zi"
    , "ći" .| "ċi" >#> "ci"
    , "ńi" .| "ṅi" >#> "ni"
    , "śi" #> "si"
    , "dźi" #> "dzi"
    , "cż" #> "cz"
    , "rż" #> "rz"
    , "sż" #> "sz"
    , str "e" >+> ("x" #> "gz")
    , "x" #> "ksz" >+> str "t"
    , "x" #> "ks"
    , (str "t" <|> str "p" <|> str "r") >+> ("h" #> "")
    , "&" #> "et"
    -- , vowel >+> ("y" #> "j") >+> consonant
    , vowel >+> ("y" #> "j")
    , vowel >+> ("i" #> "j") >+> vowel
    , hardConsonant >+> ("y" #> "yj") >+> vowel
    , softConsonant >+> ("y" #> "ij") >+> vowel
    , "ss" #> "s"
    , "szsz" #> "sz"
    , "mm" #> "m"
    , "ff" #> "f"
    , "pp" #> "p"
    , "ll" #> "l"
    , "tt" #> "t"
    , "v" #> "w" >+> vowel
    , "v" #> "u"
    , "ts" #> "c"
    , "ds" #> "dz"
    , "łi" #> "li"
    , "srz" #> "śrz"
    , "zrz" #> "źrz"
    , singleton <$> anyChar ] -- default
  where
    str = ciString
    sChar = "ʃ" .| "ſ" .|. "\60326" 

-- | FIXME: Samogłoski powinny obejmować także znaki historyczne?
vowel :: Parser String
vowel = singleton <$> oneOf "aąeęioóuy"

softConsonant :: Parser String
softConsonant = ciString "dź"
            <|> (singleton <$> oneOf "bmflśźńć") 

hardConsonant :: Parser String
hardConsonant = (singleton <$> oneOf "pwżcsztnrł")
            <|> (ciString "d" <* notFollowedBy (ciChar 'ź'))

-- consonant :: Parser String
-- consonant = singleton <$> oneOf "bcćdfghjklłmnńprsśtwzźż"

singleton :: a -> [a]
singleton = (:[])