-- | The module provides a simple embedded domain specific language for
-- defining transliteration rules.  All parsers are case-insensitive
-- by default.


module NLP.HistPL.Transliter
( 
-- * Transliteration
  TrRules (..)
, transliter

-- * Parsers
, Parser
, ciString
, ciChar

-- * Operators
, (#>)
, (>#>)
, (>+>)
, (.|)
, (.|.)
) where

import Control.Applicative ((<$>), (<*>), (<*))
import Control.Monad (msum)
import Data.Char (isUpper, toUpper, toLower)
import Text.Parsec hiding (Line)

-- | A parser data type.
type Parser = Parsec String ()

-- | Case insensitive character parser.
ciChar :: Char -> Parser Char
ciChar c = char (toLower c) <|> char (toUpper c)

-- | Case insensitive string parser.
ciString :: String -> Parser String
ciString = sequence . map ciChar

data Shape = Lower
           | Capitalized 
           | Upper

getShape :: String -> Shape
getShape [] = Lower
getShape xs@(x:_)
    | all isUpper xs = Upper
    | isUpper x      = Capitalized
    | otherwise      = Lower 

applyShape :: Shape -> String -> String
applyShape _ [] = []
applyShape Lower xs = map toLower xs
applyShape Upper xs = map toUpper xs
applyShape Capitalized (x:xs) = toUpper x : map toLower xs

-- | A transliteration rule, e.g. (\"abc\" #> \"bcd\") will
-- substitute all \"abc\" (sub)string instances with \"bcd\".
(#>) :: String -> String -> Parser String
(#>) p x = ciString p >#> x

-- | Similar to `#>`, but this function allows to define a custom
-- parser for the string which should be substituted with another
-- string.
(>#>) :: Parser String -> String -> Parser String
(>#>) p x = p >>= \y -> return (applyShape (getShape y) x)

-- | Concatentation of parsers.
(>+>) :: Parser String -> Parser String -> Parser String
(>+>) p p' = (++) <$> p <*> p'

-- | OR parser, i.e. a parser which tries to match the first string argument,
-- and only tries the second one if the first match failed.
(.|) :: String -> String -> Parser String
(.|) x y = try (ciString x) <|> try (ciString y)
-- FIXME: Gdy długość napisu jest <= 1, nie jest potrzebna funkcja try.

-- | Similar to `.|`, but accepts a parser as the first argument.
(.|.) :: Parser String -> String -> Parser String
(.|.) p y = p <|> try (ciString y)
-- FIXME: Gdy długość napisu jest <= 1, nie jest potrzebna funkcja try.

-- | A set of transliteration rules.
data TrRules = TrRules {
    -- | Word-level rule is applied only when it matches the entire word.
      wordRules :: [Parser String]
    -- | Character-level rule is always applied when a match is found.
    , charRules :: [Parser String]
    }

wordParser :: TrRules -> Parser String
wordParser rules = 
    perWord <|> perChar
  where
    perWord = msum $ map (\p -> try $ p <* eof) (wordRules rules)
    perChar = (concat <$> many1 (msum $ map try (charRules rules))) <* eof

parseWord :: TrRules -> String -> String
parseWord rules x =
  case parse (wordParser rules) x x of
    Left err -> error $ "parseWord: " ++ show err
    Right y  -> y

-- | Transliterate the word with the given set of transliteration rules.
transliter :: TrRules -> String -> String
transliter rules x =
  case takeWhile (not . eq) ps of
    [] -> x
    xs -> snd $ last $ xs
  where
    ys = iterate (parseWord rules) x
    ps = zip ys $ tail ys
    eq = uncurry (==)