{-# LANGUAGE OverloadedStrings #-}
{-# OPTIONS_HADDOCK hide #-}
-- |
-- Module:      Language.Gender.Dictionary
-- Copyright:   (c) 2013 Kranium Gikos Mendoza
--              (c) 2007-2008, Jörg Michael
-- License:     LGPL
-- Maintainer:  Kranium Gikos Mendoza <kranium@gikos.net>
-- Stability:   experimental
-- Portability: portable
--

module Language.Gender.Dictionary
  ( Gender(..)
  , parseDictionary
  ) where
import Language.Gender.Types
import Control.Applicative
import Data.Attoparsec.Text
import Data.Char                    (chr)
import Data.Text             hiding (filter,take)
import Prelude               hiding (concat,take)

data DictLine = Comment  Text
              | Mapping (Char,[Text])
              | Name     Text Gender (Maybe Column30)
              | EquivName Text Text
              deriving (Show)

data Column30 = CompressedUmlaut
              | ExpandedUmlaut
              | ArabicPseudoVowel
              deriving (Show)

-- | Parses Text with the nam_dict format into a lookup table.
parseDictionary text =
  case parseOnly dict text of
    Left   _ -> []
    Right xs -> do
      let mappings = mapValToKeys $ filter isMapping xs
          rawnames = filter isName xs
      Prelude.map (nameToTuple mappings) rawnames

mapValToKeys = Prelude.concatMap valToKeys -- [(k,vs)] -> [(v,k)]

valToKeys (Mapping (k,vs)) = Prelude.map (\v -> (v,k)) vs
valToKeys  _               = []

isName Name{} = True
isName _      = False

nameToTuple m (Name n g c30) = (expandUmlaut m n,g)
nameToTuple _ _              = error "Language.Gender.Dictionary: nameToTuple only for Name type."

expandUmlaut m text = do
  let result = parseOnly (many umlautName) text
      umlautName = umlautPart <|> namePart
      umlautPart = do char '<'
                      umlaut <- takeWhile1 (not . isBracket)
                      char '>'
                      case lookup umlaut m of
                        Just x  -> return $ singleton x
                        Nothing -> return umlaut
      namePart = takeWhile1 (not . isBracket)
      isBracket = flip elem "<>"
  case result of
    Left   _ -> text
    Right xs -> concat xs

isMapping (Mapping _) = True
isMapping  _          = False

dict = many dictLine <* endOfInput

dictLine = dictMapping
       <|> dictComment
       <|> dictName
       <|> dictEquivName

dictMapping = do
  xs <- (,) <$> key <*> some value <* takeTill isEndOfLine <* endOfLine
  return $ Mapping xs

key = do char '#'
         skipWhile isHorizontalSpace
         c <- decimal
         skipSpace
         char '='
         return $ chr c

value = valOr <|> val
  where val   = skipSpace *> char '<' *> takeTill (== '>') <* char '>'
        valOr = val <* string " or"

dictComment = Comment <$> (char '#' *> skipWhile isHorizontalSpace *> takeTill isEndOfLine <* endOfLine)

dictName = do
  gender <- genderParser
  skipSpace
  name <- fmap stripEnd $ take 26
  c30 <- c30Parser -- TODO: implement this for umlaut expansion/compression
  skipWhile $ not . isEndOfLine
  endOfLine
  return $ Name name gender c30

dictEquivName = EquivName <$> (char '=' *> skipWhile isHorizontalSpace *> takeTill isHorizontalSpace)
                          <*> (skipWhile isHorizontalSpace *> takeTill isEndOfLine <* endOfLine)

c30Parser = char '-' *> pure (Just CompressedUmlaut)
        <|> char '+' *> pure (Just ExpandedUmlaut)
        <|> char '_' *> pure (Just ArabicPseudoVowel)
        <|> char ' ' *> pure  Nothing

genderParser = string "1F" *> pure Female
           <|> string "F " *> pure Female
           <|> string "1M" *> pure Male
           <|> string "M " *> pure Male
           <|> string "? " *> pure Unisex
           <|> string "?F" *> pure MostlyFemale
           <|> string "?M" *> pure MostlyMale