module Language.Gender.Dictionary
( Gender(..)
, parseDictionary
) where
import Language.Gender.Types
import Control.Applicative
import Data.Attoparsec.Text
import Data.Char (chr)
import Data.Text hiding (filter,take)
import Prelude hiding (concat,take)
data DictLine = Comment Text
| Mapping (Char,[Text])
| Name Text Gender (Maybe Column30)
| EquivName Text Text
deriving (Show)
data Column30 = CompressedUmlaut
| ExpandedUmlaut
| ArabicPseudoVowel
deriving (Show)
parseDictionary text =
case parseOnly dict text of
Left _ -> []
Right xs -> do
let mappings = mapValToKeys $ filter isMapping xs
rawnames = filter isName xs
Prelude.map (nameToTuple mappings) rawnames
mapValToKeys = Prelude.concatMap valToKeys
valToKeys (Mapping (k,vs)) = Prelude.map (\v -> (v,k)) vs
valToKeys _ = []
isName Name{} = True
isName _ = False
nameToTuple m (Name n g c30) = (expandUmlaut m n,g)
nameToTuple _ _ = error "Language.Gender.Dictionary: nameToTuple only for Name type."
expandUmlaut m text = do
let result = parseOnly (many umlautName) text
umlautName = umlautPart <|> namePart
umlautPart = do char '<'
umlaut <- takeWhile1 (not . isBracket)
char '>'
case lookup umlaut m of
Just x -> return $ singleton x
Nothing -> return umlaut
namePart = takeWhile1 (not . isBracket)
isBracket = flip elem "<>"
case result of
Left _ -> text
Right xs -> concat xs
isMapping (Mapping _) = True
isMapping _ = False
dict = many dictLine <* endOfInput
dictLine = dictMapping
<|> dictComment
<|> dictName
<|> dictEquivName
dictMapping = do
xs <- (,) <$> key <*> some value <* takeTill isEndOfLine <* endOfLine
return $ Mapping xs
key = do char '#'
skipWhile isHorizontalSpace
c <- decimal
skipSpace
char '='
return $ chr c
value = valOr <|> val
where val = skipSpace *> char '<' *> takeTill (== '>') <* char '>'
valOr = val <* string " or"
dictComment = Comment <$> (char '#' *> skipWhile isHorizontalSpace *> takeTill isEndOfLine <* endOfLine)
dictName = do
gender <- genderParser
skipSpace
name <- fmap stripEnd $ take 26
c30 <- c30Parser
skipWhile $ not . isEndOfLine
endOfLine
return $ Name name gender c30
dictEquivName = EquivName <$> (char '=' *> skipWhile isHorizontalSpace *> takeTill isHorizontalSpace)
<*> (skipWhile isHorizontalSpace *> takeTill isEndOfLine <* endOfLine)
c30Parser = char '-' *> pure (Just CompressedUmlaut)
<|> char '+' *> pure (Just ExpandedUmlaut)
<|> char '_' *> pure (Just ArabicPseudoVowel)
<|> char ' ' *> pure Nothing
genderParser = string "1F" *> pure Female
<|> string "F " *> pure Female
<|> string "1M" *> pure Male
<|> string "M " *> pure Male
<|> string "? " *> pure Unisex
<|> string "?F" *> pure MostlyFemale
<|> string "?M" *> pure MostlyMale