{-# LANGUAGE OverloadedStrings #-} {-# OPTIONS_HADDOCK hide #-} -- | -- Module: Language.Gender.Dictionary -- Copyright: (c) 2013 Kranium Gikos Mendoza -- (c) 2007-2008, Jörg Michael -- License: LGPL -- Maintainer: Kranium Gikos Mendoza -- Stability: experimental -- Portability: portable -- module Language.Gender.Dictionary ( Gender(..) , parseDictionary ) where import Language.Gender.Types import Control.Applicative import Data.Attoparsec.Text import Data.Char (chr) import Data.Text hiding (filter,take) import Prelude hiding (concat,take) data DictLine = Comment Text | Mapping (Char,[Text]) | Name Text Gender (Maybe Column30) | EquivName Text Text deriving (Show) data Column30 = CompressedUmlaut | ExpandedUmlaut | ArabicPseudoVowel deriving (Show) -- | Parses Text with the nam_dict format into a lookup table. parseDictionary text = case parseOnly dict text of Left _ -> [] Right xs -> do let mappings = mapValToKeys $ filter isMapping xs rawnames = filter isName xs Prelude.map (nameToTuple mappings) rawnames mapValToKeys = Prelude.concatMap valToKeys -- [(k,vs)] -> [(v,k)] valToKeys (Mapping (k,vs)) = Prelude.map (\v -> (v,k)) vs valToKeys _ = [] isName Name{} = True isName _ = False nameToTuple m (Name n g c30) = (expandUmlaut m n,g) nameToTuple _ _ = error "Language.Gender.Dictionary: nameToTuple only for Name type." expandUmlaut m text = do let result = parseOnly (many umlautName) text umlautName = umlautPart <|> namePart umlautPart = do char '<' umlaut <- takeWhile1 (not . isBracket) char '>' case lookup umlaut m of Just x -> return $ singleton x Nothing -> return umlaut namePart = takeWhile1 (not . isBracket) isBracket = flip elem "<>" case result of Left _ -> text Right xs -> concat xs isMapping (Mapping _) = True isMapping _ = False dict = many dictLine <* endOfInput dictLine = dictMapping <|> dictComment <|> dictName <|> dictEquivName dictMapping = do xs <- (,) <$> key <*> some value <* takeTill isEndOfLine <* endOfLine return $ Mapping xs key = do char '#' skipWhile isHorizontalSpace c <- decimal skipSpace char '=' return $ chr c value = valOr <|> val where val = skipSpace *> char '<' *> takeTill (== '>') <* char '>' valOr = val <* string " or" dictComment = Comment <$> (char '#' *> skipWhile isHorizontalSpace *> takeTill isEndOfLine <* endOfLine) dictName = do gender <- genderParser skipSpace name <- fmap stripEnd $ take 26 c30 <- c30Parser -- TODO: implement this for umlaut expansion/compression skipWhile $ not . isEndOfLine endOfLine return $ Name name gender c30 dictEquivName = EquivName <$> (char '=' *> skipWhile isHorizontalSpace *> takeTill isHorizontalSpace) <*> (skipWhile isHorizontalSpace *> takeTill isEndOfLine <* endOfLine) c30Parser = char '-' *> pure (Just CompressedUmlaut) <|> char '+' *> pure (Just ExpandedUmlaut) <|> char '_' *> pure (Just ArabicPseudoVowel) <|> char ' ' *> pure Nothing genderParser = string "1F" *> pure Female <|> string "F " *> pure Female <|> string "1M" *> pure Male <|> string "M " *> pure Male <|> string "? " *> pure Unisex <|> string "?F" *> pure MostlyFemale <|> string "?M" *> pure MostlyMale