{-# LANGUAGE OverloadedStrings #-} {-| Module : Text.Pronounce.ParseDict Description : Module for parsing the CMU Dictionary Copyright : (c) Noah Goodman, 2018 License : BSD3 Stability : experimental This module has functions for parsing the CMU pronouncing dictionary, and exports the @CMUdict@ type and the function @initDict@ to the main module "Text.Pronounce" -} module Text.Pronounce.ParseDict ( Entry , Phones , CMUdict , DictSource(..) , initDict , stdDict , parseDict , parseLine ) where import Paths_pronounce import Control.Arrow ((***)) import Data.Binary (Binary, decodeFile) import Data.Char import Data.Map (Map) import qualified Data.Map as Map import Data.Text.Encoding import Data.Text (Text) import qualified Data.Text as T import qualified Data.Text.IO as T import System.FilePath import Text.ParserCombinators.ReadP -- | Represents an entry word in the cmu pronouncing dictionary (simply an alias -- for @Text@ to improve type specificity and readability type Entry = Text -- | Represents a string containing the phonetic breakdown of a word, in a -- similar manner to the @EntryWord@ type type Phones = [Text] -- | A Map from @Entry@s to lists of possible pronunciations (@Phones@), serving as our -- representation of the CMU Pronouncing Dictionary type CMUdict = Map Entry [Phones] -- | Options for the initial source of the CMUDict. Currently, we can either -- parse from plaintext file or load preprocessed binary data DictSource = PlainText | Binary -- | Initializes the cmu pronunctiation dictionary into our program, given an -- optional file name of the dictionary initDict :: Maybe FilePath -> DictSource -> IO CMUdict initDict path dictSource = case dictSource of Binary -> case path of Just p -> --return . Map.mapKeys decodeUtf8 . fmap (map decodeUtf8) =<< decodeFile p decodeFile p Nothing -> --return . Map.mapKeys decodeUtf8 . fmap (map decodeUtf8) =<< decodeFile =<< getDataFileName "cmubin" decodeFile =<< getDataFileName "cmubin" PlainText -> case path of Just p -> return . parseDict =<< T.readFile p Nothing -> return . parseDict =<< T.readFile =<< getDataFileName "cmuutf" -- | Default settings for @initDict@ stdDict :: IO CMUdict stdDict = initDict Nothing Binary -- | Go through all the entries in the dictionary, parsing, and inserting into -- the map data structure parseDict :: T.Text -> CMUdict parseDict = Map.fromListWith (++) . map packAndParse . filter ((/= ';') . T.head) . T.lines where packAndParse = (T.pack *** (:[]) . T.words . T.pack) . fst . head . readP_to_S parseLine . T.unpack -- | Parses a line in the dictionary, returning as @(key,val)@ pair, ignoring -- parenthetical part if it exists parseLine :: ReadP (String, String) parseLine = (,) <$> (many get) <* (paren <++ string "") <* string " " <*> (munch . const $ True) -- Helper function to parse numbers in between parentheses paren :: ReadP String paren = char '(' *> munch isDigit <* char ')'