{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE LambdaCase #-}

{-|
Module      : Text.Pronounce.ParseDict
Description : Module for parsing the CMU Dictionary
Copyright   : (c) Noah Goodman, 2018
License     : BSD3
Stability   : experimental

This module has functions for parsing the CMU pronouncing dictionary, and exports the 
@CMUdict@ type and the function @initDict@ to the main module "Text.Pronounce"
-}


module Text.Pronounce.ParseDict 
    ( CMUdict
    , UsesBin
    , initDict
    , stdDict
    , parseDict
    , parseLine
    ) where

import Paths_pronounce
import System.FilePath
import Text.ParserCombinators.ReadP
import Data.Char
import Data.Text.Encoding
import Data.Binary (decodeFile)
import qualified Data.Text as T
import qualified Data.Text.IO as T
import qualified Data.Map as Map

-- | A Map from Entries to lists of possible pronunciations, serving as our
-- representation of the CMU Pronouncing Dictionary
type CMUdict = Map.Map T.Text [T.Text]

-- | A type used to represent the option of decoding the dictionary from a
-- binary file or parsing it from text
type UsesBin = Bool

-- | Initializes the cmu pronunctiation dictionary into our program, given an
-- optional file name of the dictionary
initDict :: Maybe FilePath -> UsesBin -> IO CMUdict
initDict path = \case
    True ->
        case path of 
          Just p ->
              return . Map.mapKeys decodeUtf8 . fmap (map decodeUtf8) =<< decodeFile p
          Nothing ->
              return . Map.mapKeys decodeUtf8 . fmap (map decodeUtf8) =<< decodeFile =<< getDataFileName "cmubin"
    False ->
        case path of 
          Just p -> 
              return . parseDict =<< T.readFile p
          Nothing -> 
              return . parseDict =<< T.readFile =<< getDataFileName "cmuutf"

-- | Default settings for initDict
stdDict :: IO CMUdict
stdDict = initDict Nothing True

-- | Go through all the entries in the dictionary, parsing, and inserting into
-- the map data structure
parseDict :: T.Text -> CMUdict
parseDict = Map.fromListWith (++) . map packAndParse . filter ((/= ';') . T.head) . T.lines
    where packAndParse = (\(a,b) -> (T.pack a, [T.pack b])) . fst . head . readP_to_S parseLine . T.unpack

-- | Parses a line in the dictionary, returning as (key,val) pair, ignoring
-- parenthetical part if it exists
parseLine :: ReadP (String, String)
parseLine = (,) <$> (many get) <* (paren <++ string "") <* string "  "
                <*> (munch . const $ True)

-- Helper function to parse numbers in between parentheses
paren :: ReadP String
paren = char '(' *> munch isDigit <* char ')'