{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE ViewPatterns      #-}
-- TODO : learn how to use Functional Morphology instead

-- | Module    : NLP.Minimorph.English
-- Copyright   : 2012 Eric Kow (Computational Linguistics Ltd.)
-- License     : BSD3
-- Maintainer  : eric.kow@gmail.com
-- Stability   : experimental
-- Portability : portable
--
-- Simple default rules for English morphology
module NLP.Minimorph.English where

import           Data.Char          (toLower, isSpace, isUpper)
import           Data.Text          (Text)
import qualified Data.Text          as T

import           NLP.Minimorph.Util

-- ---------------------------------------------------------------------
-- ** Punctuation
-- ---------------------------------------------------------------------

-- | No Oxford commas, alas.
--
-- > commas "and" "foo bar"       == "foo and bar"
-- > commas "and" "foo, bar, baz" == "foo, bar and baz"
commas :: Text -> [Text] -> Text
commas _ []  = ""
commas _ [x] = x
commas et xs = T.intercalate ", " (init xs) <+> et <+> last xs

-- ---------------------------------------------------------------------
-- ** Numbers
-- ---------------------------------------------------------------------

-- | > cardinal 1 == "one"
--   > cardinal 2 == "two"
--   > cardinal 3 == "three"
--   > cardinal 11 == "11"
cardinal :: Int -> Text
cardinal n = case n of
    1  -> "one"
    2  -> "two"
    3  -> "three"
    4  -> "four"
    5  -> "five"
    6  -> "six"
    7  -> "seven"
    8  -> "eight"
    9  -> "nine"
    10 -> "ten"
    _ -> T.pack (show n)

-- | > ordinal 1 == "first"
--   > ordinal 2 == "second"
--   > ordinal 3 == "third"
--   > ordinal 11 == "11th"
--   > ordinal 42 == "42nd"
ordinal :: Int -> Text
ordinal n = case n of
    1  -> "first"
    2  -> "second"
    3  -> "third"
    4  -> "fourth"
    5  -> "fifth"
    6  -> "sixth"
    7  -> "seventh"
    8  -> "eighth"
    9  -> "ninth"
    10 -> "tenth"
    n | n < 21          -> n `suf` "th"
      | n `rem` 10 == 2 -> n `suf` "nd"
      | n `rem` 10 == 3 -> n `suf` "rd"
      | otherwise       -> n `suf` "th"
  where
    n `suf` s = T.pack (show n) <> s

-- ---------------------------------------------------------------------
-- ** Nouns and verbs
-- ---------------------------------------------------------------------

-- | Heuristics for English plural for an unknown noun
--
-- > defaultNounPlural "egg"    == "eggs"
-- > defaultNounPlural "patch"  == "patches"
-- > defaultNounPlural "boy"    == "boys"
-- > defaultNounPlural "spy"    == "spies"
-- > defaultNounPlural "thesis" == "theses"
--
-- http://www.paulnoll.com/Books/Clear-English/English-plurals-1.html
defaultNounPlural :: Text -> Text
defaultNounPlural x
    | "is" `T.isSuffixOf` x = thesis
    | hasSibilantSuffix x   = es
    | hasCySuffix x         = y_ies
    | "f"  `T.isSuffixOf` x = f_ves
    | otherwise             = plain
  where
    plain  = x            <> "s"
    es     = x            <> "es"
    y_ies  = T.init x     <> "ies"
    f_ves  = T.init x     <> "ves"
    thesis = tDropEnd 2 x <> "es"

-- | Heuristics for 3rd person singular and past participle
--   for an unknown regular verb
--
-- > defaultVerbStuff "walk"  == ("walks",  "walked")
-- > defaultVerbStuff "push"  == ("pushes", "pushed")
-- > defaultVerbStuff "play"  == ("plays",  "played")
-- > defaultVerbStuff "cry"   == ("cries",  "cried")
defaultVerbStuff :: Text -> (Text, Text)
defaultVerbStuff v
    | hasSibilantSuffix v   = sibilant_o v
    | "o" `T.isSuffixOf` v  = sibilant_o v
    | "e" `T.isSuffixOf` v  = e_final v
    | hasCySuffix v         = y_final v
    | otherwise             = plain v
  where
    plain x      = (x <> "s"         , x <> "ed")
    sibilant_o x = (x <> "es"        , x <> "ed")
    e_final    x = (x <> "s"         , x <> "d")
    y_final    x = (T.init x <> "ies", T.init x <> "ied")

-- ---------------------------------------------------------------------
-- ** Determiners
-- ---------------------------------------------------------------------

-- | > indefiniteDet "dog"  == "a"
--   > indefiniteDet "egg"  == "an"
--   > indefiniteDet "ewe"  == "a"
--   > indefiniteDet "ewok" == "an"
--   > indefiniteDet "8th"  == "an"
indefiniteDet :: Text -> Text
indefiniteDet t = if wantsAn t then "an" else "a"

-- | True if the indefinite determiner for a word would normally be
--   'an' as opposed to 'a'
wantsAn :: Text -> Bool
wantsAn t_ =
    if startsWithAcronym t_
       then acronymWantsAn t_
       else useAn0 || useAn1 || useAn2
  where
    t      = T.toLower t_
    useAn0 = t `elem` [ "11", "11th" ]
    useAn1 = case T.uncons t of
                Just ('8',_) -> True
                Just (h,_)   -> isVowel h `butNot` hasSemivowelPrefix t
                Nothing      -> False
    useAn2 = case T.break isSep t of
                (T.unpack -> [c], _) -> isLetterWithInitialVowelSound c
                _ -> False
    x `butNot` y = x && not y
    isSep c = isSpace c || c `elem` "-"

-- | Variant of 'wantsAn' that assumes the input string is pronounced
--   one letter at a time.
--
--   > wantsAn        "x-ray" == False
--   > acronymWantsAn "x-ray" == True
--
--   Note that this won't do the right thing for words like @"SCUBA"@
--   You really have to reserve it for those separate-letter acronyms
acronymWantsAn :: Text -> Bool
acronymWantsAn (T.toLower -> t) =
    useAn0 || useAn1
  where
    useAn0 = t `elem` [ "11", "11th" ]
    useAn1 = case T.uncons t of
                Just ('8',_) -> True
                Just (h,_)   -> isLetterWithInitialVowelSound h
                Nothing      -> False

-- ---------------------------------------------------------------------
-- ** Acronyms
-- ---------------------------------------------------------------------

-- | True if all upper case from second letter and up
--
--   > looksLikeAcronym "DNA"  == True
--   > looksLikeAcronym "tRNA" == True
--   > looksLikeAcronym "DnA"  == False
looksLikeAcronym :: Text -> Bool
looksLikeAcronym x = T.all isUpper (T.drop 1 x)

-- | True if the first word (separating on either - or space)
--   looks like an acronym
startsWithAcronym :: Text -> Bool
startsWithAcronym =
    looksLikeAcronym . firstWord
  where
    firstWord = fst . T.break isSep
    isSep c   = isSpace c || c `elem` "-"

-- ---------------------------------------------------------------------
-- ** Sounds
-- ---------------------------------------------------------------------

-- | Ends with a sh sound
hasSibilantSuffix :: Text -> Bool
hasSibilantSuffix x = any (`T.isSuffixOf` x) ["x","s","ch","sh"]

-- | Starts with a semivowel
hasSemivowelPrefix :: Text -> Bool
hasSemivowelPrefix ls = any (`T.isPrefixOf` ls) ["y","w","eu","ewe"]

-- | Last two letters are a consonant and 'y'
hasCySuffix :: Text -> Bool
hasCySuffix (T.unpack . tTakeEnd 2 -> [x, 'y']) = isConsonant x
hasCySuffix _ = False

-- | Is a vowel
isVowel :: Char -> Bool
isVowel = (`elem` "aeiou") . toLower

-- | Letters that when pronounced independently in English sound like they
--   begin with vowels
--
--   > isLetterWithInitialVowelSound 'r' == True
--   > isLetterWithInitialVowelSound 'k' == False
--
--   (In the above, @'r'@ is pronounced @"are"@, but @'k'@ is pronounced
--   @"kay"@)
isLetterWithInitialVowelSound :: Char -> Bool
isLetterWithInitialVowelSound = (`elem` "aeioufhlmnrsx") . toLower

-- | Is a consonant
isConsonant :: Char -> Bool
isConsonant = not . isVowel