{-# Language BangPatterns #-}
module NLP.RAKE.Stopwords (StopwordsMap,
                           mkStopwords, mkStopwordsStr,
                           smartStoplist, foxStoplist,

  import           Data.List (foldl')
  import           Data.Map  (Map)
  import qualified Data.Map  as M
  import           Data.Text (Text)
  import qualified Data.Text as T
  import qualified Data.Text.IO as TIO

  -- | Search tree for stop words
  type StopwordsMap = Map Text ()

  -- | Make 'StopwordsMap' starting from a list of stop words
  --   encoded as 'Text'
  mkStopwords :: [Text] -> StopwordsMap
  mkStopwords = foldl' (\m w -> M.insert w () m) M.empty

  -- | Make 'StopwordsMap' starting from a list of stop words
  --   encoded as 'String'
  mkStopwordsStr :: [String] -> StopwordsMap
  mkStopwordsStr = mkStopwords . map T.pack

  -- | Search for a chunk of 'Text' in the 'StopwordsMap'.
  --   Note that, if a word or symbol does not appear in the stop word list,
  --   it may still be on the  the /nolist/
  --   and, then, still counts as stop word (e.g. \"-\").
  stopword :: StopwordsMap -> NoList -> Text -> Bool
  stopword m nl s = case M.lookup s m of
                      Nothing -> s `elem` nl
                      Just _  -> True

  -- | Load a stop word list from a file.
  loadStopWords :: FilePath -> IO StopwordsMap
  loadStopWords f = do
    !c <- TIO.readFile f 
    return (mkStopwords $ filter flt $ norm $ T.lines c)
    where flt l = not(chash `T.isPrefixOf` l) && not (T.null l)
          norm = map (T.toLower . ignoreWhitespace)

  -- | The default stop word list ('smartStoplist').
  defaultStoplist :: StopwordsMap
  defaultStoplist = smartStoplist

  -- | The /nolist/: Symbols in this list count as stop words
  --   independently from the chosen stop word list.
  --   This list can be used to exclude very specific \"words\" 
  --   that may occur in a given domain like, for instance,
  --   mathematical formulas and symbols.
  type NoList = [Text]
  -- | Currently, the default /nolist/ contains only the symbol \"-\".
  defaultNolist :: NoList
  defaultNolist = map T.pack ["-"]

  -- The hash character encoded as 'Text'
  chash :: Text
  chash = T.singleton '#'

  -- Whitespace
  ignoreWhitespace :: Text -> Text
  ignoreWhitespace = T.takeWhile (/= ' ') . T.dropWhile (== ' ')

  -- | The \"smart\" stop word list
  smartStoplist :: StopwordsMap
  smartStoplist = mkStopwordsStr [

  -- | The \"Fox\" stop word list
  foxStoplist :: StopwordsMap
  foxStoplist = mkStopwordsStr [