src/NLP/RAKE/Text.hs

-------------------------------------------------------------------------------
-- |
-- Module     : NLP.RAKE.Text.hs
-- Copyright  : (c) Tobias Schoofs
-- License    : LGPL 
-- Stability  : experimental
-- Portability: portable
-- The RAKE Text interface. (Currently the only one...)
-------------------------------------------------------------------------------
module NLP.RAKE.Text (
                      -- * Keywords
                      WordScore, 
                      candidates, keywords, 

                      -- * Utitlities
                      sortByScore, sortByWord,
                      pSplitter,

                      -- * Resources
                      NoSplit,
                      defaultNosplit,
                      enNosplit, numNosplit, othNosplit,
                      latin1Nosplit, latinExAnosplit, latinExBnosplit,
                      greekNosplit,cyrillicNosplit,

                      -- * Stopwords
                      -- $Stopwords
                      StopwordsMap,
                      mkStopwords, mkStopwordsStr,
                      loadStopWords,
                      stopword,
                      defaultStoplist,
                      smartStoplist, foxStoplist,
                      NoList,
                      defaultNolist)
                      
where

  import           Data.List (foldl',sortBy,nub)
  import           Data.Char (isDigit,isPunctuation)
  import           Data.Text (Text)
  import qualified Data.Text as T
  import           Data.Map (Map)
  import qualified Data.Map as M

  import           NLP.RAKE.Stopwords
  import           NLP.RAKE.Resources

  -------------------------------------------------------------------------
  -- import Debug.Trace (trace)
  -------------------------------------------------------------------------

  {- $Stopwords

     The very heart of the RAKE algorithm is the use of stop words,
     a concept defined by NLP pioneer Hans Peter Luhn.
     Stop words are frequent words in a language that are considered
     to be void of specific semantics. They, of course, have 
     an important role in the language, but they do not
     help to determine the topic a specific document is about, e.g.
     \"is\", \"the\", \"of\" and so on.
     Stop words depend on the specific context of the documents
     to be analysed; there are, however, frequently used lists
     with wide applicability.

     The library comes with two stop word lists built in:
     the 'smartStoplist' and the 'foxStoplist', both for English.
     The list used by default is 'smartStoplist'.

     The user is free to define her own stop word list,
     which can be loaded from a file using 'loadStopWords'.
     The file format is simple:
     
       * Lines starting with \'#\' are ignored (comments);

       * Each line contains one word. 
  -}

  -------------------------------------------------------------------------
  -- | The result is a keyword candidate,
  --   a keyword consisting of one or more words
  --   and a score associated with this keyword.
  -------------------------------------------------------------------------
  type WordScore = (Text,Double)

  -------------------------------------------------------------------------
  -- | This interface provides most flexibility.
  --   It expects a 'Map' of stop words, a /nosplit/ list
  --   used by the word splitter,
  --   an additional list of words or symbols 
  --   you want to exclude for a specific document
  --   and a text split into phrases.
  --   Users may pass in their own stop word list 
  --   (e.g. by loading it from a file, see 'loadStopWords')
  --   or one of the predefined lists ('smartStopwords', 'foxStopwords').
  -------------------------------------------------------------------------
  candidates :: StopwordsMap -> NoSplit -> NoList -> [Text] -> [WordScore]
  candidates m nsp nl ps = let ks = concatMap (kfinder m nsp nl) ps
                            in sortByScore $ nub (kwScores ks)

  -------------------------------------------------------------------------
  -- | The 'keywords' function is a convenience interface
  --   that takes a couple of decisions internally:
  --   it uses the 'defaultStoplist', the English language 
  --   /nosplit/ list, the default 'nolist' and it splits the text
  --   into phrases using the 'pSplitter'.
  --
  --   The function is equivalent to
  --  
  --   > candidates defaultStoplist defaultNosplit defaultNolist . pSplitter
  --
  -------------------------------------------------------------------------
  keywords :: Text -> [WordScore]
  keywords = candidates defaultStoplist 
                        defaultNosplit 
                        defaultNolist   . pSplitter

  -------------------------------------------------------------------------
  -- | Sort the 'WordScore' list by scores (descending!)
  -------------------------------------------------------------------------
  sortByScore :: [WordScore] -> [WordScore]
  sortByScore = sortBy bySnd
    where bySnd (_,b1) (_,b2) = compare b2 b1

  -------------------------------------------------------------------------
  -- | Sort the 'WordScore' list by words (ascending!)
  -------------------------------------------------------------------------
  sortByWord :: [WordScore] -> [WordScore]
  sortByWord = sortBy byFst
    where byFst (a1,_) (a2,_) = compare a1 a2

  -------------------------------------------------------------------------
  -- List of Chars containing exceptions from 'isPunctuation'
  -------------------------------------------------------------------------
  nopunc :: String
  nopunc = "-"

  -------------------------------------------------------------------------
  -- Text ' '
  -------------------------------------------------------------------------
  space :: Text
  space = T.singleton ' '

  -------------------------------------------------------------------------
  -- Word Splitter
  -------------------------------------------------------------------------
  wSplitter :: String -> Text -> [Text]
  wSplitter nosplit = go []
    where go t cs | T.null cs && null t      = []
                  | T.null cs                = [mkw t]
                  | T.head cs `elem` nosplit = go (T.head cs:t) (T.tail cs)
                  | otherwise                = 
                    if null t then         go [] (T.tail cs)
                              else mkw t : go [] (T.tail cs)
          mkw = T.pack . reverse

  -------------------------------------------------------------------------
  -- | Default phrase splitter. It splits phrases at characters
  --   in the punctuation category 
  --   (those for which 'isPunctuation' is 'True') 
  --   with the exception of \'-\'.
  -------------------------------------------------------------------------
  pSplitter :: Text -> [Text]
  pSplitter = go []
    where go t cs | T.null cs && null t     = []
                  | T.null cs               = [mkp t]
                  | punctuation (T.head cs) = if null t then go [] (T.tail cs)
                                                else mkp t : go [] (T.tail cs)
                  | otherwise               = go (T.head cs:t) (T.tail cs)
          mkp = T.dropWhile (== ' ') . T.pack . reverse
          punctuation c = isPunctuation c && c `notElem` nopunc

  -------------------------------------------------------------------------
  -- Adding words to a keyword until a stopword is found
  -------------------------------------------------------------------------
  kfinder :: StopwordsMap -> NoSplit -> NoList -> Text -> [[Text]]
  kfinder m nosplit nl = go [] . wSplitter nosplit . T.toLower
    where go [] [] = []
          go t  [] = [mkk t]
          go t (w:ws) | stopword m nl w = if null t then         go [] ws
                                                    else mkk t : go [] ws
                      | otherwise       = go (w:t) ws
          mkk = reverse

  -------------------------------------------------------------------------
  -- Map of Text and frequency,degree
  -------------------------------------------------------------------------
  type ScoreMap  = Map Text (Double,Double)

  -------------------------------------------------------------------------
  -- To calculate the scores we map 'kwScore' on all phrases
  -------------------------------------------------------------------------
  kwScores :: [[Text]] -> [WordScore]
  kwScores s = map (kwScore $ wordScores s) s

  -------------------------------------------------------------------------
  -- The keyword score is the sum of the individual scores 
  -- of all words contained in the keyword.
  -- The score per word is computed as (d+f)/f.
  -------------------------------------------------------------------------
  kwScore :: ScoreMap -> [Text] -> WordScore
  kwScore m s = let ws = wFilter 0 s 
                 in (conc s,sum $ map findScore ws)
    where conc = T.intercalate space
          findScore w = case M.lookup w m of
                          Nothing    -> 0
                          Just (f,d) -> (d+f) / f
    
  -------------------------------------------------------------------------
  -- We compute the word score folding 'M.insert' on all keywords
  -- and computing f as f+1 for each instance of the word
  --     and       d as d+d for each instance of the word
  -------------------------------------------------------------------------
  wordScores :: [[Text]] -> ScoreMap 
  wordScores = foldl' wordScore M.empty 

  -------------------------------------------------------------------------
  -- Computing the keyword score as the number of words in the keyword.
  -- The addition of frequency (f+1) and degree (d+d) is folded on
  -- the table of all keywords.
  -------------------------------------------------------------------------
  wordScore :: ScoreMap -> [Text] -> ScoreMap
  wordScore sm s = let ws = wFilter 0 s 
                       d  = fromIntegral (length ws-1)
                    in foldl' (score d) sm ws 
    where score d m x  = M.insertWith add x (1,d) m
          add (_,d1) (f2,d2) = (f2+1,d1+d2) 
  
  -------------------------------------------------------------------------
  -- Filter words that may appear as part of keywords,
  -- but do not enter the score calculation
  -------------------------------------------------------------------------
  wFilter :: Int -> [Text] -> [Text]
  wFilter m = filter flt
    where flt w = T.compareLength w m == GT &&
                  not (T.null w) &&
                  not (numeric w)

  -------------------------------------------------------------------------
  -- Simple definition of what is a number.
  -- There may be better definitions, though.
  -------------------------------------------------------------------------
  numeric :: Text -> Bool
  numeric s | T.null s          = False
            | not (hasDigits s) = False
            | otherwise = let h = T.head s
                           in (isDigit h || (h == '-' && hasDigits s)) && 
                              pnumeric (T.tail s)
    where pnumeric cs | T.null cs = True
                      | otherwise = 
                        let h = T.head cs 
                         in (isDigit h || h == '.' || h == ',') && 
                            pnumeric (T.tail cs)
          hasDigits cs | T.null cs           = False
                       | isDigit (T.head cs) = True
                       | otherwise           = hasDigits (T.tail cs)