-- Hoogle documentation, generated by Haddock
-- See Hoogle, http://www.haskell.org/hoogle/


-- | Rapid Automatic Keyword Extraction (RAKE)
--   
@package rake
@version 0.0.1


-- | The RAKE Text interface. (Currently the only one...)
module NLP.RAKE.Text

-- | The result is a keyword candidate, a keyword consisting of one or more
--   words and a score associated with this keyword.
type WordScore = (Text, Double)

-- | This interface provides most flexibility. It expects a <a>Map</a> of
--   stop words, a <i>nosplit</i> list used by the word splitter, an
--   additional list of words or symbols you want to exclude for a specific
--   document and a text split into phrases. Users may pass in their own
--   stop word list (e.g. by loading it from a file, see
--   <a>loadStopWords</a>) or one of the predefined lists
--   (<tt>smartStopwords</tt>, <tt>foxStopwords</tt>).
candidates :: StopwordsMap -> NoSplit -> NoList -> [Text] -> [WordScore]

-- | The <a>keywords</a> function is a convenience interface that takes a
--   couple of decisions internally: it uses the <a>defaultStoplist</a>,
--   the English language <i>nosplit</i> list, the default <tt>nolist</tt>
--   and it splits the text into phrases using the <a>pSplitter</a>.
--   
--   The function is equivalent to
--   
--   <pre>
--   candidates defaultStoplist defaultNosplit defaultNolist . pSplitter
--   </pre>
keywords :: Text -> [WordScore]

-- | Sort the <a>WordScore</a> list by scores (descending!)
sortByScore :: [WordScore] -> [WordScore]

-- | Sort the <a>WordScore</a> list by words (ascending!)
sortByWord :: [WordScore] -> [WordScore]

-- | Default phrase splitter. It splits phrases at characters in the
--   punctuation category (those for which <a>isPunctuation</a> is
--   <a>True</a>) with the exception of '-'.
pSplitter :: Text -> [Text]

-- | List containing characters at which we do not split words. This list
--   is language dependent.
type NoSplit = String

-- | The default list is for English and does only consider ASCII
--   characters, the numbers 0..9 and some other symbols.
--   
--   There are resources for other languages, but they need review and
--   contribution!
defaultNosplit :: NoSplit

-- | ASCII characters,
enNosplit :: NoSplit

-- | digits
numNosplit :: NoSplit

-- | and some more symbols ("+-/")
othNosplit :: NoSplit

-- | Latin1
latin1Nosplit :: NoSplit

-- | Latin1 extended-A
latinExAnosplit :: NoSplit

-- | Latin1 extended-B
latinExBnosplit :: NoSplit

-- | Greek and Coptic (needs revision)
greekNosplit :: NoSplit

-- | Cyrillic (needs revision)
cyrillicNosplit :: NoSplit

-- | Search tree for stop words
type StopwordsMap = Map Text ()

-- | Make <a>StopwordsMap</a> starting from a list of stop words encoded as
--   <a>Text</a>
mkStopwords :: [Text] -> StopwordsMap

-- | Make <a>StopwordsMap</a> starting from a list of stop words encoded as
--   <a>String</a>
mkStopwordsStr :: [String] -> StopwordsMap

-- | Load a stop word list from a file.
loadStopWords :: FilePath -> IO StopwordsMap

-- | Search for a chunk of <a>Text</a> in the <a>StopwordsMap</a>. Note
--   that, if a word or symbol does not appear in the stop word list, it
--   may still be on the the <i>nolist</i> and, then, still counts as stop
--   word (e.g. "-").
stopword :: StopwordsMap -> NoList -> Text -> Bool

-- | The default stop word list (<a>smartStoplist</a>).
defaultStoplist :: StopwordsMap

-- | The "smart" stop word list
smartStoplist :: StopwordsMap

-- | The "Fox" stop word list
foxStoplist :: StopwordsMap

-- | The <i>nolist</i>: Symbols in this list count as stop words
--   independently from the chosen stop word list. This list can be used to
--   exclude very specific "words" that may occur in a given domain like,
--   for instance, mathematical formulas and symbols.
type NoList = [Text]

-- | Currently, the default <i>nolist</i> contains only the symbol "-".
defaultNolist :: NoList