-- Hoogle documentation, generated by Haddock -- See Hoogle, http://www.haskell.org/hoogle/ -- | Rapid Automatic Keyword Extraction (RAKE) -- @package rake @version 0.0.1 -- | The RAKE Text interface. (Currently the only one...) module NLP.RAKE.Text -- | The result is a keyword candidate, a keyword consisting of one or more -- words and a score associated with this keyword. type WordScore = (Text, Double) -- | This interface provides most flexibility. It expects a Map of -- stop words, a nosplit list used by the word splitter, an -- additional list of words or symbols you want to exclude for a specific -- document and a text split into phrases. Users may pass in their own -- stop word list (e.g. by loading it from a file, see -- loadStopWords) or one of the predefined lists -- (smartStopwords, foxStopwords). candidates :: StopwordsMap -> NoSplit -> NoList -> [Text] -> [WordScore] -- | The keywords function is a convenience interface that takes a -- couple of decisions internally: it uses the defaultStoplist, -- the English language nosplit list, the default nolist -- and it splits the text into phrases using the pSplitter. -- -- The function is equivalent to -- --
--   candidates defaultStoplist defaultNosplit defaultNolist . pSplitter
--   
keywords :: Text -> [WordScore] -- | Sort the WordScore list by scores (descending!) sortByScore :: [WordScore] -> [WordScore] -- | Sort the WordScore list by words (ascending!) sortByWord :: [WordScore] -> [WordScore] -- | Default phrase splitter. It splits phrases at characters in the -- punctuation category (those for which isPunctuation is -- True) with the exception of '-'. pSplitter :: Text -> [Text] -- | List containing characters at which we do not split words. This list -- is language dependent. type NoSplit = String -- | The default list is for English and does only consider ASCII -- characters, the numbers 0..9 and some other symbols. -- -- There are resources for other languages, but they need review and -- contribution! defaultNosplit :: NoSplit -- | ASCII characters, enNosplit :: NoSplit -- | digits numNosplit :: NoSplit -- | and some more symbols ("+-/") othNosplit :: NoSplit -- | Latin1 latin1Nosplit :: NoSplit -- | Latin1 extended-A latinExAnosplit :: NoSplit -- | Latin1 extended-B latinExBnosplit :: NoSplit -- | Greek and Coptic (needs revision) greekNosplit :: NoSplit -- | Cyrillic (needs revision) cyrillicNosplit :: NoSplit -- | Search tree for stop words type StopwordsMap = Map Text () -- | Make StopwordsMap starting from a list of stop words encoded as -- Text mkStopwords :: [Text] -> StopwordsMap -- | Make StopwordsMap starting from a list of stop words encoded as -- String mkStopwordsStr :: [String] -> StopwordsMap -- | Load a stop word list from a file. loadStopWords :: FilePath -> IO StopwordsMap -- | Search for a chunk of Text in the StopwordsMap. Note -- that, if a word or symbol does not appear in the stop word list, it -- may still be on the the nolist and, then, still counts as stop -- word (e.g. "-"). stopword :: StopwordsMap -> NoList -> Text -> Bool -- | The default stop word list (smartStoplist). defaultStoplist :: StopwordsMap -- | The "smart" stop word list smartStoplist :: StopwordsMap -- | The "Fox" stop word list foxStoplist :: StopwordsMap -- | The nolist: Symbols in this list count as stop words -- independently from the chosen stop word list. This list can be used to -- exclude very specific "words" that may occur in a given domain like, -- for instance, mathematical formulas and symbols. type NoList = [Text] -- | Currently, the default nolist contains only the symbol "-". defaultNolist :: NoList