| Safe Haskell | None | 
|---|---|
| Language | Haskell2010 | 
NLP.Punkt
- data OrthoFreq = OrthoFreq {}
 - data PunktData = PunktData {
- type_count :: HashMap Text Int
 - ortho_count :: HashMap Text OrthoFreq
 - collocations :: HashMap (Text, Text) Int
 - total_enders :: Int
 - total_toks :: Int
 
 - data Entity a
 - data Token = Token {}
 - type Punkt = Reader PunktData
 - norm :: Text -> Text
 - is_initial :: Token -> Bool
 - is_word :: Token -> Bool
 - strunk_log :: Double -> Double -> Double -> Double -> Double
 - dunning_log :: Double -> Double -> Double -> Double -> Double
 - ask_type_count :: Punkt (HashMap Text Int)
 - ask_total_toks :: Num a => Punkt a
 - ask_total_enders :: Num a => Punkt a
 - ask_ortho :: Text -> Punkt OrthoFreq
 - ask_colloc :: Text -> Text -> Punkt Double
 - freq :: Text -> Punkt Double
 - freq_snoc_dot :: Text -> Punkt Double
 - freq_type :: Text -> Punkt Double
 - dlen :: Text -> Double
 - prob_abbr :: Text -> Punkt Double
 - decide_ortho :: Text -> Punkt (Maybe Bool)
 - decide_initial_ortho :: Text -> Punkt (Maybe Bool)
 - prob_starter :: Text -> Punkt Double
 - prob_colloc :: Text -> Text -> Punkt Double
 - build_type_count :: [Token] -> HashMap Text Int
 - build_ortho_count :: [Token] -> HashMap Text OrthoFreq
 - build_collocs :: [Token] -> HashMap (Text, Text) Int
 - to_tokens :: Text -> [Token]
 - build_punkt_data :: [Token] -> PunktData
 - classify_by_type :: Token -> Punkt Token
 - classify_by_next :: Token -> Token -> Punkt Token
 - classify_punkt :: Text -> [Token]
 - find_breaks :: Text -> [(Int, Int)]
 - substring :: Text -> Int -> Int -> Text
 - match_spaces :: Text -> Maybe (Int, Int)
 - split_sentences :: Text -> [Text]
 - runPunkt :: PunktData -> Punkt a -> a
 
Documentation
Carries various orthographic statistics for a particular textual type.
Constructors
| OrthoFreq | |
Fields 
  | |
Represents training data obtained from a corpus required by Punkt.
Constructors
| PunktData | |
Fields 
  | |
Constructors
| Token | |
is_initial :: Token -> Bool Source
strunk_log :: Double -> Double -> Double -> Double -> Double Source
Dunning log likelihood modified by Kiss/Strunk
dunning_log :: Double -> Double -> Double -> Double -> Double Source
Dunning's original log likelihood
ask_total_toks :: Num a => Punkt a Source
ask_total_enders :: Num a => Punkt a Source
freq :: Text -> Punkt Double Source
Occurrences of a textual type, strictly ignoring trailing period.
 c(w, ~.). Case-insensitive.
freq_snoc_dot :: Text -> Punkt Double Source
Occurrences of a textual type with trailing period. c(w, .).
 Case-insensitive.
prob_abbr :: Text -> Punkt Double Source
Returns the log likelihood that (w_ snoc .) is an abbreviation.
 Case-insensitive.
decide_ortho :: Text -> Punkt (Maybe Bool) Source
Decides if w is a sentence ender based on its capitalization.
 Case-insensitive.
decide_initial_ortho :: Text -> Punkt (Maybe Bool) Source
Special orthographic heuristic for post-possible-initial tokens. Case-insensitive.
prob_starter :: Text -> Punkt Double Source
Log likelihood that w is a frequent sentence starter. Case-insensitive.
prob_colloc :: Text -> Text -> Punkt Double Source
Computes the collocational likelihood of w and x. Case-insensitive.
build_type_count :: [Token] -> HashMap Text Int Source
Builds a dictionary of textual type frequencies from a stream of tokens.
build_punkt_data :: [Token] -> PunktData Source
classify_by_type :: Token -> Punkt Token Source
classify_punkt :: Text -> [Token] Source
find_breaks :: Text -> [(Int, Int)] Source
split_sentences :: Text -> [Text] Source
Main export of the entire package. Splits a corpus into its constituent sentences.
runPunkt :: PunktData -> Punkt a -> a Source
runPunkt data computation runs computation using data collected from
 a corpus using build_punkt_data.