Safe Haskell | None |
---|---|
Language | Haskell2010 |
- data OrthoFreq = OrthoFreq {}
- data PunktData = PunktData {
- type_count :: HashMap Text Int
- ortho_count :: HashMap Text OrthoFreq
- collocations :: HashMap (Text, Text) Int
- total_enders :: Int
- total_toks :: Int
- data Entity a
- data Token = Token {}
- type Punkt = Reader PunktData
- norm :: Text -> Text
- is_initial :: Token -> Bool
- is_word :: Token -> Bool
- strunk_log :: Double -> Double -> Double -> Double -> Double
- dunning_log :: Double -> Double -> Double -> Double -> Double
- ask_type_count :: Punkt (HashMap Text Int)
- ask_total_toks :: Num a => Punkt a
- ask_total_enders :: Num a => Punkt a
- ask_ortho :: Text -> Punkt OrthoFreq
- ask_colloc :: Text -> Text -> Punkt Double
- freq :: Text -> Punkt Double
- freq_snoc_dot :: Text -> Punkt Double
- freq_type :: Text -> Punkt Double
- dlen :: Text -> Double
- prob_abbr :: Text -> Punkt Double
- decide_ortho :: Text -> Punkt (Maybe Bool)
- decide_initial_ortho :: Text -> Punkt (Maybe Bool)
- prob_starter :: Text -> Punkt Double
- prob_colloc :: Text -> Text -> Punkt Double
- build_type_count :: [Token] -> HashMap Text Int
- build_ortho_count :: [Token] -> HashMap Text OrthoFreq
- build_collocs :: [Token] -> HashMap (Text, Text) Int
- to_tokens :: Text -> [Token]
- build_punkt_data :: [Token] -> PunktData
- classify_by_type :: Token -> Punkt Token
- classify_by_next :: Token -> Token -> Punkt Token
- classify_punkt :: Text -> [Token]
- find_breaks :: Text -> [(Int, Int)]
- substring :: Text -> Int -> Int -> Text
- match_spaces :: Text -> Maybe (Int, Int)
- split_sentences :: Text -> [Text]
- runPunkt :: PunktData -> Punkt a -> a
Documentation
Carries various orthographic statistics for a particular textual type.
OrthoFreq | |
|
Represents training data obtained from a corpus required by Punkt.
PunktData | |
|
is_initial :: Token -> Bool Source
strunk_log :: Double -> Double -> Double -> Double -> Double Source
Dunning log likelihood modified by Kiss/Strunk
dunning_log :: Double -> Double -> Double -> Double -> Double Source
Dunning's original log likelihood
ask_total_toks :: Num a => Punkt a Source
ask_total_enders :: Num a => Punkt a Source
freq :: Text -> Punkt Double Source
Occurrences of a textual type, strictly ignoring trailing period.
c(w, ~.)
. Case-insensitive.
freq_snoc_dot :: Text -> Punkt Double Source
Occurrences of a textual type with trailing period. c(w, .)
.
Case-insensitive.
prob_abbr :: Text -> Punkt Double Source
Returns the log likelihood that (w_ snoc
.
) is an abbreviation.
Case-insensitive.
decide_ortho :: Text -> Punkt (Maybe Bool) Source
Decides if w
is a sentence ender based on its capitalization.
Case-insensitive.
decide_initial_ortho :: Text -> Punkt (Maybe Bool) Source
Special orthographic heuristic for post-possible-initial tokens. Case-insensitive.
prob_starter :: Text -> Punkt Double Source
Log likelihood that w
is a frequent sentence starter. Case-insensitive.
prob_colloc :: Text -> Text -> Punkt Double Source
Computes the collocational likelihood of w
and x
. Case-insensitive.
build_type_count :: [Token] -> HashMap Text Int Source
Builds a dictionary of textual type frequencies from a stream of tokens.
build_punkt_data :: [Token] -> PunktData Source
classify_by_type :: Token -> Punkt Token Source
classify_punkt :: Text -> [Token] Source
find_breaks :: Text -> [(Int, Int)] Source
split_sentences :: Text -> [Text] Source
Main export of the entire package. Splits a corpus into its constituent sentences.
runPunkt :: PunktData -> Punkt a -> a Source
runPunkt data computation
runs computation
using data
collected from
a corpus using build_punkt_data
.