Safe Haskell	None
Language	Haskell2010

NLP.Punkt

Synopsis

Documentation

data OrthoFreq Source

Carries various orthographic statistics for a particular textual type.

Constructors

OrthoFreq
Fields freq_lower :: Int number of lowercase occurrences freq_upper :: Int uppercase occurrences freq_first_lower :: Int number of lowercase occurrences in the first position of a sentence freq_internal_upper :: Int number of uppercase occurrences strictly internal to a sentence freq_after_ender :: Int number of occurences in the first position

Instances

Show OrthoFreq

data PunktData Source

Represents training data obtained from a corpus required by Punkt.

Constructors

PunktData
Fields type_count :: HashMap Text Int Occurrences of each textual type, case-insensitive. Used during Punkt's type-based stage. Also contains occurrences of trailing periods. ortho_count :: HashMap Text OrthoFreq Dictionary of orthographic data for each textual type. collocations :: HashMap (Text, Text) Int total_enders :: Int total_toks :: Int

Instances

Show PunktData

data Entity a Source

Constructors

Word a Bool
Punct a
ParaStart
Ellipsis
Dash

Instances

Eq a => Eq (Entity a)
Show a => Show (Entity a)

data Token Source

Constructors

Token
Fields offset :: Int toklen :: Int entity :: Entity Text sentend :: Bool abbrev :: Bool

Instances

Show Token

type Punkt = Reader PunktData Source

norm :: Text -> Text Source

is_initial :: Token -> Bool Source

is_word :: Token -> Bool Source

strunk_log :: Double -> Double -> Double -> Double -> Double Source

Dunning log likelihood modified by Kiss/Strunk

dunning_log :: Double -> Double -> Double -> Double -> Double Source

Dunning's original log likelihood

ask_type_count :: Punkt (HashMap Text Int) Source

ask_total_toks :: Num a => Punkt a Source

ask_total_enders :: Num a => Punkt a Source

ask_ortho :: Text -> Punkt OrthoFreq Source

ask_colloc :: Text -> Text -> Punkt Double Source

freq :: Text -> Punkt Double Source

Occurrences of a textual type, strictly ignoring trailing period. c(w, ~.). Case-insensitive.

freq_snoc_dot :: Text -> Punkt Double Source

Occurrences of a textual type with trailing period. c(w, .). Case-insensitive.

freq_type :: Text -> Punkt Double Source

c(w) == c(w, .) + c(w, ~.). Case-insensitive.

dlen :: Text -> Double Source

prob_abbr :: Text -> Punkt Double Source

Returns the log likelihood that (w_ snoc .) is an abbreviation. Case-insensitive.

decide_ortho :: Text -> Punkt (Maybe Bool) Source

Decides if w is a sentence ender based on its capitalization. Case-insensitive.

decide_initial_ortho :: Text -> Punkt (Maybe Bool) Source

Special orthographic heuristic for post-possible-initial tokens. Case-insensitive.

prob_starter :: Text -> Punkt Double Source

Log likelihood that w is a frequent sentence starter. Case-insensitive.

prob_colloc :: Text -> Text -> Punkt Double Source

Computes the collocational likelihood of w and x. Case-insensitive.

build_type_count :: [Token] -> HashMap Text Int Source

Builds a dictionary of textual type frequencies from a stream of tokens.

build_ortho_count :: [Token] -> HashMap Text OrthoFreq Source

build_collocs :: [Token] -> HashMap (Text, Text) Int Source

to_tokens :: Text -> [Token] Source

build_punkt_data :: [Token] -> PunktData Source

classify_by_type :: Token -> Punkt Token Source

classify_by_next :: Token -> Token -> Punkt Token Source

classify_punkt :: Text -> [Token] Source

find_breaks :: Text -> [(Int, Int)] Source

substring :: Text -> Int -> Int -> Text Source

match_spaces :: Text -> Maybe (Int, Int) Source

split_sentences :: Text -> [Text] Source

Main export of the entire package. Splits a corpus into its constituent sentences.

runPunkt :: PunktData -> Punkt a -> a Source

runPunkt data computation runs computation using data collected from a corpus using build_punkt_data.