-- Hoogle documentation, generated by Haddock -- See Hoogle, http://www.haskell.org/hoogle/ -- | Simple tokenizer for English text. -- @package tokenize @version 0.2.2 -- | NLP Tokenizer, adapted to use Text instead of Strings from the -- tokenize package. module NLP.Tokenize.Text -- | The EitherList is a newtype-wrapped list of Eithers. newtype EitherList a b E :: [Either a b] -> EitherList a b unE :: EitherList a b -> [Either a b] -- | A Tokenizer is function which takes a list and returns a list of -- Eithers (wrapped in a newtype). Right Texts will be passed on for -- processing to tokenizers down the pipeline. Left Texts will be passed -- through the pipeline unchanged. Use a Left Texts in a tokenizer to -- protect certain tokens from further processing (e.g. see the -- uris tokenizer). You can define your own custom tokenizer -- pipelines by chaining tokenizers together: type Tokenizer = Text -> EitherList Text Text -- | Split string into words using the default tokenizer pipeline tokenize :: Text -> [Text] -- | Run a tokenizer run :: Tokenizer -> (Text -> [Text]) defaultTokenizer :: Tokenizer -- | Split string on whitespace. This is just a wrapper for Data.List.words whitespace :: Tokenizer -- | Detect common uris and freeze them uris :: Tokenizer -- | Split off initial and final punctuation punctuation :: Tokenizer -- | Split off word-final punctuation finalPunctuation :: Tokenizer -- | Split off word-initial punctuation initialPunctuation :: Tokenizer -- | Split tokens on transitions between punctuation and non-punctuation -- characters. This tokenizer is not included in defaultTokenizer -- pipeline because dealing with word-internal punctuation is quite -- application specific. allPunctuation :: Tokenizer -- | Split common contractions off and freeze them. | Currently deals with: -- 'm, 's, 'd, 've, 'll contractions :: Tokenizer -- | Split words ending in n't, and freeze n't negatives :: Tokenizer instance Monad (EitherList a) module NLP.Tokenize.String -- | The EitherList is a newtype-wrapped list of Eithers. newtype EitherList a b E :: [Either a b] -> EitherList a b unE :: EitherList a b -> [Either a b] -- | A Tokenizer is function which takes a list and returns a list of -- Eithers (wrapped in a newtype). Right Strings will be passed on for -- processing to tokenizers down the pipeline. Left Strings will be -- passed through the pipeline unchanged. Use a Left String in a -- tokenizer to protect certain tokens from further processing (e.g. see -- the uris tokenizer). You can define your own custom tokenizer -- pipelines by chaining tokenizers together: type Tokenizer = String -> EitherList String String -- | Split string into words using the default tokenizer pipeline tokenize :: String -> [String] -- | Run a tokenizer run :: Tokenizer -> (String -> [String]) defaultTokenizer :: Tokenizer -- | Split string on whitespace. This is just a wrapper for Data.List.words whitespace :: Tokenizer -- | Detect common uris and freeze them uris :: Tokenizer -- | Split off initial and final punctuation punctuation :: Tokenizer -- | Split off word-final punctuation finalPunctuation :: Tokenizer -- | Split off word-initial punctuation initialPunctuation :: Tokenizer -- | Split tokens on transitions between punctuation and non-punctuation -- characters. This tokenizer is not included in defaultTokenizer -- pipeline because dealing with word-internal punctuation is quite -- application specific. allPunctuation :: Tokenizer -- | Split common contractions off and freeze them. | Currently deals with: -- 'm, 's, 'd, 've, 'll contractions :: Tokenizer -- | Split words ending in n't, and freeze n't negatives :: Tokenizer instance Monad (EitherList a) -- | NLP Tokenizer module NLP.Tokenize