-- Hoogle documentation, generated by Haddock
-- See Hoogle, http://www.haskell.org/hoogle/
-- | Simple tokenizer for English text.
--
@package tokenize
@version 0.2.2
-- | NLP Tokenizer, adapted to use Text instead of Strings from the
-- tokenize package.
module NLP.Tokenize.Text
-- | The EitherList is a newtype-wrapped list of Eithers.
newtype EitherList a b
E :: [Either a b] -> EitherList a b
unE :: EitherList a b -> [Either a b]
-- | A Tokenizer is function which takes a list and returns a list of
-- Eithers (wrapped in a newtype). Right Texts will be passed on for
-- processing to tokenizers down the pipeline. Left Texts will be passed
-- through the pipeline unchanged. Use a Left Texts in a tokenizer to
-- protect certain tokens from further processing (e.g. see the
-- uris tokenizer). You can define your own custom tokenizer
-- pipelines by chaining tokenizers together:
type Tokenizer = Text -> EitherList Text Text
-- | Split string into words using the default tokenizer pipeline
tokenize :: Text -> [Text]
-- | Run a tokenizer
run :: Tokenizer -> (Text -> [Text])
defaultTokenizer :: Tokenizer
-- | Split string on whitespace. This is just a wrapper for Data.List.words
whitespace :: Tokenizer
-- | Detect common uris and freeze them
uris :: Tokenizer
-- | Split off initial and final punctuation
punctuation :: Tokenizer
-- | Split off word-final punctuation
finalPunctuation :: Tokenizer
-- | Split off word-initial punctuation
initialPunctuation :: Tokenizer
-- | Split tokens on transitions between punctuation and non-punctuation
-- characters. This tokenizer is not included in defaultTokenizer
-- pipeline because dealing with word-internal punctuation is quite
-- application specific.
allPunctuation :: Tokenizer
-- | Split common contractions off and freeze them. | Currently deals with:
-- 'm, 's, 'd, 've, 'll
contractions :: Tokenizer
-- | Split words ending in n't, and freeze n't
negatives :: Tokenizer
instance Monad (EitherList a)
module NLP.Tokenize.String
-- | The EitherList is a newtype-wrapped list of Eithers.
newtype EitherList a b
E :: [Either a b] -> EitherList a b
unE :: EitherList a b -> [Either a b]
-- | A Tokenizer is function which takes a list and returns a list of
-- Eithers (wrapped in a newtype). Right Strings will be passed on for
-- processing to tokenizers down the pipeline. Left Strings will be
-- passed through the pipeline unchanged. Use a Left String in a
-- tokenizer to protect certain tokens from further processing (e.g. see
-- the uris tokenizer). You can define your own custom tokenizer
-- pipelines by chaining tokenizers together:
type Tokenizer = String -> EitherList String String
-- | Split string into words using the default tokenizer pipeline
tokenize :: String -> [String]
-- | Run a tokenizer
run :: Tokenizer -> (String -> [String])
defaultTokenizer :: Tokenizer
-- | Split string on whitespace. This is just a wrapper for Data.List.words
whitespace :: Tokenizer
-- | Detect common uris and freeze them
uris :: Tokenizer
-- | Split off initial and final punctuation
punctuation :: Tokenizer
-- | Split off word-final punctuation
finalPunctuation :: Tokenizer
-- | Split off word-initial punctuation
initialPunctuation :: Tokenizer
-- | Split tokens on transitions between punctuation and non-punctuation
-- characters. This tokenizer is not included in defaultTokenizer
-- pipeline because dealing with word-internal punctuation is quite
-- application specific.
allPunctuation :: Tokenizer
-- | Split common contractions off and freeze them. | Currently deals with:
-- 'm, 's, 'd, 've, 'll
contractions :: Tokenizer
-- | Split words ending in n't, and freeze n't
negatives :: Tokenizer
instance Monad (EitherList a)
-- | NLP Tokenizer
module NLP.Tokenize