-- Hoogle documentation, generated by Haddock
-- See Hoogle, http://www.haskell.org/hoogle/
-- | Simple tokenizer for English text.
--
-- Simple tokenizer for English text.
@package tokenize
@version 0.1.0
module NLP.Tokenize
-- | The EitherList is a newtype-wrapped list of Eithers.
newtype EitherList a b
E :: [Either a b] -> EitherList a b
unE :: EitherList a b -> [Either a b]
-- | A Tokenizer is function which takes a list and returns a list of
-- Eithers (wrapped in a newtype). Right Strings will be passed on for
-- processing to tokenizers down the pipeline. Left Strings will be
-- passed through the pipeline unchanged. Use a Left String in a
-- tokenizer to protect certain tokens from further processing (e.g. see
-- the uris tokenizer).
type Tokenizer = String -> EitherList String String
-- | Split string into words using the default tokenizer pipeline
tokenize :: String -> [String]
-- | Run a tokenizer
run :: Tokenizer -> (String -> [String])
-- | Split string on whitespace. This is just a wrapper for words
whitespace :: Tokenizer
-- | Detect common uris and freeze them
uris :: Tokenizer
-- | Split off initial and final punctuation
punctuation :: Tokenizer
-- | Split off word-final punctuation
finalPunctuation :: Tokenizer
-- | Split off word-initial punctuation
initialPunctuation :: Tokenizer
-- | Split words ending in n't, and freeze n't
negatives :: Tokenizer
instance Monad (EitherList a)