-- Hoogle documentation, generated by Haddock
-- See Hoogle, http://www.haskell.org/hoogle/


-- | Simple tokenizer for English text.
--   
@package tokenize
@version 0.2.2


-- | NLP Tokenizer, adapted to use Text instead of Strings from the
--   <a>tokenize</a> package.
module NLP.Tokenize.Text

-- | The EitherList is a newtype-wrapped list of Eithers.
newtype EitherList a b
E :: [Either a b] -> EitherList a b
unE :: EitherList a b -> [Either a b]

-- | A Tokenizer is function which takes a list and returns a list of
--   Eithers (wrapped in a newtype). Right Texts will be passed on for
--   processing to tokenizers down the pipeline. Left Texts will be passed
--   through the pipeline unchanged. Use a Left Texts in a tokenizer to
--   protect certain tokens from further processing (e.g. see the
--   <a>uris</a> tokenizer). You can define your own custom tokenizer
--   pipelines by chaining tokenizers together:
type Tokenizer = Text -> EitherList Text Text

-- | Split string into words using the default tokenizer pipeline
tokenize :: Text -> [Text]

-- | Run a tokenizer
run :: Tokenizer -> (Text -> [Text])
defaultTokenizer :: Tokenizer

-- | Split string on whitespace. This is just a wrapper for Data.List.words
whitespace :: Tokenizer

-- | Detect common uris and freeze them
uris :: Tokenizer

-- | Split off initial and final punctuation
punctuation :: Tokenizer

-- | Split off word-final punctuation
finalPunctuation :: Tokenizer

-- | Split off word-initial punctuation
initialPunctuation :: Tokenizer

-- | Split tokens on transitions between punctuation and non-punctuation
--   characters. This tokenizer is not included in defaultTokenizer
--   pipeline because dealing with word-internal punctuation is quite
--   application specific.
allPunctuation :: Tokenizer

-- | Split common contractions off and freeze them. | Currently deals with:
--   'm, 's, 'd, 've, 'll
contractions :: Tokenizer

-- | Split words ending in n't, and freeze n't
negatives :: Tokenizer
instance Monad (EitherList a)

module NLP.Tokenize.String

-- | The EitherList is a newtype-wrapped list of Eithers.
newtype EitherList a b
E :: [Either a b] -> EitherList a b
unE :: EitherList a b -> [Either a b]

-- | A Tokenizer is function which takes a list and returns a list of
--   Eithers (wrapped in a newtype). Right Strings will be passed on for
--   processing to tokenizers down the pipeline. Left Strings will be
--   passed through the pipeline unchanged. Use a Left String in a
--   tokenizer to protect certain tokens from further processing (e.g. see
--   the <a>uris</a> tokenizer). You can define your own custom tokenizer
--   pipelines by chaining tokenizers together:
type Tokenizer = String -> EitherList String String

-- | Split string into words using the default tokenizer pipeline
tokenize :: String -> [String]

-- | Run a tokenizer
run :: Tokenizer -> (String -> [String])
defaultTokenizer :: Tokenizer

-- | Split string on whitespace. This is just a wrapper for Data.List.words
whitespace :: Tokenizer

-- | Detect common uris and freeze them
uris :: Tokenizer

-- | Split off initial and final punctuation
punctuation :: Tokenizer

-- | Split off word-final punctuation
finalPunctuation :: Tokenizer

-- | Split off word-initial punctuation
initialPunctuation :: Tokenizer

-- | Split tokens on transitions between punctuation and non-punctuation
--   characters. This tokenizer is not included in defaultTokenizer
--   pipeline because dealing with word-internal punctuation is quite
--   application specific.
allPunctuation :: Tokenizer

-- | Split common contractions off and freeze them. | Currently deals with:
--   'm, 's, 'd, 've, 'll
contractions :: Tokenizer

-- | Split words ending in n't, and freeze n't
negatives :: Tokenizer
instance Monad (EitherList a)


-- | NLP Tokenizer
module NLP.Tokenize