| Safe Haskell | None |
|---|---|
| Language | Haskell2010 |
NLP.Tokenize.Annotations
- tokenize :: Text -> TokenizedSentence
- defaultTokenizer :: RawToken -> [RawToken]
- runTokenizer :: (RawToken -> [RawToken]) -> Text -> TokenizedSentence
- protectTerms :: [Text] -> CaseSensitive -> RawToken -> [RawToken]
- whitespace :: RawToken -> [RawToken]
- uris :: RawToken -> [RawToken]
- punctuation :: RawToken -> [RawToken]
- contractions :: RawToken -> [RawToken]
- tokenizeOn :: (Char -> Bool) -> RawToken -> [RawToken]
Documentation
tokenize :: Text -> TokenizedSentence
defaultTokenizer :: RawToken -> [RawToken]
runTokenizer :: (RawToken -> [RawToken]) -> Text -> TokenizedSentence
protectTerms :: [Text] -> CaseSensitive -> RawToken -> [RawToken]
Create a tokenizer that protects the provided terms (to tokenize multi-word terms)
whitespace :: RawToken -> [RawToken]
Tokenize on whitespace, as defined by 'ch -> Char.isSeparator ch || Char.isSpace ch'
punctuation :: RawToken -> [RawToken]
contractions :: RawToken -> [RawToken]
Split common contractions off and freeze them. Currently deals with: 'm, 's, 'd, 've, 'll, and negations (n't)
tokenizeOn :: (Char -> Bool) -> RawToken -> [RawToken]
Tokenize on characters that satisfy the provided predicate.