module NLP.Tokenize ( EitherList(..) , Tokenizer , tokenize , run , whitespace , uris , punctuation , finalPunctuation , initialPunctuation , negatives ) where import qualified Data.Char as Char import Data.List import Control.Monad.Instances import Data.List.Split import Control.Monad -- | A Tokenizer is function which takes a list and returns a list of Eithers -- (wrapped in a newtype). Right Strings will be passed on for processing -- to tokenizers down -- the pipeline. Left Strings will be passed through the pipeline unchanged. -- Use a Left String in a tokenizer to protect certain tokens from further -- processing (e.g. see the 'uris' tokenizer). type Tokenizer = String -> EitherList String String -- | The EitherList is a newtype-wrapped list of Eithers. newtype EitherList a b = E { unE :: [Either a b] } -- | Split string into words using the default tokenizer pipeline tokenize :: String -> [String] tokenize = run (whitespace >=> uris >=> punctuation >=> negatives) -- | Run a tokenizer run :: Tokenizer -> (String -> [String]) run f = map unwrap . unE . f -- | Detect common uris and freeze them uris :: Tokenizer uris x | isUri x = E [Left x] | True = E [Right x] where isUri x = any (`isPrefixOf` x) ["http://","ftp://","mailto:"] -- | Split off initial and final punctuation punctuation :: Tokenizer punctuation = finalPunctuation >=> initialPunctuation -- | Split off word-final punctuation finalPunctuation :: Tokenizer finalPunctuation x = E $ case span Char.isPunctuation . reverse $ x of ([],w) -> [Right . reverse $ w] (ps,w) -> [Right . reverse $ w, Right . reverse $ ps] -- | Split off word-initial punctuation initialPunctuation :: Tokenizer initialPunctuation x = E $ case span Char.isPunctuation$ x of ([],w) -> [Right w] (ps,w) -> [Right ps, Right w] -- | Split words ending in "n't", and freeze "n't" negatives :: Tokenizer negatives x | "n't" `isSuffixOf` x = E [ Right . reverse . drop 3 . reverse $ x , Left "n't" ] | True = E [Right x] -- | Split string on whitespace. This is just a wrapper for 'words' whitespace :: Tokenizer whitespace xs = E [Right w | w <- words xs ] instance Monad (EitherList a) where return x = E [Right x] E xs >>= f = E $ concatMap (either (return . Left) (unE . f)) xs unwrap (Left x) = x unwrap (Right x) = x examples = ["This shouldn't happen." ,"Some 'quoted' struff" ,"This is a URL: http://example.org." ,"How about an email@example.com"]