module NLP.Tokenize
( EitherList(..)
, Tokenizer
, tokenize
, run
, whitespace
, uris
, punctuation
, finalPunctuation
, initialPunctuation
, negatives
)
where
import qualified Data.Char as Char
import Data.List
import Control.Monad.Instances
import Data.List.Split
import Control.Monad
type Tokenizer = String -> EitherList String String
newtype EitherList a b = E { unE :: [Either a b] }
tokenize :: String -> [String]
tokenize = run (whitespace >=> uris >=> punctuation >=> negatives)
run :: Tokenizer -> (String -> [String])
run f = map unwrap . unE . f
uris :: Tokenizer
uris x | isUri x = E [Left x]
| True = E [Right x]
where isUri x = any (`isPrefixOf` x) ["http://","ftp://","mailto:"]
punctuation :: Tokenizer
punctuation = finalPunctuation >=> initialPunctuation
finalPunctuation :: Tokenizer
finalPunctuation x = E $
case span Char.isPunctuation . reverse $ x of
([],w) -> [Right . reverse $ w]
(ps,w) -> [Right . reverse $ w, Right . reverse $ ps]
initialPunctuation :: Tokenizer
initialPunctuation x = E $
case span Char.isPunctuation$ x of
([],w) -> [Right w]
(ps,w) -> [Right ps, Right w]
negatives :: Tokenizer
negatives x | "n't" `isSuffixOf` x = E [ Right . reverse . drop 3 . reverse $ x
, Left "n't" ]
| True = E [Right x]
whitespace :: Tokenizer
whitespace xs = E [Right w | w <- words xs ]
instance Monad (EitherList a) where
return x = E [Right x]
E xs >>= f = E $ concatMap (either (return . Left) (unE . f)) xs
unwrap (Left x) = x
unwrap (Right x) = x
examples =
["This shouldn't happen."
,"Some 'quoted' struff"
,"This is a URL: http://example.org."
,"How about an email@example.com"]