module NLP.Tokenize
( EitherList(..)
, Tokenizer
, tokenize
, run
, defaultTokenizer
, whitespace
, uris
, punctuation
, finalPunctuation
, initialPunctuation
, contractions
, negatives
)
where
import qualified Data.Char as Char
import Data.List
import Data.Maybe
import Control.Monad.Instances
import Data.List.Split
import Control.Monad
type Tokenizer = String -> EitherList String String
newtype EitherList a b = E { unE :: [Either a b] }
tokenize :: String -> [String]
tokenize = run defaultTokenizer
run :: Tokenizer -> (String -> [String])
run f = map unwrap . unE . f
defaultTokenizer :: Tokenizer
defaultTokenizer = whitespace
>=> uris
>=> punctuation
>=> contractions
>=> negatives
uris :: Tokenizer
uris x | isUri x = E [Left x]
| True = E [Right x]
where isUri x = any (`isPrefixOf` x) ["http://","ftp://","mailto:"]
punctuation :: Tokenizer
punctuation = finalPunctuation >=> initialPunctuation
finalPunctuation :: Tokenizer
finalPunctuation x = E . filter (not . null . unwrap) $
case span Char.isPunctuation . reverse $ x of
([],w) -> [Right . reverse $ w]
(ps,w) -> [Right . reverse $ w, Right . reverse $ ps]
initialPunctuation :: Tokenizer
initialPunctuation x = E . filter (not . null . unwrap) $
case span Char.isPunctuation$ x of
([],w) -> [Right w]
(ps,w) -> [Right ps, Right w]
negatives :: Tokenizer
negatives x | "n't" `isSuffixOf` x = E [ Right . reverse . drop 3 . reverse $ x
, Left "n't" ]
| True = E [Right x]
contractions :: Tokenizer
contractions x = case catMaybes . map (splitSuffix x) $ cts of
[] -> return x
((w,s):_) -> E [ Right w,Left s]
where cts = ["'s","'d","'ve","'ll"]
splitSuffix w sfx =
let w' = reverse w
len = length sfx
in if sfx `isSuffixOf` w
then Just (take (length w len) w, reverse . take len $ w')
else Nothing
whitespace :: Tokenizer
whitespace xs = E [Right w | w <- words xs ]
instance Monad (EitherList a) where
return x = E [Right x]
E xs >>= f = E $ concatMap (either (return . Left) (unE . f)) xs
unwrap (Left x) = x
unwrap (Right x) = x
examples =
["This shouldn't happen."
,"Some 'quoted' stuff"
,"This is a URL: http://example.org."
,"How about an email@example.com"
,"ReferenceError #1065 broke my debugger!"
,"I would've gone."
,"They've been there."
]