module NLP.Tokenize.String 
    ( EitherList(..)
    , Tokenizer
    , tokenize
    , run
    , defaultTokenizer
    , whitespace
    , uris
    , punctuation
    , finalPunctuation
    , initialPunctuation
    , allPunctuation
    , contractions
    , negatives
    )
where
import qualified Data.Char as Char
import Data.List
import Data.Maybe
import Control.Monad.Instances
import Control.Applicative
import Data.List.Split
import Control.Monad
type Tokenizer =  String -> EitherList String String
newtype EitherList a b =  E { unE :: [Either a b] }
tokenize :: String -> [String]
tokenize  = run defaultTokenizer
run :: Tokenizer -> (String -> [String])
run f = map unwrap . unE . f
defaultTokenizer :: Tokenizer
defaultTokenizer =     whitespace 
                   >=> uris 
                   >=> punctuation 
                   >=> contractions 
                   >=> negatives 
uris :: Tokenizer
uris x | isUri x = E [Left x]
       | True    = E [Right x]
    where isUri x = any (`isPrefixOf` x) ["http://","ftp://","mailto:"]
punctuation :: Tokenizer 
punctuation = finalPunctuation >=> initialPunctuation
finalPunctuation :: Tokenizer
finalPunctuation x = E . filter (not . null . unwrap) $
    case span Char.isPunctuation . reverse $ x of
      ([],w) -> [Right . reverse $ w]
      (ps,w) -> [Right . reverse $ w, Right . reverse $ ps]
initialPunctuation :: Tokenizer
initialPunctuation x = E . filter (not . null . unwrap) $
    case span Char.isPunctuation$ x of
      ([],w) -> [Right w]
      (ps,w) -> [Right ps, Right w]
allPunctuation :: Tokenizer
allPunctuation = E . map Right 
                 . groupBy (\a b -> Char.isPunctuation a == Char.isPunctuation b) 
                 
negatives :: Tokenizer
negatives x | "n't" `isSuffixOf` x = E [ Right . reverse . drop 3 . reverse $ x
                                       , Left "n't" ]
            | True                 = E [Right x]
contractions :: Tokenizer
contractions x = case catMaybes . map (splitSuffix x) $ cts of
                   [] -> return x
                   ((w,s):_) -> E [ Right w,Left s]
    where cts = ["'m","'s","'d","'ve","'ll"]
          splitSuffix w sfx = 
              let w' = reverse w
                  len = length sfx
              in if sfx `isSuffixOf` w 
                 then Just (take (length w  len) w, reverse . take len $ w')
                 else Nothing
whitespace :: Tokenizer
whitespace xs = E [Right w | w <- words xs ]
instance Monad (EitherList a) where
    return x = E [Right x]
    E xs >>= f = E $ concatMap (either (return . Left) (unE . f)) xs
instance Applicative (EitherList a) where
    pure x = return x
    f <*> x = f `ap` x
instance Functor (EitherList a) where
    fmap f (E xs) = E $ (fmap . fmap) f xs
unwrap (Left x) = x
unwrap (Right x) = x
examples = 
    ["This shouldn't happen."
    ,"Some 'quoted' stuff"
    ,"This is a URL: http://example.org."
    ,"How about an email@example.com"
    ,"ReferenceError #1065 broke my debugger!"
    ,"I would've gone."
    ,"They've been there."
    ,"Hyphen-words"
    ,"Yes/No questions"
    ]