module NLP.Tokenize.Text
    ( EitherList(..)
    , Tokenizer
    , tokenize
    , run
    , defaultTokenizer
    , whitespace
    , uris
    , punctuation
    , finalPunctuation
    , initialPunctuation
    , allPunctuation
    , contractions
    , negatives
    )
where
import qualified Data.Char as Char
import Data.Maybe
import Control.Monad.Instances ()
import Control.Applicative
import Control.Monad
import Data.Text (Text)
import qualified Data.Text as T
type Tokenizer =  Text -> EitherList Text Text
newtype EitherList a b =  E { unE :: [Either a b] }
tokenize :: Text -> [Text]
tokenize = run defaultTokenizer
run :: Tokenizer -> (Text -> [Text])
run f = \txt -> map T.copy $ (map unwrap . unE . f) txt
defaultTokenizer :: Tokenizer
defaultTokenizer =     whitespace 
                   >=> uris 
                   >=> punctuation 
                   >=> contractions 
                   >=> negatives 
uris :: Tokenizer
uris x | isUri x = E [Left x]
       | True    = E [Right x]
    where isUri u = any (`T.isPrefixOf` u) ["http://","ftp://","mailto:"]
punctuation :: Tokenizer 
punctuation = finalPunctuation >=> initialPunctuation
hyphens :: Tokenizer
hyphens xs = E [Right w | w <- T.split (=='-') xs ]
finalPunctuation :: Tokenizer
finalPunctuation x = E $ filter (not . T.null . unwrap) res
  where
    res :: [Either Text Text]
    res = case T.span Char.isPunctuation (T.reverse x) of
      (ps, w) | T.null ps -> [ Right $ T.reverse w ]
              | otherwise -> [ Right $ T.reverse w
                             , Right $ T.reverse ps]
      
      
initialPunctuation :: Tokenizer
initialPunctuation x = E $ filter (not . T.null . unwrap) $
    case T.span Char.isPunctuation x of
      (ps,w) | T.null ps -> [ Right w ]
             | otherwise -> [ Right ps
                            , Right w ]
allPunctuation :: Tokenizer
allPunctuation = E . map Right 
                 . T.groupBy (\a b -> Char.isPunctuation a == Char.isPunctuation b) 
negatives :: Tokenizer
negatives x | "n't" `T.isSuffixOf` x = E [ Right . T.reverse . T.drop 3 . T.reverse $ x
                                         , Left "n't" ]
            | True                   = E [ Right x ]
contractions :: Tokenizer
contractions x = case catMaybes . map (splitSuffix x) $ cts of
                   [] -> return x
                   ((w,s):_) -> E [ Right w,Left s]
    where cts = ["'m","'s","'d","'ve","'ll"]
          splitSuffix w sfx = 
              let w' = T.reverse w
                  len = T.length sfx
              in if sfx `T.isSuffixOf` w 
                 then Just (T.take (T.length w  len) w, T.reverse . T.take len $ w')
                 else Nothing
whitespace :: Tokenizer
whitespace xs = E [Right w | w <- T.words xs ]
instance Monad (EitherList a) where
    return x = E [Right x]
    E xs >>= f = E $ concatMap (either (return . Left) (unE . f)) xs
instance Applicative (EitherList a) where
    pure x = return x
    f <*> x = f `ap` x
instance Functor (EitherList a) where
    fmap f (E xs) = E $ (fmap . fmap) f xs
unwrap :: Either a a -> a
unwrap (Left x) = x
unwrap (Right x) = x
examples :: [Text]
examples = 
    ["This shouldn't happen."
    ,"Some 'quoted' stuff"
    ,"This is a URL: http://example.org."
    ,"How about an email@example.com"
    ,"ReferenceError #1065 broke my debugger!"
    ,"I would've gone."
    ,"They've been there."
    ,"Hyphen-words"
    ,"Yes/No questions"
    ]