module NLP.Tokenize 
    ( EitherList(..)
    , Tokenizer
    , tokenize
    , run
    , whitespace
    , uris
    , punctuation
    , finalPunctuation
    , initialPunctuation
    , negatives
    )
where

import qualified Data.Char as Char
import Data.List
import Control.Monad.Instances
import Data.List.Split
import Control.Monad

-- | A Tokenizer is function which takes a list and returns a list of Eithers
--  (wrapped in a newtype). Right Strings will be passed on for processing
--  to tokenizers down
--  the pipeline. Left Strings will be passed through the pipeline unchanged.
--  Use a Left String in a tokenizer to protect certain tokens from further 
--  processing (e.g. see the 'uris' tokenizer).
type Tokenizer =  String -> EitherList String String

-- | The EitherList is a newtype-wrapped list of Eithers.
newtype EitherList a b =  E { unE :: [Either a b] }

-- | Split string into words using the default tokenizer pipeline 
tokenize :: String -> [String]
tokenize  = run (whitespace >=> uris >=> punctuation >=> negatives)

-- | Run a tokenizer
run :: Tokenizer -> (String -> [String])
run f = map unwrap . unE . f

-- | Detect common uris and freeze them
uris :: Tokenizer
uris x | isUri x = E [Left x]
       | True    = E [Right x]
    where isUri x = any (`isPrefixOf` x) ["http://","ftp://","mailto:"]

-- | Split off initial and final punctuation
punctuation :: Tokenizer 
punctuation = finalPunctuation >=> initialPunctuation

-- | Split off word-final punctuation
finalPunctuation :: Tokenizer
finalPunctuation x = E $
    case span Char.isPunctuation . reverse $ x of
      ([],w) -> [Right . reverse $ w]
      (ps,w) -> [Right . reverse $ w, Right . reverse $ ps]

-- | Split off word-initial punctuation
initialPunctuation :: Tokenizer
initialPunctuation x = E $
    case span Char.isPunctuation$ x of
      ([],w) -> [Right w]
      (ps,w) -> [Right ps, Right w]

-- | Split words ending in "n't", and freeze "n't" 
negatives :: Tokenizer
negatives x | "n't" `isSuffixOf` x = E [ Right . reverse . drop 3 . reverse $ x
                                       , Left "n't" ]
            | True                 = E [Right x]

-- | Split string on whitespace. This is just a wrapper for 'words'
whitespace :: Tokenizer
whitespace xs = E [Right w | w <- words xs ]

instance Monad (EitherList a) where
    return x = E [Right x]
    E xs >>= f = E $ concatMap (either (return . Left) (unE . f)) xs

unwrap (Left x) = x
unwrap (Right x) = x

examples = 
    ["This shouldn't happen."
    ,"Some 'quoted' struff"
    ,"This is a URL: http://example.org."
    ,"How about an email@example.com"]