{-# LANGUAGE OverloadedStrings #-}
-- | NLP Tokenizer, adapted to use Text instead of Strings from the
-- `tokenize` package.

module NLP.Tokenize.Text
    ( EitherList(..)
    , Tokenizer
    , tokenize
    , run
    , defaultTokenizer
    , whitespace
    , uris
    , punctuation
    , finalPunctuation
    , initialPunctuation
    , allPunctuation
    , contractions
    , negatives
    )
where

import qualified Data.Char as Char
import Data.Maybe
import Control.Applicative
import Control.Monad

import Data.Text (Text)
import qualified Data.Text as T

-- | A Tokenizer is function which takes a list and returns a list of Eithers
--  (wrapped in a newtype). Right Texts will be passed on for processing
--  to tokenizers down
--  the pipeline. Left Texts will be passed through the pipeline unchanged.
--  Use a Left Texts in a tokenizer to protect certain tokens from further
--  processing (e.g. see the 'uris' tokenizer).
--  You can define your own custom tokenizer pipelines by chaining tokenizers together:
---
-- > myTokenizer :: Tokenizer
-- > myTokenizer = whitespace >=> allPunctuation
---

type Tokenizer =  Text -> EitherList Text Text

-- | The EitherList is a newtype-wrapped list of Eithers.
newtype EitherList a b =  E { forall a b. EitherList a b -> [Either a b]
unE :: [Either a b] }

-- | Split string into words using the default tokenizer pipeline
tokenize :: Text -> [Text]
tokenize :: Text -> [Text]
tokenize = Tokenizer -> Text -> [Text]
run Tokenizer
defaultTokenizer

-- | Run a tokenizer
run :: Tokenizer -> (Text -> [Text])
run :: Tokenizer -> Text -> [Text]
run Tokenizer
f = \Text
txt -> (Text -> Text) -> [Text] -> [Text]
forall a b. (a -> b) -> [a] -> [b]
map Text -> Text
T.copy ([Text] -> [Text]) -> [Text] -> [Text]
forall a b. (a -> b) -> a -> b
$ ((Either Text Text -> Text) -> [Either Text Text] -> [Text]
forall a b. (a -> b) -> [a] -> [b]
map Either Text Text -> Text
forall a. Either a a -> a
unwrap ([Either Text Text] -> [Text])
-> (Text -> [Either Text Text]) -> Text -> [Text]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. EitherList Text Text -> [Either Text Text]
forall a b. EitherList a b -> [Either a b]
unE (EitherList Text Text -> [Either Text Text])
-> Tokenizer -> Text -> [Either Text Text]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Tokenizer
f) Text
txt

defaultTokenizer :: Tokenizer
defaultTokenizer :: Tokenizer
defaultTokenizer =     Tokenizer
whitespace
                   Tokenizer -> Tokenizer -> Tokenizer
forall (m :: * -> *) a b c.
Monad m =>
(a -> m b) -> (b -> m c) -> a -> m c
>=> Tokenizer
uris
                   Tokenizer -> Tokenizer -> Tokenizer
forall (m :: * -> *) a b c.
Monad m =>
(a -> m b) -> (b -> m c) -> a -> m c
>=> Tokenizer
punctuation
                   Tokenizer -> Tokenizer -> Tokenizer
forall (m :: * -> *) a b c.
Monad m =>
(a -> m b) -> (b -> m c) -> a -> m c
>=> Tokenizer
contractions
                   Tokenizer -> Tokenizer -> Tokenizer
forall (m :: * -> *) a b c.
Monad m =>
(a -> m b) -> (b -> m c) -> a -> m c
>=> Tokenizer
negatives

-- | Detect common uris and freeze them
uris :: Tokenizer
uris :: Tokenizer
uris Text
x | Text -> Bool
isUri Text
x = [Either Text Text] -> EitherList Text Text
forall a b. [Either a b] -> EitherList a b
E [Text -> Either Text Text
forall a b. a -> Either a b
Left Text
x]
       | Bool
True    = [Either Text Text] -> EitherList Text Text
forall a b. [Either a b] -> EitherList a b
E [Text -> Either Text Text
forall a b. b -> Either a b
Right Text
x]
    where isUri :: Text -> Bool
isUri Text
u = (Text -> Bool) -> [Text] -> Bool
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
any (Text -> Text -> Bool
`T.isPrefixOf` Text
u) [Text
"http://",Text
"ftp://",Text
"mailto:"]

-- | Split off initial and final punctuation
punctuation :: Tokenizer
punctuation :: Tokenizer
punctuation = Tokenizer
finalPunctuation Tokenizer -> Tokenizer -> Tokenizer
forall (m :: * -> *) a b c.
Monad m =>
(a -> m b) -> (b -> m c) -> a -> m c
>=> Tokenizer
initialPunctuation

hyphens :: Tokenizer
hyphens :: Tokenizer
hyphens Text
xs = [Either Text Text] -> EitherList Text Text
forall a b. [Either a b] -> EitherList a b
E [Text -> Either Text Text
forall a b. b -> Either a b
Right Text
w | Text
w <- (Char -> Bool) -> Text -> [Text]
T.split (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==Char
'-') Text
xs ]

-- | Split off word-final punctuation
finalPunctuation :: Tokenizer
finalPunctuation :: Tokenizer
finalPunctuation Text
x = [Either Text Text] -> EitherList Text Text
forall a b. [Either a b] -> EitherList a b
E ([Either Text Text] -> EitherList Text Text)
-> [Either Text Text] -> EitherList Text Text
forall a b. (a -> b) -> a -> b
$ (Either Text Text -> Bool)
-> [Either Text Text] -> [Either Text Text]
forall a. (a -> Bool) -> [a] -> [a]
filter (Bool -> Bool
not (Bool -> Bool)
-> (Either Text Text -> Bool) -> Either Text Text -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> Bool
T.null (Text -> Bool)
-> (Either Text Text -> Text) -> Either Text Text -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Either Text Text -> Text
forall a. Either a a -> a
unwrap) [Either Text Text]
res
  where
    res :: [Either Text Text]
    res :: [Either Text Text]
res = case (Char -> Bool) -> Text -> (Text, Text)
T.span Char -> Bool
Char.isPunctuation (Text -> Text
T.reverse Text
x) of
      (Text
ps, Text
w) | Text -> Bool
T.null Text
ps -> [ Text -> Either Text Text
forall a b. b -> Either a b
Right (Text -> Either Text Text) -> Text -> Either Text Text
forall a b. (a -> b) -> a -> b
$ Text -> Text
T.reverse Text
w ]
              | Bool
otherwise -> [ Text -> Either Text Text
forall a b. b -> Either a b
Right (Text -> Either Text Text) -> Text -> Either Text Text
forall a b. (a -> b) -> a -> b
$ Text -> Text
T.reverse Text
w
                             , Text -> Either Text Text
forall a b. b -> Either a b
Right (Text -> Either Text Text) -> Text -> Either Text Text
forall a b. (a -> b) -> a -> b
$ Text -> Text
T.reverse Text
ps]
      -- ([],w) -> [Right . T.reverse $ w]
      -- (ps,w) -> [Right . T.reverse $ w, Right . T.reverse $ ps]

-- | Split off word-initial punctuation
initialPunctuation :: Tokenizer
initialPunctuation :: Tokenizer
initialPunctuation Text
x = [Either Text Text] -> EitherList Text Text
forall a b. [Either a b] -> EitherList a b
E ([Either Text Text] -> EitherList Text Text)
-> [Either Text Text] -> EitherList Text Text
forall a b. (a -> b) -> a -> b
$ (Either Text Text -> Bool)
-> [Either Text Text] -> [Either Text Text]
forall a. (a -> Bool) -> [a] -> [a]
filter (Bool -> Bool
not (Bool -> Bool)
-> (Either Text Text -> Bool) -> Either Text Text -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> Bool
T.null (Text -> Bool)
-> (Either Text Text -> Text) -> Either Text Text -> Bool
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Either Text Text -> Text
forall a. Either a a -> a
unwrap) ([Either Text Text] -> [Either Text Text])
-> [Either Text Text] -> [Either Text Text]
forall a b. (a -> b) -> a -> b
$
    case (Char -> Bool) -> Text -> (Text, Text)
T.span Char -> Bool
Char.isPunctuation Text
x of
      (Text
ps,Text
w) | Text -> Bool
T.null Text
ps -> [ Text -> Either Text Text
forall a b. b -> Either a b
Right Text
w ]
             | Bool
otherwise -> [ Text -> Either Text Text
forall a b. b -> Either a b
Right Text
ps
                            , Text -> Either Text Text
forall a b. b -> Either a b
Right Text
w ]

-- | Split tokens on transitions between punctuation and
-- non-punctuation characters. This tokenizer is not included in
-- defaultTokenizer pipeline because dealing with word-internal
-- punctuation is quite application specific.
allPunctuation :: Tokenizer
allPunctuation :: Tokenizer
allPunctuation = [Either Text Text] -> EitherList Text Text
forall a b. [Either a b] -> EitherList a b
E ([Either Text Text] -> EitherList Text Text)
-> (Text -> [Either Text Text]) -> Tokenizer
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Text -> Either Text Text) -> [Text] -> [Either Text Text]
forall a b. (a -> b) -> [a] -> [b]
map Text -> Either Text Text
forall a b. b -> Either a b
Right
                 ([Text] -> [Either Text Text])
-> (Text -> [Text]) -> Text -> [Either Text Text]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Char -> Bool) -> Text -> [Text]
T.groupBy (\Char
a Char
b -> Char -> Bool
Char.isPunctuation Char
a Bool -> Bool -> Bool
forall a. Eq a => a -> a -> Bool
== Char -> Bool
Char.isPunctuation Char
b)

-- | Split words ending in n't, and freeze n't
negatives :: Tokenizer
negatives :: Tokenizer
negatives Text
x | Text
"n't" Text -> Text -> Bool
`T.isSuffixOf` Text
x = [Either Text Text] -> EitherList Text Text
forall a b. [Either a b] -> EitherList a b
E [ Text -> Either Text Text
forall a b. b -> Either a b
Right (Text -> Either Text Text)
-> (Text -> Text) -> Text -> Either Text Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> Text
T.reverse (Text -> Text) -> (Text -> Text) -> Text -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Text -> Text
T.drop Int
3 (Text -> Text) -> (Text -> Text) -> Text -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Text -> Text
T.reverse (Text -> Either Text Text) -> Text -> Either Text Text
forall a b. (a -> b) -> a -> b
$ Text
x
                                         , Text -> Either Text Text
forall a b. a -> Either a b
Left Text
"n't" ]
            | Bool
True                   = [Either Text Text] -> EitherList Text Text
forall a b. [Either a b] -> EitherList a b
E [ Text -> Either Text Text
forall a b. b -> Either a b
Right Text
x ]

-- | Split common contractions off and freeze them.
-- | Currently deals with: 'm, 's, 'd, 've, 'll
contractions :: Tokenizer
contractions :: Tokenizer
contractions Text
x = case [Maybe (Text, Text)] -> [(Text, Text)]
forall a. [Maybe a] -> [a]
catMaybes ([Maybe (Text, Text)] -> [(Text, Text)])
-> ([Text] -> [Maybe (Text, Text)]) -> [Text] -> [(Text, Text)]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Text -> Maybe (Text, Text)) -> [Text] -> [Maybe (Text, Text)]
forall a b. (a -> b) -> [a] -> [b]
map (Text -> Text -> Maybe (Text, Text)
splitSuffix Text
x) ([Text] -> [(Text, Text)]) -> [Text] -> [(Text, Text)]
forall a b. (a -> b) -> a -> b
$ [Text]
cts of
                   [] -> Tokenizer
forall a. a -> EitherList Text a
forall (m :: * -> *) a. Monad m => a -> m a
return Text
x
                   ((Text
w,Text
s):[(Text, Text)]
_) -> [Either Text Text] -> EitherList Text Text
forall a b. [Either a b] -> EitherList a b
E [ Text -> Either Text Text
forall a b. b -> Either a b
Right Text
w,Text -> Either Text Text
forall a b. a -> Either a b
Left Text
s]
    where cts :: [Text]
cts = [Text
"'m",Text
"'s",Text
"'d",Text
"'ve",Text
"'ll"]
          splitSuffix :: Text -> Text -> Maybe (Text, Text)
splitSuffix Text
w Text
sfx =
              let w' :: Text
w' = Text -> Text
T.reverse Text
w
                  len :: Int
len = Text -> Int
T.length Text
sfx
              in if Text
sfx Text -> Text -> Bool
`T.isSuffixOf` Text
w
                 then (Text, Text) -> Maybe (Text, Text)
forall a. a -> Maybe a
Just (Int -> Text -> Text
T.take (Text -> Int
T.length Text
w Int -> Int -> Int
forall a. Num a => a -> a -> a
- Int
len) Text
w, Text -> Text
T.reverse (Text -> Text) -> (Text -> Text) -> Text -> Text
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Int -> Text -> Text
T.take Int
len (Text -> Text) -> Text -> Text
forall a b. (a -> b) -> a -> b
$ Text
w')
                 else Maybe (Text, Text)
forall a. Maybe a
Nothing


-- | Split string on whitespace. This is just a wrapper for Data.List.words
whitespace :: Tokenizer
whitespace :: Tokenizer
whitespace Text
xs = [Either Text Text] -> EitherList Text Text
forall a b. [Either a b] -> EitherList a b
E [Text -> Either Text Text
forall a b. b -> Either a b
Right Text
w | Text
w <- Text -> [Text]
T.words Text
xs ]

instance Monad (EitherList a) where
    E [Either a a]
xs >>= :: forall a b.
EitherList a a -> (a -> EitherList a b) -> EitherList a b
>>= a -> EitherList a b
f = [Either a b] -> EitherList a b
forall a b. [Either a b] -> EitherList a b
E ([Either a b] -> EitherList a b) -> [Either a b] -> EitherList a b
forall a b. (a -> b) -> a -> b
$ (Either a a -> [Either a b]) -> [Either a a] -> [Either a b]
forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap ((a -> [Either a b])
-> (a -> [Either a b]) -> Either a a -> [Either a b]
forall a c b. (a -> c) -> (b -> c) -> Either a b -> c
either (Either a b -> [Either a b]
forall a. a -> [a]
forall (m :: * -> *) a. Monad m => a -> m a
return (Either a b -> [Either a b])
-> (a -> Either a b) -> a -> [Either a b]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. a -> Either a b
forall a b. a -> Either a b
Left) (EitherList a b -> [Either a b]
forall a b. EitherList a b -> [Either a b]
unE (EitherList a b -> [Either a b])
-> (a -> EitherList a b) -> a -> [Either a b]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. a -> EitherList a b
f)) [Either a a]
xs

instance Applicative (EitherList a) where
    pure :: forall a. a -> EitherList a a
pure a
x = [Either a a] -> EitherList a a
forall a b. [Either a b] -> EitherList a b
E [a -> Either a a
forall a b. b -> Either a b
Right a
x]
    EitherList a (a -> b)
f <*> :: forall a b.
EitherList a (a -> b) -> EitherList a a -> EitherList a b
<*> EitherList a a
x = EitherList a (a -> b)
f EitherList a (a -> b) -> EitherList a a -> EitherList a b
forall (m :: * -> *) a b. Monad m => m (a -> b) -> m a -> m b
`ap` EitherList a a
x

instance Functor (EitherList a) where
    fmap :: forall a b. (a -> b) -> EitherList a a -> EitherList a b
fmap a -> b
f (E [Either a a]
xs) = [Either a b] -> EitherList a b
forall a b. [Either a b] -> EitherList a b
E ([Either a b] -> EitherList a b) -> [Either a b] -> EitherList a b
forall a b. (a -> b) -> a -> b
$ ((Either a a -> Either a b) -> [Either a a] -> [Either a b]
forall a b. (a -> b) -> [a] -> [b]
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap ((Either a a -> Either a b) -> [Either a a] -> [Either a b])
-> ((a -> b) -> Either a a -> Either a b)
-> (a -> b)
-> [Either a a]
-> [Either a b]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (a -> b) -> Either a a -> Either a b
forall a b. (a -> b) -> Either a a -> Either a b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap) a -> b
f [Either a a]
xs

unwrap :: Either a a -> a
unwrap :: forall a. Either a a -> a
unwrap (Left a
x) = a
x
unwrap (Right a
x) = a
x

examples :: [Text]
examples :: [Text]
examples =
    [Text
"This shouldn't happen."
    ,Text
"Some 'quoted' stuff"
    ,Text
"This is a URL: http://example.org."
    ,Text
"How about an email@example.com"
    ,Text
"ReferenceError #1065 broke my debugger!"
    ,Text
"I would've gone."
    ,Text
"They've been there."
    ,Text
"Hyphen-words"
    ,Text
"Yes/No questions"
    ]