{-# LANGUAGE OverloadedStrings, BangPatterns, FlexibleInstances #-}

-- | A monad for writing pure tokenizers in an imperative-looking way.
--
-- Main idea: You 'walk' through the input string like a turtle, and everytime
-- you find a token boundary, you call 'emit'. If some specific kinds of tokens
-- should be suppressed, you can 'discard' them instead (or filter afterwards).
--
-- This module supports strict text, lazy text, and strings, though the package
-- also provides support for ASCII bytestrings in separate modules.
--
-- Example for a simple tokenizer, that splits words by whitespace and discards stop symbols:
--
-- > tokenizeWords :: T.Text -> [T.Text]
-- > tokenizeWords = runTokenizer $ untilEOT $ do
-- >   c <- pop
-- >   if isStopSym c
-- >     then discard
-- >     else if c `elem` ("  \t\r\n" :: [Char])
-- >          then discard
-- >          else do
-- >            walkWhile (\c -> (c=='_') || not (isSpace c || isPunctuation' c))
-- >            emit

module Control.Monad.Tokenizer (
  -- * Monad
  Tokenizer,
  runTokenizer,
  runTokenizerCS,
  untilEOT,
  -- * Tests
  peek,
  isEOT,
  lookAhead,
  -- * Movement
  walk,
  walkBack,
  pop,
  walkWhile,
  walkFold,
  -- * Transactions
  emit,
  discard,
  restore,
  -- * Embedding
  embed,
  embed_,
  discardAndEmbed,
  -- * Conversion
  convert,
  convertWith,
  -- * Text types
  Tokenizable(..)
  ) where

import Control.Monad
import Data.Char
import Data.Monoid
import Data.String
import qualified Data.Text as T
import qualified Data.Text.Lazy as LT
import Control.Monad.Tokenizer.Class

-- | Tokenizer monad. Use runTokenizer or runTokenizerCS to run it
newtype Tokenizer t a = Tokenizer { runTokenizer' :: (t, Int, t) -> (a,[t] -> [t],t,Int,t) }
--- type explanation: (whole text since last emission, chars passed, remaining text)
---     -> (result, difference list of tokens, whole text, chars passed, remaining text)

instance Functor (Tokenizer t) where
    fmap = liftM

instance Applicative (Tokenizer t) where
    pure = return
    (<*>) = ap

instance Monad (Tokenizer t) where
    return a = Tokenizer $ \(whole,count,tail) -> (a,id,whole,count,tail)
    m >>= f = Tokenizer $ \(whole,count,tail) ->
              let (a1,o1,w1,!c1,t1) = runTokenizer' m (whole,count,tail)
                  (a2,o2,w2,!c2,t2) = runTokenizer' (f a1) (w1,c1,t1)
              in (a2,o1.o2,w2,c2,t2)

instance Tokenizable t => MonadTokenizer (Tokenizer t) where
  lookAhead chars =
    Tokenizer $ \(whole,count,tail) ->
        let h = unpack $ ttake (length chars) tail
        in (h == chars, id, whole, count, tail)
    where unpack t | tnull t = []
                   | otherwise = thead t : unpack (ttail t)

  walk = Tokenizer $ \(whole,count,tail) ->
                       if tnull tail
                       then ((),id,whole,count,tail)
                       else ((),id,whole,count+1,ttail tail)

  walkBack = Tokenizer $ \(whole,count,_) ->
                           if count > 0
                           then ((),id,whole,count-1,tdrop (count-1) whole)
                           else ((),id,whole,0,whole)

  restore = Tokenizer $ \(whole,_,_) -> ((),id,whole,0,whole)

  peek = Tokenizer $ \(whole,count,tail) -> (th tail,id,whole,count,tail)
    where th t | tnull t = '\0'
               | otherwise = thead t

  emit = Tokenizer $ \(whole,count,tail) -> ((),(ttake count whole:),tail,0,tail)

  discard = Tokenizer $ \(whole,count,tail) -> ((),id,tail,0,tail)

  isEOT = Tokenizer $ \(whole, count, tail) -> (tnull tail, id, whole, count, tail)

-- | Embed a pure tokenizer into the monad. The arguments to the function are (visited string, remaining string), and the return value is expected to be (result, emitted tokens, remaining string).
embed :: Tokenizable t => ((t,t) -> (a,[t],t)) -> Tokenizer t a
embed f = Tokenizer $ \(whole,count,tail) ->
  let (a, rs, rem) = f (tdrop count whole, tail)
  in (a, (rs++), rem, 0, rem)

-- | Embed a pure tokenizer into the monad without a result.
embed_ :: Tokenizable t => ((t,t) -> ([t],t)) -> Tokenizer t ()
embed_ f = embed f'
  where f' (v,u) = let (rs,rem) = f (v,u) in ((),rs,rem)

-- | Embed a pure tokenizer into the monad. The visited string is discarded, and
-- the given function is run on the unvisited part. The return value is expected
-- to be (result, emitted tokens, remaining string).
discardAndEmbed :: Tokenizable t => (t -> (a,[t],t)) -> Tokenizer t a
discardAndEmbed f = discard >> embed (f . snd)

-- | Natural transformation to convert between tokenizers of different text types. Note that this operation does not perform encoding\/decoding (i.e. converting from ByteString to Text does not decode Unicode characters). To do so, use convertWith a provide the correct encoding\/decoding functions.
convert :: (Tokenizable t, IsString t, Tokenizable s, IsString s) =>
           Tokenizer s a -> Tokenizer t a
convert = convertWith fromString unpack . convertWith unpack fromString
  where unpack t | tnull t = []
                 | otherwise = thead t : unpack (ttail t)

-- | Natural transformation to convert between tokenizers of different text types, using the given conversion functions.
convertWith :: (s -> t) -> (t -> s) -> Tokenizer s a -> Tokenizer t a
convertWith fw bw (Tokenizer runTok) = Tokenizer $ \(whole,count,tail) ->
  let (a, o, w, c, t) = runTok (bw whole, count, bw tail)
      o' = map fw $ o []
  in (a, (o'++), fw w, c, fw t)

-- | Split a string into tokens using the given tokenizer
runTokenizer :: Tokenizable t => Tokenizer t () -> t -> [t]
runTokenizer m input =
    let input' = tlower input
    in case runTokenizer' m (input',0,input') of
        (_, tokens, _, _, _) -> tokens []

-- | Split a string into tokens using the given tokenizer, case sensitive version
runTokenizerCS :: Tokenizable t => Tokenizer t () -> t -> [t]
runTokenizerCS m input =
    case runTokenizer' m (input,0,input) of
        (_, tokens, _, _, _) -> tokens []