-- | A monad for writing pure tokenizers in an imperative-looking way.
--
-- Main idea: You 'walk' through the input string like a turtle, and everytime
-- you find a token boundary, you call 'emit'. If some specific kinds of tokens
-- should be suppressed, you can 'discard' them instead (or filter afterwards).
--
-- This module supports is specialized for strict 'ByteString's. The module
-- "Control.Monad.Tokenizer" provides more general types, but does not export a
-- 'Tokenizable' instance for 'ByteString's, as its implementation depends on the
-- encoding. This module assumes ASCII encoding (you have been warned!). 
--
-- Example for a simple tokenizer, that splits words by whitespace and discards stop symbols:
--
-- > tokenizeWords :: ByteString -> [ByteString]
-- > tokenizeWords = runTokenizer $ untilEOT $ do
-- >   c <- pop
-- >   if isStopSym c
-- >     then discard
-- >     else if c `elem` (" \t\r\n" :: [Char])
-- >          then discard
-- >          else do
-- >            walkWhile (\c -> (c=='_') || not (isSpace c || isPunctuation' c))
-- >            emit

module Control.Monad.Tokenizer.Char8.Strict (
  -- * Monad
  Tokenizer,
  runTokenizer,
  runTokenizerCS,
  untilEOT,
  -- * Tests
  peek,
  isEOT,
  lookAhead,
  -- * Movement
  walk,
  walkBack,
  pop,
  walkWhile,
  walkFold,
  -- * Transactions
  emit,
  discard,
  restore
  ) where

import qualified Control.Monad.Tokenizer as G
import Data.ByteString.Char8 as C8
import Data.Char

-- | Tokenizer monad. Use runTokenizer or runTokenizerCS to run it
type Tokenizer = G.Tokenizer ByteString

-- | Check if the next input chars agree with the given string
lookAhead = G.lookAhead :: [Char] -> Tokenizer Bool

-- | Proceed to the next character
walk = G.walk :: Tokenizer ()

-- | Walk back to the previous character, unless it was discarded/emitted.
walkBack = G.walkBack :: Tokenizer ()

-- | Restore the state after the last emit/discard.
restore = G.restore :: Tokenizer ()

-- | Peek the current character
peek = G.peek :: Tokenizer Char

-- | Peek the current character and proceed
pop = G.pop :: Tokenizer Char

-- | Break at the current position and emit the scanned token
emit = G.emit :: Tokenizer ()

-- | Break at the current position and discard the scanned token
discard = G.discard :: Tokenizer ()

-- | Have I reached the end of the input text?
isEOT = G.isEOT :: Tokenizer Bool

-- | Proceed as long as a given function succeeds
walkWhile = G.walkWhile :: (Char -> Bool) -> Tokenizer ()

-- | Proceed as long as a given fold returns Just (generalization of walkWhile)
walkFold = G.walkFold :: a -> (Char -> a -> Maybe a) -> Tokenizer ()

-- | Repeat a given tokenizer as long as the end of text is not reached
untilEOT = G.untilEOT :: Tokenizer () -> Tokenizer ()

-- | Split a string into tokens using the given tokenizer
runTokenizer = G.runTokenizer :: Tokenizer () -> ByteString -> [ByteString]

-- | Split a string into tokens using the given tokenizer, case sensitive version
runTokenizerCS = G.runTokenizerCS :: Tokenizer () -> ByteString -> [ByteString]

-- | Assuming ASCII encoding
instance G.Tokenizable ByteString where
  tnull = C8.null
  thead = C8.head
  ttail = C8.tail
  ttake = C8.take
  tdrop = C8.drop
  tlower = C8.map toLower