src/ACE/Tokenizer.hs

{-# LANGUAGE BangPatterns #-}
-- | Tokenizer for ACE. Tokens retain source locations (line and column).

module ACE.Tokenizer where

import           ACE.Types.Tokens

import           Control.Applicative
import           Control.Arrow
import           Control.Monad
import           Data.Attoparsec.Text hiding (number)
import           Data.Char
import           Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Read as T

-- | Tokenize some complete ACE text.
tokenize :: Text -> Either String [Token]
tokenize =
  parseOnly (fmap fst tokenizer <* endOfInput)

-- | The tokenizer.
tokenizer :: Parser ([Token],(Int,Int))
tokenizer =
  manyWithPos (spaces >=> token)
              genitive
              (1,0)

-- | Parse a token.
token :: (Int,Int) -> Parser (Token,(Int,Int))
token pos =
  number pos <|>
  quotedString pos <|>
  period pos <|>
  comma pos <|>
  questionMark pos <|>
  word pos

-- | Parse a number.
number :: (Int, Int) -> Parser (Token, (Int,Int))
number pos =
  fmap (\n -> (Number pos (either (const 0) fst (T.decimal n)),second (+ T.length n) pos))
       (takeWhile1 isDigit)

-- | Parse a quoted string, @\"foobar\"@.
quotedString :: (Int,Int) -> Parser (Token,(Int,Int))
quotedString pos =
  char '"' *> (cons <$> takeWhile1 (/='"')) <* char '"'
  where
    cons x =
      (QuotedString pos x
      ,second (+ (T.length x + 2)) pos)

-- | Parse a period \".\".
period :: (Int,Int) -> Parser (Token,(Int,Int))
period pos =
  char '.' *> pure (Period pos,second (+1) pos)

-- | Parse a comma \",\".
comma :: (Int,Int) -> Parser (Token,(Int,Int))
comma pos =
  char ',' *> pure (Comma pos,second (+1) pos)

-- | Parse a question mark \"?\".
questionMark :: (Int,Int) -> Parser (Token,(Int,Int))
questionMark pos =
  char '?' *> pure (QuestionMark pos,second (+1) pos)

-- | Parse a word, which is any sequence of non-whitespace words
-- containing none of the other token characters.
word :: (Int,Int) -> Parser (Token,(Int,Int))
word pos =
  cons <$> takeWhile1 wordChar
  where
    cons w = (Word pos w,second (+ T.length w) pos)
    wordChar c =
      not (isSpace c) &&
      c /= '"' &&
      c /= '.' &&
      c /= '?' &&
      c /= ',' &&
      c /= '\''

-- | Parse the Saxon genitive ' or 's. This is ran after parsing every
-- token, but is expected to fail most of the time.
genitive :: (Int, Int) -> Parser (Maybe (Token,(Int, Int)))
genitive pos =
  optional go
  where
    go =
      do _ <- char '\''
         ms <- peekChar
         case ms of
           Just 's' -> anyChar *> pure (Genitive pos True,second (+1) pos)
           _ -> pure (Genitive pos False,second (+1) pos)

-- | Like 'many', but retains the current source position and supports
-- postfix-parsing of the genitive apostrophe.
manyWithPos :: (Monad m, Alternative m)
            => ((t, t1) -> m (a, (t, t1))) -> ((t, t1) -> m (Maybe (a, (t, t1))))
            -> (t, t1) -> m ([a], (t, t1))
manyWithPos p p' pos =
  do r <- fmap (first Just) (p pos) <|> pure (Nothing,pos)
     case r of
       (Nothing,_) ->
         return ([],pos)
       (Just x,newpos@(!_,!_)) ->
         do r' <- p' newpos
            case r' of
              Nothing ->
                do (xs,finalpos) <- manyWithPos p p' newpos
                   return (x:xs,finalpos)
              Just (y,newpos') ->
                do (xs,finalpos) <- manyWithPos p p' newpos'
                   return (x:y:xs,finalpos)

-- | Skip spaces (space, newline, tab (=4 spaces)) and keep
-- positioning information up to date.
spaces :: (Int,Int) -> Parser (Int,Int)
spaces (sline,scol) =
  go sline scol
  where
    go line col =
      do c <- peekChar
         case c of
           Just '\n' -> anyChar *> go (line+1) 0
           Just ' ' -> anyChar *> go line (col+1)
           Just '\t' -> anyChar *> go line (col+4)
           _ -> return (line,col)