module ACE.Tokenizer where
import ACE.Types.Tokens
import Control.Applicative
import Control.Arrow
import Control.Monad
import Data.Attoparsec.Text hiding (number)
import Data.Char
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Read as T
tokenize :: Text -> Either String [Token]
tokenize =
parseOnly (fmap fst tokenizer <* endOfInput)
tokenizer :: Parser ([Token],(Int,Int))
tokenizer =
manyWithPos (spaces >=> token)
genitive
(1,0)
token :: (Int,Int) -> Parser (Token,(Int,Int))
token pos =
number pos <|>
quotedString pos <|>
period pos <|>
comma pos <|>
questionMark pos <|>
word pos
number :: (Int, Int) -> Parser (Token, (Int,Int))
number pos =
fmap (\n -> (Number pos (either (const 0) fst (T.decimal n)),second (+ T.length n) pos))
(takeWhile1 isDigit)
quotedString :: (Int,Int) -> Parser (Token,(Int,Int))
quotedString pos =
char '"' *> (cons <$> takeWhile1 (/='"')) <* char '"'
where
cons x =
(QuotedString pos x
,second (+ (T.length x + 2)) pos)
period :: (Int,Int) -> Parser (Token,(Int,Int))
period pos =
char '.' *> pure (Period pos,second (+1) pos)
comma :: (Int,Int) -> Parser (Token,(Int,Int))
comma pos =
char ',' *> pure (Comma pos,second (+1) pos)
questionMark :: (Int,Int) -> Parser (Token,(Int,Int))
questionMark pos =
char '?' *> pure (QuestionMark pos,second (+1) pos)
word :: (Int,Int) -> Parser (Token,(Int,Int))
word pos =
cons <$> takeWhile1 wordChar
where
cons w = (Word pos w,second (+ T.length w) pos)
wordChar c =
not (isSpace c) &&
c /= '"' &&
c /= '.' &&
c /= '?' &&
c /= ',' &&
c /= '\''
genitive :: (Int, Int) -> Parser (Maybe (Token,(Int, Int)))
genitive pos =
optional go
where
go =
do _ <- char '\''
ms <- peekChar
case ms of
Just 's' -> anyChar *> pure (Genitive pos True,second (+1) pos)
_ -> pure (Genitive pos False,second (+1) pos)
manyWithPos :: (Monad m, Alternative m)
=> ((t, t1) -> m (a, (t, t1))) -> ((t, t1) -> m (Maybe (a, (t, t1))))
-> (t, t1) -> m ([a], (t, t1))
manyWithPos p p' pos =
do r <- fmap (first Just) (p pos) <|> pure (Nothing,pos)
case r of
(Nothing,_) ->
return ([],pos)
(Just x,newpos@(!_,!_)) ->
do r' <- p' newpos
case r' of
Nothing ->
do (xs,finalpos) <- manyWithPos p p' newpos
return (x:xs,finalpos)
Just (y,newpos') ->
do (xs,finalpos) <- manyWithPos p p' newpos'
return (x:y:xs,finalpos)
spaces :: (Int,Int) -> Parser (Int,Int)
spaces (sline,scol) =
go sline scol
where
go line col =
do c <- peekChar
case c of
Just '\n' -> anyChar *> go (line+1) 0
Just ' ' -> anyChar *> go line (col+1)
Just '\t' -> anyChar *> go line (col+4)
_ -> return (line,col)