module Text.Tokenify (
tokenize,
matchHead,
module DSL,
module Types,
module CSeq
) where
import Prelude hiding (head)
import qualified Text.Tokenify.Response as Response
import qualified Text.Tokenify.CharSeq as CSeq
import qualified Text.Tokenify.Regex as Regex
import qualified Text.Tokenify.Types as Types
import qualified Text.Tokenify.DSL as DSL
import Text.Tokenify.Response (Response)
import Text.Tokenify.CharSeq (CharSeq)
import Text.Tokenify.Regex (Regex)
import Text.Tokenify.Types
import qualified Data.Monoid as Monoid
import qualified Data.Sequence as Seq
import Data.Sequence ((|>), Seq)
import Data.Monoid ((<>))
import Control.Applicative ((<|>))
tokenize :: (CharSeq s) => Tokenizer s a -> s -> Either String (Seq a)
tokenize tokenizers input = impl tokenizers Seq.empty 0 input where
inputInfo :: [Int]
inputInfo = CSeq.lineInfo input
getPos :: Int -> (Int, Int)
getPos offset = getPosImpl 0 1 inputInfo where
getPosImpl position row (lineLength:lines)
| offset >= position && offset <= position+lineLength
= (row, offset position)
| offset > position+lineLength
= getPosImpl (position + lineLength) (row + 1) lines
impl _ acc _ input | CSeq.null input = Right acc
impl [] acc position _
= Left ("failed to match at" ++ show (getPos position))
impl ((rx, rs):ts) acc position input
= case matchHead rx input of
Nothing -> impl ts acc position input
Just (matched, rest, moved) ->
let position' = position+moved
coordants = getPos position in case rs of
Response.Error -> Left ("matched error at " ++ show position)
Response.Ignore ->
impl tokenizers acc position' rest
Response.Display p ->
impl tokenizers (acc |> p coordants) position' rest
Response.Process p ->
impl tokenizers (acc |> p matched coordants) position' rest
matchHead :: (CharSeq s) => Regex s -> s -> Maybe (s, s, Int)
matchHead regex input = case regex of
Regex.NoPass -> Nothing
Regex.Char c -> CSeq.head input >>=
\head -> if head == c
then return (CSeq.singleton head, CSeq.tail input, 1)
else Nothing
Regex.String s -> prefixTail s input >>=
\(diff, dSize) -> return (s, diff, dSize)
Regex.Alt l r ->
matchHead l input <|> matchHead r input
Regex.Range s e -> do
head <- CSeq.head input
if head >= s && e >= head
then return (CSeq.singleton head, CSeq.tail input, 1)
else Nothing
Regex.Append l r -> do
(a, cont, ai) <- matchHead l input
(b, cont, bi) <- matchHead r cont
return (a <> b, cont, ai + bi)
Regex.Option o -> case matchHead o input of
Nothing -> return (Monoid.mempty, input, 0)
anythingElse -> anythingElse
Regex.Repeat r -> impl Monoid.mempty input 0 where
impl acc cont@(matchHead r -> Nothing) i = Just (acc, cont, i)
impl a (matchHead r -> Just (b, cont, ib)) ia =
impl (a <> b) cont (ia + ib)
Regex.Repeat1 r -> do
(a, cont, ai) <- matchHead r input
(b, cont, bi) <- matchHead (Regex.Repeat r) input
return (a <> b, cont, ai + bi)
prefixTail :: (CharSeq s) => s -> s -> Maybe (s, Int)
prefixTail prefix input = trySplit prefix input 0 where
trySplit pre dec index
| CSeq.null pre = Just (dec, index)
| not (CSeq.null dec) && (CSeq.head dec == CSeq.head pre)
= trySplit (CSeq.tail pre) (CSeq.tail dec) (index + 1)
| otherwise = Nothing