{-# LANGUAGE AllowAmbiguousTypes #-} {-# LANGUAGE DataKinds #-} -- -- Copyright (c) 2013-2025 Nicola Bonelli -- -- This program is free software; you can redistribute it and/or modify -- it under the terms of the GNU General Public License as published by -- the Free Software Foundation; either version 2 of the License, or -- (at your option) any later version. -- -- This program is distributed in the hope that it will be useful, -- but WITHOUT ANY WARRANTY; without even the implied warranty of -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -- GNU General Public License for more details. -- -- You should have received a copy of the GNU General Public License -- along with this program; if not, write to the Free Software -- Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. -- {-# LANGUAGE KindSignatures #-} module CGrep.Parser.Chunk ( parseChunks, Chunk (..), cOffset, MatchLine (..), ChunkType, pattern ChunkIdentifier, pattern ChunkKeyword, pattern ChunkDigit, pattern ChunkBracket, pattern ChunkString, pattern ChunkNativeType, pattern ChunkOperator, pattern ChunkUnspec, ) where import CGrep.FileTypeMap (CharSet (..), FileTypeInfo (..), IsCharSet (..)) import Control.Monad.ST (ST, runST) import Data.STRef (STRef, newSTRef, readSTRef, writeSTRef) import Data.Sequence ((|>)) import qualified Data.Sequence as S import qualified Data.Text as T import qualified Data.Text.Unsafe as TU import Data.Word (Word8) import CGrep.Parser.Char (isBracket', isCharNumber, isDigit, isSpace) import CGrep.Text (iterM, textOffsetWord8) newtype ChunkType = ChunkType {_unChunkType :: Word8} deriving newtype (Eq, Ord) instance Show ChunkType where show ChunkUnspec = "*" show ChunkIdentifier = "identifier" show ChunkKeyword = "keyword" show ChunkDigit = "digit" show ChunkBracket = "bracket" show ChunkOperator = "operator" show ChunkString = "string" show ChunkNativeType = "native-type" {-# INLINE show #-} pattern ChunkUnspec :: ChunkType pattern ChunkUnspec = ChunkType 0 pattern ChunkIdentifier :: ChunkType pattern ChunkIdentifier = ChunkType 1 pattern ChunkKeyword :: ChunkType pattern ChunkKeyword = ChunkType 2 pattern ChunkDigit :: ChunkType pattern ChunkDigit = ChunkType 3 pattern ChunkBracket :: ChunkType pattern ChunkBracket = ChunkType 4 pattern ChunkOperator :: ChunkType pattern ChunkOperator = ChunkType 5 pattern ChunkString :: ChunkType pattern ChunkString = ChunkType 6 pattern ChunkNativeType :: ChunkType pattern ChunkNativeType = ChunkType 7 {-# COMPLETE ChunkIdentifier, ChunkKeyword, ChunkDigit, ChunkBracket, ChunkOperator, ChunkString, ChunkNativeType, ChunkUnspec #-} data Chunk = Chunk { cTyp :: {-# UNPACK #-} !ChunkType , cToken :: {-# UNPACK #-} !T.Text } deriving stock (Eq, Show, Ord) cOffset :: Chunk -> Int cOffset (Chunk _ t) = textOffsetWord8 t {-# INLINE cOffset #-} data MatchLine = MatchLine { mlOffset :: {-# UNPACK #-} !Int , mlChunks :: [Chunk] } deriving stock (Eq, Show) newtype ChunkState = ChunkState {_unChunkState :: Word8} deriving newtype (Eq, Ord) instance Show ChunkState where show StateSpace = "space" show StateAlpha = "alpha" show StateDigit = "digit" show StateBracket = "bracket" show StateOther = "other" {-# INLINE show #-} pattern StateSpace :: ChunkState pattern StateSpace = ChunkState 0 pattern StateAlpha :: ChunkState pattern StateAlpha = ChunkState 1 pattern StateDigit :: ChunkState pattern StateDigit = ChunkState 2 pattern StateBracket :: ChunkState pattern StateBracket = ChunkState 3 pattern StateOther :: ChunkState pattern StateOther = ChunkState 4 {-# COMPLETE StateSpace, StateAlpha, StateDigit, StateBracket, StateOther #-} (<~) :: STRef s a -> a -> ST s () ref <~ !x = writeSTRef ref x {-# INLINE (<~) #-} toChunk :: T.Text -> Maybe Int -> Int -> Chunk toChunk _ Nothing _ = error "CGrep.Parser.Chunk.toChunk: Nothing (Internal Error)" toChunk text (Just beg) cur = let chunk = TU.takeWord8 (cur - beg) (TU.dropWord8 beg text) in Chunk ChunkUnspec chunk {-# INLINE toChunk #-} {-# COMPLETE StateSpace, StateAlpha, StateDigit, StateBracket, StateOther #-} {-# INLINE parseChunks #-} parseChunks :: Maybe FileTypeInfo -> T.Text -> S.Seq Chunk parseChunks l txt = runST $ case l >>= \FileTypeInfo{..} -> ftIdentCharSet of Just (None, None) -> parseChunks' @'None @'None txt Just (Alpha_, AlphaNum_') -> parseChunks' @'Alpha_ @'AlphaNum_' txt Just (Alpha_, AlphaNum_) -> parseChunks' @'Alpha_ @'AlphaNum_ txt Just (Alpha, AlphaNum_) -> parseChunks' @'Alpha @'AlphaNum_ txt Just (Alpha_, AlphaNumDash_) -> parseChunks' @'Alpha_ @'AlphaNumDash_ txt Just (AlphaDash_, AlphaNumDash_) -> parseChunks' @'AlphaDash_ @'AlphaNumDash_ txt Just (ClojureIdentStart, ClojureIdentCont) -> parseChunks' @'ClojureIdentStart @'ClojureIdentCont txt Just (CSharpIdentStart, CSharpIdentCont) -> parseChunks' @'CSharpIdentStart @'CSharpIdentCont txt Just (HtmlIdentStart, HtmlIdentCont) -> parseChunks' @'HtmlIdentStart @'HtmlIdentCont txt Just (JavaIdentStart, JavaIdentCont) -> parseChunks' @'JavaIdentStart @'JavaIdentCont txt Just (JuliaIdentStart, JuliaIdentCont) -> parseChunks' @'JuliaIdentStart @'JuliaIdentCont txt Just (ListIdent, ListIdent) -> parseChunks' @'ListIdent @'ListIdent txt Just (Unicode_, UnicodeNum_) -> parseChunks' @'Unicode_ @'UnicodeNum_ txt Just (UnicodeDollar_, UnicodeNumDollar_) -> parseChunks' @'UnicodeDollar_ @'UnicodeNumDollar_ txt Just (Unicode_, UnicodeNum_') -> parseChunks' @'Unicode_ @'UnicodeNum_' txt Just (Unicode_, UnicodeNumDollar_) -> parseChunks' @'Unicode_ @'UnicodeNumDollar_ txt Just (UnicodeXIDStart_, UnicodeNumXIDCont_) -> parseChunks' @'UnicodeXIDStart_ @'UnicodeNumXIDCont_ txt Just (AgdaIdent, AgdaIdent) -> parseChunks' @'AgdaIdent @'AgdaIdent txt Nothing -> parseChunks' @'None @'None txt charsets -> error $ "CGrep: unsupported CharSet combination: " <> show charsets parseChunks' :: forall (cs1 :: CharSet) (csN :: CharSet) s. (IsCharSet cs1, IsCharSet csN) => T.Text -> ST s (S.Seq Chunk) parseChunks' txt = do stateR <- newSTRef StateSpace accR <- newSTRef Nothing tokensR <- newSTRef S.empty iterM txt $ \(# x, offset, _delta #) -> do state <- readSTRef stateR acc <- readSTRef accR tokens <- readSTRef tokensR case state of StateSpace -> if | isSpace x -> do stateR <~ StateSpace; accR <~ Nothing | isValidChar @cs1 x -> do stateR <~ StateAlpha; accR <~ Just offset | isDigit x -> do stateR <~ StateDigit; accR <~ Just offset | isBracket' x -> do stateR <~ StateBracket; accR <~ Just offset | otherwise -> do stateR <~ StateOther; accR <~ Just offset StateAlpha -> if | isValidChar @csN x -> do stateR <~ StateAlpha | isSpace x -> do stateR <~ StateSpace; accR <~ Nothing; tokensR <~ (tokens |> toChunk txt acc offset) | isBracket' x -> do stateR <~ StateBracket; accR <~ Just offset; tokensR <~ (tokens |> toChunk txt acc offset) | otherwise -> do stateR <~ StateOther; accR <~ Just offset; tokensR <~ (tokens |> toChunk txt acc offset) StateDigit -> if | isCharNumber x -> do stateR <~ StateDigit | isSpace x -> do stateR <~ StateSpace; accR <~ Nothing; tokensR <~ (tokens |> toChunk txt acc offset) | isValidChar @cs1 x -> do stateR <~ StateAlpha; accR <~ Just offset; tokensR <~ (tokens |> toChunk txt acc offset) | isBracket' x -> do stateR <~ StateBracket; accR <~ Just offset; tokensR <~ (tokens |> toChunk txt acc offset) | otherwise -> do stateR <~ StateOther; accR <~ Just offset; tokensR <~ (tokens |> toChunk txt acc offset) StateBracket -> if | isSpace x -> do stateR <~ StateSpace; accR <~ Nothing; tokensR <~ (tokens |> toChunk txt acc offset) | isValidChar @cs1 x -> do stateR <~ StateAlpha; accR <~ Just offset; tokensR <~ (tokens |> toChunk txt acc offset) | isDigit x -> do stateR <~ StateDigit; accR <~ Just offset; tokensR <~ (tokens |> toChunk txt acc offset) | isBracket' x -> do stateR <~ StateBracket; accR <~ Just offset; tokensR <~ (tokens |> toChunk txt acc offset) | otherwise -> do stateR <~ StateOther; accR <~ Just offset; tokensR <~ (tokens |> toChunk txt acc offset) StateOther -> if | isSpace x -> do stateR <~ StateSpace; accR <~ Nothing; tokensR <~ (tokens |> toChunk txt acc offset) | isValidChar @cs1 x -> do stateR <~ StateAlpha; accR <~ Just offset; tokensR <~ (tokens |> toChunk txt acc offset) | isDigit x -> if acc == Just (offset - 1) && TU.unsafeHead (TU.dropWord8 (offset - 1) txt) == '.' then do stateR <~ StateDigit else do stateR <~ StateDigit; accR <~ Just offset; tokensR <~ (tokens |> toChunk txt acc offset) | isBracket' x -> do stateR <~ StateBracket; accR <~ Just offset; tokensR <~ (tokens |> toChunk txt acc offset) | otherwise -> do stateR <~ StateOther tokens <- readSTRef tokensR lastAcc <- readSTRef accR case lastAcc of Nothing -> return tokens Just off | off == TU.lengthWord8 txt -> return tokens | otherwise -> return $ tokens |> toChunk txt lastAcc (TU.lengthWord8 txt)