-- -- Copyright (c) 2013-2023 Nicola Bonelli -- -- This program is free software; you can redistribute it and/or modify -- it under the terms of the GNU General Public License as published by -- the Free Software Foundation; either version 2 of the License, or -- (at your option) any later version. -- -- This program is distributed in the hope that it will be useful, -- but WITHOUT ANY WARRANTY; without even the implied warranty of -- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -- GNU General Public License for more details. -- -- You should have received a copy of the GNU General Public License -- along with this program; if not, write to the Free Software -- Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. -- module CGrep.Parser.Chunk ( parseChunks , Chunk(..) , MatchLine(..) , ChunkType , pattern ChunkIdentifier , pattern ChunkKeyword , pattern ChunkDigit , pattern ChunkBracket , pattern ChunkString , pattern ChunkNativeType , pattern ChunkOperator , pattern ChunkUnspec ) where import CGrep.Parser.Char ( isDigit, isSpace, isCharNumber, isAlphaNum_, isAlpha_, isBracket', isCharNumber, isBracket', isAlpha_, isAlphaNum_ ) import CGrep.Types ( Text8, Offset ) import Data.List (genericLength) import CGrep.FileTypeMap ( FileTypeInfo(..) ) import qualified Data.ByteString.Char8 as C import qualified Data.ByteString.Lazy as LB import qualified Data.ByteString.Internal as BI import qualified ByteString.StrictBuilder as B import Data.MonoTraversable ( MonoFoldable(oforM_) ) import Data.STRef ( STRef, newSTRef, writeSTRef, readSTRef ) import Control.Monad.ST ( ST, runST ) import qualified Data.Sequence as S import Data.Sequence ((|>), Seq((:<|), (:|>), Empty)) import Data.Maybe ( fromMaybe ) import Data.Word ( Word8 ) newtype ChunkType = ChunkType {unChunkType :: Word8} deriving newtype (Eq, Ord) instance Show ChunkType where show ChunkUnspec = "*" show ChunkIdentifier = "identifier" show ChunkKeyword = "keyword" show ChunkDigit = "digit" show ChunkBracket = "bracket" show ChunkOperator = "operator" show ChunkString = "string" show ChunkNativeType = "native-type" {-# INLINE show #-} pattern ChunkUnspec :: ChunkType pattern ChunkUnspec = ChunkType 0 pattern ChunkIdentifier :: ChunkType pattern ChunkIdentifier = ChunkType 1 pattern ChunkKeyword :: ChunkType pattern ChunkKeyword = ChunkType 2 pattern ChunkDigit :: ChunkType pattern ChunkDigit = ChunkType 3 pattern ChunkBracket :: ChunkType pattern ChunkBracket = ChunkType 4 pattern ChunkOperator :: ChunkType pattern ChunkOperator = ChunkType 5 pattern ChunkString :: ChunkType pattern ChunkString = ChunkType 6 pattern ChunkNativeType :: ChunkType pattern ChunkNativeType = ChunkType 7 {-# COMPLETE ChunkIdentifier, ChunkKeyword, ChunkDigit, ChunkBracket, ChunkOperator, ChunkString, ChunkNativeType, ChunkUnspec #-} data Chunk = Chunk { cTyp :: {-# UNPACK #-} !ChunkType , cToken :: {-# UNPACK #-} !Text8 , cOffset :: {-# UNPACK #-} !Offset } deriving stock (Eq, Show, Ord) data MatchLine = MatchLine { lOffset :: {-# UNPACK #-} !Offset, lChunks :: [Chunk] } deriving stock (Eq, Show) newtype ChunkState = ChunkState { unChunkState :: Word8 } deriving newtype (Eq, Ord) instance Show ChunkState where show StateSpace = "space" show StateAlpha = "alpha" show StateDigit = "digit" show StateBracket = "bracket" show StateOther = "other" {-# INLINE show #-} pattern StateSpace :: ChunkState pattern StateSpace = ChunkState 0 pattern StateAlpha :: ChunkState pattern StateAlpha = ChunkState 1 pattern StateDigit :: ChunkState pattern StateDigit = ChunkState 2 pattern StateBracket :: ChunkState pattern StateBracket = ChunkState 3 pattern StateOther :: ChunkState pattern StateOther = ChunkState 4 {-# COMPLETE StateSpace, StateAlpha, StateDigit, StateBracket, StateOther #-} (<~) :: STRef s a -> a -> ST s () ref <~ !x = writeSTRef ref x {-# INLINE (<~) #-} {-# INLINE parseChunks #-} parseChunks :: Maybe FileTypeInfo -> Text8 -> S.Seq Chunk parseChunks l t = runST $ case l >>= \FileTypeInfo {..} -> ftIdentifierChars of Just (isAlpha1, isAlphaN) -> parseChunks' isAlpha_ isAlphaNum_ t _ -> parseChunks' isAlpha_ isAlphaNum_ t where parseChunks' :: (Char->Bool) -> (Char -> Bool) -> C.ByteString -> ST s (S.Seq Chunk) parseChunks' isAlpha1 isAlphaN txt = do stateR <- newSTRef StateSpace offR <- newSTRef 0 accR <- newSTRef (mempty :: B.Builder) tokensR <- newSTRef S.empty oforM_ txt $ \w -> do let x = BI.w2c w state <- readSTRef stateR off <- readSTRef offR acc <- readSTRef accR tokens <- readSTRef tokensR case state of StateSpace -> if | isSpace x -> do stateR <~ StateSpace ; accR <~ mempty | isAlpha1 x -> do stateR <~ StateAlpha ; accR <~ B.asciiChar x | isDigit x -> do stateR <~ StateDigit ; accR <~ B.asciiChar x | isBracket' x -> do stateR <~ StateBracket ; accR <~ B.asciiChar x | otherwise -> do stateR <~ StateOther ; accR <~ B.asciiChar x StateAlpha -> if | isAlphaN x -> do stateR <~ StateAlpha ; accR <~ (acc <> B.asciiChar x) | isSpace x -> do stateR <~ StateSpace ; accR <~ mempty ; tokensR <~ (tokens |> toChunk off acc) | isBracket' x -> do stateR <~ StateBracket ; accR <~ B.asciiChar x ; tokensR <~ (tokens |> toChunk off acc) | otherwise -> do stateR <~ StateOther ; accR <~ B.asciiChar x ; tokensR <~ (tokens |> toChunk off acc) StateDigit -> if | isCharNumber x -> do stateR <~ StateDigit ; accR <~ (acc <> B.asciiChar x) | isSpace x -> do stateR <~ StateSpace ; accR <~ mempty ; tokensR <~ (tokens |> toChunk off acc) | isAlpha1 x -> do stateR <~ StateAlpha ; accR <~ B.asciiChar x ; tokensR <~ (tokens |> toChunk off acc) | isBracket' x -> do stateR <~ StateBracket ; accR <~ B.asciiChar x ; tokensR <~ (tokens |> toChunk off acc) | otherwise -> do stateR <~ StateOther ; accR <~ B.asciiChar x ; tokensR <~ (tokens |> toChunk off acc) StateBracket -> if | isSpace x -> do stateR <~ StateSpace ; accR <~ mempty ; tokensR <~ (tokens |> toChunk off acc) | isAlpha1 x -> do stateR <~ StateAlpha ; accR <~ B.asciiChar x ; tokensR <~ (tokens |> toChunk off acc) | isDigit x -> do stateR <~ StateDigit ; accR <~ B.asciiChar x ; tokensR <~ (tokens |> toChunk off acc) | isBracket' x -> do stateR <~ StateBracket ; accR <~ B.asciiChar x ; tokensR <~ (tokens |> toChunk off acc) | otherwise -> do stateR <~ StateOther ; accR <~ B.asciiChar x ; tokensR <~ (tokens |> toChunk off acc) StateOther -> if | isSpace x -> do stateR <~ StateSpace ; accR <~ mempty ; tokensR <~ (tokens |> toChunk off acc) | isAlpha1 x -> do stateR <~ StateAlpha ; accR <~ B.asciiChar x ; tokensR <~ (tokens |> toChunk off acc) | isDigit x -> if B.builderBytes acc == "." then do stateR <~ StateDigit ; accR <~ (acc <> B.asciiChar x) else do stateR <~ StateDigit ; accR <~ B.asciiChar x ; tokensR <~ (tokens |> toChunk off acc) | isBracket' x -> do stateR <~ StateBracket ; accR <~ B.asciiChar x ; tokensR <~ (tokens |> toChunk off acc) | otherwise -> do stateR <~ StateOther ; accR <~ (acc <> B.asciiChar x) offR <~ (off+1) lastAcc <- readSTRef accR tokens <- readSTRef tokensR if B.builderLength lastAcc == 0 then return tokens else do state <- readSTRef stateR off <- readSTRef offR return $ tokens |> toChunk off lastAcc toChunk :: Offset -> B.Builder -> Chunk toChunk off b = Chunk ChunkUnspec str (off - fromIntegral (B.builderLength b)) where str = B.builderBytes b {-# INLINE toChunk #-}