{- Copyright (C) 2009 Andrejs Sisojevs All rights reserved. For license and copyright information, see the file COPYRIGHT -} -------------------------------------------------------------------------- -------------------------------------------------------------------------- {-# LANGUAGE BangPatterns #-} module Text.PCLT.Parser.AdvancedSepBy where import Control.Monad.State.Strict import qualified Data.ByteString.Lazy.UTF8.Unified as Lazy (ByteString) import qualified Data.ByteString.Lazy.UTF8.Unified as B hiding (ByteString) import Data.MyHelpers import Data.Int import Data.List import qualified Data.Map as M import Data.Map (Map, (!)) import System.IO.Unsafe import Text.PCLT.Parser.ParserInternals -- for cases like "Hi, $$ParamKeyName1$$!\nHow was your day, $$ParamKeyName2$$? ##CompositeName1## ##CompositeName2##" type SeparationMarkerIdx = Int data SeparatedSectorMarker = Beginning_SSM | EOF_SSM | InnerMarker_SSM SeparationMarkerIdx | Error_SSM StandartMarkingStrategyError -- i know, a bad style to make it dependant of a very specific marking strategy... no time for any better now deriving (Eq, Ord) type MarkedChunkLength = Int64 manyTill_EOForEitherOf :: Parser Char -> [Parser Lazy.ByteString] -> Parser (SeparatedSectorMarker, Lazy.ByteString, MarkedChunkLength) manyTill_EOForEitherOf !p !ends_list = Parser $! manyTill_EOForEitherOf' B.empty 0 where alternatives_set = map (\ (end_p, i) -> end_p >> return (InnerMarker_SSM i) ) (zip ends_list [1..]) --------------------------- manyTill_EOForEitherOf' :: Lazy.ByteString -> MarkedChunkLength -> State ParserState (ParserResult (SeparatedSectorMarker, Lazy.ByteString, MarkedChunkLength)) manyTill_EOForEitherOf' !soFar !len = do saved <- get result <- runParser $ choice alternatives_set let returnResult marker = return $ Success (marker, B.pack $ B.unpack $ B.reverse soFar, len) -- any better way to compact many chunks into one?.. if_bad = do put saved ps_c <- runParser p case ps_c of Success c -> manyTill_EOForEitherOf' (B.cons c soFar) (len + 1) -- would use cons' if had time to ensure, that it would keeps tail-recursion IllegalInput -> returnResult $ Error_SSM $ UnallowedCharacter_SMSE (B.head $ pstInput saved) ReachedEOF -> returnResult EOF_SSM case result of Success a -> returnResult a IllegalInput -> if_bad ReachedEOF -> if_bad type New_Active_Sep_SSM = SeparatedSectorMarker type Previous_Active_Sep_SSM = SeparatedSectorMarker type Current_Sep_SSM = SeparatedSectorMarker type Current_Chunk_SSM = SeparatedSectorMarker type SectorMarkingStrategy = (Previous_Active_Sep_SSM, Current_Sep_SSM) -> (Current_Chunk_SSM, New_Active_Sep_SSM) sepBySome :: Parser Char -> SectorMarkingStrategy -> [Parser Lazy.ByteString] -> Parser [(SeparatedSectorMarker, Lazy.ByteString, MarkedChunkLength)] sepBySome !p !sectorMarkingStrategy !sep_str_list = _getMarkedChunks1 [] Beginning_SSM where _getMarkedChunks1 :: [(SeparatedSectorMarker, Lazy.ByteString, MarkedChunkLength)] -> SeparatedSectorMarker -> Parser [(SeparatedSectorMarker, Lazy.ByteString, MarkedChunkLength)] _getMarkedChunks1 chunks_so_far prev_active_sep_marker = do (cur_sep_marker, x, len) <- manyTill_EOForEitherOf p sep_str_list let (cur_chunk_marker, new_active_sep_marker) = sectorMarkingStrategy (prev_active_sep_marker, cur_sep_marker) new_chunks_set = (cur_chunk_marker, x, len) : chunks_so_far case new_active_sep_marker of EOF_SSM -> return $ reverse new_chunks_set _ -> _getMarkedChunks1 new_chunks_set new_active_sep_marker data StandartMarkingStrategyError = InputAfterEOF_SMSE | BeginningMNotInTheBeginning_SMSE | OverlappingMarkedChunks_SMSE | UnsupportedMarkersSequence_SMSE SeparatedSectorMarker SeparatedSectorMarker | OpenMarkerAtEOF_SMSE SeparationMarkerIdx | UnallowedCharacter_SMSE Char deriving (Eq, Ord) standardMarkingStrategy :: SectorMarkingStrategy standardMarkingStrategy (prev_active_sep_marker, cur_sep_marker) = case (prev_active_sep_marker, cur_sep_marker) of ( Beginning_SSM, EOF_SSM) -> (InnerMarker_SSM 0, EOF_SSM) ( EOF_SSM, _) -> (Error_SSM InputAfterEOF_SMSE, EOF_SSM) ( Beginning_SSM, InnerMarker_SSM i) -> (InnerMarker_SSM 0, InnerMarker_SSM i) (InnerMarker_SSM i, EOF_SSM) -> case i == 0 of True -> (InnerMarker_SSM 0, EOF_SSM) False -> (Error_SSM $ OpenMarkerAtEOF_SMSE i, EOF_SSM) (InnerMarker_SSM i, Beginning_SSM) -> (InnerMarker_SSM i, Error_SSM BeginningMNotInTheBeginning_SMSE) (InnerMarker_SSM i, InnerMarker_SSM j) -> case i == j of True -> (InnerMarker_SSM i, InnerMarker_SSM 0) False -> case i == 0 of True -> (InnerMarker_SSM i, InnerMarker_SSM j) False -> (Error_SSM OverlappingMarkedChunks_SMSE, InnerMarker_SSM j) _ -> let err_m = Error_SSM $ UnsupportedMarkersSequence_SMSE prev_active_sep_marker cur_sep_marker in (err_m, err_m) standardMarkingStrategyFix_StripEmptyChunks :: [(SeparatedSectorMarker, Lazy.ByteString, MarkedChunkLength)] -> [(SeparatedSectorMarker, Lazy.ByteString, MarkedChunkLength)] standardMarkingStrategyFix_StripEmptyChunks = foldr foldr_f [] where foldr_f marked_chunk accum = case marked_chunk == (InnerMarker_SSM 0, B.empty, 0) of True -> accum False -> marked_chunk : accum type ChunkIndexInList_ = Int retrieveNonPlainMarkingsMap :: [(SeparatedSectorMarker, Lazy.ByteString, MarkedChunkLength)] -> M.Map SeparatedSectorMarker [(Lazy.ByteString, ChunkIndexInList_)] retrieveNonPlainMarkingsMap marked_seq = fst $ foldl foldr_f (M.empty, 0) marked_seq where foldr_f (accum, ind) marked_chunk = case marked_chunk of (InnerMarker_SSM i, str, _) -> case i == 0 of False -> ( M.unionWith (++) accum ( M.singleton (InnerMarker_SSM i) [(str, ind)]) , ind + 1 ) True -> (accum, ind + 1) _ -> (accum, ind + 1) getListOfMarkings :: M.Map SeparatedSectorMarker [(Lazy.ByteString, ChunkIndexInList_)] -> Int -> [(Lazy.ByteString, ChunkIndexInList_)] getListOfMarkings m i = case M.lookup (InnerMarker_SSM i) m of Just l -> l Nothing -> [] retrieveErrorsMarkingsList :: [(SeparatedSectorMarker, Lazy.ByteString, MarkedChunkLength)] -> [(SeparatedSectorMarker, Lazy.ByteString, ChunkIndexInList_)] retrieveErrorsMarkingsList marked_seq = fst $ foldl foldr_f ([], 0) marked_seq where foldr_f (accum, ind) marked_chunk = case marked_chunk of (Error_SSM err_msg, str, _) -> ((Error_SSM err_msg, str, ind): accum, ind + 1) _ -> (accum, ind + 1) {- -- \n == "\\n" ==> "\x0d\x0a" or whatever nl is -- \\ == "\\\\" ==> "\\" = \ translateEscapes :: Lazy.ByteString -> Lazy.ByteString -> (Lazy.ByteString, MarkedChunkLength) translateEscapes nl s = (B.concat [new_s, B.pack last_state], chars_less) where nl_len = B.length nl - 2 (new_s, last_state, chars_less) = B.foldl (\ (accum, state, cl) ch -> case state of [] -> case ch of '\\' -> ( accum, '\\':[], cl) _ -> (B.cons' ch accum, [], cl) '\\':[] -> case ch of '\\' -> (B.cons' '\\' accum , [], cl + 1) 'n' -> (B.concat [nl, accum], [], cl + nl_len) ) (B.empty, [], 0) s translateEscapes_inTheListOfMarkings :: Lazy.ByteString -> [(SeparatedSectorMarker, Lazy.ByteString, MarkedChunkLength)] -> [(SeparatedSectorMarker, Lazy.ByteString, MarkedChunkLength)] translateEscapes_inTheListOfMarkings nl marked_seq = map (\ chu@(m, s, l)-> case m of InnerMarker_SSM i -> case i == 0 of True -> let (n_s, l_mod) = translateEscapes nl s in (m, n_s, l - l_mod) False -> chu _ -> chu ) marked_seq -} ------------------------------------------------------------ ------------------------------------------------------------ insertInsteadOf_inLBS :: (Lazy.ByteString, Lazy.ByteString) -> Lazy.ByteString -> Lazy.ByteString insertInsteadOf_inLBS (old_sep, new_sep) lbs = case parse (sepBySome anyChar standardMarkingStrategy [stringLBS old_sep]) lbs of ( IllegalInput , _ ) -> error "This should have never happened! Unexpected error in 'insertInsteadOf_inLBS': parse returned unexpected result!" ( ReachedEOF , _ ) -> error "This should have never happened! Unexpected error in 'insertInsteadOf_inLBS': parse returned unexpected result!" ( Success chunks_list, _ ) -> B.concat $ intersperse new_sep $ map snd3 chunks_list ------------------------------------------------------------ ------------------------------------------------------------ instance Show SeparatedSectorMarker where show sms = _prefix ++ _body ++ "." where _prefix = "Sector separation marker: " _body = case sms of Beginning_SSM -> "input beginning" EOF_SSM -> "input ending" InnerMarker_SSM sm_idx -> "marker #" ++ show sm_idx Error_SSM smse -> "error '" ++ show smse ++ "'" instance Show StandartMarkingStrategyError where show smse = _prefix ++ _body where _prefix = "An error occured when parsing a marked text. Marking failed: " _body = case smse of InputAfterEOF_SMSE -> "input is not allowed after EOF." BeginningMNotInTheBeginning_SMSE -> "beginning is allowed to occur only as the first input marker." OverlappingMarkedChunks_SMSE -> "the marking strategy doesn't allow overlapping marked text chunks." UnsupportedMarkersSequence_SMSE prev_active_sm cur_sm -> "the marking strategy doesn't support markers sequence (active previous separator marker -> current separator marker): " ++ show prev_active_sm ++ " -> " ++ show cur_sm ++ " ." OpenMarkerAtEOF_SMSE sm_idx -> "text ends with unclosed chunk of nonplain (marker index: " ++ show sm_idx ++ ") text."