-- ParseRegexStr.hs -- -- Author: Yoshikuni Jujo -- module Hidden.ParseRegexStr ( RegexAction(..) , parseRegexStr ) where import Hidden.RegexPRTypes ( RegexAction(..), RegexSrcParser, runRegexSrcParser, getBR, modifyBR, setMode, setModes, getModes, isModeI, isModeM, isModeX ) import Hidden.ParseLib ( runParse, spot, token, tokens, mzero, mplus, still, parseNot, endOfInput, MonadParse, MonadPlus, list, greedyNeList, optional ) import Hidden.Tools ( isSymbol, ignoreCase, skipRet, (>..>), ifM, applyIf, (&&&), headOrErr, modifyFst ) import Data.Char ( isAlphaNum, isDigit, isSpace ) import Data.Ix ( inRange ) import Hidden.SrcRegActList( selfTest, oneCharList, backSlashesList, plusesList, parensesList, charClassList ) parseRegexStr :: String -> [RegexAction] parseRegexStr = fst . fst . headOrErr "parse error: regex is uncorrect" . (runParse $ runRegexSrcParser parseRegexStrParser) . (,) [] parseRegexStrParser, parseTokensOr, parseTokens :: RegexSrcParser [RegexAction] parseRegexStrParser = parseTokensOr >>= endOfInput parseTokensOr = parseTokens `mplus` do { ra1 <- parseTokens; token '|'; ra2 <- parseTokensOr; return $ [ RegexOr ra1 ra2 ] } parseTokens = list parseTokenPlus parseTokenPlus, parseToken :: RegexSrcParser RegexAction parseTokenPlus = do ra <- parseToken plus <- parsePluses plusesList `mplus` parseQuantifier return $ plus ra parseQuantifier :: RegexSrcParser (RegexAction -> RegexAction) parseQuantifier = do { token '{'; mn <- list $ spot isDigit; mx <- do { cma <- optional $ token ','; case cma of "" -> return Nothing _ -> list (spot isDigit) >>= return . Just }; token '}'; nd <- optional (token '?') >>= return . null; return $ (if nd then Repeat else RepeatNotGreedy) (read mn) $ case mx of Nothing -> Just $ read mn Just "" -> Nothing Just n -> Just $ read n } parseToken = ifM isModeX parseTokenX mzero `mplus` ( isModeI >>= \ic -> spot selfTest >>= return . Select . applyIf ic ignoreCase . (==) ) `mplus` ( flip (ifM isModeM) mzero $ token '.' >> return (Select $ const True) ) `mplus` ( token '\\' >> spot isSymbol >>= return . Select . (==) ) `mplus` ( token '\\' >> optional (token '{') >> greedyNeList (spot isDigit) >>= skipRet (optional $token '}') >>= return . BackReference . read ) `mplus` ( token '[' >> optional (token '^') >>= return . not . null >>= \isNot -> parseCharList >>= skipRet (token ']') >>= return . Select . applyIf isNot (not.) ) `mplus` ( getBR >>= \i -> token '(' >> modifyBR (+1) >> parseTokensOr >>= skipRet (token ')') >>= return . Note i ) `mplus` ( tokens "(?" >> list parseMode >>= mapM_ (uncurry setMode) >> token ')' >> return NopRegex ) `mplus` ( getModes >>= \preModes -> tokens "(?" >> list parseMode >>= mapM_ (uncurry setMode) >> token ':' >> parseTokensOr >>= skipRet (setModes preModes >> token ')') >>= return . Parens ) `mplus` parseOneChar oneCharList `mplus` parseBackSlashes backSlashesList `mplus` parseParenses parensesList `mplus` ( tokens "(?#" >> list (spot (/=')')) >>= skipRet (token ')') >>= return . Comment ) parseMode :: RegexSrcParser (Char, Bool) parseMode = optional (token '-') >..> spot (flip elem "imx") >>= return . uncurry (flip (,)) . modifyFst null parseTokenX :: RegexSrcParser RegexAction parseTokenX = ( spot isSpace >> return NopRegex ) `mplus` ( token '#' >> list (spot (/='\n')) >>= skipRet (token '\n' `mplus` endOfInput '\n') >>= return . Comment ) parsePluses :: [ (String, RegexAction -> RegexAction) ] -> RegexSrcParser (RegexAction -> RegexAction) parsePluses = concatMapParse (\(t, act) -> tokens t >> return act) parseOneChar :: [ (Char, RegexAction) ] -> RegexSrcParser RegexAction parseOneChar = concatMapParse (\(t, act) -> token t >> return act) parseBackSlashes :: [ (Char, RegexAction) ] -> RegexSrcParser RegexAction parseBackSlashes = concatMapParse (\(t, act) -> tokens ['\\', t] >> return act) parseParenses :: [ (String, [RegexAction] -> RegexAction) ] -> RegexSrcParser RegexAction parseParenses = concatMapParse ( \(t, act) -> tokens ('(':t) >> parseTokensOr >>= skipRet (token ')') >>= return . act ) parseCharList :: RegexSrcParser (Char -> Bool) parseCharList = do modei <- isModeI cl1 <- parseOne `mplus` (concatMapParse ((>>= return . (==)) . token) "-]") cl2 <- list $ parseOne `mplus` (token '^' >>= return . (==)) return $ applyIf modei ignoreCase $ or . zipWith ($) (cl1 : cl2) . repeat where parseOne = (parseChar >>= return . (==)) `mplus` parseCharArea `mplus` parseCharClass parseChar = ( spot isAlphaNum ) `mplus` ( token '\\' >> spot isSymbol ) `mplus` ( spot $ selfTest &&& flip notElem "-]" ) `mplus` ( token '+' ) `mplus` ( token '[' >>= skipRet (still $ parseNot () $ token ':') ) parseCharArea = (parseChar >>= skipRet (token '-')) >..> parseChar >>= return . inRange parseCharClass = concatMapParse (\(s, p) -> tokens ("[:"++s++":]") >> return p) charClassList concatMapParse :: MonadPlus m => (b -> m a) -> [b] -> m a concatMapParse f = foldr mplus mzero . map f