{-# LANGUAGE MultiParamTypeClasses, TypeSynonymInstances, FlexibleInstances, FlexibleContexts #-} {-# OPTIONS_GHC -fno-warn-orphans #-} {- Copyright (C) 2010-2015 Dr. Alistair Ward This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . -} {- | [@AUTHOR@] Dr. Alistair Ward [@DESCRIPTION@] Creates a /back-end/ implementation, conforming to . [@CAVEATS@] * The underlying polymorphic (in terms of the base-type of the list of input data) 'RegExDot.RegEx.ExtendedRegEx'-engine is never going to be a drop-in replacement for other /regex/-engines, so this standard interface has only been implemented for traditional input data-type ['Char']. * The standard interface requires many Haskell-extensions, but since this implementation is just a thin layer over the underlying portable polymorphic 'RegExDot.RegEx.ExtendedRegEx'-engine, the latter can still be used directly, where any of these extensions are unavailable. -} module RegExChar.RegExOptsChar( -- * Types -- ** Type-synonyms MatchSpan, -- MatchDataSpan, RegExOptsChar, -- * Functions -- exciseNonCapturingTopLevelAlternatives, -- matchUntilFailure, -- toMatchDataSpanList, toZeroIndexedArray, -- ** Operators (=~) -- ** Predicates -- complyStrictlyWithPosix, -- hasNonCapturingTopLevelAlternatives ) where import qualified Data.Array.IArray import qualified Data.Default import qualified RegExChar.ExtendedRegExChar as ExtendedRegExChar import qualified RegExDot.CompilationOptions import qualified RegExDot.ConsumptionBounds import qualified RegExDot.DataSpan import qualified RegExDot.DataSpanTree import qualified RegExDot.ExecutionOptions import qualified RegExDot.RegEx import qualified RegExDot.RegExOpts import qualified RegExDot.Result import qualified RegExDot.Tree import qualified Text.Regex.Base.RegexLike as RegexLike import Text.Regex.Base.Context() -- Instance-declarations. import qualified ToolShed.Data.List import qualified ToolShed.Options infix 4 =~ -- Same as (==). -- | Defines a specific instance of the polymorphic base-type. type RegExOptsChar = RegExDot.RegExOpts.RegExOpts ExtendedRegExChar.ExtendedRegExChar -- | Convenience accessor-function. hasNonCapturingTopLevelAlternatives :: RegExOptsChar -> Bool hasNonCapturingTopLevelAlternatives = ExtendedRegExChar.hasNonCapturingTopLevelAlternatives . RegExDot.RegExOpts.regEx -- | Convenience accessor-function. complyStrictlyWithPosix :: RegExOptsChar -> Bool complyStrictlyWithPosix = RegExDot.CompilationOptions.complyStrictlyWithPosix . RegExDot.RegExOpts.compilationOptions instance RegexLike.RegexOptions RegExOptsChar RegExDot.CompilationOptions.CompilationOptions RegExDot.ExecutionOptions.ExecutionOptions where blankCompOpt = ToolShed.Options.blankValue blankExecOpt = ToolShed.Options.blankValue defaultCompOpt = Data.Default.def defaultExecOpt = Data.Default.def setExecOpts e r = r { RegExDot.RegExOpts.executionOptions = e } getExecOpts = RegExDot.RegExOpts.executionOptions -- Newer versions of this 'Text.Regex.Base.RegexLike.RegexMaker' have additional monadic methods, which can fail on parse-errors. instance RegexLike.RegexMaker RegExOptsChar RegExDot.CompilationOptions.CompilationOptions RegExDot.ExecutionOptions.ExecutionOptions String where makeRegexOpts c e source = RegExDot.RegExOpts.MkRegExOpts { RegExDot.RegExOpts.compilationOptions = c, RegExDot.RegExOpts.executionOptions = e, RegExDot.RegExOpts.regEx = read source } -- | The offset and length of the 'RegExDot.RegEx.InputData' consumed in one 'RegExDot.RegEx.Match'. type MatchSpan = (RegexLike.MatchOffset, RegexLike.MatchLength) {- | * The 'MatchSpan' augmented by the consumed 'ExtendedRegExChar.InputData' to which it refers. * Similar to 'RegExDot.DataSpan.DataSpan'. -} type MatchDataSpan = (ExtendedRegExChar.InputData, MatchSpan) {- | * Convert the 'RegExDot.RegEx.MatchList', into the 'MatchDataSpan's required by "Text.Regex.Base.RegexLike". * Treat the whole 'ExtendedRegExChar.ExtendedRegExChar' as an additional implicit capture-group. * CAVEAT: @RegExDot.DataSpanTree.extractCaptureGroups True@ may return some 'RegExDot.DataSpan.DataSpan's containing the artificial offset @-1@ specified by Posix => don't use this as the basis for any arithmetic. -} toMatchDataSpanList :: Bool -> RegExDot.ConsumptionBounds.DataLength -- ^ The offset to use for zero-length match. -> RegExDot.RegEx.MatchList Char -- ^ The list of matches. -> [MatchDataSpan] toMatchDataSpanList _ offset [] = [RegExDot.DataSpan.empty offset] -- The whole regex matched, but consumed nothing. toMatchDataSpanList strictPosixCompliance _ matchList = RegExDot.DataSpanTree.extractCaptureGroups strictPosixCompliance . return {-to List-monad-} . RegExDot.Tree.Node . return {-to List-monad-} $ RegExDot.DataSpanTree.toTreeList matchList -- | Optionally remove the 2nd element from the list, where it represents the data captured by a groups of top-level 'Alternatives', which weren't explicitly delimited & therefore are non-capturing. exciseNonCapturingTopLevelAlternatives :: RegExOptsChar -> [MatchDataSpan] -> [MatchDataSpan] exciseNonCapturingTopLevelAlternatives regExOptsChar | hasNonCapturingTopLevelAlternatives regExOptsChar = ToolShed.Data.List.excise 1 -- The zeroeth represents the whole regex, the 1st represents the top-level non-capturing group of Alternatives, & the remainder represent explicitly delimited groups. | otherwise = id -- | Convert the specified list, into a zero-indexed array. toZeroIndexedArray :: [e] -> Data.Array.IArray.Array Int e toZeroIndexedArray l = Data.Array.IArray.listArray (0, pred $ length l) l {- | * Repeatedly apply the 'RegExOptsChar' to the 'ExtendedRegExChar.InputData', forwarding unmatched input data to the next match-attempt, until it's all been consumed. * The offsets, from the start of the input data, of all matches after the first, are shifted to account for input data already consumed by previous matches. -} matchUntilFailure :: RegExOptsChar -- ^ The match-criteria. -> ExtendedRegExChar.InputData -- ^ The input-data, to be consumed by repeated matching. -> [RegExDot.RegEx.MatchList Char] matchUntilFailure regExOptsChar inputData = shiftOffsets 0 $ matchUntilFailure' inputData where matchUntilFailure' :: ExtendedRegExChar.InputData -> [RegExDot.RegEx.MatchList Char] matchUntilFailure' unmatchedInputData = case unmatchedInputData ExtendedRegExChar.+~ regExOptsChar of -- CAVEAT: . (_, Just matchList, maybeSternAnchorResult) -> matchList : if null (RegExDot.RegEx.extractDataFromMatchList matchList) then [] else matchUntilFailure' (RegExDot.RegEx.extractDataFromMatch' maybeSternAnchorResult) _ -> [] shiftOffsets :: RegExDot.ConsumptionBounds.DataLength -> [RegExDot.RegEx.MatchList Char] -> [RegExDot.RegEx.MatchList Char] shiftOffsets offset (matchList : matchLists) = let shiftedMatchList :: RegExDot.RegEx.MatchList Char shiftedMatchList = RegExDot.RegEx.shiftMatchList offset matchList in shiftedMatchList : shiftOffsets (RegExDot.DataSpan.after . last . RegExDot.DataSpanTree.flattenTreeList 0 $ RegExDot.DataSpanTree.toTreeList shiftedMatchList) matchLists {-recurse-} shiftOffsets _ _ = [] instance RegexLike.RegexLike RegExOptsChar ExtendedRegExChar.InputData where {- E.g.: ("xabcdxabdxxadx" RegExOptsChar.=~ "a(b(c?))*d") :: [Text.Regex.Base.RegexLike.MatchArray] [array (0,2) [(0,(1,4)),(1,(2,2)),(2,(3,1))],array (0,2) [(0,(6,3)),(1,(7,1)),(2,(8,0))],array (0,1) [(0,(11,2)),(1,(-1,0))]] -} -- matchAll :: RegExOptsChar -> ExtendedRegExChar.InputData -> [RegexLike.MatchArray] matchAll regExOptsChar = map (toZeroIndexedArray . map snd {-span-} . exciseNonCapturingTopLevelAlternatives regExOptsChar . toMatchDataSpanList (complyStrictlyWithPosix regExOptsChar) 0) . matchUntilFailure regExOptsChar {- E.g.: ("xabcdxabdxxadx" RegExOptsChar.=~ "a(b(c?))*d") :: [Text.Regex.Base.RegexLike.MatchText String] [array (0,2) [(0,("abcd",(1,4))),(1,("bc",(2,2))),(2,("c",(3,1)))],array (0,2) [(0,("abd",(6,3))),(1,("b",(7,1))),(2,("",(8,0)))],array (0,1) [(0,("ad",(11,2))),(1,("",(-1,0)))]] -} -- matchAllText :: RegExOptsChar -> ExtendedRegExChar.InputData -> [RegexLike.MatchText ExtendedRegExChar.InputData] matchAllText regExOptsChar = map (toZeroIndexedArray . exciseNonCapturingTopLevelAlternatives regExOptsChar . toMatchDataSpanList (complyStrictlyWithPosix regExOptsChar) 0) . matchUntilFailure regExOptsChar {- E.g.: ("xabcdx" RegExOptsChar.=~ "a(b(c?))*d") :: Text.Regex.Base.RegexLike.MatchArray array (0,2) [(0,(1,4)),(1,(2,2)),(2,(3,1))] -} -- matchOnce :: RegExOptsChar -> ExtendedRegExChar.InputData -> Maybe RegexLike.MatchArray matchOnce regExOptsChar inputData = ( toZeroIndexedArray . map snd {-span-} . exciseNonCapturingTopLevelAlternatives regExOptsChar . toMatchDataSpanList ( complyStrictlyWithPosix regExOptsChar ) ( RegExDot.RegEx.externalMatchLength $ RegExDot.Result.getPreMatch extendedRegExResult ) ) `fmap` RegExDot.Result.getMatchList extendedRegExResult where extendedRegExResult :: RegExDot.RegEx.Result Char extendedRegExResult = inputData ExtendedRegExChar.+~ regExOptsChar -- CAVEAT: . {- E.g.: ("xabcdx" RegExOptsChar.=~ "a(b(c?))*d") :: (String, Text.Regex.Base.RegexLike.MatchText String, String) ("x",array (0,2) [(0,("abcd",(1,4))),(1,("bc",(2,2))),(2,("c",(3,1)))],"x") -} -- matchOnceText :: RegExOptsChar -> ExtendedRegExChar.InputData -> Maybe (ExtendedRegExChar.InputData, MatchText ExtendedRegExChar.InputData, ExtendedRegExChar.InputData) matchOnceText regExOptsChar inputData = case inputData ExtendedRegExChar.+~ regExOptsChar of -- CAVEAT: . (maybeBowAnchorResult, Just matchList, maybeSternAnchorResult) -> Just ( RegExDot.RegEx.extractDataFromMatch' maybeBowAnchorResult, toZeroIndexedArray . exciseNonCapturingTopLevelAlternatives regExOptsChar $ toMatchDataSpanList ( complyStrictlyWithPosix regExOptsChar ) ( RegExDot.RegEx.externalMatchLength maybeBowAnchorResult ) matchList, RegExDot.RegEx.extractDataFromMatch' maybeSternAnchorResult ) _ -> Nothing {- E.g.: ("xabcdx" RegExOptsChar.=~ "a(b(c?))*d") :: Bool True -} -- matchTest :: RegExOptsChar -> ExtendedRegExChar.InputData -> Bool matchTest = flip (ExtendedRegExChar.=~) -- CAVEAT: . {- | * Match-operator. * The polymorphic return-type is resolved by the caller's "RegexLike.RegexContext". -} (=~) :: RegexLike.RegexContext RegExOptsChar ExtendedRegExChar.InputData target => ExtendedRegExChar.InputData -- ^ The input data. -> String -- ^ The string from which to read the regex-specification. -> target -- ^ The polymorphic return-type. inputData =~ s = (RegexLike.makeRegex s :: RegExOptsChar) `RegexLike.match` inputData