{- Copyright (C) 2010-2015 Dr. Alistair Ward This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . -} {- | [@AUTHOR@] Dr. Alistair Ward [@DESCRIPTION@] Permits transformation of 'RegEx.MatchList', to facilitate standardisation. -} module RegExDot.DataSpanTree( -- * Types -- ** Type-synonyms -- DataSpanTree, -- DataSpanTreeList, -- * Functions extractCaptureGroups, flattenTreeList, toTreeList ) where import qualified Data.Foldable import qualified RegExDot.ConsumptionBounds as ConsumptionBounds import qualified RegExDot.DataSpan as DataSpan import qualified RegExDot.RegEx as RegEx import qualified RegExDot.Tree as Tree -- | Defines a similar 'Tree.Tree' to 'RegEx.Match', but with different 'Tree.Leaf's. type DataSpanTree a = Tree.Tree (DataSpan.DataSpan a) -- | Defines a similar structure to 'RegEx.MatchList'. type DataSpanTreeList a = [DataSpanTree a] -- | Converts a 'RegEx.MatchList' into a 'DataSpanTreeList', by transforming the 'Tree.Leaf's. toTreeList :: RegEx.MatchList a -> DataSpanTreeList a toTreeList = map toTree where toTree :: RegEx.Match a -> DataSpanTree a toTree = fmap toDataSpan where toDataSpan :: RegEx.MatchedData a -> DataSpan.DataSpan a toDataSpan (_, inputDataOffset, inputData) = (inputData, (inputDataOffset, length inputData)) -- | Condenses a 'DataSpanTreeList's into a list of 'DataSpan.DataSpan's, using 'DataSpan.join'. flattenTreeList :: ConsumptionBounds.DataLength -- ^ The offset into the input-data at which a match occurred. -> DataSpanTreeList a -- ^ The tree to flatten. -> [DataSpan.DataSpan a] flattenTreeList _ [] = [] flattenTreeList offset (tree : treeList) = flattenedTree : flattenTreeList (DataSpan.after flattenedTree) treeList where -- flattenedTree :: DataSpan.DataSpan a flattenedTree = DataSpan.join offset $ Data.Foldable.toList tree {- | * POSIX describes the contents of /capture-groups/, as summarised in . * 'RegEx.Result', is a complete description of the match between 'RegEx.InputData' & RegEx.ExtendedRegEx'; this function extracts a POSIX-conformant list from it. * The major differences are, that: Only data from parenthesized sub-expressions ('RegEx.Alternatives') is captured. Only the /last/ repetition of a repeated sub-expression is returned. . The data captured within each parenthesized sub-expression, is summarised as a single 'DataSpan.DataSpan'. POSIX specifies a 'Span.Span'-offset of @-1@, for sub-expressions which match zero times; cf sub-expressions which consume nothing, once. . @ ("ace" Text.Regex.Posix.=~ "a(b)*c(d)?e") :: Text.Regex.Base.RegexLike.MatchArray array (0,2) [(0,(0,3)),(1,(-1,0)),(2,(-1,0))] ("ace" Text.Regex.Posix.=~ "a(b*)c(d?)e") :: Text.Regex.Base.RegexLike.MatchArray array (0,2) [(0,(0,3)),(1,(1,0)),(2,(2,0))] @ I consider this a poor convention, resulting from the focus of POSIX on C, which makes subsequent calculation from the list of 'DataSpan's difficult & error-prone. -} extractCaptureGroups :: Bool -- ^ Whether to strictly comply with /POSIX/. -> DataSpanTreeList a -- ^ The tree-structure from which to extract the capture-groups. -> [DataSpan.DataSpan a] extractCaptureGroups complyStrictlyWithPosix = extractCaptureGroups' 0 where extractCaptureGroups' :: ConsumptionBounds.DataLength -> DataSpanTreeList a -> [DataSpan.DataSpan a] extractCaptureGroups' _ [] = [] extractCaptureGroups' offset (tree : treeList) = let -- recurseHorizontallyFrom :: ConsumptionBounds.DataLength -> [DataSpan.DataSpan a] recurseHorizontallyFrom = (`extractCaptureGroups'` treeList) in case tree of Tree.Leaf dataSpan -> recurseHorizontallyFrom $ DataSpan.after dataSpan Tree.Node [] -> DataSpan.empty (if complyStrictlyWithPosix then -1 else offset) : recurseHorizontallyFrom offset -- POSIX specifies an Span-offset of -1, for sub-expressions which match 0 times; cf sub-expressions which consumes nothing, once. Tree.Node treeLists -> joinedFlattenedTreeList : (extractCaptureGroups' offset lastMatch {-recurse vertically-} ++ recurseHorizontallyFrom (DataSpan.after joinedFlattenedTreeList)) where -- lastMatch :: DataSpanTreeList a lastMatch = last treeLists -- . -- joinedFlattenedTreeList :: DataSpan.DataSpan a joinedFlattenedTreeList = DataSpan.join offset $ flattenTreeList offset lastMatch