{-# LANGUAGE BangPatterns #-} module Regex.Internal.List ( list , manyList , someList , manyListMin , someListMin , charIgnoreCase , oneOfChar , stringIgnoreCase , manyStringOf , someStringOf , manyStringOfMin , someStringOfMin , naturalDec , integerDec , naturalHex , integerHex , wordRangeDec , intRangeDec , wordRangeHex , intRangeHex , wordDecN , wordHexN , toMatch , withMatch , reParse , parse , parseSure , find , findAll , splitOn , replace , replaceAll ) where import Control.Applicative import Data.Char import Data.Maybe (fromMaybe) import Numeric.Natural import Data.CharSet (CharSet) import qualified Data.CharSet as CS import Regex.Internal.Parser (Parser) import qualified Regex.Internal.Parser as P import Regex.Internal.Regex (RE(..), Greediness(..), Strictness(..)) import qualified Regex.Internal.Regex as R import qualified Regex.Internal.Num as RNum import qualified Regex.Internal.Generated.CaseFold as CF ------------------------ -- REs and combinators ------------------------ -- | Parse the given list. list :: Eq c => [c] -> RE c [c] list xs = xs <$ foldr ((*>) . R.single) (pure ()) xs -- | Parse any list. Biased towards matching more. manyList :: RE c [c] manyList = many R.anySingle -- | Parse any non-empty list. Biased towards matching more. someList :: RE c [c] someList = some R.anySingle -- | Parse any list. Minimal, i.e. biased towards matching less. manyListMin :: RE c [c] manyListMin = R.manyMin R.anySingle -- | Parse any non-empty @String@. Minimal, i.e. biased towards matching less. someListMin :: RE c [c] someListMin = R.someMin R.anySingle ----------- -- String ----------- -- | Parse the given @Char@, ignoring case. -- -- Comparisons are performed after applying -- [simple case folding](https://www.unicode.org/reports/tr44/#Simple_Case_Folding) -- as described by the Unicode standard. charIgnoreCase :: Char -> RE Char Char charIgnoreCase c = R.satisfy $ (c'==) . CF.caseFoldSimple where !c' = CF.caseFoldSimple c -- See Note [Why simple case fold] in Regex.Internal.Text -- | Parse a @Char@ if it is a member of the @CharSet@. oneOfChar :: CharSet -> RE Char Char oneOfChar !cs = R.satisfy (`CS.member` cs) -- | Parse the given @String@, ignoring case. -- -- Comparisons are performed after applying -- [simple case folding](https://www.unicode.org/reports/tr44/#Simple_Case_Folding) -- as described by the Unicode standard. stringIgnoreCase :: String -> RE Char String stringIgnoreCase = foldr (R.liftA2' (:) . charIgnoreCase) (pure []) -- See Note [Why simple case fold] in Regex.Internal.Text -- | Parse any @String@ containing members of the @CharSet@. -- Biased towards matching more. manyStringOf :: CharSet -> RE Char String manyStringOf !cs = many (R.satisfy (`CS.member` cs)) -- | Parse any non-empty @String@ containing members of the @CharSet@. -- Biased towards matching more. someStringOf :: CharSet -> RE Char String someStringOf !cs = some (R.satisfy (`CS.member` cs)) -- | Parse any @String@ containing members of the @CharSet@. -- Minimal, i.e. biased towards matching less. manyStringOfMin :: CharSet -> RE Char String manyStringOfMin !cs = R.manyMin (R.satisfy (`CS.member` cs)) -- | Parse any non-empty @String@ containing members of the @CharSet@. -- Minimal, i.e. biased towards matching less. someStringOfMin :: CharSet -> RE Char String someStringOfMin !cs = R.someMin (R.satisfy (`CS.member` cs)) ----------------- -- Numeric REs ----------------- -- | Parse a decimal @Natural@. -- Leading zeros are not accepted. Biased towards matching more. naturalDec :: RE Char Natural naturalDec = RNum.mkNaturalDec digitRange -- | Parse a decimal @Integer@. Parse an optional sign, @\'-\'@ or @\'+\'@, -- followed by the given @RE@, followed by the absolute value of the integer. -- Leading zeros are not accepted. Biased towards matching more. integerDec :: RE Char a -> RE Char Integer integerDec sep = RNum.mkSignedInteger minus plus (sep *> naturalDec) -- | Parse a hexadecimal @Natural@. Both uppercase @\'A\'..\'F\'@ and lowercase -- @\'a\'..\'f\'@ are accepted. -- Leading zeros are not accepted. Biased towards matching more. naturalHex :: RE Char Natural naturalHex = RNum.mkNaturalHex hexDigitRange -- | Parse a hexadecimal @Integer@. Parse an optional sign, @\'-\'@ or @\'+\'@, -- followed by the given @RE@, followed by the absolute value of the integer. -- Both uppercase @\'A\'..\'F\'@ and lowercase @\'a\'..\'f\'@ are accepted. -- Leading zeros are not accepted. Biased towards matching more. integerHex :: RE Char a -> RE Char Integer integerHex sep = RNum.mkSignedInteger minus plus (sep *> naturalHex) -- | Parse a decimal @Word@ in the range @[low..high]@. -- Leading zeros are not accepted. Biased towards matching more. wordRangeDec :: (Word, Word) -> RE Char Word wordRangeDec lh = RNum.mkWordRangeDec digitRange lh -- | Parse a decimal @Int@ in the range @[low..high]@. Parse an optional sign, -- @\'-\'@ or @\'+\'@, followed by the given @RE@, followed by the absolute -- value of the integer. -- Leading zeros are not accepted. Biased towards matching more. intRangeDec :: RE Char a -> (Int, Int) -> RE Char Int intRangeDec sep lh = RNum.mkSignedIntRange minus plus ((sep *>) . wordRangeDec) lh -- | Parse a hexadecimal @Word@ in the range @[low..high]@. Both uppercase -- @\'A\'..\'F\'@ and lowercase @\'a\'..\'f\'@ are accepted. -- Leading zeros are not accepted. Biased towards matching more. wordRangeHex :: (Word, Word) -> RE Char Word wordRangeHex lh = RNum.mkWordRangeHex hexDigitRange lh -- | Parse a hexadecimal @Int@ in the range @[low..high]@. Parse an optional -- sign, @\'-\'@ or @\'+\'@, followed by the given @RE@, followed by the -- absolute value of the integer. -- Both uppercase @\'A\'..\'F\'@ and lowercase @\'a\'..\'f\'@ are accepted. -- Leading zeros are not accepted. Biased towards matching more. intRangeHex :: RE Char a -> (Int, Int) -> RE Char Int intRangeHex sep lh = RNum.mkSignedIntRange minus plus ((sep *>) . wordRangeHex) lh -- | Parse a @Word@ of exactly n decimal digits, including any leading zeros. -- Will not parse values that do not fit in a @Word@. -- Biased towards matching more. wordDecN :: Int -> RE Char Word wordDecN n = RNum.mkWordDecN digitRange n -- | Parse a @Word@ of exactly n hexadecimal digits, including any leading -- zeros. Both uppercase @\'A\'..\'F\'@ and lowercase @\'a\'..\'f\'@ are -- accepted. Will not parse values that do not fit in a @Word@. -- Biased towards matching more. wordHexN :: Int -> RE Char Word wordHexN n = RNum.mkWordHexN hexDigitRange n minus, plus :: RE Char () minus = R.token $ \c -> if c == '-' then Just () else Nothing plus = R.token $ \c -> if c == '+' then Just () else Nothing -- l and h must be in [0..9] digitRange :: Word -> Word -> RE Char Word digitRange !l !h = R.token $ \c -> let d = fromIntegral (ord c - ord '0') in if l <= d && d <= h then Just d else Nothing -- l and h must be in [0..15] hexDigitRange :: Word -> Word -> RE Char Word hexDigitRange !l !h = R.token $ \c -> let dec = fromIntegral (ord c - ord '0') hexl = fromIntegral (ord c - ord 'a') hexu = fromIntegral (ord c - ord 'A') in do d <- case () of _ | dec <= 9 -> Just dec | hexl <= 5 -> Just $! 10 + hexl | hexu <= 5 -> Just $! 10 + hexu | otherwise -> Nothing if l <= d && d <= h then Just d else Nothing ---------------- -- Match stuff ---------------- -- | Rebuild the @RE@ such that the result is the matched section of the list -- instead. toMatch :: RE c a -> RE c [c] toMatch = fmap dToL . toMatch_ toMatch_ :: RE c b -> RE c (DList c) toMatch_ re = case re of RToken t -> RToken (\c -> singletonD c <$ t c) RFmap _ _ re1 -> toMatch_ re1 RFmap_ _ re1 -> toMatch_ re1 RPure _ -> RPure mempty RLiftA2 _ _ re1 re2 -> RLiftA2 Strict (<>) (toMatch_ re1) (toMatch_ re2) REmpty -> REmpty RAlt re1 re2 -> RAlt (toMatch_ re1) (toMatch_ re2) RMany _ _ _ _ re1 -> RFold Strict Greedy (<>) mempty (toMatch_ re1) RFold _ gr _ _ re1 -> RFold Strict gr (<>) mempty (toMatch_ re1) data WithMatch c a = WM !(DList c) a instance Functor (WithMatch c) where fmap f (WM t x) = WM t (f x) fmapWM' :: (a -> b) -> WithMatch c a -> WithMatch c b fmapWM' f (WM t x) = WM t $! f x instance Applicative (WithMatch c) where pure = WM mempty liftA2 f (WM t1 x) (WM t2 y) = WM (t1 <> t2) (f x y) liftA2WM' :: (a1 -> a2 -> b) -> WithMatch c a1 -> WithMatch c a2 -> WithMatch c b liftA2WM' f (WM t1 x) (WM t2 y) = WM (t1 <> t2) $! f x y -- | Rebuild the @RE@ to include the matched section of the list alongside the -- result. withMatch :: RE c a -> RE c ([c], a) withMatch = R.fmap' (\(WM cs x) -> (dToL cs, x)) . go where go :: RE c b -> RE c (WithMatch c b) go re = case re of RToken t -> RToken (\c -> WM (singletonD c) <$> t c) RFmap st f re1 -> let g = case st of Strict -> fmapWM' f NonStrict -> fmap f in RFmap Strict g (go re1) RFmap_ b re1 -> RFmap Strict (flip WM b) (toMatch_ re1) RPure b -> RPure (pure b) RLiftA2 st f re1 re2 -> let g = case st of Strict -> liftA2WM' f NonStrict -> liftA2 f in RLiftA2 Strict g (go re1) (go re2) REmpty -> REmpty RAlt re1 re2 -> RAlt (go re1) (go re2) RMany f1 f2 f z re1 -> RMany (fmapWM' f1) (fmapWM' f2) (liftA2WM' f) (pure z) (go re1) RFold st gr f z re1 -> let g = case st of Strict -> liftA2WM' f NonStrict -> liftA2 f in RFold Strict gr g (pure z) (go re1) ---------- -- Parse ---------- -- | \(O(mn \log m)\). Parse a list with a @RE@. -- -- Uses 'Regex.List.compile', see the note there. -- -- If parsing multiple lists using the same @RE@, it is wasteful to compile -- the @RE@ every time. So, prefer to -- -- * Compile once with 'Regex.List.compile' or 'Regex.List.compileBounded' and -- use the compiled 'Parser' with 'parse' as many times as required. -- * Alternately, partially apply this function to a @RE@ and use the function -- as many times as required. reParse :: RE c a -> [c] -> Maybe a reParse re = let !p = P.compile re in parse p {-# INLINE reParse #-} -- | \(O(mn \log m)\). Parse a list with a @Parser@. parse :: Parser c a -> [c] -> Maybe a parse = P.parseFoldr foldr {-# INLINE parse #-} -- | \(O(mn \log m)\). Parse a list with a @Parser@. Calls 'error' on -- parse failure. -- -- For use with parsers that are known to never fail. parseSure :: Parser c a -> [c] -> a parseSure p = fromMaybe parseSureError . parse p {-# INLINE parseSure #-} parseSureError :: a parseSureError = errorWithoutStackTrace "Regex.List.parseSure: parse failed; if parsing can fail use 'parse' instead" reParseSure :: RE c a -> [c] -> a reParseSure re = let !p = P.compile re in parseSure p {-# INLINE reParseSure #-} -- | \(O(mn \log m)\). Find the first occurence of the given @RE@ in a list. -- -- ==== __Examples__ -- -- >>> find (list "meow") "homeowner" -- Just "meow" -- -- To test whether a list is present in another list, like above, prefer -- @Data.List.'Data.List.isInfixOf'@. -- -- >>> find (stringIgnoreCase "haskell") "Look I'm Haskelling!" -- Just "Haskell" -- >>> find (list "backtracking") "parser-regex" -- Nothing -- find :: RE c a -> [c] -> Maybe a find = reParse . R.toFind {-# INLINE find #-} -- | \(O(mn \log m)\). Find all non-overlapping occurences of the given @RE@ in -- the list. -- -- ==== __Examples__ -- -- >>> findAll (list "ana") "banananana" -- ["ana","ana"] -- -- @ -- data Roll = Roll -- Natural -- ^ Rolls -- Natural -- ^ Faces on the die -- deriving Show -- -- roll :: RE Char Roll -- roll = Roll \<$> ('naturalDec' \<|> pure 1) \<* 'R.single' \'d\' \<*> naturalDec -- @ -- -- >>> findAll roll "3d6, d10, 2d10" -- [Roll 3 6,Roll 1 10,Roll 2 10] -- findAll :: RE c a -> [c] -> [a] findAll = reParseSure . R.toFindMany {-# INLINE findAll #-} -- | \(O(mn \log m)\). Split a list at occurences of the given @RE@. -- -- ==== __Examples__ -- -- >>> splitOn (single ' ') "Glasses are really versatile" -- ["Glasses","are","really","versatile"] -- -- In cases like above, prefer using 'words' or 'lines' instead, if -- applicable. -- -- >>> splitOn (single ' ' *> oneOfChar "+-=" *> single ' ') "3 - 1 + 1/2 - 2 = 0" -- ["3","1","1/2","2","0"] -- -- If the list starts or ends with a delimiter, the result will contain -- empty lists at those positions. -- -- >>> splitOn (single 'a') "ayaya" -- ["","y","y",""] -- splitOn :: RE c a -> [c] -> [[c]] splitOn = reParseSure . toSplitOn {-# INLINE splitOn #-} toSplitOn :: RE c a -> RE c [[c]] toSplitOn re = manyListMin `R.sepBy` re -- | \(O(mn \log m)\). Replace the first match of the given @RE@ with its -- result. If there is no match, the result is @Nothing@. -- -- ==== __Examples__ -- -- >>> replace ("world" <$ list "Haskell") "Hello, Haskell!" -- Just "Hello, world!" -- -- >>> replace ("," <$ some (single '.')) "one...two...ten" -- Just "one,two...ten" -- replace :: RE c [c] -> [c] -> Maybe [c] replace = reParse . toReplace {-# INLINE replace #-} toReplace :: RE c [c] -> RE c [c] toReplace re = liftA2 f manyListMin re <*> manyList where f a b c = concat [a,b,c] -- | \(O(mn \log m)\). Replace non-overlapping matches of the given @RE@ with -- their results. -- -- ==== __Examples__ -- -- >>> replaceAll (" and " <$ list ", ") "red, blue, green" -- "red and blue and green" -- -- >>> replaceAll ("Fruit" <$ list "Time" <|> "banana" <$ list "arrow") "Time flies like an arrow" -- "Fruit flies like a banana" -- -- @ -- sep = 'oneOfChar' "-./" -- digits n = 'replicateM' n (oneOfChar 'Data.CharSet.digit') -- toYmd d m y = concat [y, \"-\", m, \"-\", d] -- date = toYmd \<$> digits 2 \<* sep -- \<*> digits 2 \<* sep -- \<*> digits 4 -- @ -- >>> replaceAll date "01/01/1970, 01-04-1990, 03.07.2011" -- "1970-01-01, 1990-04-01, 2011-07-03" -- replaceAll :: RE c [c] -> [c] -> [c] replaceAll = reParseSure . toReplaceMany {-# INLINE replaceAll #-} toReplaceMany :: RE c [c] -> RE c [c] toReplaceMany re = concat <$> many (re <|> R.token (Just . (:[]))) --------------------- -- Difference lists --------------------- newtype DList a = DList { unDList :: [a] -> [a] } instance Semigroup (DList a) where xs <> ys = DList (unDList xs . unDList ys) instance Monoid (DList a) where mempty = DList id singletonD :: a -> DList a singletonD = DList . (:) dToL :: DList a -> [a] dToL = ($ []) . unDList ---------- -- Notes ---------- -- Note [Token for Regex.List] -- ~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- -- Regex.Text uses a token TextToken, but Regex.List doesn't, why? -- -- TextToken is used for efficient slicing, but here a DList is used for that -- purpose. This has the effect that combinators like manyText and friends -- don't need to allocate a linear amount of memory, since slicing is free, but -- manyList and friends do. We could use a token type for list like -- -- data Take a = Take !Int ![a] -- -- to refer to the input list and save memory. -- -- This is not done because -- * It increases complexity. Currently this module offers the simplest possible -- application of RE, which is nice to have. -- * If the list does not already exist in memory, Take would keep the entire -- list alive in memory instead of the just the slice it needs. -- * The current implementation is a good consumer, which can fuse with a good -- producer of the input list. -- -- In the end it is about the two distinct use cases of lists in Haskell: -- * As a structure in memory, the Take token would be the better choice -- * As a stream of elements, the current implementation is the better choice