-- SPDX-FileCopyrightText: Copyright (c) 2025 Objectionary.com -- SPDX-License-Identifier: MIT {- | Tests for the Regexp module that provides regular expression matching and replacement using PCRE. -} module RegexpSpec where import Data.ByteString.Char8 qualified as B import Regexp qualified as R import Test.Hspec (Spec, anyException, describe, it, shouldBe, shouldReturn, shouldThrow) spec :: Spec spec = do describe "compile" $ do it "compiles a valid pattern" $ do _ <- R.compile (B.pack "foo") matched <- R.match (B.pack "foo") (B.pack "foobar") matched `shouldBe` True it "throws on invalid pattern" $ R.compile (B.pack "[invalid") `shouldThrow` anyException it "compiles pattern with groups" $ do _ <- R.compile (B.pack "(a)(b)(c)") matched <- R.match (B.pack "(a)(b)(c)") (B.pack "abc") matched `shouldBe` True it "compiles pattern with unicode" $ do _ <- R.compile (B.pack "кирилиця") matched <- R.match (B.pack "кирилиця") (B.pack "текст кирилиця тут") matched `shouldBe` True it "compiles empty pattern" $ do _ <- R.compile B.empty matched <- R.match B.empty (B.pack "anything") matched `shouldBe` True describe "match" $ do it "returns true when pattern matches" $ R.match (B.pack "hello") (B.pack "hello world") `shouldReturn` True it "returns false when pattern does not match" $ R.match (B.pack "goodbye") (B.pack "hello world") `shouldReturn` False it "returns true for partial match" $ R.match (B.pack "wor") (B.pack "hello world") `shouldReturn` True it "returns true for match at start" $ R.match (B.pack "^hello") (B.pack "hello world") `shouldReturn` True it "returns false for anchored pattern not at start" $ R.match (B.pack "^world") (B.pack "hello world") `shouldReturn` False it "returns true for match at end" $ R.match (B.pack "world$") (B.pack "hello world") `shouldReturn` True it "returns true with empty input and empty pattern" $ R.match B.empty B.empty `shouldReturn` True it "returns true with non-empty input and empty pattern" $ R.match B.empty (B.pack "text") `shouldReturn` True it "returns false with empty input and non-empty pattern" $ R.match (B.pack "text") B.empty `shouldReturn` False it "handles special regex characters" $ R.match (B.pack "a\\.b") (B.pack "a.b") `shouldReturn` True it "handles character class" $ R.match (B.pack "[0-9]+") (B.pack "abc123def") `shouldReturn` True it "handles alternation" $ R.match (B.pack "cat|dog") (B.pack "I have a dog") `shouldReturn` True it "handles unicode input" $ R.match (B.pack "日本語") (B.pack "これは日本語です") `shouldReturn` True it "handles case sensitive match" $ R.match (B.pack "Hello") (B.pack "hello") `shouldReturn` False describe "extractGroups" $ do it "extracts groups from pattern with capturing groups" $ do regex <- R.compile (B.pack "(\\w+)@(\\w+)") groups <- R.extractGroups regex (B.pack "user@domain") groups `shouldBe` [B.pack "user@domain", B.pack "user", B.pack "domain"] it "returns empty list when no match" $ do regex <- R.compile (B.pack "(foo)") groups <- R.extractGroups regex (B.pack "bar") groups `shouldBe` [] it "extracts nested groups" $ do regex <- R.compile (B.pack "((a)(b))") groups <- R.extractGroups regex (B.pack "ab") groups `shouldBe` [B.pack "ab", B.pack "ab", B.pack "a", B.pack "b"] it "handles optional group that did not match" $ do regex <- R.compile (B.pack "(a)(b)?") groups <- R.extractGroups regex (B.pack "a") length groups `shouldBe` 3 it "extracts multiple groups" $ do regex <- R.compile (B.pack "(x)(y)(z)") groups <- R.extractGroups regex (B.pack "prefix xyz suffix") groups `shouldBe` [B.pack "xyz", B.pack "x", B.pack "y", B.pack "z"] it "handles pattern without groups" $ do regex <- R.compile (B.pack "test") groups <- R.extractGroups regex (B.pack "this is a test") groups `shouldBe` [B.pack "test"] describe "substituteGroups" $ do it "substitutes group zero" $ R.substituteGroups (B.pack "[$0]") [B.pack "match"] `shouldBe` B.pack "[match]" it "substitutes multiple groups" $ R.substituteGroups (B.pack "$1-$2") [B.pack "full", B.pack "a", B.pack "b"] `shouldBe` B.pack "a-b" it "keeps dollar sign when no digits follow" $ R.substituteGroups (B.pack "$ test") [B.pack "x"] `shouldBe` B.pack "$ test" it "keeps original reference for out of bounds index" $ R.substituteGroups (B.pack "$9") [B.pack "only"] `shouldBe` B.pack "$9" it "handles replacement without group references" $ R.substituteGroups (B.pack "plain") [B.pack "x"] `shouldBe` B.pack "plain" it "handles empty replacement" $ R.substituteGroups B.empty [B.pack "x"] `shouldBe` B.empty it "handles empty groups list with reference" $ R.substituteGroups (B.pack "$0") [] `shouldBe` B.pack "$0" it "handles multi-digit group reference" $ R.substituteGroups (B.pack "$12") (replicate 13 (B.pack "x")) `shouldBe` B.pack "x" it "handles consecutive group references" $ R.substituteGroups (B.pack "$0$1$2") [B.pack "a", B.pack "b", B.pack "c"] `shouldBe` B.pack "abc" it "handles unicode in replacement" $ R.substituteGroups (B.pack "結果: $1") [B.pack "all", B.pack "データ"] `shouldBe` B.pack "結果: データ" it "handles dollar at end of string" $ R.substituteGroups (B.pack "test$") [B.pack "x"] `shouldBe` B.pack "test$" it "handles double dollar" $ R.substituteGroups (B.pack "$$1") [B.pack "x", B.pack "y"] `shouldBe` B.pack "$y" describe "replaceFirst" $ do it "replaces first occurrence" $ do regex <- R.compile (B.pack "cat") result <- R.replaceFirst regex (B.pack "dog") (B.pack "cat and cat") result `shouldBe` B.pack "dog and cat" it "returns input when no match" $ do regex <- R.compile (B.pack "xyz") result <- R.replaceFirst regex (B.pack "abc") (B.pack "hello world") result `shouldBe` B.pack "hello world" it "replaces with empty string" $ do regex <- R.compile (B.pack "remove") result <- R.replaceFirst regex B.empty (B.pack "please remove this") result `shouldBe` B.pack "please this" it "replaces at start of string" $ do regex <- R.compile (B.pack "^start") result <- R.replaceFirst regex (B.pack "begin") (B.pack "start here") result `shouldBe` B.pack "begin here" it "replaces at end of string" $ do regex <- R.compile (B.pack "end$") result <- R.replaceFirst regex (B.pack "finish") (B.pack "the end") result `shouldBe` B.pack "the finish" it "uses captured groups in replacement" $ do regex <- R.compile (B.pack "(\\w+)@(\\w+)") result <- R.replaceFirst regex (B.pack "[$1 AT $2]") (B.pack "email: test@example here") result `shouldBe` B.pack "email: [test AT example] here" it "handles unicode pattern and replacement" $ do regex <- R.compile (B.pack "古い") result <- R.replaceFirst regex (B.pack "新しい") (B.pack "これは古いです") result `shouldBe` B.pack "これは新しいです" it "handles empty input" $ do regex <- R.compile (B.pack "x") result <- R.replaceFirst regex (B.pack "y") B.empty result `shouldBe` B.empty it "replaces entire string when pattern matches all" $ do regex <- R.compile (B.pack "^.*$") result <- R.replaceFirst regex (B.pack "replaced") (B.pack "original") result `shouldBe` B.pack "replaced" describe "replaceAll" $ do it "replaces all occurrences" $ do regex <- R.compile (B.pack "a") result <- R.replaceAll regex (B.pack "X") (B.pack "banana") result `shouldBe` B.pack "bXnXnX" it "returns input when no match" $ do regex <- R.compile (B.pack "xyz") result <- R.replaceAll regex (B.pack "abc") (B.pack "hello world") result `shouldBe` B.pack "hello world" it "replaces consecutive matches" $ do regex <- R.compile (B.pack "o") result <- R.replaceAll regex (B.pack "0") (B.pack "oooo") result `shouldBe` B.pack "0000" it "replaces with captured groups" $ do regex <- R.compile (B.pack "(\\d+)") result <- R.replaceAll regex (B.pack "[$1]") (B.pack "a1b2c3") result `shouldBe` B.pack "a[1]b[2]c[3]" it "handles empty replacement" $ do regex <- R.compile (B.pack "x") result <- R.replaceAll regex B.empty (B.pack "axbxcx") result `shouldBe` B.pack "abc" it "handles empty input" $ do regex <- R.compile (B.pack "x") result <- R.replaceAll regex (B.pack "y") B.empty result `shouldBe` B.empty it "handles unicode input and pattern" $ do regex <- R.compile (B.pack "кіт") result <- R.replaceAll regex (B.pack "пес") (B.pack "кіт і кіт") result `shouldBe` B.pack "пес і пес" it "replaces overlapping potential matches correctly" $ do regex <- R.compile (B.pack "aa") result <- R.replaceAll regex (B.pack "X") (B.pack "aaaa") result `shouldBe` B.pack "XX" it "handles single character replacement" $ do regex <- R.compile (B.pack ".") result <- R.replaceAll regex (B.pack "*") (B.pack "abc") result `shouldBe` B.pack "***" it "handles word boundary" $ do regex <- R.compile (B.pack "\\bword\\b") result <- R.replaceAll regex (B.pack "WORD") (B.pack "word in a word") result `shouldBe` B.pack "WORD in a WORD"