concrete RegexDemo { @type run () -> () } define RegexDemo { run () { ~ testMatch("","",true) ~ testMatch("","a",false) ~ testMatch(".","a",true) ~ testMatch(".","aa",false) ~ testMatch("a",".",false) ~ testMatch(".+","",false) ~ testMatch(".+","a",true) ~ testMatch(".+","ab",true) ~ testMatch(".*.","a",true) ~ testMatch(".*.","ab",true) ~ testMatch("(ab)*","",true) ~ testMatch("(ab)*","abab",true) ~ testMatch("(ab)*","a",false) ~ testMatch("(ab)*ac","abac",true) ~ testMatch("(ab)*ac","ac",true) ~ testMatch("(ab)*ac","aca",false) ~ testMatch("(ab)*ac","acab",false) ~ testMatch("(ab)*|ac","abab",true) ~ testMatch("(ab)*|ac","ac",true) ~ testMatch("(ab)*|ac","acac",false) ~ testMatch("(ab)*|ac","abac",false) ~ testMatch("a{2}","a",false) ~ testMatch("a{2}","aa",true) ~ testMatch("a{2}","aaa",false) ~ testMatch("a{2,3}","a",false) ~ testMatch("a{2,3}","aa",true) ~ testMatch("a{2,3}","aaa",true) ~ testMatch("a{2,3}","aaaa",false) ~ testMatch("(a+){4}","aaa",false) ~ testMatch("(a+){4}","aaaa",true) ~ testMatch("(a+){4}","aaaaa",true) ~ testMatch("(a+){4}","aaaaaa",true) ~ testMatch("[a-z?]","q",true) ~ testMatch("[a-z?]","-",false) ~ testMatch("[a-z?]","?",true) ~ testMatch("[a-z?]*","abcde?",true) ~ testMatch("[a-z?]*","ab-cde",false) // A few malicious regexes from https://en.wikipedia.org/wiki/ReDoS. // // The first three below don't finish, due to repeat branching. The // branching of "a+" is O(sqrt(2)^n) and "(a+)+" is therefore O(2^n). // // The first two could still be matched using non-branching repeats, since // there is a complete overlap between the tail of one repeat and the next // repeat, and repeats past 1 don't matter. The third could also be if the // first 30 were expanded at the beginning. // // Overall, the safer solution would be to avoid repeat branching, and have // a disclaimer about matching accuracy with repeated unbounded patterns // that overlap themselves. // ~ testMatch("(a+)+","aaaaaaaaaaaaaaaaaaaaaaaa!",false) // ~ testMatch("(a+)+","aaaaaaaaaaaaaaaaaaaaaaaa!",false) // ~ testMatch("([a-zA-Z]+)*","aaaaaaaaaaaaaaaaaaaaaaaa!",false) // ~ testMatch("(.*a){30}","aaaaaaaaaaaaaaaaaaaaaaaa!",false) ~ testMatch("(a|aa)+","aaaaaaaaaaaaaaaaaaaaaaaa!",false) // This causes a ton of branching, but it actually finishes. Given m // unbounded groups in a sequence, the branching is O(n^m). Since n^m // increases rapidly with n, most sequence branches at any given time will // not contain repeat branches within them. The exception is that each of // the m groups will be in separate sequence branches from the beginning, // *adding* O(sqrt(2)^n) branches. This means the overall complexity is // O(n^m)+O(sqrt(2)^n): O(n^m) for smaller n, O(sqrt(2)^n) for larger n. ~ testMatch("[ab]*[ac]*[ad]*[ae]*[af]*", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaad",true) } @type testMatch (String,String,Bool) -> () testMatch (pattern,data,expected) { MatcherTemplate template <- require(CharRegex$parse(pattern)) ~ LazyStream$new() .append("Trying: regex=\"") .append(pattern) .append("\" data=\"") .append(data) .append("\" -> ") .append(expected) .append("...\n") .writeTo(SimpleOutput$stderr()) if (CharRegex$match(template,data) != expected) { fail("Incorrect match result!") } } }