New Upstream Release - golang-github-dlclark-regexp2
Ready changes
Summary
Merged new upstream version: 1.9.0+ds1 (was: 1.7.0+ds1).
Diff
diff --git a/.travis.yml b/.travis.yml
index 2aa5ea1..a2da6be 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,5 +1,7 @@
language: go
-
+arch:
+ - AMD64
+ - ppc64le
go:
- 1.9
- - tip
\ No newline at end of file
+ - tip
diff --git a/README.md b/README.md
index 4e4abb4..f3d1bd9 100644
--- a/README.md
+++ b/README.md
@@ -39,6 +39,24 @@ Group 0 is embedded in the Match. Group 0 is an automatically-assigned group th
The __last__ capture is embedded in each group, so `g.String()` will return the same thing as `g.Capture.String()` and `g.Captures[len(g.Captures)-1].String()`.
+If you want to find multiple matches from a single input string you should use the `FindNextMatch` method. For example, to implement a function similar to `regexp.FindAllString`:
+
+```go
+func regexp2FindAllString(re *regexp2.Regexp, s string) []string {
+ var matches []string
+ m, _ := re.FindStringMatch(s)
+ for m != nil {
+ matches = append(matches, m.String())
+ m, _ = re.FindNextMatch(m)
+ }
+ return matches
+}
+```
+
+`FindNextMatch` is optmized so that it re-uses the underlying string/rune slice.
+
+The internals of `regexp2` always operate on `[]rune` so `Index` and `Length` data in a `Match` always reference a position in `rune`s rather than `byte`s (even if the input was given as a string). This is a dramatic difference between `regexp` and `regexp2`. It's advisable to use the provided `String()` methods to avoid having to work with indices.
+
## Compare `regexp` and `regexp2`
| Category | regexp | regexp2 |
| --- | --- | --- |
@@ -62,6 +80,8 @@ The default behavior of `regexp2` is to match the .NET regexp engine, however th
* add support for named ascii character classes (e.g. `[[:foo:]]`)
* add support for python-style capture groups (e.g. `(P<name>re)`)
* change singleline behavior for `$` to only match end of string (like RE2) (see [#24](https://github.com/dlclark/regexp2/issues/24))
+* change the character classes `\d` `\s` and `\w` to match the same characters as RE2. NOTE: if you also use the `ECMAScript` option then this will change the `\s` character class to match ECMAScript instead of RE2. ECMAScript allows more whitespace characters in `\s` than RE2 (but still fewer than the the default behavior).
+* allow character escape sequences to have defaults. For example, by default `\_` isn't a known character escape and will fail to compile, but in RE2 mode it will match the literal character `_`
```go
re := regexp2.MustCompile(`Your RE2-compatible pattern`, regexp2.RE2)
@@ -72,6 +92,70 @@ if isMatch, _ := re.MatchString(`Something to match`); isMatch {
This feature is a work in progress and I'm open to ideas for more things to put here (maybe more relaxed character escaping rules?).
+## Catastrophic Backtracking and Timeouts
+
+`regexp2` supports features that can lead to catastrophic backtracking.
+`Regexp.MatchTimeout` can be set to to limit the impact of such behavior; the
+match will fail with an error after approximately MatchTimeout. No timeout
+checks are done by default.
+
+Timeout checking is not free. The current timeout checking implementation starts
+a background worker that updates a clock value approximately once every 100
+milliseconds. The matching code compares this value against the precomputed
+deadline for the match. The performance impact is as follows.
+
+1. A match with a timeout runs almost as fast as a match without a timeout.
+2. If any live matches have a timeout, there will be a background CPU load
+ (`~0.15%` currently on a modern machine). This load will remain constant
+ regardless of the number of matches done including matches done in parallel.
+3. If no live matches are using a timeout, the background load will remain
+ until the longest deadline (match timeout + the time when the match started)
+ is reached. E.g., if you set a timeout of one minute the load will persist
+ for approximately a minute even if the match finishes quickly.
+
+See [PR #58](https://github.com/dlclark/regexp2/pull/58) for more details and
+alternatives considered.
+
+## Goroutine leak error
+If you're using a library during unit tests (e.g. https://github.com/uber-go/goleak) that validates all goroutines are exited then you'll likely get an error if you or any of your dependencies use regex's with a MatchTimeout.
+To remedy the problem you'll need to tell the unit test to wait until the backgroup timeout goroutine is exited.
+
+```go
+func TestSomething(t *testing.T) {
+ defer goleak.VerifyNone(t)
+ defer regexp2.StopTimeoutClock()
+
+ // ... test
+}
+
+//or
+
+func TestMain(m *testing.M) {
+ // setup
+ // ...
+
+ // run
+ m.Run()
+
+ //tear down
+ regexp2.StopTimeoutClock()
+ goleak.VerifyNone(t)
+}
+```
+
+This will add ~100ms runtime to each test (or TestMain). If that's too much time you can set the clock cycle rate of the timeout goroutine in an init function in a test file. `regexp2.SetTimeoutCheckPeriod` isn't threadsafe so it must be setup before starting any regex's with Timeouts.
+
+```go
+func init() {
+ //speed up testing by making the timeout clock 1ms
+ regexp2.SetTimeoutCheckPeriod(time.Millisecond)
+}
+```
+
+## ECMAScript compatibility mode
+In this mode the engine provides compatibility with the [regex engine](https://tc39.es/ecma262/multipage/text-processing.html#sec-regexp-regular-expression-objects) described in the ECMAScript specification.
+
+Additionally a Unicode mode is provided which allows parsing of `\u{CodePoint}` syntax that is only when both are provided.
## Library features that I'm still working on
- Regex split
diff --git a/debian/changelog b/debian/changelog
index 130404a..65e5b8d 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,10 @@
+golang-github-dlclark-regexp2 (1.9.0+ds1-1) UNRELEASED; urgency=low
+
+ * New upstream release.
+ * New upstream release.
+
+ -- Debian Janitor <janitor@jelmer.uk> Fri, 07 Apr 2023 19:28:31 -0000
+
golang-github-dlclark-regexp2 (1.4.0+ds1-1) unstable; urgency=medium
[ Debian Janitor (Jelmer Vernooij) ]
diff --git a/fastclock.go b/fastclock.go
new file mode 100644
index 0000000..caf2c9d
--- /dev/null
+++ b/fastclock.go
@@ -0,0 +1,129 @@
+package regexp2
+
+import (
+ "sync"
+ "sync/atomic"
+ "time"
+)
+
+// fasttime holds a time value (ticks since clock initialization)
+type fasttime int64
+
+// fastclock provides a fast clock implementation.
+//
+// A background goroutine periodically stores the current time
+// into an atomic variable.
+//
+// A deadline can be quickly checked for expiration by comparing
+// its value to the clock stored in the atomic variable.
+//
+// The goroutine automatically stops once clockEnd is reached.
+// (clockEnd covers the largest deadline seen so far + some
+// extra time). This ensures that if regexp2 with timeouts
+// stops being used we will stop background work.
+type fastclock struct {
+ // instances of atomicTime must be at the start of the struct (or at least 64-bit aligned)
+ // otherwise 32-bit architectures will panic
+
+ current atomicTime // Current time (approximate)
+ clockEnd atomicTime // When clock updater is supposed to stop (>= any existing deadline)
+
+ // current and clockEnd can be read via atomic loads.
+ // Reads and writes of other fields require mu to be held.
+ mu sync.Mutex
+ start time.Time // Time corresponding to fasttime(0)
+ running bool // Is a clock updater running?
+}
+
+var fast fastclock
+
+// reached returns true if current time is at or past t.
+func (t fasttime) reached() bool {
+ return fast.current.read() >= t
+}
+
+// makeDeadline returns a time that is approximately time.Now().Add(d)
+func makeDeadline(d time.Duration) fasttime {
+ // Increase the deadline since the clock we are reading may be
+ // just about to tick forwards.
+ end := fast.current.read() + durationToTicks(d+clockPeriod)
+
+ // Start or extend clock if necessary.
+ if end > fast.clockEnd.read() {
+ extendClock(end)
+ }
+ return end
+}
+
+// extendClock ensures that clock is live and will run until at least end.
+func extendClock(end fasttime) {
+ fast.mu.Lock()
+ defer fast.mu.Unlock()
+
+ if fast.start.IsZero() {
+ fast.start = time.Now()
+ }
+
+ // Extend the running time to cover end as well as a bit of slop.
+ if shutdown := end + durationToTicks(time.Second); shutdown > fast.clockEnd.read() {
+ fast.clockEnd.write(shutdown)
+ }
+
+ // Start clock if necessary
+ if !fast.running {
+ fast.running = true
+ go runClock()
+ }
+}
+
+// stop the timeout clock in the background
+// should only used for unit tests to abandon the background goroutine
+func stopClock() {
+ fast.mu.Lock()
+ if fast.running {
+ fast.clockEnd.write(fasttime(0))
+ }
+ fast.mu.Unlock()
+
+ // pause until not running
+ // get and release the lock
+ isRunning := true
+ for isRunning {
+ time.Sleep(clockPeriod / 2)
+ fast.mu.Lock()
+ isRunning = fast.running
+ fast.mu.Unlock()
+ }
+}
+
+func durationToTicks(d time.Duration) fasttime {
+ // Downscale nanoseconds to approximately a millisecond so that we can avoid
+ // overflow even if the caller passes in math.MaxInt64.
+ return fasttime(d) >> 20
+}
+
+const DefaultClockPeriod = 100 * time.Millisecond
+
+// clockPeriod is the approximate interval between updates of approximateClock.
+var clockPeriod = DefaultClockPeriod
+
+func runClock() {
+ fast.mu.Lock()
+ defer fast.mu.Unlock()
+
+ for fast.current.read() <= fast.clockEnd.read() {
+ // Unlock while sleeping.
+ fast.mu.Unlock()
+ time.Sleep(clockPeriod)
+ fast.mu.Lock()
+
+ newTime := durationToTicks(time.Since(fast.start))
+ fast.current.write(newTime)
+ }
+ fast.running = false
+}
+
+type atomicTime struct{ v int64 } // Should change to atomic.Int64 when we can use go 1.19
+
+func (t *atomicTime) read() fasttime { return fasttime(atomic.LoadInt64(&t.v)) }
+func (t *atomicTime) write(v fasttime) { atomic.StoreInt64(&t.v, int64(v)) }
diff --git a/fastclock_test.go b/fastclock_test.go
new file mode 100644
index 0000000..ccc149a
--- /dev/null
+++ b/fastclock_test.go
@@ -0,0 +1,58 @@
+package regexp2
+
+import (
+ "fmt"
+ "testing"
+ "time"
+)
+
+func init() {
+ //speed up testing by making the timeout clock 1ms instead of 100ms...
+ //bad for benchmark tests though
+ SetTimeoutCheckPeriod(time.Millisecond)
+}
+func TestDeadline(t *testing.T) {
+ for _, delay := range []time.Duration{
+ clockPeriod / 10,
+ clockPeriod,
+ clockPeriod * 5,
+ clockPeriod * 10,
+ } {
+ delay := delay // Make copy for parallel sub-test.
+ t.Run(fmt.Sprint(delay), func(t *testing.T) {
+ t.Parallel()
+ start := time.Now()
+ d := makeDeadline(delay)
+ if d.reached() {
+ t.Fatalf("deadline (%v) unexpectedly expired immediately", delay)
+ }
+ time.Sleep(delay / 2)
+ if d.reached() {
+ t.Fatalf("deadline (%v) expired too soon (after %v)", delay, time.Since(start))
+ }
+ time.Sleep(delay/2 + 2*clockPeriod) // Give clock time to tick
+ if !d.reached() {
+ t.Fatalf("deadline (%v) did not expire within %v", delay, time.Since(start))
+ }
+ })
+ }
+}
+
+func TestStopTimeoutClock(t *testing.T) {
+ // run a quick regex with a long timeout
+ // make sure the stop clock returns quickly
+ r := MustCompile(".", 0)
+ r.MatchTimeout = time.Second * 10
+
+ r.MatchString("a")
+ start := time.Now()
+ StopTimeoutClock()
+ stop := time.Now()
+
+ if want, got := clockPeriod*2, stop.Sub(start); want < got {
+ t.Errorf("Expected duration less than %v, got %v", want, got)
+ }
+ if want, got := false, fast.running; want != got {
+ t.Errorf("Expected isRunning to be %v, got %v", want, got)
+ }
+}
diff --git a/go.mod b/go.mod
new file mode 100644
index 0000000..9f7f391
--- /dev/null
+++ b/go.mod
@@ -0,0 +1,3 @@
+module github.com/dlclark/regexp2
+
+go 1.13
diff --git a/regexp.go b/regexp.go
index 7c7b01d..c8f9e6b 100644
--- a/regexp.go
+++ b/regexp.go
@@ -24,7 +24,11 @@ var DefaultMatchTimeout = time.Duration(math.MaxInt64)
// Regexp is the representation of a compiled regular expression.
// A Regexp is safe for concurrent use by multiple goroutines.
type Regexp struct {
- //timeout when trying to find matches
+ // A match will time out if it takes (approximately) more than
+ // MatchTimeout. This is a safety check in case the match
+ // encounters catastrophic backtracking. The default value
+ // (DefaultMatchTimeout) causes all time out checking to be
+ // suppressed.
MatchTimeout time.Duration
// read-only after Compile
@@ -92,6 +96,19 @@ func Unescape(input string) (string, error) {
return syntax.Unescape(input)
}
+// SetTimeoutPeriod is a debug function that sets the frequency of the timeout goroutine's sleep cycle.
+// Defaults to 100ms. The only benefit of setting this lower is that the 1 background goroutine that manages
+// timeouts may exit slightly sooner after all the timeouts have expired. See Github issue #63
+func SetTimeoutCheckPeriod(d time.Duration) {
+ clockPeriod = d
+}
+
+// StopTimeoutClock should only be used in unit tests to prevent the timeout clock goroutine
+// from appearing like a leaking goroutine
+func StopTimeoutClock() {
+ stopClock()
+}
+
// String returns the source text used to compile the regular expression.
func (re *Regexp) String() string {
return re.pattern
@@ -121,6 +138,7 @@ const (
Debug = 0x0080 // "d"
ECMAScript = 0x0100 // "e"
RE2 = 0x0200 // RE2 (regexp package) compatibility mode
+ Unicode = 0x0400 // "u"
)
func (re *Regexp) RightToLeft() bool {
diff --git a/regexp_performance_test.go b/regexp_performance_test.go
index 01a87d0..5d59087 100644
--- a/regexp_performance_test.go
+++ b/regexp_performance_test.go
@@ -3,6 +3,7 @@ package regexp2
import (
"strings"
"testing"
+ "time"
)
func BenchmarkLiteral(b *testing.B) {
@@ -58,6 +59,7 @@ func BenchmarkMatchClass_InRange(b *testing.B) {
/*
func BenchmarkReplaceAll(b *testing.B) {
+
x := "abcdefghijklmnopqrstuvwxyz"
b.StopTimer()
re := MustCompile("[cjrw]", 0)
@@ -65,6 +67,7 @@ func BenchmarkReplaceAll(b *testing.B) {
for i := 0; i < b.N; i++ {
re.ReplaceAllString(x, "")
}
+
}
*/
func BenchmarkAnchoredLiteralShortNonMatch(b *testing.B) {
@@ -305,3 +308,53 @@ func BenchmarkLeading(b *testing.B) {
}
}
}
+
+func BenchmarkShortSearch(b *testing.B) {
+ // set to default check period for benchmarking purposes
+ // unit tests tend to send this low
+ SetTimeoutCheckPeriod(DefaultClockPeriod)
+ type benchmark struct {
+ name string
+ parallel bool // Run in parallel?
+ timeout time.Duration
+ increase time.Duration // timeout increase per match
+ }
+ for _, mode := range []benchmark{
+ {"serial-no-timeout", false, DefaultMatchTimeout, 0},
+ {"serial-fixed-timeout", false, time.Second, 0},
+ {"serial-increasing-timeout", false, time.Second, time.Second},
+ {"parallel-no-timeout", true, DefaultMatchTimeout, 0},
+ {"parallel-fixed-timeout", true, time.Second, 0},
+ {"parallel-increasing-timeout", true, time.Second, time.Second},
+ } {
+ b.Run(mode.name, func(b *testing.B) {
+ t := makeText(100)
+ b.SetBytes(int64(len(t)))
+ matchOnce := func(r *Regexp) {
+ if m, err := r.MatchRunes(t); m {
+ b.Fatal("match!")
+ } else if err != nil {
+ b.Fatalf("Err %v", err)
+ }
+ }
+
+ if !mode.parallel {
+ r := MustCompile(easy0, 0)
+ r.MatchTimeout = mode.timeout
+ for i := 0; i < b.N; i++ {
+ matchOnce(r)
+ r.MatchTimeout += mode.increase
+ }
+ } else {
+ b.RunParallel(func(pb *testing.PB) {
+ r := MustCompile(easy0, 0)
+ r.MatchTimeout = mode.timeout
+ for pb.Next() {
+ matchOnce(r)
+ r.MatchTimeout += mode.increase
+ }
+ })
+ }
+ })
+ }
+}
diff --git a/regexp_re2_test.go b/regexp_re2_test.go
index f66aafa..9a0c40f 100644
--- a/regexp_re2_test.go
+++ b/regexp_re2_test.go
@@ -1,6 +1,8 @@
package regexp2
-import "testing"
+import (
+ "testing"
+)
func TestRE2CompatCapture(t *testing.T) {
r := MustCompile(`re(?P<a>2)`, RE2)
@@ -119,3 +121,103 @@ func TestRE2Dollar_Multiline(t *testing.T) {
t.Fatal("Expected match")
}
}
+
+func TestRE2ExtendedZero(t *testing.T) {
+ notZero := "߀" // \u07c0
+ r := MustCompile(`^\d$`, RE2)
+ if m, _ := r.MatchString(notZero); m {
+ t.Fatal("Expected no match")
+ }
+
+ r = MustCompile(`^\D$`, RE2)
+ if m, _ := r.MatchString(notZero); !m {
+ t.Fatal("Expected match")
+ }
+}
+
+func TestRegularExtendedZero(t *testing.T) {
+ notZero := "߀" // \u07c0
+
+ r := MustCompile(`^\d$`, 0)
+ if m, _ := r.MatchString(notZero); !m {
+ t.Fatal("Expected match")
+ }
+
+ r = MustCompile(`^\D$`, 0)
+ if m, _ := r.MatchString(notZero); m {
+ t.Fatal("Expected no match")
+ }
+}
+
+func TestRE2Word(t *testing.T) {
+ r := MustCompile(`\w`, RE2)
+ if m, _ := r.MatchString("å"); m {
+ t.Fatal("Expected no match")
+ }
+
+ r = MustCompile(`\W`, RE2)
+ if m, _ := r.MatchString("å"); !m {
+ t.Fatal("Expected match")
+ }
+
+}
+
+func TestRegularWord(t *testing.T) {
+ r := MustCompile(`\w`, 0)
+ if m, _ := r.MatchString("å"); !m {
+ t.Fatal("Expected match")
+ }
+ r = MustCompile(`\W`, 0)
+ if m, _ := r.MatchString("å"); m {
+ t.Fatal("Expected no match")
+ }
+}
+
+func TestRE2Space(t *testing.T) {
+ r := MustCompile(`\s`, RE2)
+ if m, _ := r.MatchString("\x0b"); m {
+ t.Fatal("Expected no match")
+ }
+ r = MustCompile(`\S`, RE2)
+ if m, _ := r.MatchString("\x0b"); !m {
+ t.Fatal("Expected match")
+ }
+}
+
+func TestRegularSpace(t *testing.T) {
+ r := MustCompile(`\s`, 0)
+ if m, _ := r.MatchString("\x0b"); !m {
+ t.Fatal("Expected match")
+ }
+ r = MustCompile(`\S`, 0)
+ if m, _ := r.MatchString("\x0b"); m {
+ t.Fatal("Expected no match")
+ }
+}
+
+func TestEscapeLiteralDefaults(t *testing.T) {
+ _, err := Compile(`a\_test`, 0)
+ if err == nil {
+ t.Fatal("Expected compile fail")
+ }
+
+ r := MustCompile(`a\_test`, RE2)
+ if m, _ := r.MatchString("a_test"); !m {
+ t.Fatal("Expected match")
+ }
+ if m, _ := r.MatchString("a\\_test"); m {
+ t.Fatal("Expected no match")
+ }
+}
+
+/*
+func TestRE2EndZ_Singleline(t *testing.T) {
+ // PCRE allows for \n after the $ and RE2 doesn't
+ r := MustCompile(`^ac$\Z`, RE2|Debug)
+ if m, _ := r.MatchString("ac"); m {
+ t.Fatal("Expected no match")
+ }
+ if m, _ := r.MatchString("ac\n"); m {
+ t.Fatal("Expected no match")
+ }
+}*/
diff --git a/regexp_test.go b/regexp_test.go
index 9a50315..b63db42 100644
--- a/regexp_test.go
+++ b/regexp_test.go
@@ -1,6 +1,7 @@
package regexp2
import (
+ "fmt"
"reflect"
"strings"
"testing"
@@ -11,14 +12,43 @@ import (
func TestBacktrack_CatastrophicTimeout(t *testing.T) {
r, err := Compile("(.+)*\\?", 0)
- r.MatchTimeout = time.Millisecond * 1
- t.Logf("code dump: %v", r.code.Dump())
- m, err := r.FindStringMatch("Do you think you found the problem string!")
- if err == nil {
- t.Errorf("expected timeout err")
+ if err != nil {
+ t.Fatal(err)
}
- if m != nil {
- t.Errorf("Expected no match")
+ t.Logf("code dump: %v", r.code.Dump())
+ const subject = "Do you think you found the problem string!"
+
+ const earlyAllowance = 10 * time.Millisecond
+ var lateAllowance = clockPeriod + 500*time.Millisecond // Large allowance in case machine is slow
+
+ for _, timeout := range []time.Duration{
+ -1 * time.Millisecond,
+ 0 * time.Millisecond,
+ 1 * time.Millisecond,
+ 10 * time.Millisecond,
+ 100 * time.Millisecond,
+ 500 * time.Millisecond,
+ 1000 * time.Millisecond,
+ } {
+ t.Run(fmt.Sprint(timeout), func(t *testing.T) {
+ r.MatchTimeout = timeout
+ start := time.Now()
+ m, err := r.FindStringMatch(subject)
+ elapsed := time.Since(start)
+ if err == nil {
+ t.Errorf("expected timeout err")
+ }
+ if m != nil {
+ t.Errorf("Expected no match")
+ }
+ t.Logf("timeed out after %v", elapsed)
+ if elapsed < timeout-earlyAllowance {
+ t.Errorf("Match timed out too quickly (%v instead of expected %v)", elapsed, timeout-earlyAllowance)
+ }
+ if elapsed > timeout+lateAllowance {
+ t.Errorf("Match timed out too late (%v instead of expected %v)", elapsed, timeout+lateAllowance)
+ }
+ })
}
}
@@ -756,6 +786,72 @@ func TestECMAInvalidEscape(t *testing.T) {
}
}
+func TestECMANamedGroup(t *testing.T) {
+ re := MustCompile(`\k`, ECMAScript)
+ if m, err := re.MatchString("k"); err != nil {
+ t.Fatal(err)
+ } else if !m {
+ t.Fatal("Expected match")
+ }
+
+ re = MustCompile(`\k'test'`, ECMAScript)
+ if m, err := re.MatchString(`k'test'`); err != nil {
+ t.Fatal(err)
+ } else if !m {
+ t.Fatal("Expected match")
+ }
+
+ re = MustCompile(`\k<test>`, ECMAScript)
+ if m, err := re.MatchString(`k<test>`); err != nil {
+ t.Fatal(err)
+ } else if !m {
+ t.Fatal("Expected match")
+ }
+
+ _, err := Compile(`(?<title>\w+), yes \k'title'`, ECMAScript)
+ if err == nil {
+ t.Fatal("Expected error")
+ }
+
+ re = MustCompile(`(?<title>\w+), yes \k<title>`, ECMAScript)
+ if m, err := re.MatchString("sir, yes sir"); err != nil {
+ t.Fatal(err)
+ } else if !m {
+ t.Fatal("Expected match")
+ }
+
+ re = MustCompile(`\k<title>, yes (?<title>\w+)`, ECMAScript)
+ if m, err := re.MatchString(", yes sir"); err != nil {
+ t.Fatal(err)
+ } else if !m {
+ t.Fatal("Expected match")
+ }
+
+ _, err = Compile(`\k<(?<name>)>`, ECMAScript)
+ if err == nil {
+ t.Fatal("Expected error")
+ }
+
+ MustCompile(`\k<(<name>)>`, ECMAScript)
+
+ _, err = Compile(`\k<(<name>)>`, 0)
+ if err == nil {
+ t.Fatal("Expected error")
+ }
+
+ re = MustCompile(`\'|\<?`, 0)
+ if m, err := re.MatchString("'"); err != nil {
+ t.Fatal(err)
+ } else if !m {
+ t.Fatal("Expected match")
+ }
+ if m, err := re.MatchString("<"); err != nil {
+ t.Fatal(err)
+ } else if !m {
+ t.Fatal("Expected match")
+ }
+}
+
func TestECMAInvalidEscapeCharClass(t *testing.T) {
re := MustCompile(`[\x0]`, ECMAScript)
if m, err := re.MatchString("x"); err != nil {
@@ -792,6 +888,20 @@ func TestECMAScriptXCurlyBraceEscape(t *testing.T) {
}
}
+func TestEcmaScriptUnicodeRange(t *testing.T) {
+ r, err := Compile(`([\u{001a}-\u{ffff}]+)`, ECMAScript|Unicode)
+ if err != nil {
+ panic(err)
+ }
+ m, err := r.FindStringMatch("qqqq")
+ if err != nil {
+ panic(err)
+ }
+ if m == nil {
+ t.Fatal("Expected non-nil, got nil")
+ }
+}
+
func TestNegateRange(t *testing.T) {
re := MustCompile(`[\D]`, 0)
if m, err := re.MatchString("A"); err != nil {
@@ -1093,27 +1203,23 @@ func TestEmptyCaptureLargeRepeat(t *testing.T) {
}
}
-func TestFuzzBytes(t *testing.T) {
+func TestFuzzBytes_NoCompile(t *testing.T) {
//some crash cases found from fuzzing
var testCases = []struct {
- r, s []byte
+ r []byte
}{
{
r: []byte{0x28, 0x28, 0x29, 0x5c, 0x37, 0x28, 0x3f, 0x28, 0x29, 0x29},
- s: []byte{0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30},
},
{
r: []byte{0x28, 0x5c, 0x32, 0x28, 0x3f, 0x28, 0x30, 0x29, 0x29},
- s: []byte{0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30},
},
{
r: []byte{0x28, 0x3f, 0x28, 0x29, 0x29, 0x5c, 0x31, 0x30, 0x28, 0x3f, 0x28, 0x30, 0x29},
- s: []byte{0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30},
},
{
r: []byte{0x28, 0x29, 0x28, 0x28, 0x29, 0x5c, 0x37, 0x28, 0x3f, 0x28, 0x29, 0x29},
- s: []byte{0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30},
},
}
@@ -1129,3 +1235,64 @@ func TestFuzzBytes(t *testing.T) {
}
}
+
+func TestFuzzBytes_Match(t *testing.T) {
+
+ var testCases = []struct {
+ r, s []byte
+ }{
+ {
+ r: []byte{0x30, 0xbf, 0x30, 0x2a, 0x30, 0x30},
+ s: []byte{0xf0, 0xb0, 0x80, 0x91, 0xf7},
+ },
+ {
+ r: []byte{0x30, 0xaf, 0xf3, 0x30, 0x2a},
+ s: []byte{0xf3, 0x80, 0x80, 0x87, 0x80, 0x89},
+ },
+ }
+
+ for _, c := range testCases {
+ r := string(c.r)
+ t.Run(r, func(t *testing.T) {
+ re, err := Compile(r, 0)
+
+ if err != nil {
+ t.Fatal("should compile, but didn't")
+ }
+
+ re.MatchString(string(c.s))
+ })
+ }
+}
+
+func TestConcatAccidentalPatternCharge(t *testing.T) {
+ // originally this pattern would parse incorrectly
+ // specifically the closing group would concat the string literals
+ // together but the raw rune slice would blow over the original pattern
+ // so the final bit of pattern parsing would be wrong
+ // fixed in #49
+ r, err := Compile(`(?<=1234\.\*56).*(?=890)`, 0)
+
+ if err != nil {
+ panic(err)
+ }
+
+ m, err := r.FindStringMatch(`1234.*567890`)
+ if err != nil {
+ panic(err)
+ }
+ if m == nil {
+ t.Fatal("Expected non-nil, got nil")
+ }
+}
+
+func TestGoodReverseOrderMessage(t *testing.T) {
+ _, err := Compile(`[h-c]`, ECMAScript)
+ if err == nil {
+ t.Fatal("expected error")
+ }
+ expected := "error parsing regexp: [h-c] range in reverse order in `[h-c]`"
+ if err.Error() != expected {
+ t.Fatalf("expected %q got %q", expected, err.Error())
+ }
+}
diff --git a/runner.go b/runner.go
index 4d7f9b0..494dcef 100644
--- a/runner.go
+++ b/runner.go
@@ -58,10 +58,9 @@ type runner struct {
runmatch *Match // result object
- ignoreTimeout bool
- timeout time.Duration // timeout in milliseconds (needed for actual)
- timeoutChecksToSkip int
- timeoutAt time.Time
+ ignoreTimeout bool
+ timeout time.Duration // timeout in milliseconds (needed for actual)
+ deadline fasttime
operator syntax.InstOp
codepos int
@@ -1551,39 +1550,15 @@ func (r *runner) isECMABoundary(index, startpos, endpos int) bool {
(index < endpos && syntax.IsECMAWordChar(r.runtext[index]))
}
-// this seems like a comment to justify randomly picking 1000 :-P
-// We have determined this value in a series of experiments where x86 retail
-// builds (ono-lab-optimized) were run on different pattern/input pairs. Larger values
-// of TimeoutCheckFrequency did not tend to increase performance; smaller values
-// of TimeoutCheckFrequency tended to slow down the execution.
-const timeoutCheckFrequency int = 1000
-
func (r *runner) startTimeoutWatch() {
if r.ignoreTimeout {
return
}
-
- r.timeoutChecksToSkip = timeoutCheckFrequency
- r.timeoutAt = time.Now().Add(r.timeout)
+ r.deadline = makeDeadline(r.timeout)
}
func (r *runner) checkTimeout() error {
- if r.ignoreTimeout {
- return nil
- }
- r.timeoutChecksToSkip--
- if r.timeoutChecksToSkip != 0 {
- return nil
- }
-
- r.timeoutChecksToSkip = timeoutCheckFrequency
- return r.doCheckTimeout()
-}
-
-func (r *runner) doCheckTimeout() error {
- current := time.Now()
-
- if current.Before(r.timeoutAt) {
+ if r.ignoreTimeout || !r.deadline.reached() {
return nil
}
diff --git a/syntax/charclass.go b/syntax/charclass.go
index 53974d1..6881a0e 100644
--- a/syntax/charclass.go
+++ b/syntax/charclass.go
@@ -37,6 +37,8 @@ var (
ecmaSpace = []rune{0x0009, 0x000e, 0x0020, 0x0021, 0x00a0, 0x00a1, 0x1680, 0x1681, 0x2000, 0x200b, 0x2028, 0x202a, 0x202f, 0x2030, 0x205f, 0x2060, 0x3000, 0x3001, 0xfeff, 0xff00}
ecmaWord = []rune{0x0030, 0x003a, 0x0041, 0x005b, 0x005f, 0x0060, 0x0061, 0x007b}
ecmaDigit = []rune{0x0030, 0x003a}
+
+ re2Space = []rune{0x0009, 0x000b, 0x000c, 0x000e, 0x0020, 0x0021}
)
var (
@@ -56,6 +58,9 @@ var (
NotSpaceClass = getCharSetFromCategoryString(true, false, spaceCategoryText)
DigitClass = getCharSetFromCategoryString(false, false, "Nd")
NotDigitClass = getCharSetFromCategoryString(false, true, "Nd")
+
+ RE2SpaceClass = getCharSetFromOldString(re2Space, false)
+ NotRE2SpaceClass = getCharSetFromOldString(re2Space, true)
)
var unicodeCategories = func() map[string]*unicode.RangeTable {
@@ -401,13 +406,19 @@ func (c *CharSet) addChar(ch rune) {
c.addRange(ch, ch)
}
-func (c *CharSet) addSpace(ecma, negate bool) {
+func (c *CharSet) addSpace(ecma, re2, negate bool) {
if ecma {
if negate {
c.addRanges(NotECMASpaceClass().ranges)
} else {
c.addRanges(ECMASpaceClass().ranges)
}
+ } else if re2 {
+ if negate {
+ c.addRanges(NotRE2SpaceClass().ranges)
+ } else {
+ c.addRanges(RE2SpaceClass().ranges)
+ }
} else {
c.addCategories(category{cat: spaceCategoryText, negate: negate})
}
@@ -563,7 +574,7 @@ func (c *CharSet) addNamedASCII(name string, negate bool) bool {
case "punct": //[!-/:-@[-`{-~]
rs = []singleRange{singleRange{'!', '/'}, singleRange{':', '@'}, singleRange{'[', '`'}, singleRange{'{', '~'}}
case "space":
- c.addSpace(true, negate)
+ c.addSpace(true, false, negate)
case "upper":
rs = []singleRange{singleRange{'A', 'Z'}}
case "word":
diff --git a/syntax/parser.go b/syntax/parser.go
index da14f98..9dc6e31 100644
--- a/syntax/parser.go
+++ b/syntax/parser.go
@@ -22,6 +22,7 @@ const (
Debug = 0x0080 // "d"
ECMAScript = 0x0100 // "e"
RE2 = 0x0200 // RE2 compat mode
+ Unicode = 0x0400 // "u"
)
func optionFromCode(ch rune) RegexOptions {
@@ -43,6 +44,8 @@ func optionFromCode(ch rune) RegexOptions {
return Debug
case 'e', 'E':
return ECMAScript
+ case 'u', 'U':
+ return Unicode
default:
return 0
}
@@ -104,7 +107,7 @@ const (
ErrBadClassInCharRange = "cannot include class \\%v in character range"
ErrUnterminatedBracket = "unterminated [] set"
ErrSubtractionMustBeLast = "a subtraction must be the last element in a character class"
- ErrReversedCharRange = "[x-y] range in reverse order"
+ ErrReversedCharRange = "[%c-%c] range in reverse order"
)
func (e ErrorCode) String() string {
@@ -1121,14 +1124,14 @@ func (p *parser) scanBackslash(scanOnly bool) (*regexNode, error) {
case 'w':
p.moveRight(1)
- if p.useOptionE() {
+ if p.useOptionE() || p.useRE2() {
return newRegexNodeSet(ntSet, p.options, ECMAWordClass()), nil
}
return newRegexNodeSet(ntSet, p.options, WordClass()), nil
case 'W':
p.moveRight(1)
- if p.useOptionE() {
+ if p.useOptionE() || p.useRE2() {
return newRegexNodeSet(ntSet, p.options, NotECMAWordClass()), nil
}
return newRegexNodeSet(ntSet, p.options, NotWordClass()), nil
@@ -1137,6 +1140,8 @@ func (p *parser) scanBackslash(scanOnly bool) (*regexNode, error) {
p.moveRight(1)
if p.useOptionE() {
return newRegexNodeSet(ntSet, p.options, ECMASpaceClass()), nil
+ } else if p.useRE2() {
+ return newRegexNodeSet(ntSet, p.options, RE2SpaceClass()), nil
}
return newRegexNodeSet(ntSet, p.options, SpaceClass()), nil
@@ -1144,19 +1149,21 @@ func (p *parser) scanBackslash(scanOnly bool) (*regexNode, error) {
p.moveRight(1)
if p.useOptionE() {
return newRegexNodeSet(ntSet, p.options, NotECMASpaceClass()), nil
+ } else if p.useRE2() {
+ return newRegexNodeSet(ntSet, p.options, NotRE2SpaceClass()), nil
}
return newRegexNodeSet(ntSet, p.options, NotSpaceClass()), nil
case 'd':
p.moveRight(1)
- if p.useOptionE() {
+ if p.useOptionE() || p.useRE2() {
return newRegexNodeSet(ntSet, p.options, ECMADigitClass()), nil
}
return newRegexNodeSet(ntSet, p.options, DigitClass()), nil
case 'D':
p.moveRight(1)
- if p.useOptionE() {
+ if p.useOptionE() || p.useRE2() {
return newRegexNodeSet(ntSet, p.options, NotECMADigitClass()), nil
}
return newRegexNodeSet(ntSet, p.options, NotDigitClass()), nil
@@ -1186,19 +1193,24 @@ func (p *parser) scanBasicBackslash(scanOnly bool) (*regexNode, error) {
return nil, p.getErr(ErrIllegalEndEscape)
}
angled := false
+ k := false
close := '\x00'
backpos := p.textpos()
ch := p.rightChar(0)
- // allow \k<foo> instead of \<foo>, which is now deprecated
+ // Allow \k<foo> instead of \<foo>, which is now deprecated.
- if ch == 'k' {
+ // According to ECMAScript specification, \k<name> is only parsed as a named group reference if
+ // there is at least one group name in the regexp.
+ // See https://www.ecma-international.org/ecma-262/#sec-isvalidregularexpressionliteral, step 7.
+ // Note, during the first (scanOnly) run we may not have all group names scanned, but that's ok.
+ if ch == 'k' && (!p.useOptionE() || len(p.capnames) > 0) {
if p.charsRight() >= 2 {
p.moveRight(1)
ch = p.moveRightGetChar()
- if ch == '<' || ch == '\'' {
+ if ch == '<' || (!p.useOptionE() && ch == '\'') { // No support for \k'name' in ECMAScript
angled = true
if ch == '\'' {
close = '\''
@@ -1213,8 +1225,9 @@ func (p *parser) scanBasicBackslash(scanOnly bool) (*regexNode, error) {
}
ch = p.rightChar(0)
+ k = true
- } else if (ch == '<' || ch == '\'') && p.charsRight() > 1 { // Note angle without \g
+ } else if !p.useOptionE() && (ch == '<' || ch == '\'') && p.charsRight() > 1 { // Note angle without \g
angled = true
if ch == '\'' {
close = '\''
@@ -1257,14 +1270,23 @@ func (p *parser) scanBasicBackslash(scanOnly bool) (*regexNode, error) {
return nil, p.getErr(ErrUndefinedBackRef, capnum)
}
- } else if angled && IsWordChar(ch) {
+ } else if angled {
capname := p.scanCapname()
- if p.charsRight() > 0 && p.moveRightGetChar() == close {
+ if capname != "" && p.charsRight() > 0 && p.moveRightGetChar() == close {
+
+ if scanOnly {
+ return nil, nil
+ }
+
if p.isCaptureName(capname) {
return newRegexNodeM(ntRef, p.options, p.captureSlotFromName(capname)), nil
}
return nil, p.getErr(ErrUndefinedNameRef, capname)
+ } else {
+ if k {
+ return nil, p.getErr(ErrMalformedNameRef)
+ }
}
}
@@ -1276,6 +1298,10 @@ func (p *parser) scanBasicBackslash(scanOnly bool) (*regexNode, error) {
return nil, err
}
+ if scanOnly {
+ return nil, nil
+ }
+
if p.useOptionI() {
ch = unicode.ToLower(ch)
}
@@ -1443,7 +1469,7 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
if inRange {
return nil, p.getErr(ErrBadClassInCharRange, ch)
}
- cc.addDigit(p.useOptionE(), ch == 'D', p.patternRaw)
+ cc.addDigit(p.useOptionE() || p.useRE2(), ch == 'D', p.patternRaw)
}
continue
@@ -1452,7 +1478,7 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
if inRange {
return nil, p.getErr(ErrBadClassInCharRange, ch)
}
- cc.addSpace(p.useOptionE(), ch == 'S')
+ cc.addSpace(p.useOptionE(), p.useRE2(), ch == 'S')
}
continue
@@ -1462,7 +1488,7 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
return nil, p.getErr(ErrBadClassInCharRange, ch)
}
- cc.addWord(p.useOptionE(), ch == 'W')
+ cc.addWord(p.useOptionE() || p.useRE2(), ch == 'W')
}
continue
@@ -1548,7 +1574,7 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
} else {
// a regular range, like a-z
if chPrev > ch {
- return nil, p.getErr(ErrReversedCharRange)
+ return nil, p.getErr(ErrReversedCharRange, chPrev, ch)
}
cc.addRange(chPrev, ch)
}
@@ -1672,7 +1698,13 @@ func (p *parser) scanCharEscape() (r rune, err error) {
r, err = p.scanHex(2)
}
case 'u':
- r, err = p.scanHex(4)
+ // ECMAscript suppot \u{HEX} only if `u` is also set
+ if p.useOptionE() && p.useOptionU() && p.charsRight() > 0 && p.rightChar(0) == '{' {
+ p.moveRight(1)
+ return p.scanHexUntilBrace()
+ } else {
+ r, err = p.scanHex(4)
+ }
case 'a':
return '\u0007', nil
case 'b':
@@ -1692,7 +1724,7 @@ func (p *parser) scanCharEscape() (r rune, err error) {
case 'c':
r, err = p.scanControl()
default:
- if !p.useOptionE() && IsWordChar(ch) {
+ if !p.useOptionE() && !p.useRE2() && IsWordChar(ch) {
return 0, p.getErr(ErrUnrecognizedEscape, string(ch))
}
return ch, nil
@@ -1949,6 +1981,11 @@ func (p *parser) useRE2() bool {
return (p.options & RE2) != 0
}
+// True if U option enabling ECMAScript's Unicode behavior on.
+func (p *parser) useOptionU() bool {
+ return (p.options & Unicode) != 0
+}
+
// True if options stack is empty.
func (p *parser) emptyOptionsStack() bool {
return len(p.optionsStack) == 0
@@ -2044,7 +2081,8 @@ func (p *parser) addToConcatenate(pos, cch int, isReplacement bool) {
}
if cch > 1 {
- str := p.pattern[pos : pos+cch]
+ str := make([]rune, cch)
+ copy(str, p.pattern[pos:pos+cch])
if p.useOptionI() && !isReplacement {
// We do the ToLower character by character for consistency. With surrogate chars, doing
diff --git a/syntax/prefix.go b/syntax/prefix.go
index 011ef0b..f671688 100644
--- a/syntax/prefix.go
+++ b/syntax/prefix.go
@@ -712,7 +712,7 @@ func (b *BmPrefix) Scan(text []rune, index, beglimit, endlimit int) int {
if chTest != b.pattern[match] {
advance = b.positive[match]
- if (chTest & 0xFF80) == 0 {
+ if chTest < 128 {
test2 = (match - startmatch) + b.negativeASCII[chTest]
} else if chTest < 0xffff && len(b.negativeUnicode) > 0 {
unicodeLookup = b.negativeUnicode[chTest>>8]