chore(deps): resolve dependency conflict

2026-06-18 06:17:40 +02:00 · 2020-05-11 20:12:28 -04:00
parent 1b17ab1914
commit 4eeec6c868
92 changed files with 1750 additions and 317 deletions
@@ -22,4 +22,6 @@ _testmain.go
 *.exe
 *.test
 *.prof
-*.out
+*.out
+
+.DS_Store
@@ -57,6 +57,21 @@ The __last__ capture is embedded in each group, so `g.String()` will return the
 | named ascii character class `[[:foo:]]`| yes | no |
 | conditionals `((expr)yes\|no)` | no | yes |

+## RE2 compatibility mode
+The default behavior of `regexp2` is to match the .NET regexp engine, however the `RE2` option is provided to change the parsing to increase compatibility with RE2.  Using the `RE2` option when compiling a regexp will not take away any features, but will change the following behaviors:
+* add support for named ascii character classes (e.g. `[[:foo:]]`)
+* add support for python-style capture groups (e.g. `(P<name>re)`)
+
+```go
+re := regexp2.MustCompile(`Your RE2-compatible pattern`, regexp2.RE2)
+if isMatch, _ := re.MatchString(`Something to match`); isMatch {
+    //do something
+}
+```
+
+This feature is a work in progress and I'm open to ideas for more things to put here (maybe more relaxed character escaping rules?).
+
+
 ## Library features that I'm still working on
 - Regex split

@@ -120,6 +120,7 @@ const (
 	RightToLeft                          = 0x0040 // "r"
 	Debug                                = 0x0080 // "d"
 	ECMAScript                           = 0x0100 // "e"
+	RE2                                  = 0x0200 // RE2 (regexp package) compatibility mode
 )

 func (re *Regexp) RightToLeft() bool {
@@ -484,6 +484,29 @@ func (c *CharSet) addRanges(ranges []singleRange) {
 	c.canonicalize()
 }

+// Merges everything but the new ranges into our own
+func (c *CharSet) addNegativeRanges(ranges []singleRange) {
+	if c.anything {
+		return
+	}
+
+	var hi rune
+
+	// convert incoming ranges into opposites, assume they are in order
+	for _, r := range ranges {
+		if hi < r.first {
+			c.ranges = append(c.ranges, singleRange{hi, r.first - 1})
+		}
+		hi = r.last + 1
+	}
+
+	if hi < utf8.MaxRune {
+		c.ranges = append(c.ranges, singleRange{hi, utf8.MaxRune})
+	}
+
+	c.canonicalize()
+}
+
 func isValidUnicodeCat(catName string) bool {
 	_, ok := unicodeCategories[catName]
 	return ok
@@ -515,6 +538,53 @@ func (c *CharSet) addRange(chMin, chMax rune) {
 	c.canonicalize()
 }

+func (c *CharSet) addNamedASCII(name string, negate bool) bool {
+	var rs []singleRange
+
+	switch name {
+	case "alnum":
+		rs = []singleRange{singleRange{'0', '9'}, singleRange{'A', 'Z'}, singleRange{'a', 'z'}}
+	case "alpha":
+		rs = []singleRange{singleRange{'A', 'Z'}, singleRange{'a', 'z'}}
+	case "ascii":
+		rs = []singleRange{singleRange{0, 0x7f}}
+	case "blank":
+		rs = []singleRange{singleRange{'\t', '\t'}, singleRange{' ', ' '}}
+	case "cntrl":
+		rs = []singleRange{singleRange{0, 0x1f}, singleRange{0x7f, 0x7f}}
+	case "digit":
+		c.addDigit(false, negate, "")
+	case "graph":
+		rs = []singleRange{singleRange{'!', '~'}}
+	case "lower":
+		rs = []singleRange{singleRange{'a', 'z'}}
+	case "print":
+		rs = []singleRange{singleRange{' ', '~'}}
+	case "punct": //[!-/:-@[-`{-~]
+		rs = []singleRange{singleRange{'!', '/'}, singleRange{':', '@'}, singleRange{'[', '`'}, singleRange{'{', '~'}}
+	case "space":
+		c.addSpace(true, negate)
+	case "upper":
+		rs = []singleRange{singleRange{'A', 'Z'}}
+	case "word":
+		c.addWord(true, negate)
+	case "xdigit":
+		rs = []singleRange{singleRange{'0', '9'}, singleRange{'A', 'F'}, singleRange{'a', 'f'}}
+	default:
+		return false
+	}
+
+	if len(rs) > 0 {
+		if negate {
+			c.addNegativeRanges(rs)
+		} else {
+			c.addRanges(rs)
+		}
+	}
+
+	return true
+}
+
 type singleRangeSorter []singleRange

 func (p singleRangeSorter) Len() int           { return len(p) }
@@ -21,6 +21,7 @@ const (
 	RightToLeft                          = 0x0040 // "r"
 	Debug                                = 0x0080 // "d"
 	ECMAScript                           = 0x0100 // "e"
+	RE2                                  = 0x0200 // RE2 compat mode
 )

 func optionFromCode(ch rune) RegexOptions {
@@ -310,7 +311,7 @@ func (p *parser) countCaptures() error {
 		switch ch {
 		case '\\':
 			if p.charsRight() > 0 {
-				p.moveRight(1)
+				p.scanBackslash(true)
 			}

 		case '#':
@@ -354,6 +355,14 @@ func (p *parser) countCaptures() error {
 								p.noteCaptureName(p.scanCapname(), pos)
 							}
 						}
+					} else if p.useRE2() && p.charsRight() > 2 && (p.rightChar(0) == 'P' && p.rightChar(1) == '<') {
+						// RE2-compat (?P<)
+						p.moveRight(2)
+						ch = p.rightChar(0)
+						if IsWordChar(ch) {
+							p.noteCaptureName(p.scanCapname(), pos)
+						}
+
 					} else {
 						// (?...

@@ -520,7 +529,7 @@ func (p *parser) scanRegex() (*regexNode, error) {
 			}

 		case '\\':
-			n, err := p.scanBackslash()
+			n, err := p.scanBackslash(false)
 			if err != nil {
 				return nil, err
 			}
@@ -1022,6 +1031,50 @@ func (p *parser) scanGroupOpen() (*regexNode, error) {
 				}
 			}

+		case 'P':
+			if p.useRE2() {
+				// support for P<name> syntax
+				if p.charsRight() < 3 {
+					goto BreakRecognize
+				}
+
+				ch = p.moveRightGetChar()
+				if ch != '<' {
+					goto BreakRecognize
+				}
+
+				ch = p.moveRightGetChar()
+				p.moveLeft()
+
+				if IsWordChar(ch) {
+					capnum := -1
+					capname := p.scanCapname()
+
+					if p.isCaptureName(capname) {
+						capnum = p.captureSlotFromName(capname)
+					}
+
+					// check if we have bogus character after the name
+					if p.charsRight() > 0 && p.rightChar(0) != '>' {
+						return nil, p.getErr(ErrInvalidGroupName)
+					}
+
+					// actually make the node
+
+					if capnum != -1 && p.charsRight() > 0 && p.moveRightGetChar() == '>' {
+						return newRegexNodeMN(ntCapture, p.options, capnum, -1), nil
+					}
+					goto BreakRecognize
+
+				} else {
+					// bad group name - starts with something other than a word character and isn't a number
+					return nil, p.getErr(ErrInvalidGroupName)
+				}
+			}
+			// if we're not using RE2 compat mode then
+			// we just behave like normal
+			fallthrough
+
 		default:
 			p.moveLeft()

@@ -1055,7 +1108,7 @@ BreakRecognize:
 }

 // scans backslash specials and basics
-func (p *parser) scanBackslash() (*regexNode, error) {
+func (p *parser) scanBackslash(scanOnly bool) (*regexNode, error) {

 	if p.charsRight() == 0 {
 		return nil, p.getErr(ErrIllegalEndEscape)
@@ -1123,12 +1176,12 @@ func (p *parser) scanBackslash() (*regexNode, error) {
 		return newRegexNodeSet(ntSet, p.options, cc), nil

 	default:
-		return p.scanBasicBackslash()
+		return p.scanBasicBackslash(scanOnly)
 	}
 }

 // Scans \-style backreferences and character escapes
-func (p *parser) scanBasicBackslash() (*regexNode, error) {
+func (p *parser) scanBasicBackslash(scanOnly bool) (*regexNode, error) {
 	if p.charsRight() == 0 {
 		return nil, p.getErr(ErrIllegalEndEscape)
 	}
@@ -1184,15 +1237,19 @@ func (p *parser) scanBasicBackslash() (*regexNode, error) {
 		if p.charsRight() > 0 && p.moveRightGetChar() == close {
 			if p.isCaptureSlot(capnum) {
 				return newRegexNodeM(ntRef, p.options, capnum), nil
-			} else {
-				return nil, p.getErr(ErrUndefinedBackRef, capnum)
 			}
+			return nil, p.getErr(ErrUndefinedBackRef, capnum)
 		}
 	} else if !angled && ch >= '1' && ch <= '9' { // Try to parse backreference or octal: \1
 		capnum, err := p.scanDecimal()
 		if err != nil {
 			return nil, err
 		}
+
+		if scanOnly {
+			return nil, nil
+		}
+
 		if p.useOptionE() || p.isCaptureSlot(capnum) {
 			return newRegexNodeM(ntRef, p.options, capnum), nil
 		}
@@ -1448,11 +1505,26 @@ func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
 				savePos := p.textpos()

 				p.moveRight(1)
-				p.scanCapname() // throwaway the name
+				negate := false
+				if p.charsRight() > 1 && p.rightChar(0) == '^' {
+					negate = true
+					p.moveRight(1)
+				}
+
+				nm := p.scanCapname() // snag the name
+				if !scanOnly && p.useRE2() {
+					// look up the name since these are valid for RE2
+					// add the group based on the name
+					if ok := cc.addNamedASCII(nm, negate); !ok {
+						return nil, p.getErr(ErrInvalidCharRange)
+					}
+				}
 				if p.charsRight() < 2 || p.moveRightGetChar() != ':' || p.moveRightGetChar() != ']' {
 					p.textto(savePos)
+				} else if p.useRE2() {
+					// move on
+					continue
 				}
-				// else lookup name (nyi)
 			}
 		}

@@ -1547,7 +1619,7 @@ func (p *parser) scanDecimal() (int, error) {

 // Returns true for options allowed only at the top level
 func isOnlyTopOption(option RegexOptions) bool {
-	return option == RightToLeft || option == ECMAScript
+	return option == RightToLeft || option == ECMAScript || option == RE2
 }

 // Scans cimsx-cimsx option string, stops at the first unrecognized char.
@@ -1861,6 +1933,11 @@ func (p *parser) useOptionE() bool {
 	return (p.options & ECMAScript) != 0
 }

+// true to use RE2 compatibility parsing behavior.
+func (p *parser) useRE2() bool {
+	return (p.options & RE2) != 0
+}
+
 // True if options stack is empty.
 func (p *parser) emptyOptionsStack() bool {
 	return len(p.optionsStack) == 0