blob: 4fa7d0e2f8330dca2cdf4388502c88459a05dafa [file] [log] [blame]
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package syntax
// Note to implementers:
// In this package, re is always a *Regexp and r is always a rune.
import (
"strconv"
"strings"
"unicode"
)
// A Regexp is a node in a regular expression syntax tree.
type Regexp struct {
Op Op // operator
Flags Flags
Sub []*Regexp // subexpressions, if any
Sub0 [1]*Regexp // storage for short Sub
Rune []rune // matched runes, for OpLiteral, OpCharClass
Rune0 [2]rune // storage for short Rune
Min, Max int // min, max for OpRepeat
Cap int // capturing index, for OpCapture
Name string // capturing name, for OpCapture
}
//go:generate stringer -type Op -trimprefix Op
// An Op is a single regular expression operator.
type Op uint8
// Operators are listed in precedence order, tightest binding to weakest.
// Character class operators are listed simplest to most complex
// (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar).
const (
OpNoMatch Op = 1 + iota // matches no strings
OpEmptyMatch // matches empty string
OpLiteral // matches Runes sequence
OpCharClass // matches Runes interpreted as range pair list
OpAnyCharNotNL // matches any character except newline
OpAnyChar // matches any character
OpBeginLine // matches empty string at beginning of line
OpEndLine // matches empty string at end of line
OpBeginText // matches empty string at beginning of text
OpEndText // matches empty string at end of text
OpWordBoundary // matches word boundary `\b`
OpNoWordBoundary // matches word non-boundary `\B`
OpCapture // capturing subexpression with index Cap, optional name Name
OpStar // matches Sub[0] zero or more times
OpPlus // matches Sub[0] one or more times
OpQuest // matches Sub[0] zero or one times
OpRepeat // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit)
OpConcat // matches concatenation of Subs
OpAlternate // matches alternation of Subs
)
const opPseudo Op = 128 // where pseudo-ops start
// Equal reports whether x and y have identical structure.
func (x *Regexp) Equal(y *Regexp) bool {
if x == nil || y == nil {
return x == y
}
if x.Op != y.Op {
return false
}
switch x.Op {
case OpEndText:
// The parse flags remember whether this is \z or \Z.
if x.Flags&WasDollar != y.Flags&WasDollar {
return false
}
case OpLiteral, OpCharClass:
if len(x.Rune) != len(y.Rune) {
return false
}
for i, r := range x.Rune {
if r != y.Rune[i] {
return false
}
}
case OpAlternate, OpConcat:
if len(x.Sub) != len(y.Sub) {
return false
}
for i, sub := range x.Sub {
if !sub.Equal(y.Sub[i]) {
return false
}
}
case OpStar, OpPlus, OpQuest:
if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) {
return false
}
case OpRepeat:
if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) {
return false
}
case OpCapture:
if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) {
return false
}
}
return true
}
// printFlags is a bit set indicating which flags (including non-capturing parens) to print around a regexp.
type printFlags uint8
const (
flagI printFlags = 1 << iota // (?i:
flagM // (?m:
flagS // (?s:
flagOff // )
flagPrec // (?: )
negShift = 5 // flagI<<negShift is (?-i:
)
// addSpan enables the flags f around start..last,
// by setting flags[start] = f and flags[last] = flagOff.
func addSpan(start, last *Regexp, f printFlags, flags *map[*Regexp]printFlags) {
if *flags == nil {
*flags = make(map[*Regexp]printFlags)
}
(*flags)[start] = f
(*flags)[last] |= flagOff // maybe start==last
}
// calcFlags calculates the flags to print around each subexpression in re,
// storing that information in (*flags)[sub] for each affected subexpression.
// The first time an entry needs to be written to *flags, calcFlags allocates the map.
// calcFlags also calculates the flags that must be active or can't be active
// around re and returns those flags.
func calcFlags(re *Regexp, flags *map[*Regexp]printFlags) (must, cant printFlags) {
switch re.Op {
default:
return 0, 0
case OpLiteral:
// If literal is fold-sensitive, return (flagI, 0) or (0, flagI)
// according to whether (?i) is active.
// If literal is not fold-sensitive, return 0, 0.
for _, r := range re.Rune {
if minFold <= r && r <= maxFold && unicode.SimpleFold(r) != r {
if re.Flags&FoldCase != 0 {
return flagI, 0
} else {
return 0, flagI
}
}
}
return 0, 0
case OpCharClass:
// If literal is fold-sensitive, return 0, flagI - (?i) has been compiled out.
// If literal is not fold-sensitive, return 0, 0.
for i := 0; i < len(re.Rune); i += 2 {
lo := max(minFold, re.Rune[i])
hi := min(maxFold, re.Rune[i+1])
for r := lo; r <= hi; r++ {
for f := unicode.SimpleFold(r); f != r; f = unicode.SimpleFold(f) {
if !(lo <= f && f <= hi) && !inCharClass(f, re.Rune) {
return 0, flagI
}
}
}
}
return 0, 0
case OpAnyCharNotNL: // (?-s).
return 0, flagS
case OpAnyChar: // (?s).
return flagS, 0
case OpBeginLine, OpEndLine: // (?m)^ (?m)$
return flagM, 0
case OpEndText:
if re.Flags&WasDollar != 0 { // (?-m)$
return 0, flagM
}
return 0, 0
case OpCapture, OpStar, OpPlus, OpQuest, OpRepeat:
return calcFlags(re.Sub[0], flags)
case OpConcat, OpAlternate:
// Gather the must and cant for each subexpression.
// When we find a conflicting subexpression, insert the necessary
// flags around the previously identified span and start over.
var must, cant, allCant printFlags
start := 0
last := 0
did := false
for i, sub := range re.Sub {
subMust, subCant := calcFlags(sub, flags)
if must&subCant != 0 || subMust&cant != 0 {
if must != 0 {
addSpan(re.Sub[start], re.Sub[last], must, flags)
}
must = 0
cant = 0
start = i
did = true
}
must |= subMust
cant |= subCant
allCant |= subCant
if subMust != 0 {
last = i
}
if must == 0 && start == i {
start++
}
}
if !did {
// No conflicts: pass the accumulated must and cant upward.
return must, cant
}
if must != 0 {
// Conflicts found; need to finish final span.
addSpan(re.Sub[start], re.Sub[last], must, flags)
}
return 0, allCant
}
}
// writeRegexp writes the Perl syntax for the regular expression re to b.
func writeRegexp(b *strings.Builder, re *Regexp, f printFlags, flags map[*Regexp]printFlags) {
f |= flags[re]
if f&flagPrec != 0 && f&^(flagOff|flagPrec) != 0 && f&flagOff != 0 {
// flagPrec is redundant with other flags being added and terminated
f &^= flagPrec
}
if f&^(flagOff|flagPrec) != 0 {
b.WriteString(`(?`)
if f&flagI != 0 {
b.WriteString(`i`)
}
if f&flagM != 0 {
b.WriteString(`m`)
}
if f&flagS != 0 {
b.WriteString(`s`)
}
if f&((flagM|flagS)<<negShift) != 0 {
b.WriteString(`-`)
if f&(flagM<<negShift) != 0 {
b.WriteString(`m`)
}
if f&(flagS<<negShift) != 0 {
b.WriteString(`s`)
}
}
b.WriteString(`:`)
}
if f&flagOff != 0 {
defer b.WriteString(`)`)
}
if f&flagPrec != 0 {
b.WriteString(`(?:`)
defer b.WriteString(`)`)
}
switch re.Op {
default:
b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">")
case OpNoMatch:
b.WriteString(`[^\x00-\x{10FFFF}]`)
case OpEmptyMatch:
b.WriteString(`(?:)`)
case OpLiteral:
for _, r := range re.Rune {
escape(b, r, false)
}
case OpCharClass:
if len(re.Rune)%2 != 0 {
b.WriteString(`[invalid char class]`)
break
}
b.WriteRune('[')
if len(re.Rune) == 0 {
b.WriteString(`^\x00-\x{10FFFF}`)
} else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune && len(re.Rune) > 2 {
// Contains 0 and MaxRune. Probably a negated class.
// Print the gaps.
b.WriteRune('^')
for i := 1; i < len(re.Rune)-1; i += 2 {
lo, hi := re.Rune[i]+1, re.Rune[i+1]-1
escape(b, lo, lo == '-')
if lo != hi {
if hi != lo+1 {
b.WriteRune('-')
}
escape(b, hi, hi == '-')
}
}
} else {
for i := 0; i < len(re.Rune); i += 2 {
lo, hi := re.Rune[i], re.Rune[i+1]
escape(b, lo, lo == '-')
if lo != hi {
if hi != lo+1 {
b.WriteRune('-')
}
escape(b, hi, hi == '-')
}
}
}
b.WriteRune(']')
case OpAnyCharNotNL, OpAnyChar:
b.WriteString(`.`)
case OpBeginLine:
b.WriteString(`^`)
case OpEndLine:
b.WriteString(`$`)
case OpBeginText:
b.WriteString(`\A`)
case OpEndText:
if re.Flags&WasDollar != 0 {
b.WriteString(`$`)
} else {
b.WriteString(`\z`)
}
case OpWordBoundary:
b.WriteString(`\b`)
case OpNoWordBoundary:
b.WriteString(`\B`)
case OpCapture:
if re.Name != "" {
b.WriteString(`(?P<`)
b.WriteString(re.Name)
b.WriteRune('>')
} else {
b.WriteRune('(')
}
if re.Sub[0].Op != OpEmptyMatch {
writeRegexp(b, re.Sub[0], flags[re.Sub[0]], flags)
}
b.WriteRune(')')
case OpStar, OpPlus, OpQuest, OpRepeat:
p := printFlags(0)
sub := re.Sub[0]
if sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 {
p = flagPrec
}
writeRegexp(b, sub, p, flags)
switch re.Op {
case OpStar:
b.WriteRune('*')
case OpPlus:
b.WriteRune('+')
case OpQuest:
b.WriteRune('?')
case OpRepeat:
b.WriteRune('{')
b.WriteString(strconv.Itoa(re.Min))
if re.Max != re.Min {
b.WriteRune(',')
if re.Max >= 0 {
b.WriteString(strconv.Itoa(re.Max))
}
}
b.WriteRune('}')
}
if re.Flags&NonGreedy != 0 {
b.WriteRune('?')
}
case OpConcat:
for _, sub := range re.Sub {
p := printFlags(0)
if sub.Op == OpAlternate {
p = flagPrec
}
writeRegexp(b, sub, p, flags)
}
case OpAlternate:
for i, sub := range re.Sub {
if i > 0 {
b.WriteRune('|')
}
writeRegexp(b, sub, 0, flags)
}
}
}
func (re *Regexp) String() string {
var b strings.Builder
var flags map[*Regexp]printFlags
must, cant := calcFlags(re, &flags)
must |= (cant &^ flagI) << negShift
if must != 0 {
must |= flagOff
}
writeRegexp(&b, re, must, flags)
return b.String()
}
const meta = `\.+*?()|[]{}^$`
func escape(b *strings.Builder, r rune, force bool) {
if unicode.IsPrint(r) {
if strings.ContainsRune(meta, r) || force {
b.WriteRune('\\')
}
b.WriteRune(r)
return
}
switch r {
case '\a':
b.WriteString(`\a`)
case '\f':
b.WriteString(`\f`)
case '\n':
b.WriteString(`\n`)
case '\r':
b.WriteString(`\r`)
case '\t':
b.WriteString(`\t`)
case '\v':
b.WriteString(`\v`)
default:
if r < 0x100 {
b.WriteString(`\x`)
s := strconv.FormatInt(int64(r), 16)
if len(s) == 1 {
b.WriteRune('0')
}
b.WriteString(s)
break
}
b.WriteString(`\x{`)
b.WriteString(strconv.FormatInt(int64(r), 16))
b.WriteString(`}`)
}
}
// MaxCap walks the regexp to find the maximum capture index.
func (re *Regexp) MaxCap() int {
m := 0
if re.Op == OpCapture {
m = re.Cap
}
for _, sub := range re.Sub {
if n := sub.MaxCap(); m < n {
m = n
}
}
return m
}
// CapNames walks the regexp to find the names of capturing groups.
func (re *Regexp) CapNames() []string {
names := make([]string, re.MaxCap()+1)
re.capNames(names)
return names
}
func (re *Regexp) capNames(names []string) {
if re.Op == OpCapture {
names[re.Cap] = re.Name
}
for _, sub := range re.Sub {
sub.capNames(names)
}
}