From 8b3a5d153b7b255bafd1a82d61505088356d0458 Mon Sep 17 00:00:00 2001 From: Russ Cox Date: Wed, 28 Sep 2022 11:18:51 -0400 Subject: [PATCH] regexp: limit size of parsed regexps Set a 128 MB limit on the amount of space used by []syntax.Inst in the compiled form corresponding to a given regexp. Also set a 128 MB limit on the rune storage in the *syntax.Regexp tree itself. Thanks to Adam Korczynski (ADA Logics) and OSS-Fuzz for reporting this issue. Fixes CVE-2022-41715. Updates #55949. Fixes #55950. Change-Id: Ia656baed81564436368cf950e1c5409752f28e1b Reviewed-on: https://team-review.git.corp.google.com/c/golang/go-private/+/1592136 TryBot-Result: Security TryBots Reviewed-by: Damien Neil Run-TryBot: Roland Shoemaker Reviewed-by: Julie Qiu Reviewed-on: https://go-review.googlesource.com/c/go/+/438501 Run-TryBot: Carlos Amedee Reviewed-by: Carlos Amedee Reviewed-by: Dmitri Shuralyov TryBot-Result: Gopher Robot Reviewed-by: Dmitri Shuralyov --- src/regexp/syntax/parse.go | 222 +++++++++++++++++++++++++++++++- src/regexp/syntax/parse_test.go | 11 +- 2 files changed, 224 insertions(+), 9 deletions(-) diff --git a/src/regexp/syntax/parse.go b/src/regexp/syntax/parse.go index 7b40309..67254d6 100644 --- a/src/regexp/syntax/parse.go +++ b/src/regexp/syntax/parse.go @@ -43,6 +43,7 @@ const ( ErrMissingRepeatArgument ErrorCode = "missing argument to repetition operator" ErrTrailingBackslash ErrorCode = "trailing backslash at end of expression" ErrUnexpectedParen ErrorCode = "unexpected )" + ErrNestingDepth ErrorCode = "expression nests too deeply" ) func (e ErrorCode) String() string { @@ -76,13 +77,63 @@ const ( opVerticalBar ) +// maxHeight is the maximum height of a regexp parse tree. +// It is somewhat arbitrarily chosen, but the idea is to be large enough +// that no one will actually hit in real use but at the same time small enough +// that recursion on the Regexp tree will not hit the 1GB Go stack limit. +// The maximum amount of stack for a single recursive frame is probably +// closer to 1kB, so this could potentially be raised, but it seems unlikely +// that people have regexps nested even this deeply. +// We ran a test on Google's C++ code base and turned up only +// a single use case with depth > 100; it had depth 128. +// Using depth 1000 should be plenty of margin. +// As an optimization, we don't even bother calculating heights +// until we've allocated at least maxHeight Regexp structures. +const maxHeight = 1000 + +// maxSize is the maximum size of a compiled regexp in Insts. +// It too is somewhat arbitrarily chosen, but the idea is to be large enough +// to allow significant regexps while at the same time small enough that +// the compiled form will not take up too much memory. +// 128 MB is enough for a 3.3 million Inst structures, which roughly +// corresponds to a 3.3 MB regexp. +const ( + maxSize = 128 << 20 / instSize + instSize = 5 * 8 // byte, 2 uint32, slice is 5 64-bit words +) + +// maxRunes is the maximum number of runes allowed in a regexp tree +// counting the runes in all the nodes. +// Ignoring character classes p.numRunes is always less than the length of the regexp. +// Character classes can make it much larger: each \pL adds 1292 runes. +// 128 MB is enough for 32M runes, which is over 26k \pL instances. +// Note that repetitions do not make copies of the rune slices, +// so \pL{1000} is only one rune slice, not 1000. +// We could keep a cache of character classes we've seen, +// so that all the \pL we see use the same rune list, +// but that doesn't remove the problem entirely: +// consider something like [\pL01234][\pL01235][\pL01236]...[\pL^&*()]. +// And because the Rune slice is exposed directly in the Regexp, +// there is not an opportunity to change the representation to allow +// partial sharing between different character classes. +// So the limit is the best we can do. +const ( + maxRunes = 128 << 20 / runeSize + runeSize = 4 // rune is int32 +) + type parser struct { flags Flags // parse mode flags stack []*Regexp // stack of parsed expressions free *Regexp numCap int // number of capturing groups seen wholeRegexp string - tmpClass []rune // temporary char class work space + tmpClass []rune // temporary char class work space + numRegexp int // number of regexps allocated + numRunes int // number of runes in char classes + repeats int64 // product of all repetitions seen + height map[*Regexp]int // regexp height, for height limit check + size map[*Regexp]int64 // regexp compiled size, for size limit check } func (p *parser) newRegexp(op Op) *Regexp { @@ -92,20 +143,155 @@ func (p *parser) newRegexp(op Op) *Regexp { *re = Regexp{} } else { re = new(Regexp) + p.numRegexp++ } re.Op = op return re } func (p *parser) reuse(re *Regexp) { + if p.height != nil { + delete(p.height, re) + } re.Sub0[0] = p.free p.free = re } +func (p *parser) checkLimits(re *Regexp) { + if p.numRunes > maxRunes { + panic(ErrInternalError) + } + p.checkSize(re) + p.checkHeight(re) +} + +func (p *parser) checkSize(re *Regexp) { + if p.size == nil { + // We haven't started tracking size yet. + // Do a relatively cheap check to see if we need to start. + // Maintain the product of all the repeats we've seen + // and don't track if the total number of regexp nodes + // we've seen times the repeat product is in budget. + if p.repeats == 0 { + p.repeats = 1 + } + if re.Op == OpRepeat { + n := re.Max + if n == -1 { + n = re.Min + } + if n <= 0 { + n = 1 + } + if int64(n) > maxSize/p.repeats { + p.repeats = maxSize + } else { + p.repeats *= int64(n) + } + } + if int64(p.numRegexp) < maxSize/p.repeats { + return + } + + // We need to start tracking size. + // Make the map and belatedly populate it + // with info about everything we've constructed so far. + p.size = make(map[*Regexp]int64) + for _, re := range p.stack { + p.checkSize(re) + } + } + + if p.calcSize(re, true) > maxSize { + panic(ErrInternalError) + } +} + +func (p *parser) calcSize(re *Regexp, force bool) int64 { + if !force { + if size, ok := p.size[re]; ok { + return size + } + } + + var size int64 + switch re.Op { + case OpLiteral: + size = int64(len(re.Rune)) + case OpCapture, OpStar: + // star can be 1+ or 2+; assume 2 pessimistically + size = 2 + p.calcSize(re.Sub[0], false) + case OpPlus, OpQuest: + size = 1 + p.calcSize(re.Sub[0], false) + case OpConcat: + for _, sub := range re.Sub { + size += p.calcSize(sub, false) + } + case OpAlternate: + for _, sub := range re.Sub { + size += p.calcSize(sub, false) + } + if len(re.Sub) > 1 { + size += int64(len(re.Sub)) - 1 + } + case OpRepeat: + sub := p.calcSize(re.Sub[0], false) + if re.Max == -1 { + if re.Min == 0 { + size = 2 + sub // x* + } else { + size = 1 + int64(re.Min)*sub // xxx+ + } + break + } + // x{2,5} = xx(x(x(x)?)?)? + size = int64(re.Max)*sub + int64(re.Max-re.Min) + } + + if size < 1 { + size = 1 + } + p.size[re] = size + return size +} + +func (p *parser) checkHeight(re *Regexp) { + if p.numRegexp < maxHeight { + return + } + if p.height == nil { + p.height = make(map[*Regexp]int) + for _, re := range p.stack { + p.checkHeight(re) + } + } + if p.calcHeight(re, true) > maxHeight { + panic(ErrNestingDepth) + } +} + +func (p *parser) calcHeight(re *Regexp, force bool) int { + if !force { + if h, ok := p.height[re]; ok { + return h + } + } + h := 1 + for _, sub := range re.Sub { + hsub := p.calcHeight(sub, false) + if h < 1+hsub { + h = 1 + hsub + } + } + p.height[re] = h + return h +} + // Parse stack manipulation. // push pushes the regexp re onto the parse stack and returns the regexp. func (p *parser) push(re *Regexp) *Regexp { + p.numRunes += len(re.Rune) if re.Op == OpCharClass && len(re.Rune) == 2 && re.Rune[0] == re.Rune[1] { // Single rune. if p.maybeConcat(re.Rune[0], p.flags&^FoldCase) { @@ -137,6 +323,7 @@ func (p *parser) push(re *Regexp) *Regexp { } p.stack = append(p.stack, re) + p.checkLimits(re) return re } @@ -246,6 +433,7 @@ func (p *parser) repeat(op Op, min, max int, before, after, lastRepeat string) ( re.Sub = re.Sub0[:1] re.Sub[0] = sub p.stack[n-1] = re + p.checkLimits(re) if op == OpRepeat && (min >= 2 || max >= 2) && !repeatIsValid(re, 1000) { return "", &Error{ErrInvalidRepeatSize, before[:len(before)-len(after)]} @@ -390,12 +578,16 @@ func (p *parser) collapse(subs []*Regexp, op Op) *Regexp { // frees (passes to p.reuse) any removed *Regexps. // // For example, -// ABC|ABD|AEF|BCX|BCY +// +// ABC|ABD|AEF|BCX|BCY +// // simplifies by literal prefix extraction to -// A(B(C|D)|EF)|BC(X|Y) +// +// A(B(C|D)|EF)|BC(X|Y) +// // which simplifies by character class introduction to -// A(B[CD]|EF)|BC[XY] // +// A(B[CD]|EF)|BC[XY] func (p *parser) factor(sub []*Regexp) []*Regexp { if len(sub) < 2 { return sub @@ -449,6 +641,7 @@ func (p *parser) factor(sub []*Regexp) []*Regexp { for j := start; j < i; j++ { sub[j] = p.removeLeadingString(sub[j], len(str)) + p.checkLimits(sub[j]) } suffix := p.collapse(sub[start:i], OpAlternate) // recurse @@ -506,6 +699,7 @@ func (p *parser) factor(sub []*Regexp) []*Regexp { for j := start; j < i; j++ { reuse := j != start // prefix came from sub[start] sub[j] = p.removeLeadingRegexp(sub[j], reuse) + p.checkLimits(sub[j]) } suffix := p.collapse(sub[start:i], OpAlternate) // recurse @@ -693,6 +887,23 @@ func literalRegexp(s string, flags Flags) *Regexp { // Flags, and returns a regular expression parse tree. The syntax is // described in the top-level comment. func Parse(s string, flags Flags) (*Regexp, error) { + return parse(s, flags) +} + +func parse(s string, flags Flags) (_ *Regexp, err error) { + defer func() { + switch r := recover(); r { + default: + panic(r) + case nil: + // ok + case ErrInternalError: // too big + err = &Error{Code: ErrInternalError, Expr: s} + case ErrNestingDepth: + err = &Error{Code: ErrNestingDepth, Expr: s} + } + }() + if flags&Literal != 0 { // Trivial parser for literal string. if err := checkUTF8(s); err != nil { @@ -704,7 +915,6 @@ func Parse(s string, flags Flags) (*Regexp, error) { // Otherwise, must do real work. var ( p parser - err error c rune op Op lastRepeat string @@ -1733,7 +1943,7 @@ func appendClass(r []rune, x []rune) []rune { return r } -// appendFolded returns the result of appending the case folding of the class x to the class r. +// appendFoldedClass returns the result of appending the case folding of the class x to the class r. func appendFoldedClass(r []rune, x []rune) []rune { for i := 0; i < len(x); i += 2 { r = appendFoldedRange(r, x[i], x[i+1]) diff --git a/src/regexp/syntax/parse_test.go b/src/regexp/syntax/parse_test.go index 5581ba1..6044da6 100644 --- a/src/regexp/syntax/parse_test.go +++ b/src/regexp/syntax/parse_test.go @@ -479,10 +479,15 @@ var invalidRegexps = []string{ `(?P<>a)`, `[a-Z]`, `(?i)[a-Z]`, - `a{100000}`, - `a{100000,}`, - "((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}){2})", `\Q\E*`, + `a{100000}`, // too much repetition + `a{100000,}`, // too much repetition + "((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}){2})", // too much repetition + strings.Repeat("(", 1000) + strings.Repeat(")", 1000), // too deep + strings.Repeat("(?:", 1000) + strings.Repeat(")*", 1000), // too deep + "(" + strings.Repeat("(xx?)", 1000) + "){1000}", // too long + strings.Repeat("(xx?){1000}", 1000), // too long + strings.Repeat(`\pL`, 27000), // too many runes } var onlyPerl = []string{ -- 2.33.0