mirror of
https://github.com/cheat/cheat.git
synced 2024-11-22 14:01:36 +01:00
501 lines
11 KiB
Go
501 lines
11 KiB
Go
package syntax
|
|
|
|
import (
|
|
"bytes"
|
|
"fmt"
|
|
"math"
|
|
"os"
|
|
)
|
|
|
|
func Write(tree *RegexTree) (*Code, error) {
|
|
w := writer{
|
|
intStack: make([]int, 0, 32),
|
|
emitted: make([]int, 2),
|
|
stringhash: make(map[string]int),
|
|
sethash: make(map[string]int),
|
|
}
|
|
|
|
code, err := w.codeFromTree(tree)
|
|
|
|
if tree.options&Debug > 0 && code != nil {
|
|
os.Stdout.WriteString(code.Dump())
|
|
os.Stdout.WriteString("\n")
|
|
}
|
|
|
|
return code, err
|
|
}
|
|
|
|
type writer struct {
|
|
emitted []int
|
|
|
|
intStack []int
|
|
curpos int
|
|
stringhash map[string]int
|
|
stringtable [][]rune
|
|
sethash map[string]int
|
|
settable []*CharSet
|
|
counting bool
|
|
count int
|
|
trackcount int
|
|
caps map[int]int
|
|
}
|
|
|
|
const (
|
|
beforeChild nodeType = 64
|
|
afterChild = 128
|
|
//MaxPrefixSize is the largest number of runes we'll use for a BoyerMoyer prefix
|
|
MaxPrefixSize = 50
|
|
)
|
|
|
|
// The top level RegexCode generator. It does a depth-first walk
|
|
// through the tree and calls EmitFragment to emits code before
|
|
// and after each child of an interior node, and at each leaf.
|
|
//
|
|
// It runs two passes, first to count the size of the generated
|
|
// code, and second to generate the code.
|
|
//
|
|
// We should time it against the alternative, which is
|
|
// to just generate the code and grow the array as we go.
|
|
func (w *writer) codeFromTree(tree *RegexTree) (*Code, error) {
|
|
var (
|
|
curNode *regexNode
|
|
curChild int
|
|
capsize int
|
|
)
|
|
// construct sparse capnum mapping if some numbers are unused
|
|
|
|
if tree.capnumlist == nil || tree.captop == len(tree.capnumlist) {
|
|
capsize = tree.captop
|
|
w.caps = nil
|
|
} else {
|
|
capsize = len(tree.capnumlist)
|
|
w.caps = tree.caps
|
|
for i := 0; i < len(tree.capnumlist); i++ {
|
|
w.caps[tree.capnumlist[i]] = i
|
|
}
|
|
}
|
|
|
|
w.counting = true
|
|
|
|
for {
|
|
if !w.counting {
|
|
w.emitted = make([]int, w.count)
|
|
}
|
|
|
|
curNode = tree.root
|
|
curChild = 0
|
|
|
|
w.emit1(Lazybranch, 0)
|
|
|
|
for {
|
|
if len(curNode.children) == 0 {
|
|
w.emitFragment(curNode.t, curNode, 0)
|
|
} else if curChild < len(curNode.children) {
|
|
w.emitFragment(curNode.t|beforeChild, curNode, curChild)
|
|
|
|
curNode = curNode.children[curChild]
|
|
|
|
w.pushInt(curChild)
|
|
curChild = 0
|
|
continue
|
|
}
|
|
|
|
if w.emptyStack() {
|
|
break
|
|
}
|
|
|
|
curChild = w.popInt()
|
|
curNode = curNode.next
|
|
|
|
w.emitFragment(curNode.t|afterChild, curNode, curChild)
|
|
curChild++
|
|
}
|
|
|
|
w.patchJump(0, w.curPos())
|
|
w.emit(Stop)
|
|
|
|
if !w.counting {
|
|
break
|
|
}
|
|
|
|
w.counting = false
|
|
}
|
|
|
|
fcPrefix := getFirstCharsPrefix(tree)
|
|
prefix := getPrefix(tree)
|
|
rtl := (tree.options & RightToLeft) != 0
|
|
|
|
var bmPrefix *BmPrefix
|
|
//TODO: benchmark string prefixes
|
|
if prefix != nil && len(prefix.PrefixStr) > 0 && MaxPrefixSize > 0 {
|
|
if len(prefix.PrefixStr) > MaxPrefixSize {
|
|
// limit prefix changes to 10k
|
|
prefix.PrefixStr = prefix.PrefixStr[:MaxPrefixSize]
|
|
}
|
|
bmPrefix = newBmPrefix(prefix.PrefixStr, prefix.CaseInsensitive, rtl)
|
|
} else {
|
|
bmPrefix = nil
|
|
}
|
|
|
|
return &Code{
|
|
Codes: w.emitted,
|
|
Strings: w.stringtable,
|
|
Sets: w.settable,
|
|
TrackCount: w.trackcount,
|
|
Caps: w.caps,
|
|
Capsize: capsize,
|
|
FcPrefix: fcPrefix,
|
|
BmPrefix: bmPrefix,
|
|
Anchors: getAnchors(tree),
|
|
RightToLeft: rtl,
|
|
}, nil
|
|
}
|
|
|
|
// The main RegexCode generator. It does a depth-first walk
|
|
// through the tree and calls EmitFragment to emits code before
|
|
// and after each child of an interior node, and at each leaf.
|
|
func (w *writer) emitFragment(nodetype nodeType, node *regexNode, curIndex int) error {
|
|
bits := InstOp(0)
|
|
|
|
if nodetype <= ntRef {
|
|
if (node.options & RightToLeft) != 0 {
|
|
bits |= Rtl
|
|
}
|
|
if (node.options & IgnoreCase) != 0 {
|
|
bits |= Ci
|
|
}
|
|
}
|
|
ntBits := nodeType(bits)
|
|
|
|
switch nodetype {
|
|
case ntConcatenate | beforeChild, ntConcatenate | afterChild, ntEmpty:
|
|
break
|
|
|
|
case ntAlternate | beforeChild:
|
|
if curIndex < len(node.children)-1 {
|
|
w.pushInt(w.curPos())
|
|
w.emit1(Lazybranch, 0)
|
|
}
|
|
|
|
case ntAlternate | afterChild:
|
|
if curIndex < len(node.children)-1 {
|
|
lbPos := w.popInt()
|
|
w.pushInt(w.curPos())
|
|
w.emit1(Goto, 0)
|
|
w.patchJump(lbPos, w.curPos())
|
|
} else {
|
|
for i := 0; i < curIndex; i++ {
|
|
w.patchJump(w.popInt(), w.curPos())
|
|
}
|
|
}
|
|
break
|
|
|
|
case ntTestref | beforeChild:
|
|
if curIndex == 0 {
|
|
w.emit(Setjump)
|
|
w.pushInt(w.curPos())
|
|
w.emit1(Lazybranch, 0)
|
|
w.emit1(Testref, w.mapCapnum(node.m))
|
|
w.emit(Forejump)
|
|
}
|
|
|
|
case ntTestref | afterChild:
|
|
if curIndex == 0 {
|
|
branchpos := w.popInt()
|
|
w.pushInt(w.curPos())
|
|
w.emit1(Goto, 0)
|
|
w.patchJump(branchpos, w.curPos())
|
|
w.emit(Forejump)
|
|
if len(node.children) <= 1 {
|
|
w.patchJump(w.popInt(), w.curPos())
|
|
}
|
|
} else if curIndex == 1 {
|
|
w.patchJump(w.popInt(), w.curPos())
|
|
}
|
|
|
|
case ntTestgroup | beforeChild:
|
|
if curIndex == 0 {
|
|
w.emit(Setjump)
|
|
w.emit(Setmark)
|
|
w.pushInt(w.curPos())
|
|
w.emit1(Lazybranch, 0)
|
|
}
|
|
|
|
case ntTestgroup | afterChild:
|
|
if curIndex == 0 {
|
|
w.emit(Getmark)
|
|
w.emit(Forejump)
|
|
} else if curIndex == 1 {
|
|
Branchpos := w.popInt()
|
|
w.pushInt(w.curPos())
|
|
w.emit1(Goto, 0)
|
|
w.patchJump(Branchpos, w.curPos())
|
|
w.emit(Getmark)
|
|
w.emit(Forejump)
|
|
if len(node.children) <= 2 {
|
|
w.patchJump(w.popInt(), w.curPos())
|
|
}
|
|
} else if curIndex == 2 {
|
|
w.patchJump(w.popInt(), w.curPos())
|
|
}
|
|
|
|
case ntLoop | beforeChild, ntLazyloop | beforeChild:
|
|
|
|
if node.n < math.MaxInt32 || node.m > 1 {
|
|
if node.m == 0 {
|
|
w.emit1(Nullcount, 0)
|
|
} else {
|
|
w.emit1(Setcount, 1-node.m)
|
|
}
|
|
} else if node.m == 0 {
|
|
w.emit(Nullmark)
|
|
} else {
|
|
w.emit(Setmark)
|
|
}
|
|
|
|
if node.m == 0 {
|
|
w.pushInt(w.curPos())
|
|
w.emit1(Goto, 0)
|
|
}
|
|
w.pushInt(w.curPos())
|
|
|
|
case ntLoop | afterChild, ntLazyloop | afterChild:
|
|
|
|
startJumpPos := w.curPos()
|
|
lazy := (nodetype - (ntLoop | afterChild))
|
|
|
|
if node.n < math.MaxInt32 || node.m > 1 {
|
|
if node.n == math.MaxInt32 {
|
|
w.emit2(InstOp(Branchcount+lazy), w.popInt(), math.MaxInt32)
|
|
} else {
|
|
w.emit2(InstOp(Branchcount+lazy), w.popInt(), node.n-node.m)
|
|
}
|
|
} else {
|
|
w.emit1(InstOp(Branchmark+lazy), w.popInt())
|
|
}
|
|
|
|
if node.m == 0 {
|
|
w.patchJump(w.popInt(), startJumpPos)
|
|
}
|
|
|
|
case ntGroup | beforeChild, ntGroup | afterChild:
|
|
|
|
case ntCapture | beforeChild:
|
|
w.emit(Setmark)
|
|
|
|
case ntCapture | afterChild:
|
|
w.emit2(Capturemark, w.mapCapnum(node.m), w.mapCapnum(node.n))
|
|
|
|
case ntRequire | beforeChild:
|
|
// NOTE: the following line causes lookahead/lookbehind to be
|
|
// NON-BACKTRACKING. It can be commented out with (*)
|
|
w.emit(Setjump)
|
|
|
|
w.emit(Setmark)
|
|
|
|
case ntRequire | afterChild:
|
|
w.emit(Getmark)
|
|
|
|
// NOTE: the following line causes lookahead/lookbehind to be
|
|
// NON-BACKTRACKING. It can be commented out with (*)
|
|
w.emit(Forejump)
|
|
|
|
case ntPrevent | beforeChild:
|
|
w.emit(Setjump)
|
|
w.pushInt(w.curPos())
|
|
w.emit1(Lazybranch, 0)
|
|
|
|
case ntPrevent | afterChild:
|
|
w.emit(Backjump)
|
|
w.patchJump(w.popInt(), w.curPos())
|
|
w.emit(Forejump)
|
|
|
|
case ntGreedy | beforeChild:
|
|
w.emit(Setjump)
|
|
|
|
case ntGreedy | afterChild:
|
|
w.emit(Forejump)
|
|
|
|
case ntOne, ntNotone:
|
|
w.emit1(InstOp(node.t|ntBits), int(node.ch))
|
|
|
|
case ntNotoneloop, ntNotonelazy, ntOneloop, ntOnelazy:
|
|
if node.m > 0 {
|
|
if node.t == ntOneloop || node.t == ntOnelazy {
|
|
w.emit2(Onerep|bits, int(node.ch), node.m)
|
|
} else {
|
|
w.emit2(Notonerep|bits, int(node.ch), node.m)
|
|
}
|
|
}
|
|
if node.n > node.m {
|
|
if node.n == math.MaxInt32 {
|
|
w.emit2(InstOp(node.t|ntBits), int(node.ch), math.MaxInt32)
|
|
} else {
|
|
w.emit2(InstOp(node.t|ntBits), int(node.ch), node.n-node.m)
|
|
}
|
|
}
|
|
|
|
case ntSetloop, ntSetlazy:
|
|
if node.m > 0 {
|
|
w.emit2(Setrep|bits, w.setCode(node.set), node.m)
|
|
}
|
|
if node.n > node.m {
|
|
if node.n == math.MaxInt32 {
|
|
w.emit2(InstOp(node.t|ntBits), w.setCode(node.set), math.MaxInt32)
|
|
} else {
|
|
w.emit2(InstOp(node.t|ntBits), w.setCode(node.set), node.n-node.m)
|
|
}
|
|
}
|
|
|
|
case ntMulti:
|
|
w.emit1(InstOp(node.t|ntBits), w.stringCode(node.str))
|
|
|
|
case ntSet:
|
|
w.emit1(InstOp(node.t|ntBits), w.setCode(node.set))
|
|
|
|
case ntRef:
|
|
w.emit1(InstOp(node.t|ntBits), w.mapCapnum(node.m))
|
|
|
|
case ntNothing, ntBol, ntEol, ntBoundary, ntNonboundary, ntECMABoundary, ntNonECMABoundary, ntBeginning, ntStart, ntEndZ, ntEnd:
|
|
w.emit(InstOp(node.t))
|
|
|
|
default:
|
|
return fmt.Errorf("unexpected opcode in regular expression generation: %v", nodetype)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// To avoid recursion, we use a simple integer stack.
|
|
// This is the push.
|
|
func (w *writer) pushInt(i int) {
|
|
w.intStack = append(w.intStack, i)
|
|
}
|
|
|
|
// Returns true if the stack is empty.
|
|
func (w *writer) emptyStack() bool {
|
|
return len(w.intStack) == 0
|
|
}
|
|
|
|
// This is the pop.
|
|
func (w *writer) popInt() int {
|
|
//get our item
|
|
idx := len(w.intStack) - 1
|
|
i := w.intStack[idx]
|
|
//trim our slice
|
|
w.intStack = w.intStack[:idx]
|
|
return i
|
|
}
|
|
|
|
// Returns the current position in the emitted code.
|
|
func (w *writer) curPos() int {
|
|
return w.curpos
|
|
}
|
|
|
|
// Fixes up a jump instruction at the specified offset
|
|
// so that it jumps to the specified jumpDest.
|
|
func (w *writer) patchJump(offset, jumpDest int) {
|
|
w.emitted[offset+1] = jumpDest
|
|
}
|
|
|
|
// Returns an index in the set table for a charset
|
|
// uses a map to eliminate duplicates.
|
|
func (w *writer) setCode(set *CharSet) int {
|
|
if w.counting {
|
|
return 0
|
|
}
|
|
|
|
buf := &bytes.Buffer{}
|
|
|
|
set.mapHashFill(buf)
|
|
hash := buf.String()
|
|
i, ok := w.sethash[hash]
|
|
if !ok {
|
|
i = len(w.sethash)
|
|
w.sethash[hash] = i
|
|
w.settable = append(w.settable, set)
|
|
}
|
|
return i
|
|
}
|
|
|
|
// Returns an index in the string table for a string.
|
|
// uses a map to eliminate duplicates.
|
|
func (w *writer) stringCode(str []rune) int {
|
|
if w.counting {
|
|
return 0
|
|
}
|
|
|
|
hash := string(str)
|
|
i, ok := w.stringhash[hash]
|
|
if !ok {
|
|
i = len(w.stringhash)
|
|
w.stringhash[hash] = i
|
|
w.stringtable = append(w.stringtable, str)
|
|
}
|
|
|
|
return i
|
|
}
|
|
|
|
// When generating code on a regex that uses a sparse set
|
|
// of capture slots, we hash them to a dense set of indices
|
|
// for an array of capture slots. Instead of doing the hash
|
|
// at match time, it's done at compile time, here.
|
|
func (w *writer) mapCapnum(capnum int) int {
|
|
if capnum == -1 {
|
|
return -1
|
|
}
|
|
|
|
if w.caps != nil {
|
|
return w.caps[capnum]
|
|
}
|
|
|
|
return capnum
|
|
}
|
|
|
|
// Emits a zero-argument operation. Note that the emit
|
|
// functions all run in two modes: they can emit code, or
|
|
// they can just count the size of the code.
|
|
func (w *writer) emit(op InstOp) {
|
|
if w.counting {
|
|
w.count++
|
|
if opcodeBacktracks(op) {
|
|
w.trackcount++
|
|
}
|
|
return
|
|
}
|
|
w.emitted[w.curpos] = int(op)
|
|
w.curpos++
|
|
}
|
|
|
|
// Emits a one-argument operation.
|
|
func (w *writer) emit1(op InstOp, opd1 int) {
|
|
if w.counting {
|
|
w.count += 2
|
|
if opcodeBacktracks(op) {
|
|
w.trackcount++
|
|
}
|
|
return
|
|
}
|
|
w.emitted[w.curpos] = int(op)
|
|
w.curpos++
|
|
w.emitted[w.curpos] = opd1
|
|
w.curpos++
|
|
}
|
|
|
|
// Emits a two-argument operation.
|
|
func (w *writer) emit2(op InstOp, opd1, opd2 int) {
|
|
if w.counting {
|
|
w.count += 3
|
|
if opcodeBacktracks(op) {
|
|
w.trackcount++
|
|
}
|
|
return
|
|
}
|
|
w.emitted[w.curpos] = int(op)
|
|
w.curpos++
|
|
w.emitted[w.curpos] = opd1
|
|
w.curpos++
|
|
w.emitted[w.curpos] = opd2
|
|
w.curpos++
|
|
}
|