120 lines
3.3 KiB
Go
120 lines
3.3 KiB
Go
package parser
|
|
|
|
import (
|
|
"bufio"
|
|
"io"
|
|
"strings"
|
|
"unicode/utf8"
|
|
|
|
"github.com/go-errors/errors"
|
|
"github.com/spf13/viper"
|
|
"github.com/supabase/cli/pkg/cast"
|
|
)
|
|
|
|
// Equal to `startBufSize` from `bufio/scan.go`
|
|
const startBufSize = 4096
|
|
|
|
// MaxScannerCapacity defaults to 64 * 1024 which is not enough for certain lines
|
|
// containing e.g. geographical data. 256K ought to be enough for anybody...
|
|
var MaxScannerCapacity = 256 * 1024
|
|
|
|
// State transition table for tokenizer:
|
|
//
|
|
// Ready -> Ready (default)
|
|
// Ready -> Error (on invalid syntax)
|
|
// Ready -> Done (on ;, emit token)
|
|
// Ready -> Done (on EOF, emit token)
|
|
//
|
|
// Ready -> Comment (on --)
|
|
// Comment -> Comment (default)
|
|
// Comment -> Ready (on \n)
|
|
//
|
|
// Ready -> Block (on /*)
|
|
// Block -> Block (on /*, +-depth)
|
|
// Block -> Ready (on */, depth 0)
|
|
//
|
|
// Ready -> Quote (on ')
|
|
// Quote -> Quote (on '', default)
|
|
// Quote -> Ready (on ')
|
|
//
|
|
// Ready -> Dollar (on $tag$)
|
|
// Dollar -> Dollar (default)
|
|
// Dollar -> Ready (on $tag$)
|
|
//
|
|
// Ready -> Escape (on \)
|
|
// Escape -> Ready (on next)
|
|
type tokenizer struct {
|
|
state State
|
|
last int
|
|
}
|
|
|
|
func (t *tokenizer) ScanToken(data []byte, atEOF bool) (advance int, token []byte, err error) {
|
|
// If we requested more data, resume from last position.
|
|
for width := 1; t.last < len(data); t.last += width {
|
|
r, width := utf8.DecodeRune(data[t.last:])
|
|
end := t.last + width
|
|
t.state = t.state.Next(r, data[:end])
|
|
// Emit token
|
|
if t.state == nil {
|
|
t.last = 0
|
|
t.state = &ReadyState{}
|
|
return end, data[:end], nil
|
|
}
|
|
}
|
|
if !atEOF || len(data) == 0 {
|
|
// Request more data or end the stream
|
|
return 0, nil, nil
|
|
}
|
|
// We're at EOF. If we have a final, non-terminated token, return it.
|
|
return len(data), data, nil
|
|
}
|
|
|
|
// Use bufio.Scanner to split a PostgreSQL string into multiple statements.
|
|
//
|
|
// The core problem is to figure out whether the current ; separator is inside
|
|
// an escaped string literal. PostgreSQL has multiple ways of opening a string
|
|
// literal, $$, ', --, /*, etc. We use a FSM to guarantee these states are
|
|
// entered exclusively. If not in one of the above escape states, the next ;
|
|
// token can be parsed as statement separator.
|
|
//
|
|
// Each statement is split as it is, without removing comments or white spaces.
|
|
func Split(sql io.Reader, transform ...func(string) string) (stats []string, err error) {
|
|
t := tokenizer{state: &ReadyState{}}
|
|
scanner := bufio.NewScanner(sql)
|
|
|
|
// Increase scanner capacity to support very long lines containing e.g. geodata
|
|
buf := make([]byte, startBufSize)
|
|
maxbuf := cast.UintToInt(viper.GetSizeInBytes("SCANNER_BUFFER_SIZE"))
|
|
if maxbuf == 0 {
|
|
maxbuf = MaxScannerCapacity
|
|
}
|
|
scanner.Buffer(buf, maxbuf)
|
|
scanner.Split(t.ScanToken)
|
|
|
|
var token string
|
|
for scanner.Scan() {
|
|
token = scanner.Text()
|
|
trim := token
|
|
for _, apply := range transform {
|
|
trim = apply(trim)
|
|
}
|
|
if len(trim) > 0 {
|
|
stats = append(stats, trim)
|
|
}
|
|
}
|
|
err = scanner.Err()
|
|
if err != nil {
|
|
err = errors.Errorf("%w\nAfter statement %d: %s", err, len(stats), token)
|
|
}
|
|
if errors.Is(err, bufio.ErrTooLong) {
|
|
err = errors.Errorf("%w\nTry setting SUPABASE_SCANNER_BUFFER_SIZE=5MB (current size is %dKB)", err, maxbuf>>10)
|
|
}
|
|
return stats, err
|
|
}
|
|
|
|
func SplitAndTrim(sql io.Reader) (stats []string, err error) {
|
|
return Split(sql, func(token string) string {
|
|
return strings.TrimRight(token, ";")
|
|
}, strings.TrimSpace)
|
|
}
|