package parser import ( "bufio" "io" "strings" "unicode/utf8" "github.com/go-errors/errors" "github.com/spf13/viper" "github.com/supabase/cli/pkg/cast" ) // Equal to `startBufSize` from `bufio/scan.go` const startBufSize = 4096 // MaxScannerCapacity defaults to 64 * 1024 which is not enough for certain lines // containing e.g. geographical data. 256K ought to be enough for anybody... var MaxScannerCapacity = 256 * 1024 // State transition table for tokenizer: // // Ready -> Ready (default) // Ready -> Error (on invalid syntax) // Ready -> Done (on ;, emit token) // Ready -> Done (on EOF, emit token) // // Ready -> Comment (on --) // Comment -> Comment (default) // Comment -> Ready (on \n) // // Ready -> Block (on /*) // Block -> Block (on /*, +-depth) // Block -> Ready (on */, depth 0) // // Ready -> Quote (on ') // Quote -> Quote (on '', default) // Quote -> Ready (on ') // // Ready -> Dollar (on $tag$) // Dollar -> Dollar (default) // Dollar -> Ready (on $tag$) // // Ready -> Escape (on \) // Escape -> Ready (on next) type tokenizer struct { state State last int } func (t *tokenizer) ScanToken(data []byte, atEOF bool) (advance int, token []byte, err error) { // If we requested more data, resume from last position. for width := 1; t.last < len(data); t.last += width { r, width := utf8.DecodeRune(data[t.last:]) end := t.last + width t.state = t.state.Next(r, data[:end]) // Emit token if t.state == nil { t.last = 0 t.state = &ReadyState{} return end, data[:end], nil } } if !atEOF || len(data) == 0 { // Request more data or end the stream return 0, nil, nil } // We're at EOF. If we have a final, non-terminated token, return it. return len(data), data, nil } // Use bufio.Scanner to split a PostgreSQL string into multiple statements. // // The core problem is to figure out whether the current ; separator is inside // an escaped string literal. PostgreSQL has multiple ways of opening a string // literal, $$, ', --, /*, etc. We use a FSM to guarantee these states are // entered exclusively. If not in one of the above escape states, the next ; // token can be parsed as statement separator. // // Each statement is split as it is, without removing comments or white spaces. func Split(sql io.Reader, transform ...func(string) string) (stats []string, err error) { t := tokenizer{state: &ReadyState{}} scanner := bufio.NewScanner(sql) // Increase scanner capacity to support very long lines containing e.g. geodata buf := make([]byte, startBufSize) maxbuf := cast.UintToInt(viper.GetSizeInBytes("SCANNER_BUFFER_SIZE")) if maxbuf == 0 { maxbuf = MaxScannerCapacity } scanner.Buffer(buf, maxbuf) scanner.Split(t.ScanToken) var token string for scanner.Scan() { token = scanner.Text() trim := token for _, apply := range transform { trim = apply(trim) } if len(trim) > 0 { stats = append(stats, trim) } } err = scanner.Err() if err != nil { err = errors.Errorf("%w\nAfter statement %d: %s", err, len(stats), token) } if errors.Is(err, bufio.ErrTooLong) { err = errors.Errorf("%w\nTry setting SUPABASE_SCANNER_BUFFER_SIZE=5MB (current size is %dKB)", err, maxbuf>>10) } return stats, err } func SplitAndTrim(sql io.Reader) (stats []string, err error) { return Split(sql, func(token string) string { return strings.TrimRight(token, ";") }, strings.TrimSpace) }