sqllexer

package module
v0.0.1-dev Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Sep 1, 2023 License: Apache-2.0 Imports: 3 Imported by: 0

README

go-sql-lexer

This repository contains a hand written SQL Lexer that tokenizes SQL queries with a focus on obfuscating and normalization. The lexer is written in Go with no external dependencies. Note This is NOT a SQL parser, it only tokenizes SQL queries.

Features

  • 🚀 Fast and lightweight tokenization (not regex based)
  • 🔒 Obfuscates sensitive data (e.g. numbers, strings, specific literals like dollar quoted strings in Postgres, etc.)
  • 📖 Even works with truncated queries
  • 🌐 UTF-8 support
  • 🔧 Normalizes obfuscated queries

Installation

go get github.com/DataDog/go-sql-lexer

Usage

Tokenize
import "github.com/DataDog/go-sql-lexer"

func main() {
    query := "SELECT * FROM users WHERE id = 1"
    lexer := NewSQLLexer(query)
    tokens := lexer.ScanAll()
    for _, token := range tokens {
        fmt.Println(token)
    }
}
Obfuscate
import (
    "fmt"
    "github.com/DataDog/go-sql-lexer"
)

func main() {
    query := "SELECT * FROM users WHERE id = 1"
    obfuscator := NewSQLObfuscator(&SQLObfuscatorConfig{})
    obfuscated := obfuscator.Obfuscate(query)
    // "SELECT * FROM users WHERE id = ?"
    fmt.Println(obfuscated)
}
Normalize

Note: By design, normalization works on obfuscated queries.

import (
    "fmt"
    "github.com/DataDog/go-sql-lexer"
)

func main() {
    query := "SELECT * FROM users WHERE id in (?, ?)"
    normalizer := NewSQLNormalizer(&SQLNormalizerConfig{
        CollectComments: true,
        CollectCommands: true,
        CollectTables:      true,
        KeepSQLAlias:    false,
    })
    normalized, normalizedInfo, err := normalizer.Normalize(query)
    // "SELECT * FROM users WHERE id in (?)"
    fmt.Println(normalized)
}

Testing

go test -v ./...

Benchmarks

go test -bench=. -benchmem ./...

License

Apache License, v2.0

Documentation

Index

Constants

View Source
const (
	ArrayPlaceholder   = "( ? )"
	BracketPlaceholder = "[ ? ]"
)
View Source
const (
	StringPlaceholder = "?"
	NumberPlaceholder = "?"
)
View Source
const (
	// DBMSSQLServer is a MS SQL Server
	DBMSSQLServer = "mssql"
	// DBMSPostgres is a PostgreSQL Server
	DBMSPostgres = "postgresql"
	// DBMSMySQL is a MySQL Server
	DBMSMySQL = "mysql"
	// DBMSOracle is a Oracle Server
	DBMSOracle = "oracle"
)

Variables

View Source
var Commands = map[string]bool{
	"SELECT":   true,
	"INSERT":   true,
	"UPDATE":   true,
	"DELETE":   true,
	"CREATE":   true,
	"ALTER":    true,
	"DROP":     true,
	"JOIN":     true,
	"GRANT":    true,
	"REVOKE":   true,
	"COMMIT":   true,
	"BEGIN":    true,
	"TRUNCATE": true,
	"MERGE":    true,
}

Functions

func DiscardSQLAlias

func DiscardSQLAlias(input string) string

DiscardSQLAlias removes any SQL alias from the input string and returns the modified string. It uses a regular expression to match the alias pattern and replace it with an empty string. The function is case-insensitive and matches the pattern "AS <alias_name>". The input string is not modified in place.

Types

type NormalizedInfo

type NormalizedInfo struct {
	Tables   []string
	Comments []string
	Commands []string
}

type SQLLexer

type SQLLexer struct {
	// contains filtered or unexported fields
}

SQL Lexer inspired from Rob Pike's talk on Lexical Scanning in Go

func NewSQLLexer

func NewSQLLexer(input string) *SQLLexer

func (*SQLLexer) Scan

func (s *SQLLexer) Scan() Token

Scan scans the next token and returns it.

func (*SQLLexer) ScanAll

func (s *SQLLexer) ScanAll() []Token

ScanAll scans the entire input string and returns a slice of tokens.

func (*SQLLexer) ScanAllTokens

func (s *SQLLexer) ScanAllTokens() <-chan *Token

ScanAllTokens scans the entire input string and returns a channel of tokens. Use this if you want to process the tokens as they are scanned.

type SQLNormalizer

type SQLNormalizer struct {
	// contains filtered or unexported fields
}

func NewSQLNormalizer

func NewSQLNormalizer(config *SQLNormalizerConfig) *SQLNormalizer

func (*SQLNormalizer) Normalize

func (n *SQLNormalizer) Normalize(input string) (string, *NormalizedInfo, error)

Normalize takes an input SQL string and returns a normalized SQL string, a NormalizedInfo struct, and an error. The normalizer collapses input SQL into compact format, groups obfuscated values into single placeholder, and collects metadata such as table names, comments, and commands.

type SQLNormalizerConfig

type SQLNormalizerConfig struct {
	DBMS string `json:"dbms"`

	// CollectTables specifies whether the normalizer should also extract the table names that a query addresses
	CollectTables bool `json:"collect_tables"`

	// CollectCommands specifies whether the normalizer should extract and return commands as SQL metadata
	CollectCommands bool `json:"collect_commands"`

	// CollectComments specifies whether the normalizer should extract and return comments as SQL metadata
	CollectComments bool `json:"collect_comments"`

	// KeepSQLAlias reports whether SQL aliases ("AS") should be truncated.
	KeepSQLAlias bool `json:"keep_sql_alias"`
}

type SQLObfuscator

type SQLObfuscator struct {
	// contains filtered or unexported fields
}

func NewSQLObfuscator

func NewSQLObfuscator(config *SQLObfuscatorConfig) *SQLObfuscator

func (*SQLObfuscator) Obfuscate

func (o *SQLObfuscator) Obfuscate(input string) string

Obfuscate takes an input SQL string and returns an obfuscated SQL string. The obfuscator replaces all literal values with a single placeholder

type SQLObfuscatorConfig

type SQLObfuscatorConfig struct {
	ReplaceDigits    bool
	DollarQuotedFunc bool
}

type Token

type Token struct {
	Type  TokenType
	Value string
}

Token represents a SQL token with its type and value.

type TokenType

type TokenType int
const (
	ERROR TokenType = iota
	EOF
	WS                     // whitespace
	STRING                 // string literal
	INCOMPLETE_STRING      // illegal string literal so that we can obfuscate it, e.g. 'abc
	NUMBER                 // number literal
	IDENT                  // identifier
	OPERATOR               // operator
	WILDCARD               // wildcard *
	COMMENT                // comment
	MULTILINE_COMMENT      // multiline comment
	PUNCTUATION            // punctuation
	DOLLAR_QUOTED_FUNCTION // dollar quoted function
	DOLLAR_QUOTED_STRING   // dollar quoted string
	NUMBERED_PARAMETER     // numbered parameter
	UNKNOWN                // unknown token
)

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL