state

package
v1.0.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Feb 1, 2026 License: MIT Imports: 15 Imported by: 0

Documentation

Overview

Package state provides state management for the crawler.

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func ComputeContentHash

func ComputeContentHash(content string) string

ComputeContentHash computes an MD5 hash of content.

Types

type BoltStore

type BoltStore struct {
	// contains filtered or unexported fields
}

BoltStore implements Store using BoltDB.

func NewBoltStore

func NewBoltStore(path string) (*BoltStore, error)

NewBoltStore creates a new BoltDB-backed state store.

func (*BoltStore) Close

func (s *BoltStore) Close() error

Close closes the database.

func (*BoltStore) Load

func (s *BoltStore) Load() (*CrawlerState, error)

Load loads the crawler state.

func (*BoltStore) Save

func (s *BoltStore) Save(state *CrawlerState) error

Save saves the crawler state.

type CrawlError

type CrawlError struct {
	URL       string
	Error     string
	Timestamp time.Time
}

CrawlError represents a crawl error.

type CrawlStats

type CrawlStats struct {
	URLsDiscovered     int
	PagesCrawled       int
	FormsFound         int
	APIEndpoints       int
	WebSocketEndpoints int
	ErrorCount         int
	Duration           time.Duration
	BytesTransferred   int64
}

CrawlStats contains statistics about the crawl.

type CrawlerState

type CrawlerState struct {
	Target      string              `json:"target"`
	StartedAt   time.Time           `json:"started_at"`
	UpdatedAt   time.Time           `json:"updated_at"`
	Stats       CrawlStats          `json:"stats"`
	Config      json.RawMessage     `json:"config"`
	QueueURLs   []string            `json:"queue_urls"`
	VisitedURLs []string            `json:"visited_urls"`
	Endpoints   []Endpoint          `json:"endpoints"`
	Forms       []Form              `json:"forms"`
	WebSockets  []WebSocketEndpoint `json:"websockets"`
	Errors      []CrawlError        `json:"errors"`
}

CrawlerState represents the complete state of a crawler session.

type Deduplicator

type Deduplicator struct {
	// contains filtered or unexported fields
}

Deduplicator handles URL deduplication using a Bloom filter.

func NewDeduplicator

func NewDeduplicator(estimatedItems int) *Deduplicator

NewDeduplicator creates a new deduplicator.

func (*Deduplicator) Add

func (d *Deduplicator) Add(url string)

Add adds a URL to the deduplicator.

func (*Deduplicator) AddBatch

func (d *Deduplicator) AddBatch(urls []string)

AddBatch adds multiple URLs at once.

func (*Deduplicator) Count

func (d *Deduplicator) Count() int

Count returns the number of unique URLs seen.

func (*Deduplicator) FalsePositiveRate

func (d *Deduplicator) FalsePositiveRate() float64

FalsePositiveRate returns the current estimated false positive rate.

func (*Deduplicator) GetAll

func (d *Deduplicator) GetAll() []string

GetAll returns all URLs in the deduplicator.

func (*Deduplicator) HasSeen

func (d *Deduplicator) HasSeen(url string) bool

HasSeen checks if a URL has been seen before.

func (*Deduplicator) Merge

func (d *Deduplicator) Merge(other *Deduplicator)

Merge merges another deduplicator into this one.

func (*Deduplicator) Reset

func (d *Deduplicator) Reset()

Reset resets the deduplicator.

type Endpoint

type Endpoint struct {
	URL            string
	Method         string
	Source         string
	Depth          int
	Parameters     []Parameter
	Headers        map[string]string
	DiscoveredFrom string
	StatusCode     int
	ContentType    string
	ResponseSize   int64
	Timestamp      time.Time
}

Endpoint represents a discovered endpoint.

type FileStore

type FileStore struct {
	// contains filtered or unexported fields
}

FileStore implements Store using JSON files.

func NewFileStore

func NewFileStore(path string, compressed bool) *FileStore

NewFileStore creates a new file-based state store.

func (*FileStore) Close

func (s *FileStore) Close() error

Close is a no-op for FileStore.

func (*FileStore) Load

func (s *FileStore) Load() (*CrawlerState, error)

Load loads the crawler state from a file.

func (*FileStore) Save

func (s *FileStore) Save(state *CrawlerState) error

Save saves the crawler state to a file.

type Form

type Form struct {
	URL       string
	Action    string
	Method    string
	Enctype   string
	Inputs    []FormInput
	HasCSRF   bool
	Depth     int
	Timestamp time.Time
}

Form represents an HTML form.

type FormInput

type FormInput struct {
	Name        string
	Type        string
	Value       string
	Required    bool
	Placeholder string
	Pattern     string
	MaxLength   int
	MinLength   int
}

FormInput represents a form input field.

type HashAwareDeduplicator

type HashAwareDeduplicator struct {
	// contains filtered or unexported fields
}

HashAwareDeduplicator provides URL deduplication that properly handles hash-based SPAs.

func NewHashAwareDeduplicator

func NewHashAwareDeduplicator(maxSize int) *HashAwareDeduplicator

NewHashAwareDeduplicator creates a new hash-aware deduplicator.

func (*HashAwareDeduplicator) AddBatch

func (d *HashAwareDeduplicator) AddBatch(urls []string)

AddBatch adds multiple URLs as visited.

func (*HashAwareDeduplicator) ExtractRoutingFragment

func (d *HashAwareDeduplicator) ExtractRoutingFragment(rawURL string) string

ExtractRoutingFragment extracts the routing-relevant part of a fragment.

func (*HashAwareDeduplicator) GetAll

func (d *HashAwareDeduplicator) GetAll() []string

GetAll returns all visited URLs.

func (*HashAwareDeduplicator) GetContentHash

func (d *HashAwareDeduplicator) GetContentHash(rawURL string) (string, bool)

GetContentHash returns the content hash for a URL if available.

func (*HashAwareDeduplicator) HasDuplicateContent

func (d *HashAwareDeduplicator) HasDuplicateContent(rawURL string, contentHash string) (bool, string)

HasDuplicateContent checks if a URL has the same content as another visited URL.

func (*HashAwareDeduplicator) HasVisited

func (d *HashAwareDeduplicator) HasVisited(rawURL string) bool

HasVisited checks if a URL has been visited.

func (*HashAwareDeduplicator) MarkVisited

func (d *HashAwareDeduplicator) MarkVisited(rawURL string)

MarkVisited marks a URL as visited.

func (*HashAwareDeduplicator) NormalizeURL

func (d *HashAwareDeduplicator) NormalizeURL(rawURL string) string

NormalizeURL normalizes a URL for deduplication.

func (*HashAwareDeduplicator) SetContentHash

func (d *HashAwareDeduplicator) SetContentHash(rawURL string, contentHash string)

SetContentHash sets the content hash for a URL.

func (*HashAwareDeduplicator) ShouldSkipFragment

func (d *HashAwareDeduplicator) ShouldSkipFragment(fragment string) bool

ShouldSkipFragment returns true if this fragment should be skipped.

func (*HashAwareDeduplicator) Stats

func (d *HashAwareDeduplicator) Stats() map[string]int

Stats returns deduplicator statistics.

type Manager

type Manager struct {
	// contains filtered or unexported fields
}

Manager handles crawler state.

func NewManager

func NewManager(store Store, estimatedURLs int) *Manager

NewManager creates a new state manager.

func (*Manager) AddAPIEndpoint

func (m *Manager) AddAPIEndpoint()

AddAPIEndpoint increments the API endpoint counter.

func (*Manager) AddBytes

func (m *Manager) AddBytes(n int64)

AddBytes adds to the bytes transferred counter.

func (*Manager) AddDiscoveredURL

func (m *Manager) AddDiscoveredURL()

AddDiscoveredURL increments the discovered URL counter.

func (*Manager) AddError

func (m *Manager) AddError()

AddError increments the error counter.

func (*Manager) AddForm

func (m *Manager) AddForm()

AddForm increments the form counter.

func (*Manager) AddWebSocket

func (m *Manager) AddWebSocket()

AddWebSocket increments the WebSocket endpoint counter.

func (*Manager) GetDeduplicator

func (m *Manager) GetDeduplicator() *Deduplicator

GetDeduplicator returns the deduplicator.

func (*Manager) GetHashDeduplicator

func (m *Manager) GetHashDeduplicator() *HashAwareDeduplicator

GetHashDeduplicator returns the hash-aware deduplicator.

func (*Manager) GetSoftErrors

func (m *Manager) GetSoftErrors() map[string]string

GetSoftErrors returns all soft error URLs.

func (*Manager) GetStats

func (m *Manager) GetStats() CrawlStats

GetStats returns the current statistics.

func (*Manager) HasDuplicateContent

func (m *Manager) HasDuplicateContent(url, contentHash string) (bool, string)

HasDuplicateContent checks if the content hash has been seen before. Returns true if duplicate, along with the URL that had the same content.

func (*Manager) HasVisited

func (m *Manager) HasVisited(url string) bool

HasVisited checks if a URL has been visited.

func (*Manager) IsSoftError

func (m *Manager) IsSoftError(url string) (bool, string)

IsSoftError checks if a URL was marked as a soft 404.

func (*Manager) Load

func (m *Manager) Load() (*CrawlerState, error)

Load loads the state from storage.

func (*Manager) MarkSoftError

func (m *Manager) MarkSoftError(url, errorMsg string)

MarkSoftError records a URL as a soft 404.

func (*Manager) MarkVisited

func (m *Manager) MarkVisited(url string) bool

MarkVisited marks a URL as visited.

func (*Manager) NormalizeURL

func (m *Manager) NormalizeURL(url string) string

NormalizeURL normalizes a URL for deduplication (handles hash-based SPAs).

func (*Manager) Reset

func (m *Manager) Reset()

Reset resets the state manager.

func (*Manager) Save

func (m *Manager) Save(state *CrawlerState) error

Save saves the current state.

func (*Manager) SetContentHash

func (m *Manager) SetContentHash(url, contentHash string)

SetContentHash stores the content hash for a URL.

func (*Manager) SetTarget

func (m *Manager) SetTarget(target string)

SetTarget sets the crawl target.

func (*Manager) ShouldSkipFragment

func (m *Manager) ShouldSkipFragment(fragment string) bool

ShouldSkipFragment checks if a hash fragment should be skipped (UI state).

func (*Manager) Start

func (m *Manager) Start(target string)

Start initializes the state for a new crawl.

type MemoryStore

type MemoryStore struct {
	// contains filtered or unexported fields
}

MemoryStore implements Store using in-memory storage.

func NewMemoryStore

func NewMemoryStore() *MemoryStore

NewMemoryStore creates a new in-memory state store.

func (*MemoryStore) Close

func (s *MemoryStore) Close() error

Close is a no-op for MemoryStore.

func (*MemoryStore) Load

func (s *MemoryStore) Load() (*CrawlerState, error)

Load returns the stored state.

func (*MemoryStore) Save

func (s *MemoryStore) Save(state *CrawlerState) error

Save saves the state in memory.

type Parameter

type Parameter struct {
	Name     string
	Type     string
	Example  string
	Required bool
}

Parameter represents a request parameter.

type Store

type Store interface {
	Save(state *CrawlerState) error
	Load() (*CrawlerState, error)
	Close() error
}

Store defines the interface for state storage.

type WebSocketEndpoint

type WebSocketEndpoint struct {
	URL            string
	DiscoveredFrom string
	SampleMessages []WebSocketMsg
	Protocols      []string
	Timestamp      time.Time
}

WebSocketEndpoint represents a discovered WebSocket endpoint.

type WebSocketMsg

type WebSocketMsg struct {
	Direction string
	Type      string
	Data      string
	Timestamp time.Time
}

WebSocketMsg represents a WebSocket message.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL