Documentation
¶
Overview ¶
Package state provides state management for the crawler.
Index ¶
- func ComputeContentHash(content string) string
- type BoltStore
- type CrawlError
- type CrawlStats
- type CrawlerState
- type Deduplicator
- func (d *Deduplicator) Add(url string)
- func (d *Deduplicator) AddBatch(urls []string)
- func (d *Deduplicator) Count() int
- func (d *Deduplicator) FalsePositiveRate() float64
- func (d *Deduplicator) GetAll() []string
- func (d *Deduplicator) HasSeen(url string) bool
- func (d *Deduplicator) Merge(other *Deduplicator)
- func (d *Deduplicator) Reset()
- type Endpoint
- type FileStore
- type Form
- type FormInput
- type HashAwareDeduplicator
- func (d *HashAwareDeduplicator) AddBatch(urls []string)
- func (d *HashAwareDeduplicator) ExtractRoutingFragment(rawURL string) string
- func (d *HashAwareDeduplicator) GetAll() []string
- func (d *HashAwareDeduplicator) GetContentHash(rawURL string) (string, bool)
- func (d *HashAwareDeduplicator) HasDuplicateContent(rawURL string, contentHash string) (bool, string)
- func (d *HashAwareDeduplicator) HasVisited(rawURL string) bool
- func (d *HashAwareDeduplicator) MarkVisited(rawURL string)
- func (d *HashAwareDeduplicator) NormalizeURL(rawURL string) string
- func (d *HashAwareDeduplicator) SetContentHash(rawURL string, contentHash string)
- func (d *HashAwareDeduplicator) ShouldSkipFragment(fragment string) bool
- func (d *HashAwareDeduplicator) Stats() map[string]int
- type Manager
- func (m *Manager) AddAPIEndpoint()
- func (m *Manager) AddBytes(n int64)
- func (m *Manager) AddDiscoveredURL()
- func (m *Manager) AddError()
- func (m *Manager) AddForm()
- func (m *Manager) AddWebSocket()
- func (m *Manager) GetDeduplicator() *Deduplicator
- func (m *Manager) GetHashDeduplicator() *HashAwareDeduplicator
- func (m *Manager) GetSoftErrors() map[string]string
- func (m *Manager) GetStats() CrawlStats
- func (m *Manager) HasDuplicateContent(url, contentHash string) (bool, string)
- func (m *Manager) HasVisited(url string) bool
- func (m *Manager) IsSoftError(url string) (bool, string)
- func (m *Manager) Load() (*CrawlerState, error)
- func (m *Manager) MarkSoftError(url, errorMsg string)
- func (m *Manager) MarkVisited(url string) bool
- func (m *Manager) NormalizeURL(url string) string
- func (m *Manager) Reset()
- func (m *Manager) Save(state *CrawlerState) error
- func (m *Manager) SetContentHash(url, contentHash string)
- func (m *Manager) SetTarget(target string)
- func (m *Manager) ShouldSkipFragment(fragment string) bool
- func (m *Manager) Start(target string)
- type MemoryStore
- type Parameter
- type Store
- type WebSocketEndpoint
- type WebSocketMsg
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func ComputeContentHash ¶
ComputeContentHash computes an MD5 hash of content.
Types ¶
type BoltStore ¶
type BoltStore struct {
// contains filtered or unexported fields
}
BoltStore implements Store using BoltDB.
func NewBoltStore ¶
NewBoltStore creates a new BoltDB-backed state store.
func (*BoltStore) Load ¶
func (s *BoltStore) Load() (*CrawlerState, error)
Load loads the crawler state.
func (*BoltStore) Save ¶
func (s *BoltStore) Save(state *CrawlerState) error
Save saves the crawler state.
type CrawlError ¶
CrawlError represents a crawl error.
type CrawlStats ¶
type CrawlStats struct {
URLsDiscovered int
PagesCrawled int
FormsFound int
APIEndpoints int
WebSocketEndpoints int
ErrorCount int
Duration time.Duration
BytesTransferred int64
}
CrawlStats contains statistics about the crawl.
type CrawlerState ¶
type CrawlerState struct {
Target string `json:"target"`
StartedAt time.Time `json:"started_at"`
UpdatedAt time.Time `json:"updated_at"`
Stats CrawlStats `json:"stats"`
Config json.RawMessage `json:"config"`
QueueURLs []string `json:"queue_urls"`
VisitedURLs []string `json:"visited_urls"`
Endpoints []Endpoint `json:"endpoints"`
Forms []Form `json:"forms"`
WebSockets []WebSocketEndpoint `json:"websockets"`
Errors []CrawlError `json:"errors"`
}
CrawlerState represents the complete state of a crawler session.
type Deduplicator ¶
type Deduplicator struct {
// contains filtered or unexported fields
}
Deduplicator handles URL deduplication using a Bloom filter.
func NewDeduplicator ¶
func NewDeduplicator(estimatedItems int) *Deduplicator
NewDeduplicator creates a new deduplicator.
func (*Deduplicator) Add ¶
func (d *Deduplicator) Add(url string)
Add adds a URL to the deduplicator.
func (*Deduplicator) AddBatch ¶
func (d *Deduplicator) AddBatch(urls []string)
AddBatch adds multiple URLs at once.
func (*Deduplicator) Count ¶
func (d *Deduplicator) Count() int
Count returns the number of unique URLs seen.
func (*Deduplicator) FalsePositiveRate ¶
func (d *Deduplicator) FalsePositiveRate() float64
FalsePositiveRate returns the current estimated false positive rate.
func (*Deduplicator) GetAll ¶
func (d *Deduplicator) GetAll() []string
GetAll returns all URLs in the deduplicator.
func (*Deduplicator) HasSeen ¶
func (d *Deduplicator) HasSeen(url string) bool
HasSeen checks if a URL has been seen before.
func (*Deduplicator) Merge ¶
func (d *Deduplicator) Merge(other *Deduplicator)
Merge merges another deduplicator into this one.
type Endpoint ¶
type Endpoint struct {
URL string
Method string
Source string
Depth int
Parameters []Parameter
Headers map[string]string
DiscoveredFrom string
StatusCode int
ContentType string
ResponseSize int64
Timestamp time.Time
}
Endpoint represents a discovered endpoint.
type FileStore ¶
type FileStore struct {
// contains filtered or unexported fields
}
FileStore implements Store using JSON files.
func NewFileStore ¶
NewFileStore creates a new file-based state store.
func (*FileStore) Load ¶
func (s *FileStore) Load() (*CrawlerState, error)
Load loads the crawler state from a file.
func (*FileStore) Save ¶
func (s *FileStore) Save(state *CrawlerState) error
Save saves the crawler state to a file.
type Form ¶
type Form struct {
URL string
Action string
Method string
Enctype string
Inputs []FormInput
HasCSRF bool
Depth int
Timestamp time.Time
}
Form represents an HTML form.
type FormInput ¶
type FormInput struct {
Name string
Type string
Value string
Required bool
Placeholder string
Pattern string
MaxLength int
MinLength int
}
FormInput represents a form input field.
type HashAwareDeduplicator ¶
type HashAwareDeduplicator struct {
// contains filtered or unexported fields
}
HashAwareDeduplicator provides URL deduplication that properly handles hash-based SPAs.
func NewHashAwareDeduplicator ¶
func NewHashAwareDeduplicator(maxSize int) *HashAwareDeduplicator
NewHashAwareDeduplicator creates a new hash-aware deduplicator.
func (*HashAwareDeduplicator) AddBatch ¶
func (d *HashAwareDeduplicator) AddBatch(urls []string)
AddBatch adds multiple URLs as visited.
func (*HashAwareDeduplicator) ExtractRoutingFragment ¶
func (d *HashAwareDeduplicator) ExtractRoutingFragment(rawURL string) string
ExtractRoutingFragment extracts the routing-relevant part of a fragment.
func (*HashAwareDeduplicator) GetAll ¶
func (d *HashAwareDeduplicator) GetAll() []string
GetAll returns all visited URLs.
func (*HashAwareDeduplicator) GetContentHash ¶
func (d *HashAwareDeduplicator) GetContentHash(rawURL string) (string, bool)
GetContentHash returns the content hash for a URL if available.
func (*HashAwareDeduplicator) HasDuplicateContent ¶
func (d *HashAwareDeduplicator) HasDuplicateContent(rawURL string, contentHash string) (bool, string)
HasDuplicateContent checks if a URL has the same content as another visited URL.
func (*HashAwareDeduplicator) HasVisited ¶
func (d *HashAwareDeduplicator) HasVisited(rawURL string) bool
HasVisited checks if a URL has been visited.
func (*HashAwareDeduplicator) MarkVisited ¶
func (d *HashAwareDeduplicator) MarkVisited(rawURL string)
MarkVisited marks a URL as visited.
func (*HashAwareDeduplicator) NormalizeURL ¶
func (d *HashAwareDeduplicator) NormalizeURL(rawURL string) string
NormalizeURL normalizes a URL for deduplication.
func (*HashAwareDeduplicator) SetContentHash ¶
func (d *HashAwareDeduplicator) SetContentHash(rawURL string, contentHash string)
SetContentHash sets the content hash for a URL.
func (*HashAwareDeduplicator) ShouldSkipFragment ¶
func (d *HashAwareDeduplicator) ShouldSkipFragment(fragment string) bool
ShouldSkipFragment returns true if this fragment should be skipped.
func (*HashAwareDeduplicator) Stats ¶
func (d *HashAwareDeduplicator) Stats() map[string]int
Stats returns deduplicator statistics.
type Manager ¶
type Manager struct {
// contains filtered or unexported fields
}
Manager handles crawler state.
func NewManager ¶
NewManager creates a new state manager.
func (*Manager) AddAPIEndpoint ¶
func (m *Manager) AddAPIEndpoint()
AddAPIEndpoint increments the API endpoint counter.
func (*Manager) AddDiscoveredURL ¶
func (m *Manager) AddDiscoveredURL()
AddDiscoveredURL increments the discovered URL counter.
func (*Manager) AddWebSocket ¶
func (m *Manager) AddWebSocket()
AddWebSocket increments the WebSocket endpoint counter.
func (*Manager) GetDeduplicator ¶
func (m *Manager) GetDeduplicator() *Deduplicator
GetDeduplicator returns the deduplicator.
func (*Manager) GetHashDeduplicator ¶
func (m *Manager) GetHashDeduplicator() *HashAwareDeduplicator
GetHashDeduplicator returns the hash-aware deduplicator.
func (*Manager) GetSoftErrors ¶
GetSoftErrors returns all soft error URLs.
func (*Manager) GetStats ¶
func (m *Manager) GetStats() CrawlStats
GetStats returns the current statistics.
func (*Manager) HasDuplicateContent ¶
HasDuplicateContent checks if the content hash has been seen before. Returns true if duplicate, along with the URL that had the same content.
func (*Manager) HasVisited ¶
HasVisited checks if a URL has been visited.
func (*Manager) IsSoftError ¶
IsSoftError checks if a URL was marked as a soft 404.
func (*Manager) Load ¶
func (m *Manager) Load() (*CrawlerState, error)
Load loads the state from storage.
func (*Manager) MarkSoftError ¶
MarkSoftError records a URL as a soft 404.
func (*Manager) MarkVisited ¶
MarkVisited marks a URL as visited.
func (*Manager) NormalizeURL ¶
NormalizeURL normalizes a URL for deduplication (handles hash-based SPAs).
func (*Manager) Save ¶
func (m *Manager) Save(state *CrawlerState) error
Save saves the current state.
func (*Manager) SetContentHash ¶
SetContentHash stores the content hash for a URL.
func (*Manager) ShouldSkipFragment ¶
ShouldSkipFragment checks if a hash fragment should be skipped (UI state).
type MemoryStore ¶
type MemoryStore struct {
// contains filtered or unexported fields
}
MemoryStore implements Store using in-memory storage.
func NewMemoryStore ¶
func NewMemoryStore() *MemoryStore
NewMemoryStore creates a new in-memory state store.
func (*MemoryStore) Load ¶
func (s *MemoryStore) Load() (*CrawlerState, error)
Load returns the stored state.
func (*MemoryStore) Save ¶
func (s *MemoryStore) Save(state *CrawlerState) error
Save saves the state in memory.
type Store ¶
type Store interface {
Save(state *CrawlerState) error
Load() (*CrawlerState, error)
Close() error
}
Store defines the interface for state storage.
type WebSocketEndpoint ¶
type WebSocketEndpoint struct {
URL string
DiscoveredFrom string
SampleMessages []WebSocketMsg
Protocols []string
Timestamp time.Time
}
WebSocketEndpoint represents a discovered WebSocket endpoint.