Documentation
¶
Index ¶
Constants ¶
View Source
const ( // MaxFeedSize limits response body size to 10MB MaxFeedSize = 10 * 1024 * 1024 // DefaultTimeout for HTTP requests DefaultTimeout = 30 * time.Second // MaxRedirects prevents redirect loops MaxRedirects = 5 // UserAgent identifies the bot UserAgent = "RoguePlanet/0.1 (+https://github.com/adewale/rogue_planet)" )
Variables ¶
Functions ¶
func ValidateURL ¶
ValidateURL checks if a URL is safe to fetch (SSRF prevention)
Types ¶
type Crawler ¶
type Crawler struct {
// contains filtered or unexported fields
}
Crawler handles HTTP fetching with proper conditional request support
func NewForTesting ¶
func NewForTesting() *Crawler
NewForTesting creates a Crawler that allows local URLs (for testing only)
func NewWithUserAgent ¶
NewWithUserAgent creates a Crawler with a custom user agent
func (*Crawler) Fetch ¶
func (c *Crawler) Fetch(ctx context.Context, feedURL string, cache FeedCache) (*FeedResponse, error)
Fetch fetches a feed with conditional request support
func (*Crawler) FetchWithRetry ¶
func (c *Crawler) FetchWithRetry(ctx context.Context, feedURL string, cache FeedCache, maxRetries int) (*FeedResponse, error)
FetchWithRetry attempts to fetch with exponential backoff
type FeedCache ¶
type FeedCache struct {
URL string
ETag string // Stored exactly as received, including quotes
LastModified string // Stored exactly as received
LastFetched time.Time
}
FeedCache stores HTTP caching headers for conditional requests
type FeedResponse ¶
type FeedResponse struct {
Body []byte
StatusCode int
NotModified bool // True if 304 Not Modified was returned
NewCache FeedCache // Updated cache headers for storage
FinalURL string // URL after redirects (for 301 permanent redirects)
FetchTime time.Time
}
FeedResponse contains the fetched feed data and metadata
Click to show internal directories.
Click to hide internal directories.