http_crawler

package
v0.0.0-...-d9bf8a9 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Dec 5, 2022 License: MIT Imports: 10 Imported by: 2

Documentation

Index

Constants

View Source
const (
	ATOM       = "application/atom+xml"
	CSS        = "text/css"
	CSV        = "text/csv"
	DOCX       = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
	GIF        = "image/gif"
	HTML       = "text/html"
	ICO        = "image/x-icon"
	ICS        = "text/calendar"
	JAVASCRIPT = "application/javascript"
	JPEG       = "image/jpeg"
	JSON       = "application/json"
	ODP        = "application/vnd.oasis.opendocument.presentation"
	ODS        = "application/vnd.oasis.opendocument.spreadsheet"
	ODT        = "application/vnd.oasis.opendocument.text"
	PDF        = "application/pdf"
	PNG        = "image/png"
	XLS        = "application/vnd.ms-excel"
	XLSX       = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
)

Variables

View Source
var (
	ErrCannotCrawlURL  = errors.New("Cannot crawl URLs that don't live under the provided root URLs")
	ErrNotFound        = errors.New("404 Not Found")
	ErrRetryRequest5XX = errors.New("Retry request: 5XX HTTP Response returned")
	ErrRetryRequest429 = errors.New("Retry request: 429 HTTP Response returned (back off)")
)

Functions

func HostOnly

func HostOnly(hostport string) (string, error)

HostOnly parses out the host and removes the port (and separating colon) if present.

func IsAllowedHost

func IsAllowedHost(needle string, allowedHosts []*url.URL) bool

func Retry5XXStatusCodes

func Retry5XXStatusCodes() []int

Types

type BasicAuth

type BasicAuth struct {
	Username string
	Password string
}

type Crawler

type Crawler struct {
	RootURLs []*url.URL
	// contains filtered or unexported fields
}

func NewCrawler

func NewCrawler(rootURLs []*url.URL, versionNumber string, rateLimitToken string, basicAuth *BasicAuth) *Crawler

func (*Crawler) Crawl

func (c *Crawler) Crawl(crawlURL *url.URL) (*CrawlerResponse, error)

type CrawlerResponse

type CrawlerResponse struct {
	Body        []byte
	ContentType string
	URL         *url.URL
}

func (*CrawlerResponse) AcceptedContentType

func (c *CrawlerResponse) AcceptedContentType() bool

func (*CrawlerResponse) ParseContentType

func (c *CrawlerResponse) ParseContentType() (string, error)

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL