crawler

package
v1.0.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: May 31, 2023 License: MIT Imports: 19 Imported by: 2

Documentation

Index

Constants

View Source
const (
	// CSSUri url from url()
	CSSUri urlContext = 1 + iota
	// HTMLTagA url from <a href=""></a>
	HTMLTagA
	// HTMLTagForm url from <form action="" />
	HTMLTagForm
	// HTMLTagImg url from <img src="" />
	HTMLTagImg
	// HTMLTagLinkStylesheet url from <link rel="stylesheet" href="" />
	HTMLTagLinkStylesheet
	// HTMLTagScript url from <script src="" />
	HTMLTagScript
	// HTTP3xxLocation url from HTTP response code 3xx
	HTTP3xxLocation
)

Variables

This section is empty.

Functions

func LongestCommonPrefix

func LongestCommonPrefix(path1 string, path2 string) string

LongestCommonPrefix returns the common path elements between two paths

func ReduceURL

func ReduceURL(base *neturl.URL, url *neturl.URL) string

ReduceURL returns relative version of url from base

Types

type Crawler

type Crawler interface {
	GetClientTimeout() time.Duration
	SetAutoDownloadDepth(uint64)
	GetAutoDownloadDepth() uint64
	SetNoCrossHost(bool)
	GetNoCrossHost() bool
	AddRequestHeader(string, string)
	SetRequestHeader(string, string)
	GetRequestHeaderValues(string) []string
	SetWorkerCount(uint64) error
	GetWorkerCount() uint64

	SetURLRewriter(func(*url.URL))
	SetOnURLShouldQueue(func(*url.URL) bool)
	SetOnURLShouldDownload(func(*url.URL) bool)
	SetOnDownload(func(*url.URL))
	SetOnDownloaded(func(*Downloaded))

	GetEnqueuedCount() uint64
	GetDownloadedCount() uint64
	GetLinkFoundCount() uint64
	HasStarted() bool
	HasStopped() bool
	IsRunning() bool
	IsBusy() bool

	Start()
	Stop()
	Enqueue(QueueItem)
	Download(QueueItem) *Downloaded
	Downloaded() (*Downloaded, bool)
	DownloadedNotBlocking() *Downloaded
	// contains filtered or unexported methods
}

Crawler represents an object that can process download requests

func New

func New(client *http.Client, logger *logrus.Logger) Crawler

New returns a new crawler instance

type Downloaded

type Downloaded struct {
	Input *Input

	BaseURL         *url.URL
	Body            string
	Error           error
	LinksAssets     map[string]Link
	LinksDiscovered map[string]Link
	StatusCode      int
	// contains filtered or unexported fields
}

Downloaded represents processed data after downloading

func Download

func Download(input *Input) *Downloaded

Download returns parsed data after downloading the specified url.

func (*Downloaded) AddHeader

func (d *Downloaded) AddHeader(key string, value string)

AddHeader adds a new header

func (*Downloaded) GetAssetURLs

func (d *Downloaded) GetAssetURLs() []*neturl.URL

GetAssetURLs returns resolved asset urls

func (*Downloaded) GetDiscoveredURLs

func (d *Downloaded) GetDiscoveredURLs() []*neturl.URL

GetDiscoveredURLs returns resolved discovered link urls

func (*Downloaded) GetHeaderKeys

func (d *Downloaded) GetHeaderKeys() []string

GetHeaderKeys returns all header keys

func (*Downloaded) GetHeaderValues

func (d *Downloaded) GetHeaderValues(key string) []string

GetHeaderValues returns values of the specified header key

func (*Downloaded) ProcessURL

func (d *Downloaded) ProcessURL(context urlContext, url string) (string, error)

ProcessURL validates url and returns rewritten string representation

func (*Downloaded) Reduce

func (d *Downloaded) Reduce(url *neturl.URL) string

Reduce returns relative version of url from .Input.URL

type Input

type Input struct {
	Client      *http.Client
	Header      http.Header
	NoCrossHost bool
	Rewriter    *func(*url.URL)
	URL         *url.URL
}

Input represents a download request ready to be processed

type Link struct {
	Context urlContext
	URL     *url.URL
}

Link represents an extracted link from download result

type QueueItem

type QueueItem struct {
	URL           *url.URL
	Depth         uint64
	ForceDownload bool
}

QueueItem represents a download request in the queue

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL