spy

package module
v0.0.0-...-8fffa5a Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Apr 15, 2017 License: Apache-2.0 Imports: 32 Imported by: 0

README

spy

Documentation

Index

Constants

View Source
const (
	MIMEJSON              = "application/json"
	MIMEHTML              = "text/html"
	MIMEXML               = "application/xml"
	MIMEXMLText           = "text/xml"
	MIMEPlain             = "text/plain"
	MIMEPOSTForm          = "application/x-www-form-urlencoded"
	MIMEMultipartPOSTForm = "multipart/form-data"
	MIMEPROTOBUF          = "application/x-protobuf"
)
View Source
const ItemPipelines = "ItemPipelines"

Variables

View Source
var (
	ErrSpiderClosed  = errors.New("spider closed")
	ErrItemDropped   = errors.New("item dropped")
	ErrIgnoreRequest = errors.New("request ignored")
)
View Source
var IgnoredExtensions = []string{

	"mng", "pct", "bmp", "gif ", "jpg", "jpeg", "png", "pst", "psp", "tif",
	"tiff", "ai", "drw", "dxf", "eps", "ps", "svg",

	"mp3", "wma", "ogg", "wav", "ra", "aac", "mid", "au", "aiff",

	"3gp", "asf", "asx", "avi", "mov", "mp4", "mpg", "qt", "rm", "swf", "wmv",
	"m4a",

	"xls", "xlsx", "ppt", "pptx", "pps", "doc", "docx", "odt", "ods", "odg",
	"odp",

	"css", "pdf", "exe", "bin", "rss", "zip", "rar",
}

common file extensions that are not followed if they occur in links

Functions

This section is empty.

Types

type Closer

type Closer interface {
	Close(spider ISpider)
}

type Config

type Config struct {
	*viper.Viper

	ConcurrentRequests          int
	ConcurrentRequestsPerDomain int
	ConcurrentRequestsPerIp     int
	RandomizeFetchDelay         bool
	FetchDelay                  float64
}

type CrawlSpider

type CrawlSpider struct {
	*Spider
}

func (*CrawlSpider) Parse

func (s *CrawlSpider) Parse(response *Response) (*SpiderResult, error)

type Crawler

type Crawler struct {
	*Config
	*logrus.Logger
	*Stats

	Concurrency int

	Spider    ISpider
	Scheduler IScheduler
	Fetcher   IFetcher
	*SpiderMiddlewareManager
	*ItemPipelineManager

	*tunny.WorkPool
	*concurrency.Worker
	// contains filtered or unexported fields
}

func NewCrawler

func NewCrawler(spider ISpider, scheduler IScheduler) *Crawler

func (*Crawler) Pause

func (c *Crawler) Pause()

func (*Crawler) Resume

func (c *Crawler) Resume()

func (*Crawler) Start

func (c *Crawler) Start()

func (*Crawler) Stop

func (c *Crawler) Stop()

type DupeFilter

type DupeFilter interface {
	Opener
	Closer
	SeenRequest(request *Request) bool
}

type Event

type Event string
const (
	CrawlerStarted     Event = "CrawlerStarted"
	CrawlerStopped     Event = "CrawlerStopped"
	SpiderOpened       Event = "SpiderOpened "
	SpiderIdle         Event = "SpiderIdle"
	SpiderClosed       Event = "SpiderClosed"
	SpiderError        Event = "SpiderError"
	RequestScheduled   Event = "RequestScheduled"
	RequestDropped     Event = "RequestDropped"
	ResponseReceived   Event = "ResponseReceived"
	ResponseDownloaded Event = "ResponseDownloaded"
	ItemScraped        Event = "ItemScraped"
	ItemDropped        Event = "ItemDropped"
)

func (Event) Pub

func (e Event) Pub(args ...interface{})

func (Event) Sub

func (e Event) Sub(fn interface{})

func (Event) SubAsync

func (e Event) SubAsync(fn interface{})

func (Event) Unsub

func (e Event) Unsub(fn interface{})

type Fetcher

type Fetcher struct {
	TotalConcurrency  int
	DomainConcurrency int
	IpConcurrency     int
	Delay             time.Duration
	RandomizeDelay    bool
	// contains filtered or unexported fields
}

func NewFetcher

func NewFetcher() *Fetcher

func (*Fetcher) Close

func (f *Fetcher) Close(spider ISpider)

func (*Fetcher) Fetch

func (f *Fetcher) Fetch(req *Request, spider ISpider) (*Response, *Request, error)

func (*Fetcher) NeedsBackout

func (f *Fetcher) NeedsBackout() bool

func (*Fetcher) Open

func (f *Fetcher) Open(spider ISpider)

type FetcherHandler

type FetcherHandler interface {
	Fetch(request *Request, spider ISpider) (*Response, error)
	Close()
}

type FetcherMiddleware

type FetcherMiddleware interface{}

type FetcherMiddlewareManager

type FetcherMiddlewareManager struct {
	// contains filtered or unexported fields
}

func (*FetcherMiddlewareManager) Register

func (fmm *FetcherMiddlewareManager) Register(middleware FetcherMiddleware)

type FetchingErrorProcessor

type FetchingErrorProcessor interface {
	/* ProcessError is called when a fetcher handler or a ProcessRequest (from a fetcher middleware) returns an error.
	It should return: either nils, a Response object, or a Request object.

	If it returns nils, will continue processing this error, executing any other ProcessError methods of installed
	middleware, until no middleware is left.

	If it returns a Response object, the ProcessResponse methods chain of installed middleware is started,
	and won’t bother calling any other ProcessError methods of middleware.

	If it returns a Request object, the returned request is rescheduled to be performed in the future.
	This stops the execution of ProcessError methods of the middleware the same as returning a response would.
	*/
	ProcessError(err error, request *Request, spider ISpider) (*Response, *Request)
}

type FetchingRequestProcessor

type FetchingRequestProcessor interface {
	/* ProcessRequest is called for each request that goes through the fetcher middleware.
	It should either: return nils, return a Response object, return a Request object, or return an IgnoreRequest error.

	If it returns nils, will continue processing this request, executing all other middlewares,
	until, finally, the appropriate fetcher handler performs the request performed (and its response downloaded).

	If it returns a Response object, won’t bother calling any other ProcessRequest or ProcessError methods,
	or the appropriate fetcher handler; it’ll return that response. The ProcessResponse methods of installed middleware
	is always called on every response.

	If it returns a Request object, will stop calling ProcessRequest methods and reschedule the returned request.
	Once the newly returned request is performed, the appropriate middleware chain will again be called on the downloaded response.

	If it returns an IgnoreRequest error, the ProcessError methods of installed downloader middleware will be called.
	If none of them handle the error, the callback function of the request (Request.Callback) is called.
	If no code handles the returned error, it is ignored and not logged (unlike other error).
	*/
	ProcessRequest(request *Request, spider ISpider) (*Response, *Request, error)
}

type FetchingResponseProcessor

type FetchingResponseProcessor interface {
	/* ProcessResponse should either: return a Response object, return a Request object or return an IgnoreRequest error.

	If it returns a Response (it could be the same given response, or a brand-new one), that response will continue to be
	processed with the ProcessResponse method of the next middleware in the chain.

	If it returns a Request object, the middleware chain is halted and the returned request is rescheduled to be performed
	in the future. This is the same behavior as if a request is returned from ProcessRequest.

	If it returns an IgnoreRequest error, the callback function of the request (Request.Callback) is called.
	If no code handles the returned error, it is ignored and not logged (unlike other error).
	*/
	ProcessResponse(response *Response, request *Request, spider ISpider) (*Response, *Request, error)
}

type FingerprintDupeFilter

type FingerprintDupeFilter struct {
	*logrus.Logger
	// contains filtered or unexported fields
}

func NewFingerprintDupeFilter

func NewFingerprintDupeFilter(logger *logrus.Logger, filename ...string) *FingerprintDupeFilter

func (*FingerprintDupeFilter) Close

func (f *FingerprintDupeFilter) Close(spider ISpider)

func (*FingerprintDupeFilter) Open

func (f *FingerprintDupeFilter) Open(spider ISpider)

func (*FingerprintDupeFilter) SeenRequest

func (f *FingerprintDupeFilter) SeenRequest(request *Request) bool

type FromConfiger

type FromConfiger interface {
	FromConfig(config Config)
}

type FromCrawlerer

type FromCrawlerer interface {
	FromCrawler(crawler Crawler)
}

type GoquerySelector

type GoquerySelector struct {
	*goquery.Selection
}

func NewGoquerySelector

func NewGoquerySelector(doc *goquery.Document) *GoquerySelector

func (*GoquerySelector) Attr

func (gs *GoquerySelector) Attr(attrName string) (val string, exists bool)

func (*GoquerySelector) Extract

func (gs *GoquerySelector) Extract() string

func (*GoquerySelector) Regex

func (gs *GoquerySelector) Regex(regex interface{}) []string

func (*GoquerySelector) Select

func (gs *GoquerySelector) Select(query string) Selectors

type HTMLLinkExtractor

type HTMLLinkExtractor struct {
	// Regular expressions that the (absolute) urls must match in order to be extracted.
	// If empty, it will match all links.
	Allows []string

	// regular expressions that the (absolute) urls must match in order to be excluded.
	// It has precedence over the Allows parameter.
	// If empty, it won't exclude any links.
	Denies []string

	// Domains which will be considered for extracting the links.
	AllowDomains []string

	// Domains which won't be considered for extracting the links.
	DenyDomains []string

	// File extensions that should be ignored when extracting links.
	// If empty, it will default to the IgnoredExtensions.
	DenyExtensions []string

	// Selectors which define regions inside the response where links should be extracted from.
	// If given, only the text selected by those selectors will be scanned for links.
	RestrictSelectors []string

	// Whether duplicate filtering should be applied to extracted links.
	// Defaults to false.
	Unique bool

	// Function which receives each url value extracted from the tag and attributes scanned
	// and can modify the value and return a new one, or return nil to ignore the link altogether.
	// If not given, defaults to the untouched link.
	ProcessValue func(value *url.URL) *url.URL

	// a list of tags to consider when extracting links.
	// Defaults to {"a", "area"}.
	Tags []string

	// Attributes which should be considered when looking for links to extract.
	// Only for those tags specified in the tags parameter.
	// Defaults to {"href"}.
	Attrs []string
	// contains filtered or unexported fields
}
func (hle *HTMLLinkExtractor) ExtractLinks(response *Response) []*Link

func (*HTMLLinkExtractor) Init

func (hle *HTMLLinkExtractor) Init()

type IFetcher

type IFetcher interface {
	NeedsBackout() bool
	Fetch(req *Request, spider ISpider) (*Response, *Request, error)
	Opener
	Closer
}

type IScheduler

type IScheduler interface {
	Opener
	Closer
	EnqueueRequest(request *Request) bool
	NextRequest() *Request
}

type ISpider

type ISpider interface {
	StartRequests() []*Request
	Parse(response *Response) (*SpiderResult, error)
	FetchDelay() time.Duration
	ConcurrentRequests() int
	String() string
	Crawler() *Crawler
}

type Item

type Item map[string]interface{}

type ItemPipelineManager

type ItemPipelineManager struct {
	// contains filtered or unexported fields
}

func (*ItemPipelineManager) Close

func (ipm *ItemPipelineManager) Close(spider ISpider)

func (*ItemPipelineManager) Open

func (ipm *ItemPipelineManager) Open(spider ISpider)

func (*ItemPipelineManager) ProcessItem

func (ipm *ItemPipelineManager) ProcessItem(item *Item, spider ISpider) (*Item, error)

func (*ItemPipelineManager) Register

func (ipm *ItemPipelineManager) Register(middleware ItemPipelineMiddleware)

type ItemPipelineMiddleware

type ItemPipelineMiddleware interface{}

type ItemProcessor

type ItemProcessor interface {
	ProcessItem(item *Item, spider ISpider) (*Item, error)
}
type Link struct {
	// contains filtered or unexported fields
}

Link represents an extracted link.

func (*Link) String

func (l *Link) String() string

type LinkExtractor

type LinkExtractor interface {
	ExtractLinks(response *Response) ([]Link, error)
}

type Middleware

type Middleware interface {
	OnSpiderOpeneder
	OnSpiderCloseder
}

type MiddlewareManager

type MiddlewareManager struct {
	// contains filtered or unexported fields
}

func NewMiddlewareManager

func NewMiddlewareManager() *MiddlewareManager

func (*MiddlewareManager) OnSpiderClosed

func (mm *MiddlewareManager) OnSpiderClosed(spider Spider)

func (*MiddlewareManager) OnSpiderOpened

func (mm *MiddlewareManager) OnSpiderOpened(spider Spider)

func (*MiddlewareManager) Register

func (mm *MiddlewareManager) Register(middleware Middleware)

type MiddlewareManagerIterator

type MiddlewareManagerIterator struct {
	// contains filtered or unexported fields
}

func (*MiddlewareManagerIterator) HasNext

func (mmi *MiddlewareManagerIterator) HasNext() bool

func (*MiddlewareManagerIterator) Next

func (mmi *MiddlewareManagerIterator) Next() interface{}

type OnSpiderCloseder

type OnSpiderCloseder interface {
	OnSpiderClosed(spider ISpider)
}

type OnSpiderOpeneder

type OnSpiderOpeneder interface {
	OnSpiderOpened(spider ISpider)
}

type Opener

type Opener interface {
	Open(spider ISpider)
}

type Request

type Request struct {
	*http.Request
	Error     error
	Meta      map[string]interface{}
	NotFilter bool
	Callback  func(response *Response, err error) (*SpiderResult, error)
}

func NewRequest

func NewRequest(urlStr, method string) *Request

func (*Request) Fingerprint

func (req *Request) Fingerprint() string

Fingerprint returns a hash that uniquely identifies the request. Ignore all headers.

type Response

type Response struct {
	*http.Response

	MediaType string
	HTMLDoc   *goquery.Document

	/* Request which generated this response.
		This attribute is assigned in the `Crawler`, after the response and the request have passed
	    through all `Fetcher Middlewares`. In particular, this means that:

	    - HTTP redirections will cause the original request (to the URL before
	      redirection) to be assigned to the redirected response (with the final
	      URL after redirection).

	    - Response.Request.URL doesn't always equal Response.Response.URL

	    - This attribute is only available in the spider code, and in the `Spider Middlewares`,
	      but not in `Downloader Middlewares` (although you have the Request available there by
	      other means) and handlers of the `response_downloaded` signal.
	*/
	*Request
	// contains filtered or unexported fields
}

func NewResponse

func NewResponse(hr *http.Response) (r *Response, err error)

func (*Response) Close

func (r *Response) Close()

func (*Response) ContentType

func (r *Response) ContentType() string

func (*Response) JSON

func (r *Response) JSON(v interface{}) error

func (*Response) Select

func (r *Response) Select(query string) Selectors

func (*Response) Selector

func (r *Response) Selector() Selector

func (*Response) Text

func (r *Response) Text() (text string, err error)

func (*Response) XML

func (r *Response) XML(v interface{}) error

type Rule

type Rule struct {
}

type Scheduler

type Scheduler struct {
	// contains filtered or unexported fields
}

func NewScheduler

func NewScheduler() *Scheduler

func (*Scheduler) Close

func (s *Scheduler) Close(spider ISpider)

func (*Scheduler) EnqueueRequest

func (s *Scheduler) EnqueueRequest(request *Request) bool

func (*Scheduler) NextRequest

func (s *Scheduler) NextRequest() *Request

func (*Scheduler) Open

func (s *Scheduler) Open(spider ISpider)

type Selector

type Selector interface {
	Select(query string) Selectors
	Regex(regex interface{}) []string
	Extract() string
	Attr(attrName string) (val string, exists bool)
}

type Selectors

type Selectors []Selector

func (Selectors) Attrs

func (ss Selectors) Attrs(attrName string) []string

func (Selectors) Extract

func (ss Selectors) Extract() []string

func (Selectors) ExtractFirst

func (ss Selectors) ExtractFirst() string

func (Selectors) Regex

func (ss Selectors) Regex(regex interface{}) []string

func (Selectors) RegexFirst

func (ss Selectors) RegexFirst(regex interface{}) string

func (Selectors) Select

func (ss Selectors) Select(query string) Selectors

type Spider

type Spider struct {
	Name      string
	StartURLs []string
}

func (*Spider) Parse

func (s *Spider) Parse(response *Response) (*SpiderResult, error)

func (*Spider) StartResusts

func (s *Spider) StartResusts() []*Request

func (*Spider) String

func (s *Spider) String() string

type SpiderErrorProcessor

type SpiderErrorProcessor interface {
	ProcessSpiderError(err error, response *Response, spider ISpider) (*SpiderResult, error)
}

type SpiderInputProcessor

type SpiderInputProcessor interface {
	ProcessSpiderInput(response *Response, spider ISpider) error
}

type SpiderMiddleware

type SpiderMiddleware interface{}

type SpiderMiddlewareManager

type SpiderMiddlewareManager struct {
	// contains filtered or unexported fields
}

func (*SpiderMiddlewareManager) ProcessStartRequests

func (smm *SpiderMiddlewareManager) ProcessStartRequests(startRequests []*Request, spider ISpider) ([]*Request, error)

func (*SpiderMiddlewareManager) Register

func (smm *SpiderMiddlewareManager) Register(middleware SpiderMiddleware)

func (*SpiderMiddlewareManager) ScrapeResponse

func (smm *SpiderMiddlewareManager) ScrapeResponse(request *Request, response *Response, spider ISpider) (*SpiderResult, error)

type SpiderOutputProcessor

type SpiderOutputProcessor interface {
	ProcessSpiderOutput(result *SpiderResult, response *Response, spider ISpider) (*SpiderResult, error)
}

type SpiderResult

type SpiderResult struct {
	Requests []*Request
	Items    []*Item
}

func (*SpiderResult) Empty

func (sr *SpiderResult) Empty() bool

type StartRequestsProcessor

type StartRequestsProcessor interface {
	ProcessStartRequests(startRequests []*Request, spider ISpider) ([]*Request, error)
}

type Stats

type Stats struct {
	Name string
	// contains filtered or unexported fields
}

func NewStats

func NewStats(name string) *Stats

func (*Stats) Clear

func (stats *Stats) Clear()

func (*Stats) Close

func (stats *Stats) Close(spider ISpider)

func (*Stats) Del

func (stats *Stats) Del(key string)

func (*Stats) Get

func (stats *Stats) Get(key string) uint64

func (*Stats) GetStr

func (stats *Stats) GetStr(key string) string

func (*Stats) Inc

func (stats *Stats) Inc(key string)

func (*Stats) Max

func (stats *Stats) Max(key string, value uint64)

func (*Stats) Min

func (stats *Stats) Min(key string, value uint64)

func (*Stats) Open

func (stats *Stats) Open(spider ISpider)

func (*Stats) SetStr

func (stats *Stats) SetStr(key, value string)

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL