Documentation
¶
Index ¶
- Constants
- Variables
- func ReleaseRequest(req *Request)
- func ReleaseResponse(resp *Response, releaseCtx bool)
- type AcquireProxies
- type CacheCondition
- type ComplementProxyPool
- type Crawler
- func (c *Crawler) AddProxy(newProxy string)
- func (c *Crawler) AfterResponse(f HandleResponse)
- func (c *Crawler) BeforeRequest(f HandleRequest)
- func (c *Crawler) ClearCache() error
- func (c *Crawler) Clone() *Crawler
- func (c *Crawler) Debug(msg string, args ...map[string]interface{})
- func (c *Crawler) Error(err error, args ...map[string]interface{})
- func (c *Crawler) Fatal(err error, args ...map[string]interface{})
- func (c *Crawler) FatalOrPanic(err error)
- func (c *Crawler) Get(URL string) error
- func (c *Crawler) GetWithCtx(URL string, ctx pctx.Context) error
- func (c *Crawler) Info(msg string, args ...map[string]interface{})
- func (c *Crawler) ParseHTML(selector string, f HandleHTML)
- func (c *Crawler) Post(URL string, requestData map[string]string, ctx pctx.Context) error
- func (c *Crawler) PostJSON(URL string, requestData map[string]interface{}, ctx pctx.Context) error
- func (c *Crawler) PostMultipart(URL string, form *MultipartForm, ctx pctx.Context) error
- func (c *Crawler) PostRaw(URL string, body []byte, ctx pctx.Context) error
- func (c *Crawler) ProxyDialerWithTimeout(proxyAddr string, timeout time.Duration) fasthttp.DialFunc
- func (c Crawler) ProxyPoolAmount() int
- func (c *Crawler) SetProxyInvalidCondition(condition ProxyInvalidCondition)
- func (c *Crawler) Wait()
- func (c *Crawler) Warning(msg string, args ...map[string]interface{})
- type CrawlerOption
- func EnableIPv6() CrawlerOption
- func SkipVerification() CrawlerOption
- func WithCache(cc cache.Cache, compressed bool, cacheCondition CacheCondition, ...) CrawlerOption
- func WithComplementProxyPool(f ComplementProxyPool) CrawlerOption
- func WithConcurrency(count uint64) CrawlerOption
- func WithCookies(cookies map[string]string) CrawlerOption
- func WithDefaultCache() CrawlerOption
- func WithDefaultLogger() CrawlerOption
- func WithLogger(lop *LogOp) CrawlerOption
- func WithProxy(proxyURL string) CrawlerOption
- func WithProxyPool(proxyURLs []string) CrawlerOption
- func WithRawCookie(cookie string) CrawlerOption
- func WithRetry(count uint32, cond RetryConditions) CrawlerOption
- func WithUserAgent(ua string) CrawlerOption
- type CustomRandomBoundary
- type HTMLParser
- type HandleHTML
- type HandleRequest
- type HandleResponse
- type LogOp
- type MultipartForm
- type Pool
- type ProxyInvalidCondition
- type Request
- func (r *Request) Abort()
- func (r Request) AbsoluteURL(src string) string
- func (r *Request) AllowRedirect(maxRedirectsCount uint)
- func (r Request) Get(u string) error
- func (r Request) GetWithCache(URL string, cacheFields ...string) error
- func (r Request) Hash() (string, error)
- func (r *Request) New(method, URL string, body []byte) *Request
- func (r Request) NumberOfRetries() uint32
- func (r Request) Post(URL string, requestData map[string]string) error
- func (r Request) PostJSON(URL string, requestData map[string]interface{}) error
- func (r Request) PostJSONWithCache(URL string, requestData map[string]interface{}, cacheFields ...string) error
- func (r Request) PostMultipart(URL string, form *MultipartForm) error
- func (r Request) PostMultipartWithCache(URL string, form *MultipartForm, cacheFields ...string) error
- func (r Request) PostWithCache(URL string, requestData map[string]string, cacheFields ...string) error
- func (r Request) Request(method, URL string, cachedMap map[string]string, body []byte) error
- func (r *Request) Reset()
- func (r *Request) SetContentType(contentType string)
- func (r *Request) SetHeaders(headers map[string]string)
- func (r *Request) SetTimeout(t time.Duration)
- type Response
- func (r Response) ClientIP() net.Addr
- func (r *Response) ContentType() string
- func (r *Response) GetSetCookie() string
- func (r Response) LocalIP() net.Addr
- func (r Response) Marshal() ([]byte, error)
- func (r *Response) Reset(releaseCtx bool)
- func (r *Response) Save(fileName string) error
- func (r *Response) String() string
- type RetryConditions
- type Task
Constants ¶
const ( RUNNING = 1 STOPED = 0 )
running status
Variables ¶
var ( // return if pool size <= 0 ErrInvalidPoolCap = errors.New("invalid pool cap") // put task but pool already closed ErrPoolAlreadyClosed = errors.New("pool already closed") )
errors
var (
ErrIncorrectResponse = errors.New("the response status code is not 20X")
)
var ErrNoCacheSet = errors.New("no cache set")
Functions ¶
func ReleaseRequest ¶
func ReleaseRequest(req *Request)
ReleaseRequest returns req acquired via AcquireRequest to request pool.
It is forbidden accessing req and/or its' members after returning it to request pool.
func ReleaseResponse ¶
ReleaseResponse returns resp acquired via AcquireResponse to response pool.
It is forbidden accessing resp and/or its' members after returning it to response pool.
Types ¶
type CacheCondition ¶ added in v0.2.0
type ComplementProxyPool ¶ added in v0.2.0
type ComplementProxyPool func() []string
type Crawler ¶
type Crawler struct {
// UserAgent is the User-Agent string used by HTTP requests
UserAgent string
// 在多协程中这个上下文管理可以用来退出或取消多个协程
Context context.Context
// contains filtered or unexported fields
}
Crawler is the provider of crawlers
func NewCrawler ¶
func NewCrawler(opts ...CrawlerOption) *Crawler
NewCrawler creates a new Crawler instance with some CrawlerOptions
func (*Crawler) AfterResponse ¶
func (c *Crawler) AfterResponse(f HandleResponse)
AfterResponse is used to process the response, this method should be used for the response body in non-html format
func (*Crawler) BeforeRequest ¶
func (c *Crawler) BeforeRequest(f HandleRequest)
BeforeRequest used to process requests, such as setting headers, passing context, etc.
func (*Crawler) FatalOrPanic ¶ added in v0.2.0
func (*Crawler) GetWithCtx ¶ added in v0.2.0
GetWithCtx is used to send GET requests with a context
func (*Crawler) ParseHTML ¶
func (c *Crawler) ParseHTML(selector string, f HandleHTML)
ParseHTML can parse html to find the data you need, and process the data
func (*Crawler) PostMultipart ¶
PostMultipart
func (*Crawler) ProxyDialerWithTimeout ¶ added in v0.2.0
func (Crawler) ProxyPoolAmount ¶
ProxyPoolAmount returns the number of proxies in the proxy pool
func (*Crawler) SetProxyInvalidCondition ¶ added in v0.2.0
func (c *Crawler) SetProxyInvalidCondition(condition ProxyInvalidCondition)
type CrawlerOption ¶
type CrawlerOption func(*Crawler)
func EnableIPv6 ¶ added in v0.2.0
func EnableIPv6() CrawlerOption
func SkipVerification ¶
func SkipVerification() CrawlerOption
SkipVerification will skip verifying the certificate when you access the `https` protocol
func WithCache ¶
func WithCache(cc cache.Cache, compressed bool, cacheCondition CacheCondition, cacheFileds ...string) CrawlerOption
WithCache 使用缓存,可以选择是否压缩缓存的响应。 使用缓存时,如果发出的是 POST 请求,最好传入能 代表请求体的唯一性的缓存字段,可以是零个、一个或多个。
注意:当不传入缓存字段时,将会默认采用整个请求体作为 缓存标识,但由于 map 无序,同一个请求体生成的 key 很 难保证相同,所以可能会有同一个请求缓存多次,或者无法 从缓存中读取已请求过的请求的响应的情况出现。
func WithComplementProxyPool ¶ added in v0.2.0
func WithComplementProxyPool(f ComplementProxyPool) CrawlerOption
func WithConcurrency ¶
func WithConcurrency(count uint64) CrawlerOption
WithConcurrency 使用并发,参数为要创建的协程池数量
func WithCookies ¶
func WithCookies(cookies map[string]string) CrawlerOption
func WithDefaultCache ¶ added in v0.2.0
func WithDefaultCache() CrawlerOption
WithDefaultCache 默认缓存为 sqlite3,不压缩
func WithDefaultLogger ¶ added in v0.2.0
func WithDefaultLogger() CrawlerOption
func WithLogger ¶
func WithLogger(lop *LogOp) CrawlerOption
func WithRawCookie ¶
func WithRawCookie(cookie string) CrawlerOption
func WithRetry ¶
func WithRetry(count uint32, cond RetryConditions) CrawlerOption
WithRetry 请求失败时重试多少次,什么条件的响应是请求失败
func WithUserAgent ¶
func WithUserAgent(ua string) CrawlerOption
type CustomRandomBoundary ¶
type CustomRandomBoundary func() string
CustomRandomBoundary generates a custom boundary
type HTMLParser ¶
type HTMLParser struct {
Selector string
Handle HandleHTML
}
HTMLParser is used to parse html
type HandleHTML ¶
type HandleHTML func(he *html.HTMLElement, r *Response)
HandleHTML is used to process html
type HandleResponse ¶
type HandleResponse func(r *Response)
HandleResponse is used to handle the response
type LogOp ¶
func (*LogOp) ToConsoleAndFile ¶
type MultipartForm ¶
type MultipartForm struct {
// contains filtered or unexported fields
}
MultipartForm 请求体的构造
func NewMultipartForm ¶
func NewMultipartForm(dash string, f CustomRandomBoundary) *MultipartForm
func (*MultipartForm) AppendFile ¶
func (mf *MultipartForm) AppendFile(name, filePath string) error
func (*MultipartForm) AppendString ¶
func (mf *MultipartForm) AppendString(name, value string)
func (*MultipartForm) Boundary ¶
func (mf *MultipartForm) Boundary() string
Boundary returns the Writer's boundary.
func (*MultipartForm) Bytes ¶
func (mf *MultipartForm) Bytes() []byte
func (*MultipartForm) FormDataContentType ¶
func (mf *MultipartForm) FormDataContentType() string
FormDataContentType returns the Content-Type for an HTTP multipart/form-data with this Writer's Boundary.
type ProxyInvalidCondition ¶ added in v0.2.0
type Request ¶
type Request struct {
// 访问的链接
URL string
// 请求方法
Method string
// 请求头
Headers *fasthttp.RequestHeader
// 请求和响应之间共享的上下文
Ctx pctx.Context
// 请求体
Body []byte
// 唯一标识
ID uint32
// contains filtered or unexported fields
}
func AcquireRequest ¶
func AcquireRequest() *Request
AcquireRequest returns an empty Request instance from request pool.
The returned Request instance may be passed to ReleaseRequest when it is no longer needed. This allows Request recycling, reduces GC pressure and usually improves performance.
func (Request) AbsoluteURL ¶
AbsoluteURL returns with the resolved absolute URL of an URL chunk. AbsoluteURL returns empty string if the URL chunk is a fragment or could not be parsed
func (*Request) AllowRedirect ¶
AllowRedirect allows up to `maxRedirectsCount` times to be redirected.
func (Request) GetWithCache ¶ added in v0.2.0
func (Request) NumberOfRetries ¶
func (Request) PostJSONWithCache ¶ added in v0.2.0
func (Request) PostMultipart ¶ added in v0.2.0
func (r Request) PostMultipart(URL string, form *MultipartForm) error
func (Request) PostMultipartWithCache ¶ added in v0.2.0
func (r Request) PostMultipartWithCache(URL string, form *MultipartForm, cacheFields ...string) error
func (Request) PostWithCache ¶ added in v0.2.0
func (*Request) SetContentType ¶
func (*Request) SetHeaders ¶
func (*Request) SetTimeout ¶ added in v0.1.9
SetTimeout sets the waiting time for each request before the remote end returns a response.
The function doesn't follow redirects.
type Response ¶
type Response struct {
// 响应状态码
StatusCode int
// 二进制请求体
Body []byte
// 请求和响应之间共享的上下文
Ctx ctx.Context `json:"-"`
// 响应对应的请求
Request *Request `json:"-"`
// 响应头
Headers fasthttp.ResponseHeader
// 是否从缓存中取得的响应
FromCache bool
// contains filtered or unexported fields
}
func AcquireResponse ¶
func AcquireResponse() *Response
AcquireResponse returns an empty Response instance from response pool.
The returned Response instance may be passed to ReleaseResponse when it is no longer needed. This allows Response recycling, reduces GC pressure and usually improves performance.