Documentation ¶
Index ¶
- Constants
- Variables
- type Collector
- func (c *Collector) Clone() *Collector
- func (c *Collector) Cookies(URL string) []*http.Cookie
- func (c *Collector) DisableCookies()
- func (c *Collector) HasVisited(URL string) (bool, error)
- func (c *Collector) Head(URL string) error
- func (c *Collector) Init()
- func (c *Collector) Limit(rule *LimitRule) error
- func (c *Collector) Limits(rules []*LimitRule) error
- func (c *Collector) OnError(f ErrorCallback)
- func (c *Collector) OnHTML(goquerySelector string, f HTMLCallback)
- func (c *Collector) OnHTMLDetach(goquerySelector string)
- func (c *Collector) OnRequest(f RequestCallback)
- func (c *Collector) OnResponse(f ResponseCallback)
- func (c *Collector) OnResponseHeaders(f ResponseHeadersCallback)
- func (c *Collector) OnScraped(f ScrapedCallback)
- func (c *Collector) Post(URL string, requestData map[string]string) error
- func (c *Collector) PostMultipart(URL string, requestData map[string][]byte) error
- func (c *Collector) PostRaw(URL string, requestData []byte) error
- func (c *Collector) Request(method, URL string, requestData io.Reader, ctx *Context, hdr http.Header) error
- func (c *Collector) SetClient(client *http.Client)
- func (c *Collector) SetCookieJar(j http.CookieJar)
- func (c *Collector) SetCookies(URL string, cookies []*http.Cookie) error
- func (c *Collector) SetDebugger(d debug.Debugger)
- func (c *Collector) SetProxy(proxyURL string) error
- func (c *Collector) SetProxyFunc(p ProxyFunc)
- func (c *Collector) SetRedirectHandler(f func(req *http.Request, via []*http.Request) error)
- func (c *Collector) SetRequestTimeout(timeout time.Duration)
- func (c *Collector) SetStorage(s storage.Storage) error
- func (c *Collector) String() string
- func (c *Collector) UnmarshalRequest(r []byte) (*Request, error)
- func (c *Collector) Visit(URL string) error
- func (c *Collector) Wait()
- func (c *Collector) WithTransport(transport http.RoundTripper)
- type Context
- func (c *Context) ForEach(fn func(k string, v interface{}) interface{}) []interface{}
- func (c *Context) Get(key string) string
- func (c *Context) GetAny(key string) interface{}
- func (c *Context) MarshalBinary() (_ []byte, _ error)
- func (c *Context) Put(key string, value interface{})
- func (c *Context) UnmarshalBinary(_ []byte) error
- type ErrorCallback
- type HTMLCallback
- type HTMLElement
- func (h *HTMLElement) Attr(k string) string
- func (h *HTMLElement) ChildAttr(goquerySelector, attrName string) string
- func (h *HTMLElement) ChildAttrs(goquerySelector, attrName string) []string
- func (h *HTMLElement) ChildText(goquerySelector string) string
- func (h *HTMLElement) ChildTexts(goquerySelector string) []string
- func (h *HTMLElement) ForEach(goquerySelector string, callback func(int, *HTMLElement))
- func (h *HTMLElement) ForEachWithBreak(goquerySelector string, callback func(int, *HTMLElement) bool)
- type LimitRule
- type ProxyFunc
- type Request
- func (r *Request) Abort()
- func (r *Request) AbsoluteURL(u string) string
- func (r *Request) Do() error
- func (r *Request) HasVisited(URL string) (bool, error)
- func (r *Request) Marshal() ([]byte, error)
- func (r *Request) New(URL, method string, body io.Reader) (*Request, error)
- func (r *Request) Post(URL string, requestData map[string]string) error
- func (r *Request) PostMultipart(URL string, requestData map[string][]byte) error
- func (r *Request) PostRaw(URL string, requestData []byte) error
- func (r *Request) Retry() error
- func (r *Request) Visit(URL string) error
- type RequestCallback
- type Response
- type ResponseCallback
- type ResponseHeadersCallback
- type ScrapedCallback
Constants ¶
const ProxyURLKey key = iota
ProxyURLKey 代理地址上下文键
Variables ¶
var ( // ErrMissingURL 网址为空的错误 ErrMissingURL = errors.New("Missing URL") // ErrAlreadyVisited 访问已经访问过网址的错误 ErrAlreadyVisited = errors.New("URL already visited") // ErrNoPattern 没有定义匹配域名规则的错误 ErrNoPattern = errors.New("No pattern defined in LimitRule") // ErrAbortedAfterHeaders OnResponseHeaders中止传输时返回的错误 ErrAbortedAfterHeaders = errors.New("Aborted after receiving response headers") // ErrNoCookieJar 没有启用Cookie的错误 ErrNoCookieJar = errors.New("Cookie jar is not available") // ErrEmptyProxyURL 代理地址列表为空的错误 ErrEmptyProxyURL = errors.New("Proxy URL list is empty") // ErrQueueFull 请求队列已满的错误 ErrQueueFull = errors.New("Queue MaxSize reached") )
Functions ¶
This section is empty.
Types ¶
type Collector ¶
type Collector struct { // ID Collector的标识符 ID uint32 // UserAgent HTTP请求头User-Agent UserAgent string // MaxBodySize HTTP响应主体接收最大值,0为无限制,默认为10MB MaxBodySize int // Async 是否异步抓取,需用c.Wait()等待所有HTTP请求完成 Async bool // AllowURLRevisit 是否允许重复抓取同个网址 AllowURLRevisit bool // ParseHTTPErrorResponse 是否处理非2xx的HTTP响应错误 ParseHTTPErrorResponse bool // contains filtered or unexported fields }
Collector 数据抓取控制类
func (*Collector) HasVisited ¶
HasVisited 检查指定网址是否已经访问
func (*Collector) OnHTML ¶
func (c *Collector) OnHTML(goquerySelector string, f HTMLCallback)
OnHTML 注册HTML处理回调函数
func (*Collector) OnHTMLDetach ¶
OnHTMLDetach 注销HTML处理回调函数
func (*Collector) OnResponse ¶
func (c *Collector) OnResponse(f ResponseCallback)
OnResponse 注册响应处理回调函数
func (*Collector) OnResponseHeaders ¶
func (c *Collector) OnResponseHeaders(f ResponseHeadersCallback)
OnResponseHeaders 注册响应头处理回调函数(响应头已接收但响应主体没有读取时调用)
func (*Collector) OnScraped ¶
func (c *Collector) OnScraped(f ScrapedCallback)
OnScraped 注册抓取完成处理回调函数
func (*Collector) PostMultipart ¶
PostMultipart 发送多部分实体POST请求(包含多部分实体数据)
func (*Collector) Request ¶
func (c *Collector) Request(method, URL string, requestData io.Reader, ctx *Context, hdr http.Header) error
Request 发送指定方法的请求
func (*Collector) SetCookieJar ¶
SetCookieJar 设置Cookie
func (*Collector) SetCookies ¶
SetCookies 设置指定URL的Cookie
func (*Collector) SetDebugger ¶
SetDebugger 设置Collector的调试器
func (*Collector) SetProxyFunc ¶
SetProxyFunc 设置代理处理函数
func (*Collector) SetRedirectHandler ¶
SetRedirectHandler 设置重定向处理器
func (*Collector) SetRequestTimeout ¶
SetRequestTimeout 设置请求超时时间(默认10秒)
func (*Collector) SetStorage ¶
SetStorage 设置Collector的存储器
func (*Collector) UnmarshalRequest ¶
UnmarshalRequest 从序列化数据中创建请求
func (*Collector) WithTransport ¶
func (c *Collector) WithTransport(transport http.RoundTripper)
WithTransport 设置http.RoundTripper
type Context ¶
type Context struct {
// contains filtered or unexported fields
}
Context 上下文环境类(用于回调函数中传递数据)
func (*Context) MarshalBinary ¶
MarshalBinary 编码上下文环境
func (*Context) UnmarshalBinary ¶
UnmarshalBinary 解码上下文环境
type HTMLElement ¶
type HTMLElement struct { // Index 节点索引 Index int // Name 节点名称 Name string // Text 节点文本内容 Text string // DOM 当前匹配节点 DOM *goquery.Selection // Request 当前请求实例 Request *Request // Response 当前响应实例 Response *Response // contains filtered or unexported fields }
HTMLElement HTML节点类
func NewHTMLElementFromSelectionNode ¶
func NewHTMLElementFromSelectionNode(resp *Response, s *goquery.Selection, n *html.Node, idx int) *HTMLElement
NewHTMLElementFromSelectionNode 从goquery.Selection节点创建HTMLElement实例
func (*HTMLElement) Attr ¶
func (h *HTMLElement) Attr(k string) string
Attr 获取当前节点指定属性的值(如不存在返回空字符串)
func (*HTMLElement) ChildAttr ¶
func (h *HTMLElement) ChildAttr(goquerySelector, attrName string) string
ChildAttr 获取子节点中第一个匹配标签指定属性的值
func (*HTMLElement) ChildAttrs ¶
func (h *HTMLElement) ChildAttrs(goquerySelector, attrName string) []string
ChildAttrs 获取子节点中所有匹配标签指定属性的值(字符串数组)
func (*HTMLElement) ChildText ¶
func (h *HTMLElement) ChildText(goquerySelector string) string
ChildText 获取子节点中所有匹配标签的文本内容(组合在一起)
func (*HTMLElement) ChildTexts ¶
func (h *HTMLElement) ChildTexts(goquerySelector string) []string
ChildTexts 获取子节点中所有匹配标签的文本内容(字符串数组)
func (*HTMLElement) ForEach ¶
func (h *HTMLElement) ForEach(goquerySelector string, callback func(int, *HTMLElement))
ForEach 每个匹配标签执行指定回调函数
func (*HTMLElement) ForEachWithBreak ¶
func (h *HTMLElement) ForEachWithBreak(goquerySelector string, callback func(int, *HTMLElement) bool)
ForEachWithBreak 每个匹配标签执行指定回调函数(可终止)
type LimitRule ¶
type LimitRule struct { // DomainRegexp 匹配域名的正则表达式 DomainRegexp string // Delay 请求之间等待的时间 Delay time.Duration // RandomDelay 请求之间额外随机等待的时间 RandomDelay time.Duration // Parallelism 允许的最大并发请求数 Parallelism int // contains filtered or unexported fields }
LimitRule 请求限制规则
type Request ¶
type Request struct { // ID 请求标识符 ID uint32 // URL 请求网址 URL *url.URL // Method 请求方法 Method string // Headers 请求头 Headers *http.Header // Body 请求主体(POST/PUT) Body io.Reader // Ctx 上下文环境 Ctx *Context // ResponseCharacterEncoding 响应体字符编码(为空自动检测) ResponseCharacterEncoding string // ProxyURL 代理地址 ProxyURL string // contains filtered or unexported fields }
Request 请求类
func (*Request) AbsoluteURL ¶
AbsoluteURL 解析为绝对网址(不包含Fragment部分)
func (*Request) HasVisited ¶
HasVisited 检查指定网址是否已经访问
func (*Request) PostMultipart ¶
PostMultipart 发送多部分实体POST请求(包含多部分实体数据,保留上下文环境)
type Response ¶
type Response struct { // StatusCode 响应状态码 StatusCode int // Headers 响应头 Headers *http.Header // Body 响应主体 Body []byte // Request 当前请求实例 Request *Request // Ctx 上下文环境 Ctx *Context }
Response 响应类
type ResponseHeadersCallback ¶
type ResponseHeadersCallback func(*Response)
ResponseHeadersCallback OnResponseHeaders回调函数