README

Goribot

一个分布式友好的轻量的 Golang 爬虫框架。

完整文档 | Document

GitHub go.mod Go version GitHub tag (latest by date) codecov go-report license code-size

🚀Feature

版本警告

Goribot 仅支持 Go1.13 及以上版本。

👜获取 Goribot

go get -u github.com/zhshch2002/goribot

Goribot 包含一个历史开发版本,如果您需要使用过那个版本,请拉取 Tag 为 v0.0.1 版本。

⚡建立你的第一个项目

package main

import (
	"fmt"
	"github.com/zhshch2002/goribot"
)

func main() {
	s := goribot.NewSpider()

	s.AddTask(
		goribot.GetReq("https://httpbin.org/get"),
		func(ctx *goribot.Context) {
			fmt.Println(ctx.Resp.Text)
			fmt.Println(ctx.Resp.Json("headers.User-Agent"))
		},
	)

	s.Run()
}

🎉完成

至此你已经可以使用 Goribot 了。更多内容请从 开始使用 了解。

🙏感谢

万分感谢以上项目的帮助🙏。

Documentation

Index

Constants

View Source
const DeduplicateSuffix = "_deduplicate"
View Source
const ItemsSuffix = "_items"
View Source
const TasksSuffix = "_tasks"

Variables

View Source
var Do = D.Do
View Source
var ErrRunFinishedSpider = errors.New("running a spider which is finished,you could recreate this spider and run the new one")
View Source
var GetReq = Get

    Deprecated: will be remove at next major version

    View Source
    var Log = logging.MustGetLogger("goribot")
    View Source
    var PostReq = Post

      Deprecated: will be remove at next major version

      Functions

      func AddCookieToJar

      func AddCookieToJar(urlAddr string, cookies ...*http.Cookie) func(s *Spider)

        AddCookieToJar is an extension add a cookie to downloader's cookie jar

        func GetRequestHash

        func GetRequestHash(r *Request) [md5.Size]byte

          GetRequestHash return a hash of url,header,cookie and body data from a request

          func Limiter

          func Limiter(WhiteList bool, rules ...*LimitRule) func(s *Spider)

          func RandomProxy

          func RandomProxy(p ...string) func(s *Spider)

            RandomUserAgent is an extension can set random proxy url for new task

            func RandomUserAgent

            func RandomUserAgent() func(s *Spider)

              RandomUserAgent is an extension can set random User-Agent for new task

              func RedisDistributed

              func RedisDistributed(ro *redis.Options, sName string, useDeduplicate bool, onSeedHandler CtxHandlerFun) func(s *Spider)

              func RedisReqDeduplicate

              func RedisReqDeduplicate(r *redis.Client, sName string) func(s *Spider)

                ReqDeduplicate is an extension can deduplicate new task based on redis to support distributed

                func RefererFiller

                func RefererFiller() func(s *Spider)

                  RefererFiller is an extension can add Referer for new task

                  func ReqDeduplicate

                  func ReqDeduplicate() func(s *Spider)

                    ReqDeduplicate is an extension can deduplicate new task

                    func Retry

                    func Retry(maxTimes int, okcode ...int) func(s *Spider)

                      Retry is a extension make a new request when get response with error

                      func RobotsTxt

                      func RobotsTxt(baseUrl, ua string) func(s *Spider)

                        RobotsTxt is an extension can parse the robots.txt and follow it

                        func SaveItemsAsCSV

                        func SaveItemsAsCSV(f *os.File) func(s *Spider)

                          SaveItemsAsCSV is a extension save items to a csv file

                          func SaveItemsAsJSON

                          func SaveItemsAsJSON(f *os.File) func(s *Spider)

                            SaveItemsAsCSV is a extension save items to a json file

                            func SetDepthFirst

                            func SetDepthFirst(d bool) func(s *Spider)

                              SetDepthFirst is an extension change Scheduler DepthFirst setting

                              func SpiderLogError

                              func SpiderLogError(f *os.File) func(s *Spider)

                                SpiderLogError is a extension logs special or error response

                                func SpiderLogPrint

                                func SpiderLogPrint() func(s *Spider)

                                  SpiderLogPrint is a extension print spider working status

                                  Types

                                  type BaseDownloader

                                  type BaseDownloader struct {
                                  	Client *http.Client
                                  	// contains filtered or unexported fields
                                  }

                                    BaseDownloader is default downloader of goribot

                                    func NewBaseDownloader

                                    func NewBaseDownloader() *BaseDownloader

                                    func (*BaseDownloader) AddMiddleware

                                    func (s *BaseDownloader) AddMiddleware(fn func(req *Request, next func(*Request) (*Response, error)) (*Response, error))

                                    func (*BaseDownloader) Do

                                    func (s *BaseDownloader) Do(req *Request) (resp *Response, err error)

                                    type BaseScheduler

                                    type BaseScheduler struct {
                                    
                                    	// DepthFirst sets push new tasks to the top of the queue
                                    	DepthFirst bool
                                    	// contains filtered or unexported fields
                                    }

                                      Scheduler is default scheduler of goribot

                                      func NewBaseScheduler

                                      func NewBaseScheduler(depthFirst bool) *BaseScheduler

                                      func (*BaseScheduler) AddItem

                                      func (s *BaseScheduler) AddItem(i interface{})

                                      func (*BaseScheduler) AddTask

                                      func (s *BaseScheduler) AddTask(t *Task)

                                      func (*BaseScheduler) GetItem

                                      func (s *BaseScheduler) GetItem() interface{}

                                      func (*BaseScheduler) GetTask

                                      func (s *BaseScheduler) GetTask() *Task

                                      func (*BaseScheduler) IsItemEmpty

                                      func (s *BaseScheduler) IsItemEmpty() bool

                                      func (*BaseScheduler) IsTaskEmpty

                                      func (s *BaseScheduler) IsTaskEmpty() bool

                                      type Context

                                      type Context struct {
                                      	// Req is the origin request
                                      	Req *Request
                                      	// Resp is the response object
                                      	Resp *Response
                                      
                                      	// Meta the request task created by NewTaskWithMeta func will have a k-y pair
                                      	Meta map[string]interface{}
                                      
                                      	Handlers []CtxHandlerFun
                                      	// contains filtered or unexported fields
                                      }

                                        Context is a wrap of response,origin request,new task,etc

                                        func (*Context) Abort

                                        func (c *Context) Abort()

                                          Abort this context to break the handler chain and stop handling

                                          func (*Context) AddItem

                                          func (c *Context) AddItem(i interface{})

                                            AddItem add an item to new item list. After every handler func return, spider will collect these items and call OnItem handler func

                                            func (*Context) AddTask

                                            func (c *Context) AddTask(request *Request, handlers ...CtxHandlerFun)

                                              AddTask add a task to new task list. After every handler func return,spider will collect these tasks

                                              func (*Context) IsAborted

                                              func (c *Context) IsAborted() bool

                                                IsAborted return was the context dropped

                                                type CsvItem

                                                type CsvItem []string

                                                type CtxHandlerFun

                                                type CtxHandlerFun func(ctx *Context)

                                                type Downloader

                                                type Downloader interface {
                                                	Do(req *Request) (resp *Response, err error)
                                                	AddMiddleware(func(req *Request, next func(req *Request) (resp *Response, err error)) (resp *Response, err error))
                                                }

                                                  Downloader tool download response from request

                                                  type DownloaderErr

                                                  type DownloaderErr struct {
                                                  
                                                  	// Request is the Request object when the error occurred
                                                  	Request *Request
                                                  	// Response is the Request object when the error occurred.It could be nil.
                                                  	Response *Response
                                                  	// contains filtered or unexported fields
                                                  }

                                                    DownloaderErr is a error create by Downloader

                                                    type ErrorItem

                                                    type ErrorItem struct {
                                                    	Ctx *Context
                                                    	Msg string
                                                    }

                                                    type JsonItem

                                                    type JsonItem struct {
                                                    	Data interface{}
                                                    }

                                                    type LimitRule

                                                    type LimitRule struct {
                                                    	Regexp, Glob string
                                                    	Allow        LimitRuleAllow
                                                    	Parallelism  int64
                                                    
                                                    	Rate int64
                                                    
                                                    	Delay       time.Duration
                                                    	RandomDelay time.Duration
                                                    	MaxReq      int64
                                                    
                                                    	MaxDepth int64
                                                    	// contains filtered or unexported fields
                                                    }

                                                    func (*LimitRule) Match

                                                    func (s *LimitRule) Match(u *url.URL) bool

                                                    type LimitRuleAllow

                                                    type LimitRuleAllow uint8
                                                    const (
                                                    	NotSet LimitRuleAllow = iota
                                                    	Allow
                                                    	Disallow
                                                    )

                                                    type Manager

                                                    type Manager struct {
                                                    	// contains filtered or unexported fields
                                                    }

                                                    func NewManager

                                                    func NewManager(redis *redis.Client, sName string) *Manager

                                                    func (*Manager) GetItem

                                                    func (s *Manager) GetItem() interface{}

                                                    func (*Manager) OnItem

                                                    func (s *Manager) OnItem(fn func(i interface{}) interface{})

                                                    func (*Manager) Run

                                                    func (s *Manager) Run()

                                                    func (*Manager) SendReq

                                                    func (s *Manager) SendReq(req *Request)

                                                    func (*Manager) SetItemPoolSize

                                                    func (s *Manager) SetItemPoolSize(i int)

                                                    type RedisScheduler

                                                    type RedisScheduler struct {
                                                    	// contains filtered or unexported fields
                                                    }

                                                      Scheduler is default scheduler of goribot

                                                      func NewRedisScheduler

                                                      func NewRedisScheduler(redis *redis.Client, sName string, bs int, fn ...CtxHandlerFun) *RedisScheduler

                                                      func (*RedisScheduler) AddItem

                                                      func (s *RedisScheduler) AddItem(i interface{})

                                                      func (*RedisScheduler) AddTask

                                                      func (s *RedisScheduler) AddTask(t *Task)

                                                      func (*RedisScheduler) GetItem

                                                      func (s *RedisScheduler) GetItem() interface{}

                                                      func (*RedisScheduler) GetTask

                                                      func (s *RedisScheduler) GetTask() *Task

                                                      func (*RedisScheduler) IsItemEmpty

                                                      func (s *RedisScheduler) IsItemEmpty() bool

                                                      func (*RedisScheduler) IsTaskEmpty

                                                      func (s *RedisScheduler) IsTaskEmpty() bool

                                                      type Request

                                                      type Request struct {
                                                      	*http.Request
                                                      	Depth int
                                                      	// ResponseCharacterEncoding is the character encoding of the response body.
                                                      	// Leave it blank to allow automatic character encoding of the response body.
                                                      	// It is empty by default and it can be set in OnRequest callback.
                                                      	ResponseCharacterEncoding string
                                                      	// ProxyURL is the proxy address that handles the request
                                                      	ProxyURL string
                                                      	// Meta contains data between a Request and a Response
                                                      	Meta map[string]interface{}
                                                      	Err  error
                                                      	// contains filtered or unexported fields
                                                      }

                                                        Request is a object of HTTP request

                                                        func Get

                                                        func Get(urladdr string) *Request

                                                          Get creates a get request

                                                          func Post

                                                          func Post(urladdr string, body io.Reader) *Request

                                                            Post creates a post request

                                                            func PostFormReq

                                                            func PostFormReq(urladdr string, requestData map[string]string) *Request

                                                              PostFormReq creates a post request with form data

                                                              func PostJsonReq

                                                              func PostJsonReq(urladdr string, requestData interface{}) *Request

                                                                PostJsonReq creates a post request with json data

                                                                func PostRawReq

                                                                func PostRawReq(urladdr string, body []byte) *Request

                                                                  PostReq creates a post request with raw data

                                                                  func (*Request) AddCookie

                                                                  func (s *Request) AddCookie(c *http.Cookie) *Request

                                                                    AddCookie adds a cookie to the request.

                                                                    func (*Request) AddParam

                                                                    func (s *Request) AddParam(k, v string) *Request

                                                                      AddParam adds a query param of request url.

                                                                      func (*Request) GetBody

                                                                      func (s *Request) GetBody() []byte

                                                                        GetBody returns the body as bytes of request

                                                                        func (*Request) SetHeader

                                                                        func (s *Request) SetHeader(key, value string) *Request

                                                                          SetHeader sets the header entries associated with key to the single element value.

                                                                          func (*Request) SetParam

                                                                          func (s *Request) SetParam(p map[string]string) *Request

                                                                            SetParam sets query param of request url. Deprecated: will be remove at next major version

                                                                            func (*Request) SetProxy

                                                                            func (s *Request) SetProxy(p string) *Request

                                                                              SetProxy sets proxy url of request.

                                                                              func (*Request) SetUA

                                                                              func (s *Request) SetUA(ua string) *Request

                                                                                SetProxy sets user-agent url of request header.

                                                                                func (*Request) WithMeta

                                                                                func (s *Request) WithMeta(k string, v interface{}) *Request

                                                                                  SetParam sets the meta data of request.

                                                                                  type Response

                                                                                  type Response struct {
                                                                                  	*http.Response
                                                                                  	// Body is the content of the Response
                                                                                  	Body []byte
                                                                                  	// Text is the content of the Response parsed as string
                                                                                  	Text string
                                                                                  	// Request is the Req object from goribot of the response.Tip: there is another Request attr come from *http.Response
                                                                                  	Req *Request
                                                                                  	// Dom is the parsed html object
                                                                                  	Dom *goquery.Document
                                                                                  	// Meta contains data between a Request and a Response
                                                                                  	Meta map[string]interface{}
                                                                                  }

                                                                                    Response is a object of HTTP response

                                                                                    func (*Response) DecodeAndParse

                                                                                    func (s *Response) DecodeAndParse() error

                                                                                      DecodeAndParas decodes the body to text and try to parse it to html or json.

                                                                                      func (*Response) IsHTML

                                                                                      func (s *Response) IsHTML() bool

                                                                                      func (*Response) IsJSON

                                                                                      func (s *Response) IsJSON() bool

                                                                                      func (*Response) Json

                                                                                      func (s *Response) Json(q string) gjson.Result

                                                                                        Json returns json result parsed from response

                                                                                        type Scheduler

                                                                                        type Scheduler interface {
                                                                                        	// GetTask pops a task
                                                                                        	GetTask() *Task
                                                                                        	// GetItem pops a item
                                                                                        	GetItem() interface{}
                                                                                        
                                                                                        	// AddTask push a task
                                                                                        	AddTask(t *Task)
                                                                                        	// AddItem push a item
                                                                                        	AddItem(i interface{})
                                                                                        
                                                                                        	// IsTaskEmpty returns is tasks queue empty
                                                                                        	IsTaskEmpty() bool
                                                                                        	// IsItemEmpty returns is items queue empty
                                                                                        	IsItemEmpty() bool
                                                                                        }

                                                                                          Scheduler is a queue of tasks and items

                                                                                          type Spider

                                                                                          type Spider struct {
                                                                                          	Scheduler  Scheduler
                                                                                          	Downloader Downloader
                                                                                          	AutoStop   bool
                                                                                          	// contains filtered or unexported fields
                                                                                          }

                                                                                          func NewSpider

                                                                                          func NewSpider(exts ...func(s *Spider)) *Spider

                                                                                          func (*Spider) AddTask

                                                                                          func (s *Spider) AddTask(request *Request, handlers ...CtxHandlerFun)

                                                                                          func (*Spider) OnAdd

                                                                                          func (s *Spider) OnAdd(fn func(ctx *Context, t *Task) *Task)

                                                                                            ***********************************************************************************

                                                                                            func (*Spider) OnError

                                                                                            func (s *Spider) OnError(fn func(ctx *Context, err error))

                                                                                              ***********************************************************************************

                                                                                              func (*Spider) OnFinish

                                                                                              func (s *Spider) OnFinish(fn func(s *Spider))

                                                                                                ***********************************************************************************

                                                                                                func (*Spider) OnHTML

                                                                                                func (s *Spider) OnHTML(selector string, fn func(ctx *Context, sel *goquery.Selection))

                                                                                                func (*Spider) OnItem

                                                                                                func (s *Spider) OnItem(fn func(i interface{}) interface{})

                                                                                                  ***********************************************************************************

                                                                                                  func (*Spider) OnJSON

                                                                                                  func (s *Spider) OnJSON(q string, fn func(ctx *Context, j gjson.Result))

                                                                                                  func (*Spider) OnReq

                                                                                                  func (s *Spider) OnReq(fn func(ctx *Context, req *Request) *Request)

                                                                                                    ***********************************************************************************

                                                                                                    func (*Spider) OnResp

                                                                                                    func (s *Spider) OnResp(fn CtxHandlerFun)

                                                                                                      ***********************************************************************************

                                                                                                      func (*Spider) OnStart

                                                                                                      func (s *Spider) OnStart(fn func(s *Spider))

                                                                                                        ***********************************************************************************

                                                                                                        func (*Spider) Run

                                                                                                        func (s *Spider) Run()

                                                                                                        func (*Spider) SetItemPoolSize

                                                                                                        func (s *Spider) SetItemPoolSize(i int)

                                                                                                        func (*Spider) SetTaskPoolSize

                                                                                                        func (s *Spider) SetTaskPoolSize(i int)

                                                                                                        func (*Spider) Use

                                                                                                        func (s *Spider) Use(fn ...func(s *Spider))

                                                                                                        type Task

                                                                                                        type Task struct {
                                                                                                        	Request  *Request
                                                                                                        	Handlers []CtxHandlerFun
                                                                                                        }

                                                                                                        func NewTask

                                                                                                        func NewTask(request *Request, handlers ...CtxHandlerFun) *Task

                                                                                                        Directories

                                                                                                        Path Synopsis