README

Surfer GitHub release report card github issues github closed issues GoDoc view Go大数据

Package surfer is a high level concurrency http client. It has surf and phantom download engines, highly simulated browser behavior, the function of analog login and so on.

简体中文

Features

  • Both surf and phantomjs engines are supported
  • Support random User-Agent
  • Support cache cookie
  • Support http/https

Usage

package main

import (
    "github.com/henrylee2cn/surfer"
    "io/ioutil"
    "log"
)

func main() {
    // Use surf engine
    resp, err := surfer.Download(&surfer.Request{
        Url: "http://github.com/henrylee2cn/surfer",
    })
    if err != nil {
        log.Fatal(err)
    }
    b, err := ioutil.ReadAll(resp.Body)
    log.Println(string(b), err)

    // Use phantomjs engine
    surfer.SetPhantomJsFilePath("Path to phantomjs.exe")
    resp, err = surfer.Download(&surfer.Request{
        Url:          "http://github.com/henrylee2cn",
        DownloaderID: 1,
    })
    if err != nil {
        log.Fatal(err)
    }
    b, err = ioutil.ReadAll(resp.Body)
    log.Println(string(b), err)

    resp.Body.Close()
    surfer.DestroyJsFiles()
}

Full example

License

Surfer is under Apache v2 License. See the LICENSE file for the full license text.

Documentation

Overview

    Copyright 2015 henrylee2cn Author. All Rights Reserved.

    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0
    

    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

    Package surfer is a high level concurrency http client.

    It has `surf` and` phantom` download engines, highly simulated browser behavior, the function of analog login and so on.

    Features: - Both surf and phantomjs engines are supported - Support random User-Agent - Support cache cookie - Support http/https

    Usage: package main

    import (

    "github.com/henrylee2cn/surfer"
    "io/ioutil"
    "log"
    

    )

    func main() {

    // Use surf engine
    resp, err := surfer.Download(&surfer.Request{
        Url: "http://github.com/henrylee2cn/surfer",
    })
    if err != nil {
        log.Fatal(err)
    }
    b, err := ioutil.ReadAll(resp.Body)
    log.Println(string(b), err)
    
    // Use phantomjs engine
    resp, err = surfer.Download(&surfer.Request{
        Url:          "http://github.com/henrylee2cn",
        DownloaderID: 1,
    })
    if err != nil {
        log.Fatal(err)
    }
    b, err = ioutil.ReadAll(resp.Body)
    log.Println(string(b), err)
    resp.Body.Close()
    surfer.DestroyJsFiles()
    

    }

    Index

    Constants

    View Source
    const (
    	// Windows operating system.
    	Windows int = iota
    	// Linux based operating system.
    	Linux
    	// Macintosh /OS X operating system.
    	Macintosh
    )
    View Source
    const (
    	SurfID             = 0               // Surf下载器标识符
    	PhomtomJsID        = 1               // PhomtomJs下载器标识符
    	DefaultMethod      = "GET"           // 默认请求方法
    	DefaultDialTimeout = 2 * time.Minute // 默认请求服务器超时
    	DefaultConnTimeout = 2 * time.Minute // 默认下载超时
    	DefaultTryTimes    = 3               // 默认最大下载次数
    	DefaultRetryPause  = 2 * time.Second // 默认重新下载前停顿时长
    )

      constant

      Variables

      View Source
      var Database = UATable{
      	"chrome": {
      		"37.0.2049.0",
      		Windows,
      		Formats{
      			"37": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) Chrome/{{.Ver}} Safari/537.36",
      			"36": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) Chrome/{{.Ver}} Safari/537.36",
      			"35": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) Chrome/{{.Ver}} Safari/537.36",
      			"34": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) Chrome/{{.Ver}} Safari/537.36",
      			"33": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) Chrome/{{.Ver}} Safari/537.36",
      			"32": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) Chrome/{{.Ver}} Safari/537.36",
      			"31": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) Chrome/{{.Ver}} Safari/537.36",
      			"30": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}) Chrome/{{.Ver}} Safari/537.36",
      		},
      	},
      	"firefox": {
      		"31.0",
      		Windows,
      		Formats{
      			"31": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}; rv:31.0) Gecko/20100101 Firefox/{{.Ver}}",
      			"30": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}; rv:30.0) Gecko/20120101 Firefox/{{.Ver}}",
      			"29": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}; rv:29.0) Gecko/20120101 Firefox/{{.Ver}}",
      			"28": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}; rv:28.0) Gecko/20100101 Firefox/{{.Ver}}",
      			"27": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}; rv:27.0) Gecko/20130101 Firefox/{{.Ver}}",
      			"26": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}; rv:26.0) Gecko/20121011 Firefox/{{.Ver}}",
      			"25": "Mozilla/5.0 ({{.OSN}} {{.OSV}}{{.Coms}}; rv:25.0) Gecko/20100101 Firefox/{{.Ver}}",
      		},
      	},
      	"msie": {
      		"10.0",
      		Windows,
      		Formats{
      			"10": "Mozilla/5.0 (compatible; MSIE 10.0; {{.OSN}} {{.OSV}}{{if .Coms}}{{.Coms}}; {{end}}Trident/5.0; .NET CLR 3.5.30729)",
      			"9":  "Mozilla/5.0 (compatible; MSIE 9.0; {{.OSN}} {{.OSV}}{{if .Coms}}{{.Coms}}; {{end}}Trident/5.0; .NET CLR 3.0.30729)",
      			"8":  "Mozilla/5.0 (compatible; MSIE 8.0; {{.OSN}} {{.OSV}}{{if .Coms}}{{.Coms}}; {{end}}Trident/4.0; .NET CLR 3.0.04320)",
      			"7":  "Mozilla/4.0 (compatible; MSIE 7.0; {{.OSN}} {{.OSV}}{{if .Coms}}{{.Coms}}; {{end}}.NET CLR 2.0.50727)",
      		},
      	},
      	"opera": {
      		"12.14",
      		Windows,
      		Formats{
      			"12": "Opera/9.80 ({{.OSN}} {{.OSV}}; U{{.Coms}}) Presto/2.9.181 Version/{{.Ver}}",
      			"11": "Opera/9.80 ({{.OSN}} {{.OSV}}; U{{.Coms}}) Presto/2.7.62 Version/{{.Ver}}",
      			"10": "Opera/9.80 ({{.OSN}} {{.OSV}}; U{{.Coms}}) Presto/2.2.15 Version/{{.Ver}}",
      			"9":  "Opera/9.00 ({{.OSN}} {{.OSV}}; U{{.Coms}})",
      		},
      	},
      	"safari": {
      		"6.0",
      		Macintosh,
      		Formats{
      			"6": "Mozilla/5.0 (Macintosh; {{.OSN}} {{.OSV}}{{.Coms}}) AppleWebKit/536.26 (KHTML, like Gecko) Version/{{.Ver}} Safari/8536.25",
      			"5": "Mozilla/5.0 (Macintosh; {{.OSN}} {{.OSV}}{{.Coms}}) AppleWebKit/531.2+ (KHTML, like Gecko) Version/{{.Ver}} Safari/531.2+",
      			"4": "Mozilla/5.0 (Macintosh; {{.OSN}} {{.OSV}}{{.Coms}}) AppleWebKit/528.16 (KHTML, like Gecko) Version/{{.Ver}} Safari/528.16",
      		},
      	},
      	"itunes": {
      		"9.1.1",
      		Macintosh,
      		Formats{
      			"9": "iTunes/{{.Ver}}",
      			"8": "iTunes/{{.Ver}}",
      			"7": "iTunes/{{.Ver}} (Macintosh; U; PPC Mac OS X 10.4.7{{.Coms}})",
      			"6": "iTunes/{{.Ver}} (Macintosh; U; PPC Mac OS X 10.4.5{{.Coms}})",
      		},
      	},
      	"aol": {
      		"9.7",
      		Windows,
      		Formats{
      			"9": "Mozilla/5.0 (compatible; MSIE 9.0; AOL {{.Ver}}; AOLBuild 4343.19; {{.OSN}} {{.OSV}}; WOW64; Trident/5.0; FunWebProducts{{.Coms}})",
      			"8": "Mozilla/4.0 (compatible; MSIE 7.0; AOL {{.Ver}}; {{.OSN}} {{.OSV}}; GTB5; .NET CLR 1.1.4322; .NET CLR 2.0.50727{{.Coms}})",
      			"7": "Mozilla/4.0 (compatible; MSIE 7.0; AOL {{.Ver}}; {{.OSN}} {{.OSV}}; FunWebProducts{{.Coms}})",
      			"6": "Mozilla/4.0 (compatible; MSIE 6.0; AOL {{.Ver}}; {{.OSN}} {{.OSV}}{{.Coms}})",
      		},
      	},
      	"konqueror": {
      		"4.9",
      		Linux,
      		Formats{
      			"4": "Mozilla/5.0 (compatible; Konqueror/4.0; {{.OSN}}{{.Coms}}) KHTML/4.0.3 (like Gecko)",
      			"3": "Mozilla/5.0 (compatible; Konqueror/3.0-rc6; i686 {{.OSN}}; 20021127{{.Coms}})",
      			"2": "Mozilla/5.0 (compatible; Konqueror/2.1.1; {{.OSN}}{{.Coms}})",
      		},
      	},
      	"netscape": {
      		"9.1.0285",
      		Windows,
      		Formats{
      			"9": "Mozilla/5.0 ({{.OSN}}; U; {{.OSN}} {{.OSV}}; rv:1.9.2.4{{.Coms}}) Gecko/20070321 Netscape/{{.Ver}}",
      			"8": "Mozilla/5.0 ({{.OSN}}; U; {{.OSN}} {{.OSV}}; rv:1.7.5{{.Coms}}) Gecko/20050519 Netscape/{{.Ver}}",
      			"7": "Mozilla/5.0 ({{.OSN}}; U; {{.OSN}} {{.OSV}}; rv:1.0.1{{.Coms}}) Gecko/20020921 Netscape/{{.Ver}}",
      		},
      	},
      	"lynx": {
      		"2.8.8dev.3",
      		Linux,
      		Formats{
      			"2": "Lynx/{{.Ver}} libwww-FM/2.14 SSL-MM/1.4.1",
      			"1": "Lynx (textmode)",
      		},
      	},
      	"googlebot": {
      		"2.1",
      		Linux,
      		Formats{
      			"2": "Mozilla/5.0 (compatible; Googlebot/{{.Ver}}; +http://www.google.com/bot.html{{.Coms}})",
      			"1": "Googlebot/{{.Ver}} (+http://www.google.com/bot.html{{.Coms}})",
      		},
      	},
      	"bingbot": {
      		"2.0",
      		Windows,
      		Formats{
      			"2": "Mozilla/5.0 (compatible; bingbot/{{.Ver}}; +http://www.bing.com/bingbot.htm{{.Coms}})",
      		},
      	},
      	"yahoobot": {
      		"2.0",
      		Linux,
      		Formats{
      			"2": "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp{{.Coms}})",
      		},
      	},
      	"default": {
      		"1.0",
      		Linux,
      		Formats{
      			"1": "{{.Name}}/{{.Ver}} ({{.OSN}} {{.OSV}}{{.Coms}})",
      		},
      	},
      }

        Database is the "database" of user agents.

        View Source
        var DefaultOSAttributes = map[int]OSAttributes{
        	Windows:   {"Windows NT", "6.3", []string{"x64"}},
        	Linux:     {"Linux", "3.16.1", []string{"x64"}},
        	Macintosh: {"Intel Mac OS X", "10_6_8", []string{}},
        }

          DefaultOSAttributes stores default OS attributes.

          View Source
          var UserAgents = map[string][]string{}

            UserAgents all User-Agent

            Functions

            func AutoToUTF8

            func AutoToUTF8(resp *http.Response) error

              AutoToUTF8 采用surf内核下载时,可以尝试自动转码为utf8 采用phantomjs内核时,无需转码(已是utf8)

              func BodyBytes

              func BodyBytes(resp *http.Response) ([]byte, error)

                BodyBytes 读取完整响应流正文

                func CreateDefault

                func CreateDefault(browser string) string

                  CreateDefault returns a user agent string using default values.

                  func CreateReal

                  func CreateReal() string

                    CreateReal creates generates and returns a complete user agent string.

                    func CreateVersion

                    func CreateVersion(browser, version string) string

                      CreateVersion generates and returns a complete user agent string for a specific browser version.

                      func DestroyJsFiles

                      func DestroyJsFiles()

                        DestroyJsFiles 销毁Phantomjs的js临时文件

                        func Download

                        func Download(req *Request) (resp *http.Response, err error)

                          Download 实现surfer下载器接口

                          func Format

                          func Format(bname, bver string) string

                            Format returns the format string for the given browser name and version.

                            When a format can't be found for a version, the first format string for the browser is returned. When a format can't be found for the browser the default format is returned.

                            func GetWDPath

                            func GetWDPath() string

                              GetWDPath gets the work directory path.

                              func IsDirExists

                              func IsDirExists(path string) bool

                                IsDirExists judges path is directory or not.

                                func IsFileExists

                                func IsFileExists(path string) bool

                                  IsFileExists judges path is file or not.

                                  func SetPhantomJsFilePath

                                  func SetPhantomJsFilePath(filePath string)

                                    指定phantomjs可执行文件的位置

                                    func TopVersion

                                    func TopVersion(bname string) string

                                      TopVersion returns the most recent version for the given browser name.

                                      func UrlEncode

                                      func UrlEncode(urlStr string) (*url.URL, error)

                                        UrlEncode 返回编码后的url.URL指针、及解析错误

                                        func WalkDir

                                        func WalkDir(targpath string, suffixes ...string) (dirlist []string)

                                          WalkDir 遍历目录,可指定后缀

                                          Types

                                          type Bytes

                                          type Bytes []byte

                                            Bytes bytes type of body content, without content type

                                            func (Bytes) SetBody

                                            func (b Bytes) SetBody(r *Request) error

                                              SetBody sets request body

                                              type Content

                                              type Content struct {
                                              	ContentType string
                                              	Bytes       []byte
                                              }

                                                Content bytes type of body content

                                                func (*Content) SetBody

                                                func (c *Content) SetBody(r *Request) error

                                                  SetBody sets request body

                                                  type Cookie struct {
                                                  	Name   string `json:"name"`
                                                  	Value  string `json:"value"`
                                                  	Domain string `json:"domain"`
                                                  	Path   string `json:"path"`
                                                  }

                                                    给phantomjs传输cookie用

                                                    type DnsCache

                                                    type DnsCache struct {
                                                    	// contains filtered or unexported fields
                                                    }

                                                      DnsCache DNS cache

                                                      func (*DnsCache) Del

                                                      func (d *DnsCache) Del(addr string)

                                                        Del deletes ipPort from DNS cache.

                                                        func (*DnsCache) Query

                                                        func (d *DnsCache) Query(addr string) (string, bool)

                                                          Query queries ipPort from DNS cache.

                                                          func (*DnsCache) Reg

                                                          func (d *DnsCache) Reg(addr, ipPort string)

                                                            Reg registers ipPort to DNS cache.

                                                            type File

                                                            type File struct {
                                                            	Filename string
                                                            	Bytes    []byte
                                                            }

                                                              File post form file

                                                              type Form

                                                              type Form struct {
                                                              	// Values [field name]-[]value
                                                              	Values map[string][]string
                                                              	// Files [field name]-[]File
                                                              	Files map[string][]File
                                                              }

                                                                Form impletes body interface

                                                                func (Form) SetBody

                                                                func (f Form) SetBody(r *Request) error

                                                                  SetBody sets request body

                                                                  type Formats

                                                                  type Formats map[string]string

                                                                    Formats is a collection of UA format strings. key is the browser version. value is the browser info.

                                                                    type JSONObj

                                                                    type JSONObj struct{ Data interface{} }

                                                                      JSONObj JSON type of body content

                                                                      func (*JSONObj) SetBody

                                                                      func (obj *JSONObj) SetBody(r *Request) error

                                                                        SetBody sets request body

                                                                        type OSAttributes

                                                                        type OSAttributes struct {
                                                                        	// OSName is the operating system name.
                                                                        	OSName string
                                                                        	// OSVersion is the operating system version.
                                                                        	OSVersion string
                                                                        	// Comments are additional comments to add to a user agent string.
                                                                        	Comments []string
                                                                        }

                                                                          OSAttributes stores OS attributes.

                                                                          type Phantom

                                                                          type Phantom struct {
                                                                          	PhantomjsFile string //Phantomjs完整文件名
                                                                          	TempJsDir     string //临时js存放目录
                                                                          
                                                                          	CookieJar *cookiejar.Jar
                                                                          	// contains filtered or unexported fields
                                                                          }

                                                                            Phantom 基于Phantomjs的下载器实现,作为surfer的补充 效率较surfer会慢很多,但是因为模拟浏览器,破防性更好 支持UserAgent/TryTimes/RetryPause/自定义js

                                                                            func (*Phantom) DestroyJsFiles

                                                                            func (phantom *Phantom) DestroyJsFiles()

                                                                              DestroyJsFiles 销毁js临时文件

                                                                              func (*Phantom) Download

                                                                              func (phantom *Phantom) Download(req *Request) (resp *http.Response, err error)

                                                                                Download 实现surfer下载器接口

                                                                                type Request

                                                                                type Request struct {
                                                                                	// url (必须填写)
                                                                                	Url string
                                                                                
                                                                                	// GET POST HEAD (默认为GET)
                                                                                	Method string
                                                                                	// http header
                                                                                	Header http.Header
                                                                                	// 是否使用cookies,在Spider的EnableCookie设置
                                                                                	EnableCookie bool
                                                                                	// request body interface
                                                                                	Body body
                                                                                
                                                                                	// dial tcp: i/o timeout
                                                                                	DialTimeout time.Duration
                                                                                	// WSARecv tcp: i/o timeout
                                                                                	ConnTimeout time.Duration
                                                                                	// the max times of download
                                                                                	TryTimes int
                                                                                	// how long pause when retry
                                                                                	RetryPause time.Duration
                                                                                	// max redirect times
                                                                                	// when RedirectTimes equal 0, redirect times is ∞
                                                                                	// when RedirectTimes less than 0, redirect times is 0
                                                                                	RedirectTimes int
                                                                                	// the download ProxyHost
                                                                                	Proxy string
                                                                                
                                                                                	// 指定下载器ID
                                                                                	// 0为Surf高并发下载器,各种控制功能齐全
                                                                                	// 1为PhantomJS下载器,特点破防力强,速度慢,低并发
                                                                                	DownloaderID int
                                                                                	// contains filtered or unexported fields
                                                                                }

                                                                                  Request contains the necessary prerequisite information.

                                                                                  func (*Request) ReadBody

                                                                                  func (r *Request) ReadBody() ([]byte, error)

                                                                                    ReadBody returns body bytes

                                                                                    type RespBody

                                                                                    type RespBody struct {
                                                                                    	io.ReadCloser
                                                                                    	io.Reader
                                                                                    }

                                                                                      RespBody 封装Response.Body

                                                                                      func (*RespBody) Read

                                                                                      func (b *RespBody) Read(p []byte) (int, error)

                                                                                        Read 实现Reader接口

                                                                                        type Response

                                                                                        type Response struct {
                                                                                        	Cookies []string
                                                                                        	Body    string
                                                                                        	Error   string
                                                                                        	Header  []struct {
                                                                                        		Name  string
                                                                                        		Value string
                                                                                        	}
                                                                                        }

                                                                                          Response 用于解析Phantomjs的响应内容

                                                                                          type Surf

                                                                                          type Surf struct {
                                                                                          	CookieJar *cookiejar.Jar
                                                                                          }

                                                                                            Surf is the default Download implementation.

                                                                                            func (*Surf) Download

                                                                                            func (surf *Surf) Download(param *Request) (*http.Response, error)

                                                                                              Download 实现surfer下载器接口

                                                                                              type Surfer

                                                                                              type Surfer interface {
                                                                                              	// GET @param url string, header http.Header, cookies []*http.Cookie
                                                                                              	// HEAD @param url string, header http.Header, cookies []*http.Cookie
                                                                                              	// POST PostForm @param url, referer string, values url.Values, header http.Header, cookies []*http.Cookie
                                                                                              	// POST-M PostMultipart @param url, referer string, values url.Values, header http.Header, cookies []*http.Cookie
                                                                                              	Download(*Request) (resp *http.Response, err error)
                                                                                              }

                                                                                                Surfer represents an core of HTTP web browser for crawler.

                                                                                                func New

                                                                                                func New(jar ...*cookiejar.Jar) Surfer

                                                                                                  New 创建一个Surf下载器

                                                                                                  func NewPhantom

                                                                                                  func NewPhantom(phantomjsFile, tempJsDir string, jar ...*cookiejar.Jar) Surfer

                                                                                                    NewPhantom 创建一个Phantomjs下载器

                                                                                                    type TemplateData

                                                                                                    type TemplateData struct {
                                                                                                    	Name string
                                                                                                    	Ver  string
                                                                                                    	OSN  string
                                                                                                    	OSV  string
                                                                                                    	Coms string
                                                                                                    }

                                                                                                      TemplateData structure for template data.

                                                                                                      type UAData

                                                                                                      type UAData struct {
                                                                                                      	TopVersion string
                                                                                                      	DefaultOS  int
                                                                                                      	Formats    Formats
                                                                                                      }

                                                                                                        UAData stores information on a browser user agent.

                                                                                                        type UATable

                                                                                                        type UATable map[string]UAData

                                                                                                          UATable is a collection of UAData values. key is the name of the browser.

                                                                                                          type XMLObj

                                                                                                          type XMLObj struct{ Data interface{} }

                                                                                                            XMLObj XML type of body content

                                                                                                            func (*XMLObj) SetBody

                                                                                                            func (obj *XMLObj) SetBody(r *Request) error

                                                                                                              SetBody sets request body

                                                                                                              Directories

                                                                                                              Path Synopsis
                                                                                                              example