crawl

package module
v0.0.0-...-563cbd9 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jul 3, 2022 License: Apache-2.0 Imports: 33 Imported by: 0

README

crawl

urlx 封装一些用于爬虫的有用的函数。

Documentation

Index

Constants

View Source
const (
	HeaderContentType = "Content-Type"
	ParamCharset      = "charset"
)
View Source
const (
	HeaderRequestCookie  = "Cookie"     // Request Cookie
	HeaderResponseCookie = "Set-Cookie" // Response Cookie
)
View Source
const (
	HeaderAcceptEncoding  = "Accept-Encoding"
	HeaderContentEncoding = "Content-Encoding"
)
View Source
const Day = time.Hour * 24

Variables

View Source
var (
	// AcceptAllEncodings 接受所有的编码格式
	AcceptAllEncodings = AcceptEncoding("zstd", "br", "gzip", "deflate", "snappy", "s2")
	// DefaultEncodings 默认接受所有的编码格式
	DefaultEncodings = AcceptEncoding("gzip", "deflate", "br")
)
View Source
var (
	ErrValueCannotAddress = errors.New("value can not address")
	ErrValueNotBasicKind  = errors.New("value not a basic kind")
	ErrValueCast          = errors.New("value string cast error")
)
View Source
var (
	New            = urlx.New
	AcceptHTML     = urlx.AcceptHTML
	AcceptChinese  = urlx.AcceptChinese
	NoCache        = urlx.NoCache
	UseClient      = urlx.UseClient
	Accept         = urlx.Accept
	AcceptAny      = urlx.AcceptAny
	AcceptJSON     = urlx.AcceptJSON
	AcceptLanguage = urlx.AcceptLanguage
	AcceptXML      = urlx.AcceptXML
	HeaderDel      = urlx.HeaderDel
	HeaderSet      = urlx.HeaderSet
)
View Source
var (
	MacChromeAgent  = urlx.UserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36")
	MacFirefoxAgent = urlx.UserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:65.0) Gecko/20100101 Firefox/65.0")
	MacSafariAgent  = urlx.UserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.3 Safari/605.1.15")
	MacEdgeAgent    = urlx.UserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.55 Safari/537.36 Edg/96.0.1054.43")

	WindowsChromeAgent = urlx.UserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36")
	WindowsEdgeAgent   = urlx.UserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763")
	WindowsIEAgent     = urlx.UserAgent("Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko")

	AndroidChromeAgent = urlx.UserAgent("Mozilla/5.0 (Linux; Android 11; SM-G9910) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.59 Mobile Safari/537.36")
	AndroidWebkitAgent = urlx.UserAgent("Mozilla/5.0 (Linux; Android 11; SM-G9910) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30")
	AndroidEdgeAgent   = urlx.UserAgent("Mozilla/5.0 (Linux; Android 11; SM-G9910) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Mobile Safari/537.36 Edge/95.0.1020.55")
)
View Source
var AutoCharset = Charset("auto")

AutoCharset 将响应解码成UTF-8

Functions

func AcceptEncoding

func AcceptEncoding(acceptEncodings ...string) urlx.HeaderOption

AcceptEncoding 接受编码

func AddCookieString

func AddCookieString(cookies ...string) urlx.HeaderOption

AddCookieString 添加Cookie到请求

func AddCookies

func AddCookies(cookies ...*http.Cookie) urlx.HeaderOption

AddCookies 添加Cookie到请求

func BindMapField

func BindMapField(doc *goquery.Selection, params map[string]string, field MapField) ([]any, error)

func BindStruct

func BindStruct(sel *goquery.Selection, out any, options StructOptions) error

func Charset

func Charset(charset string) urlx.ProcessMw

Charset 指定响应的编码,auto 或者空则通过 Content-Type 自动判断

func Decompression

func Decompression(next urlx.Process) urlx.Process

Decompression 解压Body

func Dump

func Dump(w io.Writer, reqBody, respBody bool) urlx.ProcessMw

func FormBody

func FormBody(in any) urlx.Body

FormBody 提交Form表单

func JSONBody

func JSONBody(in any) urlx.Body

JSONBody 提交JSON

func ProcessHtml

func ProcessHtml(readHtml func(doc *goquery.Selection) error) urlx.Process

ProcessHtml Html选择器

func ProcessJSON

func ProcessJSON(out any) urlx.Process

ProcessJSON 处理JSON响应

func ProcessMap

func ProcessMap(out any, params map[string]string, options MapField) urlx.Process

ProcessMap Html解析到Map, out must map[string]any or []map[string]any

func ProcessStruct

func ProcessStruct(rootSelect string, out any, options StructOptions) urlx.Process

ProcessStruct Html解析到Struct

func ProcessXML

func ProcessXML(out any) urlx.Process

ProcessXML 处理xml响应

func ProcessYAML

func ProcessYAML(out any) urlx.Process

ProcessYAML 处理yaml响应

func Proxy

func Proxy(proxy string) urlx.Option

Proxy 使用代理,支持 ss:// ssr:// vmess:// http:// https:// sock5://

func ProxySubscribe

func ProxySubscribe(subscribeUri string) urlx.Option

ProxySubscribe 使用订阅来当做代理池

func ReadCookies

func ReadCookies(read func(cookies []*http.Cookie) error) urlx.ProcessMw

ReadCookies 从响应读取Cookie

func ReplaceTemplate

func ReplaceTemplate(template string, params map[string]string) (s string)

ReplaceTemplate 模板替换

func XMLBody

func XMLBody(in any) urlx.Body

Types

type Duration

type Duration int64

func ParseDuration

func ParseDuration(s string) (Duration, error)

ParseDuration parses a duration string. A duration string is a possibly signed sequence of decimal numbers, each with optional fraction and a unit suffix, such as "300ms", "-1.5h" or "2h45m". Valid time units are "ns", "us" (or "µs"), "ms", "s", "m", "h".

func (Duration) Days

func (d Duration) Days() float64

func (Duration) Go

func (d Duration) Go() time.Duration

func (Duration) Int

func (d Duration) Int() int64

func (Duration) MarshalJSON

func (d Duration) MarshalJSON() ([]byte, error)

func (Duration) MarshalYAML

func (d Duration) MarshalYAML() (interface{}, error)

func (*Duration) Scan

func (d *Duration) Scan(value any) (err error)

Scan implements the Scanner interface.

func (Duration) String

func (d Duration) String() string

func (*Duration) UnmarshalJSON

func (d *Duration) UnmarshalJSON(p []byte) error

func (*Duration) UnmarshalYAML

func (d *Duration) UnmarshalYAML(fn func(interface{}) error) error

func (Duration) Value

func (d Duration) Value() (driver.Value, error)

Value implements the driver Valuer interface.

type HeaderOption

type HeaderOption = urlx.HeaderOption

type MapField

type MapField struct {
	Name   string     `json:"name,omitempty" xml:"name,omitempty"`     // 字段名称
	Value  string     `json:"value,omitempty" xml:"value,omitempty"`   // 模板值
	Select string     `json:"select,omitempty" xml:"select,omitempty"` // 选择器
	Attr   string     `json:"attr,omitempty" xml:"attr,omitempty"`     // 属性选择
	Format string     `json:"format,omitempty" xml:"format,omitempty"` // 格式化
	Find   string     `json:"find,omitempty" xml:"find,omitempty"`     // 结果再查找(正则表达式)
	Repl   string     `json:"repl,omitempty" xml:"repl,omitempty"`     // 结果查找后再替换(正则替换表达式)
	List   bool       `json:"list,omitempty" xml:"list,omitempty"`     // 是否列表
	Split  string     `json:"split,omitempty" xml:"split,omitempty"`   // 是否对字段再进行拆分
	Type   string     `json:"type,omitempty" xml:"type,omitempty"`     // 类型: time, duration, string, int, float, bool, 默认 string
	Fields []MapField `json:"fields,omitempty" xml:"fields,omitempty"` // 字段
}

type Option

type Option = urlx.Option

type Request

type Request = urlx.Request

func AndroidEdge

func AndroidEdge(ctx context.Context) *Request

AndroidEdge Android Edge 浏览器

func Default

func Default(ctx context.Context) *Request

func MacEdge

func MacEdge(ctx context.Context) *Request

MacEdge Mac Edge 浏览器

func NewBrowser

func NewBrowser(ctx context.Context) *Request

NewBrowser 浏览器

func WindowsEdge

func WindowsEdge(ctx context.Context) *Request

WindowsEdge Windows Edge 浏览器

type StructOptions

type StructOptions struct {
	SelectTag string // select
	AttrTag   string // attr
	FormatTag string // format
	FindTag   string // find
	ReplTag   string // repl
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL