crawler

package
v1.2.6-sp7 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Sep 15, 2023 License: AGPL-3.0 Imports: 28 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

View Source
var (
	ExcludedSuffix = []string{
		".js", ".css",
		".jpg", ".jpeg", ".png",
		".mp3", ".mp4",
		".flv", ".aac", ".ogg",
		".svg", "ico", ".gif",
		".doc", "docx", ".pptx",
		".ppt", ".pdf",
	}
	ExcludedMIME = []string{"image/*",
		"audio/*", "video/*", "*octet-stream*",
		"application/ogg", "application/pdf", "application/msword",
		"application/x-ppt", "video/avi", "application/x-ico",
		"*zip",
	}
)
View Source
var Exports = map[string]interface{}{
	"Start": func(url string, opt ...configOpt) (chan *Req, error) {
		ch := make(chan *Req)
		opt = append(opt, WithOnRequest(func(req *Req) {
			ch <- req
		}))

		crawler, err := NewCrawler(url, opt...)
		if err != nil {
			return nil, utils.Errorf("create crawler failed: %s", err)
		}
		go func() {
			defer close(ch)

			err := crawler.Run()
			if err != nil {
				log.Error(err)
			}
		}()
		return ch, nil
	},

	"basicAuth":           WithBasicAuth,
	"bodySize":            WithBodySize,
	"concurrent":          WithConcurrent,
	"connectTimeout":      WithConnectTimeout,
	"timeout":             WithConnectTimeout,
	"domainExclude":       WithDomainBlackList,
	"domainInclude":       WithDomainWhiteList,
	"cookie":              WithFixedCookie,
	"forbiddenFromParent": WithForbiddenFromParent,
	"disallowSuffix":      WithDisallowSuffix,
	"header":              WithHeader,
	"urlExtractor":        WithUrlExtractor,
	"maxDepth":            WithMaxDepth,
	"maxRedirect":         WithMaxRedirectTimes,
	"maxRequest":          WithMaxRequestCount,
	"maxRetry":            WithMaxRetry,
	"maxUrls":             WithMaxUrlCount,
	"proxy":               WithProxy,
	"responseTimeout":     WithResponseTimeout,
	"urlRegexpExclude":    WithUrlRegexpBlackList,
	"urlRegexpInclude":    WithUrlRegexpWhiteList,
	"userAgent":           WithUserAgent,
	"ua":                  WithUserAgent,
	"autoLogin":           WithAutoLogin,
	"RequestsFromFlow":    HandleRequestResult,
}
View Source
var (
	URLPattern, _ = regexp.Compile(`(((?:[a-zA-Z]{1,10}://|//)[^"'/]{1,}\.[a-zA-Z]{2,}[^"']{0,})|((?:/|\.\./|\./)[^"'><,;|*()(%%$^/\\\[\]][^"'><,;|()]{1,})|([a-zA-Z0-9_\-/]{1,}/[a-zA-Z0-9_\-/]{1,}\.(?:[a-zA-Z]{1,4}|action)(?:[\?|/][^"|']{0,}|))|([a-zA-Z0-9_\-]{1,}\.(?:\.{1,10})(?:\?[^"|']{0,}|)))`)
)

Functions

func AbsoluteURL

func AbsoluteURL(u string, base *url.URL) string

func HandleElementForm

func HandleElementForm(dom *goquery.Selection, baseURL *url.URL, guessParams ...func(user, pass string, extra map[string][]string)) (method, requestURL, contentType string, body *bytes.Buffer, err error)

func HandleFormUrlEncoded

func HandleFormUrlEncoded(method string, actionAbsURL string, selects *goquery.Selection, guessParams ...func(username, password string, extra map[string][]string)) (requestURL string, body *bytes.Buffer, contentType string, err error)

func HandleMultipartFormData

func HandleMultipartFormData(selects *goquery.Selection) (body *bytes.Buffer, contentType string, err error)

func HandleRequestResult

func HandleRequestResult(isHttps bool, reqBytes, rspBytes []byte) ([][]byte, error)

func HostToWildcardGlobs

func HostToWildcardGlobs(host string) []glob.Glob

func RoundRobinProxySwitcher

func RoundRobinProxySwitcher(ProxyURLs ...string) (func(r *http.Request) (*url.URL, error), error)

RoundRobinProxySwitcher creates a proxy switcher function which rotates ProxyURLs on every request. The proxy type is determined by the URL scheme. "http", "https" and "socks5" are supported. If the scheme is empty, "http" is assumed.

func WithAutoLogin

func WithAutoLogin(username, password string, flags ...string) configOpt

func WithBasicAuth

func WithBasicAuth(user, pass string) configOpt

func WithBodySize

func WithBodySize(size int) configOpt

func WithConcurrent

func WithConcurrent(concurrent int) configOpt

func WithConnectTimeout

func WithConnectTimeout(f float64) configOpt

func WithDisallowMIMEType

func WithDisallowMIMEType(d []string) configOpt

func WithDisallowSuffix

func WithDisallowSuffix(d []string) configOpt

func WithDomainBlackList

func WithDomainBlackList(domain string) configOpt

func WithDomainWhiteList

func WithDomainWhiteList(domain string) configOpt

func WithExtraSuffixForEveryPath

func WithExtraSuffixForEveryPath(path ...string) configOpt

func WithExtraSuffixForEveryRootPath

func WithExtraSuffixForEveryRootPath(path ...string) configOpt

func WithFixedCookie

func WithFixedCookie(k, v string) configOpt

func WithForbiddenFromParent

func WithForbiddenFromParent(b bool) configOpt

func WithHeader

func WithHeader(k, v string) configOpt

func WithMaxDepth

func WithMaxDepth(depth int) configOpt

func WithMaxRedirectTimes

func WithMaxRedirectTimes(maxRedirectTimes int) configOpt

func WithMaxRequestCount

func WithMaxRequestCount(limit int) configOpt

func WithMaxRetry

func WithMaxRetry(limit int) configOpt

func WithMaxUrlCount

func WithMaxUrlCount(limit int) configOpt

func WithOnRequest

func WithOnRequest(f func(req *Req)) configOpt

func WithProxy

func WithProxy(proxies ...string) configOpt

func WithResponseTimeout

func WithResponseTimeout(f float64) configOpt

func WithUrlExtractor

func WithUrlExtractor(f func(*Req) []interface{}) configOpt

func WithUrlRegexpBlackList

func WithUrlRegexpBlackList(re string) configOpt

func WithUrlRegexpWhiteList

func WithUrlRegexpWhiteList(re string) configOpt

func WithUserAgent

func WithUserAgent(ua string) configOpt

Types

type Config

type Config struct {
	// 基础认证
	BasicAuth    bool
	AuthUsername string
	AuthPassword string
	// contains filtered or unexported fields
}

func (*Config) CheckShouldBeHandledURL

func (c *Config) CheckShouldBeHandledURL(u *url.URL) bool

func (*Config) CreateHTTPClient

func (c *Config) CreateHTTPClient() *http.Client

type Crawler

type Crawler struct {
	// contains filtered or unexported fields
}

func NewCrawler

func NewCrawler(urls string, opts ...configOpt) (*Crawler, error)

func (*Crawler) Run

func (c *Crawler) Run() error

type Req

type Req struct {
	// contains filtered or unexported fields
}

func (*Req) AbsoluteURL

func (r *Req) AbsoluteURL(u string) string

func (*Req) Hash

func (r *Req) Hash() string

func (*Req) IsForm

func (r *Req) IsForm() bool

func (*Req) IsHttps

func (r *Req) IsHttps() bool

func (*Req) IsLoginForm

func (r *Req) IsLoginForm() bool

func (*Req) IsUploadForm

func (r *Req) IsUploadForm() bool

func (*Req) Request

func (r *Req) Request() *http.Request

func (*Req) RequestRaw

func (r *Req) RequestRaw() []byte

func (*Req) Response

func (r *Req) Response() (*http.Response, error)

func (*Req) ResponseBody

func (r *Req) ResponseBody() []byte

func (*Req) ResponseRaw

func (r *Req) ResponseRaw() []byte

func (*Req) SameWildcardOrigin

func (r *Req) SameWildcardOrigin(s *Req) bool

func (*Req) Url

func (r *Req) Url() string

type RequestIf

type RequestIf interface {
	Url() string
	Request() *http.Request
	ResponseBody() []byte
	Response() (*http.Response, error)
	IsHttps() bool
	ResponseRaw() []byte
	RequestRaw() []byte
}

type Result

type Result struct {
	FoundUrls []string
	Requests  []*Req
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL