Documentation
¶
Index ¶
- Variables
- func AbsoluteURL(uri string, baseURL string) (string, error)
- func HTML2Mkd(body []byte, sel string) (*string, error)
- func HostFromURL(uri string) string
- func NoEcho(...any)
- func ParseURL(uri string) (*url.URL, error)
- func SelectionToMd(sel *goquery.Selection) *string
- func ShortenURL(uu *url.URL) string
- func TimeTrack(start time.Time, skip int)
- func TruncateString(str string, maxLen int) string
- func URL2Str(uu *url.URL) string
- type Context
- func (c *Context) ForEach(fn func(k string, v interface{}) interface{}) []interface{}
- func (c *Context) Get(key string) string
- func (c *Context) GetAny(key string) interface{}
- func (c *Context) MarshalBinary() (_ []byte, _ error)
- func (c *Context) Put(key string, value interface{})
- func (c *Context) UnmarshalBinary(_ []byte) error
- type Crawler
- func (c *Crawler) ClosePage(err error) error
- func (c *Crawler) Echo(err error)
- func (c *Crawler) EnsureVisit(url string, opts ...VisitOptionFunc)
- func (c *Crawler) GoBack(err error) error
- func (c *Crawler) LogTimeSpent(start time.Time)
- func (c *Crawler) MustSetStorage(s storage.Storage)
- func (c *Crawler) OnHTML(sel string, cbFn HTMLCallback)
- func (c *Crawler) OnHTMLDetach(selector string)
- func (c *Crawler) OnPaging(selector string, f SerpCallback)
- func (c *Crawler) OnResponse(f ResponseCallback)
- func (c *Crawler) Pie(err error)
- func (c *Crawler) SetStorage(s storage.Storage) error
- func (c *Crawler) String() string
- func (c *Crawler) Visit(url string, opts ...VisitOptionFunc) error
- type CrawlerOption
- func AllowedDomains(domains ...string) CrawlerOption
- func DebugDetailStep(b bool) CrawlerOption
- func DebugEachRequest(b bool) CrawlerOption
- func DebugVerifier(b bool) CrawlerOption
- func Headless(b bool) CrawlerOption
- func IgnoredErrors(errs ...error) CrawlerOption
- func MaxDepth(i int) CrawlerOption
- func StableDiff(f float64) CrawlerOption
- func TrackTime(b bool) CrawlerOption
- func UIDGen(fn UIDGenerator) CrawlerOption
- func WithLogger(l *zap.Logger) CrawlerOption
- type ErrHandler
- type HTMLCallback
- type HTMLElement
- func (h *HTMLElement) Attr(k string) string
- func (h *HTMLElement) ChildAttr(goquerySelector, attrName string) string
- func (h *HTMLElement) ChildAttrs(goquerySelector, attrName string) []string
- func (h *HTMLElement) ChildText(goquerySelector string) string
- func (h *HTMLElement) ChildTexts(goquerySelector string) []string
- type Request
- type Response
- type ResponseCallback
- type SerpCallback
- type SerpElement
- type UIDGenerator
- type VisitOptionFunc
- type VisitOptions
- type WeenyParser
Constants ¶
This section is empty.
Variables ¶
var ( // errors from bot ErrNoElemFound = errors.New("no element found") ErrNotInteractable = errors.New("elem not interactable") )
var ( // errors of control logics ErrForbiddenDomain = errors.New("forbidden domain") ErrMaxDepth = errors.New("max depth limit reached") ErrVisited = errors.New("url already visited") ErrURLInvalid = errors.New("url invalid error") )
var PresetUnloggedErrors = []error{ ErrVisited, ErrURLInvalid, ErrNoElemFound, ErrMaxDepth, ErrForbiddenDomain, ErrNotInteractable, }
Functions ¶
func SelectionToMd ¶
func ShortenURL ¶
func TruncateString ¶
Types ¶
type Context ¶
type Context struct {
// contains filtered or unexported fields
}
Context provides a tiny layer for passing data between callbacks
func (*Context) Get ¶
Get retrieves a string value from Context. Get returns an empty string if key not found
func (*Context) MarshalBinary ¶
MarshalBinary encodes Context value This function is used by request caching
func (*Context) UnmarshalBinary ¶
UnmarshalBinary decodes Context value to nil This function is used by request caching
type Crawler ¶
func NewCrawler ¶
func NewCrawler(options ...CrawlerOption) *Crawler
func NewCrawlerMuted ¶
func NewCrawlerMuted(options ...CrawlerOption) *Crawler
func (*Crawler) EnsureVisit ¶
func (c *Crawler) EnsureVisit(url string, opts ...VisitOptionFunc)
func (*Crawler) LogTimeSpent ¶
func (*Crawler) MustSetStorage ¶
func (*Crawler) OnHTML ¶
func (c *Crawler) OnHTML(sel string, cbFn HTMLCallback)
OnHTML registers a function. Function will be executed on every HTML element matched by the GoQuery Selector parameter. GoQuery Selector is a selector used by https://github.com/PuerkitoBio/goquery
func (*Crawler) OnHTMLDetach ¶
OnHTMLDetach deregister a function. Function will not be execute after detached
func (*Crawler) OnPaging ¶
func (c *Crawler) OnPaging(selector string, f SerpCallback)
func (*Crawler) OnResponse ¶
func (c *Crawler) OnResponse(f ResponseCallback)
OnResponse registers a function. Function will be executed on every response
type CrawlerOption ¶
type CrawlerOption func(*Crawler)
func AllowedDomains ¶
func AllowedDomains(domains ...string) CrawlerOption
func DebugDetailStep ¶
func DebugDetailStep(b bool) CrawlerOption
func DebugEachRequest ¶
func DebugEachRequest(b bool) CrawlerOption
func DebugVerifier ¶
func DebugVerifier(b bool) CrawlerOption
func Headless ¶
func Headless(b bool) CrawlerOption
func IgnoredErrors ¶
func IgnoredErrors(errs ...error) CrawlerOption
func MaxDepth ¶
func MaxDepth(i int) CrawlerOption
func StableDiff ¶
func StableDiff(f float64) CrawlerOption
func TrackTime ¶
func TrackTime(b bool) CrawlerOption
func UIDGen ¶
func UIDGen(fn UIDGenerator) CrawlerOption
func WithLogger ¶
func WithLogger(l *zap.Logger) CrawlerOption
type ErrHandler ¶
type HTMLCallback ¶
type HTMLCallback func(e *HTMLElement)
type HTMLElement ¶
type HTMLElement struct { // Name is the name of the tag Name string Text string // Request is the request object of the element's HTML document Request *Request // Response is the Response object of the element's HTML document Response *Response // DOM is the goquery parsed DOM object of the page. DOM is relative // to the current HTMLElement DOM *goquery.Selection // GlbIndex stores the position of the current element within all the elements matched by an OnHTML callback GlbIndex int ElemSelector string ElemIndex int // contains filtered or unexported fields }
HTMLElement is the representation of a HTML tag.
func NewHTMLElementFromSelectionNode ¶
func NewHTMLElementFromSelectionNode(resp *Response, gqSel *goquery.Selection, node *html.Node, glbCbIndex int, elemSel string, elemIndex int) *HTMLElement
NewHTMLElementFromSelectionNode creates a HTMLElement from a goquery.Selection Node.
func (*HTMLElement) Attr ¶
func (h *HTMLElement) Attr(k string) string
Attr returns the selected attribute of a HTMLElement or empty string if no attribute found
func (*HTMLElement) ChildAttr ¶
func (h *HTMLElement) ChildAttr(goquerySelector, attrName string) string
ChildAttr returns the stripped text content of the first matching element's attribute.
func (*HTMLElement) ChildAttrs ¶
func (h *HTMLElement) ChildAttrs(goquerySelector, attrName string) []string
ChildAttrs returns the stripped text content of all the matching element's attributes.
func (*HTMLElement) ChildText ¶
func (h *HTMLElement) ChildText(goquerySelector string) string
ChildText returns the concatenated and stripped text content of the matching elements.
func (*HTMLElement) ChildTexts ¶
func (h *HTMLElement) ChildTexts(goquerySelector string) []string
ChildTexts returns the stripped text content of all the matching elements.
type Request ¶
type Request struct { URL *url.URL // ID is the Unique identifier of the request ID uint32 Depth int // Ctx is a context between a Request and a Response Ctx *Context ByGetURL bool Selector string Index int // contains filtered or unexported fields }
func (*Request) AbsoluteURL ¶
AbsoluteURL returns with the resolved absolute URL of an URL chunk. AbsoluteURL returns empty string if the URL chunk is a fragment or could not be parsed
func (*Request) Visit ¶
func (r *Request) Visit(url string, opts ...VisitOptionFunc) error
Visit continues Collector's collecting job by creating a request and preserves the Context of the previous request. Visit also calls the previously provided callbacks
func (*Request) VisitElem ¶
func (r *Request) VisitElem(opts ...VisitOptionFunc) error
type Response ¶
type ResponseCallback ¶
type ResponseCallback func(*Response)
ResponseCallback is a type alias for OnResponse callback functions
type SerpCallback ¶
type SerpCallback func(e *SerpElement)
type SerpElement ¶
type SerpElement struct { Request *Request Bot *wee.Bot Selector string Index int Element *rod.Element }
func NewSerpElement ¶
func (*SerpElement) Attr ¶
func (e *SerpElement) Attr(k string) string
func (*SerpElement) Link ¶
func (e *SerpElement) Link(attrs ...string) string
Link alias of Attr for the first matched of "src/href"
func (*SerpElement) Target ¶
func (e *SerpElement) Target() string
func (*SerpElement) Text ¶
func (e *SerpElement) Text() string
type UIDGenerator ¶
type VisitOptionFunc ¶
type VisitOptionFunc func(o *VisitOptions)
func Elem ¶
func Elem(elem *rod.Element) VisitOptionFunc
func OnVisitEnd ¶
func OnVisitEnd(fn ErrHandler) VisitOptionFunc
func OpenInTab ¶
func OpenInTab(b bool) VisitOptionFunc
OpenInTab marks request open a link in new tab.
func WithURL ¶
func WithURL(u string) VisitOptionFunc
type VisitOptions ¶
type VisitOptions struct {
// contains filtered or unexported fields
}
type WeenyParser ¶
type WeenyParser struct {
*xparse.HTMLParser
}
func NewParser ¶
func NewParser(raw, cfg []byte) *WeenyParser
func (*WeenyParser) ParseToStruct ¶
func (wp *WeenyParser) ParseToStruct(key string, obj any) error
Source Files
¶
Directories
¶
Path | Synopsis |
---|---|
examples
|
|
15five
command
|
|
basic
command
|
|
colly/basic
command
|
|
colly/max_depth
command
|
|
colly/xszj
command
|
|
huarun
command
|
|
pingan
command
|
|
xszj
command
|
|