weeny

package module
v0.0.0-...-b6f9473 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jun 25, 2024 License: GPL-3.0 Imports: 30 Imported by: 0

README

weeny

Documentation

Index

Constants

This section is empty.

Variables

View Source
var (
	// errors from bot
	ErrNoElemFound     = errors.New("no element found")
	ErrNotInteractable = errors.New("elem not interactable")
)
View Source
var (
	// errors of control logics
	ErrForbiddenDomain = errors.New("forbidden domain")
	ErrMaxDepth        = errors.New("max depth limit reached")
	ErrVisited         = errors.New("url already visited")
	ErrURLInvalid      = errors.New("url invalid error")
)

Functions

func AbsoluteURL

func AbsoluteURL(uri string, baseURL string) (string, error)

func HTML2Mkd

func HTML2Mkd(body []byte, sel string) (*string, error)

func HostFromURL

func HostFromURL(uri string) string

HostFromURL extract `Host` from url

func NoEcho

func NoEcho(...any)

func ParseURL

func ParseURL(uri string) (*url.URL, error)

func SelectionToMd

func SelectionToMd(sel *goquery.Selection) *string

func ShortenURL

func ShortenURL(uu *url.URL) string

func TimeTrack

func TimeTrack(start time.Time, skip int)

func TruncateString

func TruncateString(str string, maxLen int) string

func URL2Str

func URL2Str(uu *url.URL) string

URL2Str return url.String() or "" if url is nil.

Types

type Context

type Context struct {
	// contains filtered or unexported fields
}

Context provides a tiny layer for passing data between callbacks

func NewContext

func NewContext() *Context

NewContext initializes a new Context instance

func (*Context) ForEach

func (c *Context) ForEach(fn func(k string, v interface{}) interface{}) []interface{}

ForEach iterate context

func (*Context) Get

func (c *Context) Get(key string) string

Get retrieves a string value from Context. Get returns an empty string if key not found

func (*Context) GetAny

func (c *Context) GetAny(key string) interface{}

GetAny retrieves a value from Context. GetAny returns nil if key not found

func (*Context) MarshalBinary

func (c *Context) MarshalBinary() (_ []byte, _ error)

MarshalBinary encodes Context value This function is used by request caching

func (*Context) Put

func (c *Context) Put(key string, value interface{})

Put stores a value of any type in Context

func (*Context) UnmarshalBinary

func (c *Context) UnmarshalBinary(_ []byte) error

UnmarshalBinary decodes Context value to nil This function is used by request caching

type Crawler

type Crawler struct {
	ID  uint32
	Bot *wee.Bot
	// contains filtered or unexported fields
}

func NewCrawler

func NewCrawler(options ...CrawlerOption) *Crawler

func NewCrawlerMuted

func NewCrawlerMuted(options ...CrawlerOption) *Crawler

func (*Crawler) ClosePage

func (c *Crawler) ClosePage(err error) error

func (*Crawler) Echo

func (c *Crawler) Echo(err error)

Echo warn if log happens

func (*Crawler) EnsureVisit

func (c *Crawler) EnsureVisit(url string, opts ...VisitOptionFunc)

func (*Crawler) GoBack

func (c *Crawler) GoBack(err error) error

func (*Crawler) LogTimeSpent

func (c *Crawler) LogTimeSpent(start time.Time)

func (*Crawler) MustSetStorage

func (c *Crawler) MustSetStorage(s storage.Storage)

func (*Crawler) OnHTML

func (c *Crawler) OnHTML(sel string, cbFn HTMLCallback)

OnHTML registers a function. Function will be executed on every HTML element matched by the GoQuery Selector parameter. GoQuery Selector is a selector used by https://github.com/PuerkitoBio/goquery

func (*Crawler) OnHTMLDetach

func (c *Crawler) OnHTMLDetach(selector string)

OnHTMLDetach deregister a function. Function will not be execute after detached

func (*Crawler) OnPaging

func (c *Crawler) OnPaging(selector string, f SerpCallback)

func (*Crawler) OnResponse

func (c *Crawler) OnResponse(f ResponseCallback)

OnResponse registers a function. Function will be executed on every response

func (*Crawler) Pie

func (c *Crawler) Pie(err error)

Pie print if error not in ignored errors

func (*Crawler) SetStorage

func (c *Crawler) SetStorage(s storage.Storage) error

func (*Crawler) String

func (c *Crawler) String() string

String is the text representation of the crawler. It contains useful debug information about the collector's internals

func (*Crawler) Visit

func (c *Crawler) Visit(url string, opts ...VisitOptionFunc) error

type CrawlerOption

type CrawlerOption func(*Crawler)

func AllowedDomains

func AllowedDomains(domains ...string) CrawlerOption

func DebugDetailStep

func DebugDetailStep(b bool) CrawlerOption

func DebugEachRequest

func DebugEachRequest(b bool) CrawlerOption

func DebugVerifier

func DebugVerifier(b bool) CrawlerOption

func Headless

func Headless(b bool) CrawlerOption

func IgnoredErrors

func IgnoredErrors(errs ...error) CrawlerOption

func MaxDepth

func MaxDepth(i int) CrawlerOption

func StableDiff

func StableDiff(f float64) CrawlerOption

func TrackTime

func TrackTime(b bool) CrawlerOption

func UIDGen

func UIDGen(fn UIDGenerator) CrawlerOption

func WithLogger

func WithLogger(l *zap.Logger) CrawlerOption

type ErrHandler

type ErrHandler func(error) error

type HTMLCallback

type HTMLCallback func(e *HTMLElement)

type HTMLElement

type HTMLElement struct {
	// Name is the name of the tag
	Name string
	Text string

	// Request is the request object of the element's HTML document
	Request *Request
	// Response is the Response object of the element's HTML document
	Response *Response
	// DOM is the goquery parsed DOM object of the page. DOM is relative
	// to the current HTMLElement
	DOM *goquery.Selection
	// GlbIndex stores the position of the current element within all the elements matched by an OnHTML callback
	GlbIndex int

	ElemSelector string
	ElemIndex    int
	// contains filtered or unexported fields
}

HTMLElement is the representation of a HTML tag.

func NewHTMLElementFromSelectionNode

func NewHTMLElementFromSelectionNode(resp *Response, gqSel *goquery.Selection, node *html.Node, glbCbIndex int, elemSel string, elemIndex int) *HTMLElement

NewHTMLElementFromSelectionNode creates a HTMLElement from a goquery.Selection Node.

func (*HTMLElement) Attr

func (h *HTMLElement) Attr(k string) string

Attr returns the selected attribute of a HTMLElement or empty string if no attribute found

func (*HTMLElement) ChildAttr

func (h *HTMLElement) ChildAttr(goquerySelector, attrName string) string

ChildAttr returns the stripped text content of the first matching element's attribute.

func (*HTMLElement) ChildAttrs

func (h *HTMLElement) ChildAttrs(goquerySelector, attrName string) []string

ChildAttrs returns the stripped text content of all the matching element's attributes.

func (*HTMLElement) ChildText

func (h *HTMLElement) ChildText(goquerySelector string) string

ChildText returns the concatenated and stripped text content of the matching elements.

func (*HTMLElement) ChildTexts

func (h *HTMLElement) ChildTexts(goquerySelector string) []string

ChildTexts returns the stripped text content of all the matching elements.

type Request

type Request struct {
	URL *url.URL
	// ID is the Unique identifier of the request
	ID uint32

	Depth int
	// Ctx is a context between a Request and a Response
	Ctx *Context

	ByGetURL bool

	Selector string
	Index    int
	// contains filtered or unexported fields
}

func (*Request) AbsoluteURL

func (r *Request) AbsoluteURL(uri string) string

AbsoluteURL returns with the resolved absolute URL of an URL chunk. AbsoluteURL returns empty string if the URL chunk is a fragment or could not be parsed

func (*Request) RID

func (r *Request) RID() string

func (*Request) String

func (r *Request) String() string

func (*Request) Visit

func (r *Request) Visit(url string, opts ...VisitOptionFunc) error

Visit continues Collector's collecting job by creating a request and preserves the Context of the previous request. Visit also calls the previously provided callbacks

func (*Request) VisitElem

func (r *Request) VisitElem(opts ...VisitOptionFunc) error

type Response

type Response struct {
	Request *Request
	Body    []byte
	Doc     *goquery.Document
	// Ctx is a context between a Request and a Response
	Ctx *Context
}

func (*Response) Mkd

func (r *Response) Mkd(sel string) (*string, error)

func (*Response) Save

func (r *Response) Save(fileName string) error

Save writes response body to disk

type ResponseCallback

type ResponseCallback func(*Response)

ResponseCallback is a type alias for OnResponse callback functions

type SerpCallback

type SerpCallback func(e *SerpElement)

type SerpElement

type SerpElement struct {
	Request *Request

	Bot      *wee.Bot
	Selector string
	Index    int
	Element  *rod.Element
}

func NewSerpElement

func NewSerpElement(req *Request, bot *wee.Bot, sel string, index int) (*SerpElement, error)

func (*SerpElement) Attr

func (e *SerpElement) Attr(k string) string
func (e *SerpElement) Link(attrs ...string) string

Link alias of Attr for the first matched of "src/href"

func (*SerpElement) Target

func (e *SerpElement) Target() string

func (*SerpElement) Text

func (e *SerpElement) Text() string

type UIDGenerator

type UIDGenerator func(*url.URL) string

type VisitOptionFunc

type VisitOptionFunc func(o *VisitOptions)

func Elem

func Elem(elem *rod.Element) VisitOptionFunc

func OnVisitEnd

func OnVisitEnd(fn ErrHandler) VisitOptionFunc

func OpenInTab

func OpenInTab(b bool) VisitOptionFunc

OpenInTab marks request open a link in new tab.

func WithURL

func WithURL(u string) VisitOptionFunc

type VisitOptions

type VisitOptions struct {
	// contains filtered or unexported fields
}

type WeenyParser

type WeenyParser struct {
	*xparse.HTMLParser
}

func NewParser

func NewParser(raw, cfg []byte) *WeenyParser

func (*WeenyParser) ParseToStruct

func (wp *WeenyParser) ParseToStruct(key string, obj any) error

Directories

Path Synopsis
examples
15five command
basic command
colly/basic command
colly/max_depth command
colly/xszj command
huarun command
pingan command
xszj command

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL