scraper

package
v0.0.81 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: May 19, 2025 License: Apache-2.0 Imports: 29 Imported by: 0

Documentation

Index

Constants

View Source
const (
	MaxCrawlDepth   = 5
	MaxPagesToCrawl = 100
)
View Source
const (
	// queue group
	QUEUE_GROUP = "scraper-service"

	// consumer config
	CONSUMER_NAME         = "scraper-consumer"
	ACK_WAIT              = 30 * time.Second
	MAX_DELIVERY_ATTEMPTS = 5
	MAX_ACK_PENDING       = 100
	FETCH_BATCH_SIZE      = 50
	MAX_FETCH_WAIT        = 500 * time.Millisecond
	ERR_BACKOFF           = 100 * time.Millisecond
)

Variables

View Source
var (
	ErrPaymentRequired = errors.New("jina balance requires topup")
	ErrUnprocessable   = errors.New("jina cannot process webpage")
	ErrUrlNotReachable = errors.New("url is not reachable")
)
View Source
var SUBSCRIBED_SUBJECT = enums.EventWebtrackerCreated.String()

Functions

This section is empty.

Types

type DocumentSection

type DocumentSection struct {
	Heading   string
	Content   []string
	LinkCount int
	TextCount int
	ListCount int
}

type ScraperService added in v0.0.42

type ScraperService interface {
	interfaces.NatsService
	Crawl(ctx context.Context, domain string) error
}

func NewScraperService

func NewScraperService(
	config *config.JinaConfig,
	natsConn *nats_internal.NATSConnections,
	leadsDB *database.DbConnections,
	repositories *repository.Repositories,
) ScraperService

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL