Documentation
¶
Index ¶
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type ExtractionResult ¶
type ExtractionResult struct {
SessionID string `json:"session_id"`
Extractors []Extractor `json:"extractors"`
TotalPages int `json:"total_pages"`
Pages []PageExtraction `json:"pages"`
}
ExtractionResult is the full output of running extractors against a session.
type ExtractionRow ¶
type ExtractionRow struct {
CrawlSessionID string
URL string
ExtractorName string
Value string
CrawledAt time.Time
}
ExtractionRow is a single extracted value ready for ClickHouse insertion.
func RunExtractors ¶
func RunExtractors(body []byte, url, sessionID string, extractors []Extractor, crawledAt time.Time) []ExtractionRow
RunExtractors runs all extractors against a page body and returns rows for insertion.
type Extractor ¶
type Extractor struct {
ID string `json:"id"`
SetID string `json:"set_id"`
Type ExtractorType `json:"type"`
Name string `json:"name"`
Selector string `json:"selector"`
Attribute string `json:"attribute"`
URLPattern string `json:"url_pattern"`
SortOrder int `json:"sort_order"`
}
Extractor is a single extraction rule within an extractor set.
type ExtractorSet ¶
type ExtractorSet struct {
ID string `json:"id"`
Name string `json:"name"`
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
Extractors []Extractor `json:"extractors"`
ExtractorCount int `json:"extractor_count,omitempty"`
}
ExtractorSet groups extractors under a named set.
type ExtractorType ¶
type ExtractorType string
ExtractorType defines the kind of extraction.
const ( CSSExtractText ExtractorType = "css_extract_text" CSSExtractAttr ExtractorType = "css_extract_attr" CSSExtractAllText ExtractorType = "css_extract_all_text" CSSExtractAllAttr ExtractorType = "css_extract_all_attr" RegexExtract ExtractorType = "regex_extract" RegexExtractAll ExtractorType = "regex_extract_all" XPathExtract ExtractorType = "xpath_extract" XPathExtractAll ExtractorType = "xpath_extract_all" )
type PageExtraction ¶
PageExtraction holds extraction results for a single page.
Click to show internal directories.
Click to hide internal directories.