rules

package
v0.0.30 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Dec 31, 2023 License: Apache-2.0 Imports: 14 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

View Source
var ErrInvalidImageFormat = errors.New("invalid image format")
View Source
var ErrValueNotFound = errors.New("no value found")

Functions

This section is empty.

Types

type AuthorRule

type AuthorRule struct {
	BaseRule
}

AuthorRule is the rule for extracting the author information from a page.

func NewAuthorRule

func NewAuthorRule(strategies ...ExtractionStrategy) *AuthorRule

type BaseResult

type BaseResult struct {
	// contains filtered or unexported fields
}

func (*BaseResult) ApplyMetadata

func (r *BaseResult) ApplyMetadata(_ string, _ *url.URL, _ *metadata.Metadata)

func (*BaseResult) Found

func (r *BaseResult) Found() bool

func (*BaseResult) SelectorInfo

func (r *BaseResult) SelectorInfo() SelectorInfo

type BaseRule

type BaseRule struct {
	Strategies []ExtractionStrategy
}

BaseRule is the base rule for all rules

func (*BaseRule) Extract

func (br *BaseRule) Extract(node *html.Node, targetURL *url.URL) (ExtractResult, error)

Extract extracts the value from the node It iterates through all the strategies and returns the first value found

type CanonicalRule

type CanonicalRule struct {
	BaseRule
}

CanonicalRule is the rule for extracting the canonical URL of a page.

func NewCanonicalRule

func NewCanonicalRule() *CanonicalRule

func (*CanonicalRule) Extract

func (cr *CanonicalRule) Extract(node *html.Node, targetURL *url.URL) (ExtractResult, error)

type DateRule

type DateRule struct {
	BaseRule
}

DateRule is the rule for extracting the date information from a page.

func NewDateRule

func NewDateRule(strategies ...ExtractionStrategy) *DateRule

type DescriptionRule

type DescriptionRule struct {
	BaseRule
}

DescriptionRule is the rule for extracting the description information from a page.

func NewDescriptionRule

func NewDescriptionRule() *DescriptionRule

type ExtractFunc

type ExtractFunc func(node *html.Node, targetURL *url.URL, selectors []string) ExtractResult

ExtractFunc is the function signature for all extractors that can be used in a strategy. It accepts the node to extract from, the target URL, and the selectors to use It returns the value as an array of strings, a string indicating where it was found, and a boolean indicating if the value was found

func ExtractAttr

func ExtractAttr(attribute string) ExtractFunc

ExtractAttr extracts a selector from the given document using the given attribute.

type ExtractResult

type ExtractResult interface {
	ApplyMetadata(key string, u *url.URL, m *metadata.Metadata)
	Found() bool
	SelectorInfo() SelectorInfo
	Value() any
}

ExtractResult is the result of an extraction.

func ExtractCSS

func ExtractCSS(node *html.Node, _ *url.URL, selectors []string) ExtractResult

ExtractCSS extracts the given CSS selector from the given document.

func ExtractJSONLD

func ExtractJSONLD(node *html.Node, _ *url.URL, selectors []string) ExtractResult

ExtractJSONLD extracts the given JSON-LD attribute from the given document.

func ExtractMeta

func ExtractMeta(node *html.Node, targetURL *url.URL, selectors []string) ExtractResult

ExtractMeta extracts the given meta tag from the given document.

type ExtractionStrategy

type ExtractionStrategy struct {
	Selectors []string
	Extractor ExtractFunc
}

ExtractionStrategy is the strategy for extracting a value

type FaviconRule

type FaviconRule struct {
	BaseRule
}

FaviconRule is the rule for extracting the favicon URL of a page.

func NewFaviconRule

func NewFaviconRule() *FaviconRule

func (*FaviconRule) Extract

func (r *FaviconRule) Extract(node *html.Node, targetURL *url.URL) (ExtractResult, error)

type FeedRule

type FeedRule struct {
	BaseRule
}

FeedRule is the rule for extracting the feed URL of a page. It will respond with an array of feed URLs it found.

func NewFeedRule

func NewFeedRule() *FeedRule

func (*FeedRule) Extract

func (fr *FeedRule) Extract(node *html.Node, targetURL *url.URL) (ExtractResult, error)

Extract extracts the value from the node

type LangRule

type LangRule struct {
	BaseRule
}

LangRule is the rule for extracting the language information from a page.

func NewLangRule

func NewLangRule() *LangRule

type LeadImageRule

type LeadImageRule struct {
	BaseRule
}

LeadImageRule is the rule for extracting the lead image from a page.

func NewLeadImageRule

func NewLeadImageRule() *LeadImageRule

type MultiStringResult

type MultiStringResult struct {
	*BaseResult
	// contains filtered or unexported fields
}

func NewMultiStringResult

func NewMultiStringResult(value []string, selectorInfo SelectorInfo, found bool) *MultiStringResult

func (*MultiStringResult) ApplyMetadata

func (r *MultiStringResult) ApplyMetadata(key string, _ *url.URL, m *metadata.Metadata)

func (*MultiStringResult) Found

func (r *MultiStringResult) Found() bool

func (*MultiStringResult) Value

func (r *MultiStringResult) Value() any

type NoResult

type NoResult struct {
	*BaseResult
}

func NewNoResult

func NewNoResult() *NoResult

func (*NoResult) Value

func (r *NoResult) Value() any

type PublisherRule

type PublisherRule struct {
	BaseRule
}

PublisherRule is the rule for extracting the publisher information from a page.

func NewPublisherRule

func NewPublisherRule() *PublisherRule

type ReadableResult

type ReadableResult struct {
	*BaseResult
	// contains filtered or unexported fields
}

func NewReadableResult

func NewReadableResult(value ReadableValue, selectorInfo SelectorInfo, found bool) *ReadableResult

func (*ReadableResult) ApplyMetadata

func (r *ReadableResult) ApplyMetadata(_ string, _ *url.URL, m *metadata.Metadata)

func (*ReadableResult) Value

func (r *ReadableResult) Value() any

type ReadableRule

type ReadableRule struct {
	BaseRule
}

ReadableRule is the rule for extracting the readable content

func NewReadableRule

func NewReadableRule() *ReadableRule

NewReadableRule creates a new ReadableRule

type ReadableValue

type ReadableValue struct {
	Excerpt    string
	HTML       string
	Text       string
	Image      string
	Lang       string
	Length     int
	Title      string
	Byline     string
	SiteName   string
	IsReadable bool
}

type Rule

type Rule interface {
	// Extract extracts the value from the node
	Extract(node *html.Node, targetURL *url.URL) (ExtractResult, error)
}

A Rule is a rule for extracting a value from a node. It encapsulates multiple strategies for extracting a value. Each strategy is tried in order of priority until a value is found, or all strategies have been tried.

type SelectorInfo

type SelectorInfo struct {
	Attr     string
	InMeta   bool
	Selector string
}

type SiteNameRule

type SiteNameRule struct {
	BaseRule
}

SiteNameRule is the rule for extracting the site name information from a page.

func NewSiteNameRule

func NewSiteNameRule() *SiteNameRule

func (*SiteNameRule) Extract

func (r *SiteNameRule) Extract(node *html.Node, targetURL *url.URL) (ExtractResult, error)

type StringResult

type StringResult struct {
	*BaseResult
	// contains filtered or unexported fields
}

func NewStringResult

func NewStringResult(value string, selectorInfo SelectorInfo, found bool) *StringResult

func (*StringResult) ApplyMetadata

func (r *StringResult) ApplyMetadata(key string, u *url.URL, m *metadata.Metadata)

func (*StringResult) Value

func (r *StringResult) Value() any

type TitleRule

type TitleRule struct {
	BaseRule
}

TitleRule is the rule for extracting the title information from a page.

func NewTitleRule

func NewTitleRule() *TitleRule

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL