rules

package

v0.0.30 Latest Latest Go to latest Published: Dec 31, 2023 License: Apache-2.0 Imports: 14 Imported by: 0

Details

Valid go.mod file

The Go module system was introduced in Go 1.11 and is the official dependency management solution for Go.
Redistributable license

Redistributable licenses place minimal restrictions on how software can be used, modified, and redistributed.
Tagged version

Modules with tagged versions give importers more predictable builds.
Stable version

When a project reaches major version v1 it is considered stable.
Learn more about best practices

Repository

github.com/octetic/gophetch

Links

Open Source Insights

Documentation ¶

Index ¶

Variables
type AuthorRule
- func NewAuthorRule(strategies ...ExtractionStrategy) *AuthorRule
type BaseResult
- func (r *BaseResult) ApplyMetadata(_ string, _ *url.URL, _ *metadata.Metadata)
- func (r *BaseResult) Found() bool
- func (r *BaseResult) SelectorInfo() SelectorInfo
type BaseRule
- func (br *BaseRule) Extract(node *html.Node, targetURL *url.URL) (ExtractResult, error)
type CanonicalRule
- func NewCanonicalRule() *CanonicalRule
- func (cr *CanonicalRule) Extract(node *html.Node, targetURL *url.URL) (ExtractResult, error)
type DateRule
- func NewDateRule(strategies ...ExtractionStrategy) *DateRule
type DescriptionRule
- func NewDescriptionRule() *DescriptionRule
type ExtractFunc
- func ExtractAttr(attribute string) ExtractFunc
type ExtractResult
- func ExtractCSS(node *html.Node, _ *url.URL, selectors []string) ExtractResult
- func ExtractJSONLD(node *html.Node, _ *url.URL, selectors []string) ExtractResult
- func ExtractMeta(node *html.Node, targetURL *url.URL, selectors []string) ExtractResult
type ExtractionStrategy
type FaviconRule
- func NewFaviconRule() *FaviconRule
- func (r *FaviconRule) Extract(node *html.Node, targetURL *url.URL) (ExtractResult, error)
type FeedRule
- func NewFeedRule() *FeedRule
- func (fr *FeedRule) Extract(node *html.Node, targetURL *url.URL) (ExtractResult, error)
type LangRule
- func NewLangRule() *LangRule
type LeadImageRule
- func NewLeadImageRule() *LeadImageRule
type MultiStringResult
- func NewMultiStringResult(value []string, selectorInfo SelectorInfo, found bool) *MultiStringResult
- func (r *MultiStringResult) ApplyMetadata(key string, _ *url.URL, m *metadata.Metadata)
- func (r *MultiStringResult) Found() bool
- func (r *MultiStringResult) Value() any
type NoResult
- func NewNoResult() *NoResult
- func (r *NoResult) Value() any
type PublisherRule
- func NewPublisherRule() *PublisherRule
type ReadableResult
- func NewReadableResult(value ReadableValue, selectorInfo SelectorInfo, found bool) *ReadableResult
- func (r *ReadableResult) ApplyMetadata(_ string, _ *url.URL, m *metadata.Metadata)
- func (r *ReadableResult) Value() any
type ReadableRule
- func NewReadableRule() *ReadableRule
type ReadableValue
type Rule
type SelectorInfo
type SiteNameRule
- func NewSiteNameRule() *SiteNameRule
- func (r *SiteNameRule) Extract(node *html.Node, targetURL *url.URL) (ExtractResult, error)
type StringResult
- func NewStringResult(value string, selectorInfo SelectorInfo, found bool) *StringResult
- func (r *StringResult) ApplyMetadata(key string, u *url.URL, m *metadata.Metadata)
- func (r *StringResult) Value() any
type TitleRule
- func NewTitleRule() *TitleRule

Constants ¶

This section is empty.

Variables ¶

View Source

var ErrInvalidImageFormat = errors.New("invalid image format")

View Source

var ErrValueNotFound = errors.New("no value found")

Functions ¶

This section is empty.

Types ¶

type AuthorRule ¶

type AuthorRule struct {
	BaseRule
}

AuthorRule is the rule for extracting the author information from a page.

func NewAuthorRule ¶

func NewAuthorRule(strategies ...ExtractionStrategy) *AuthorRule

type BaseResult ¶

type BaseResult struct {
	// contains filtered or unexported fields
}

func (*BaseResult) ApplyMetadata ¶

func (r *BaseResult) ApplyMetadata(_ string, _ *url.URL, _ *metadata.Metadata)

func (*BaseResult) Found ¶

func (r *BaseResult) Found() bool

func (*BaseResult) SelectorInfo ¶

func (r *BaseResult) SelectorInfo() SelectorInfo

func (*BaseRule) Extract ¶

func (br *BaseRule) Extract(node *html.Node, targetURL *url.URL) (ExtractResult, error)

Extract extracts the value from the node It iterates through all the strategies and returns the first value found

type CanonicalRule ¶

type CanonicalRule struct {
	BaseRule
}

CanonicalRule is the rule for extracting the canonical URL of a page.

func NewCanonicalRule ¶

func NewCanonicalRule() *CanonicalRule

func (*CanonicalRule) Extract ¶

func (cr *CanonicalRule) Extract(node *html.Node, targetURL *url.URL) (ExtractResult, error)

type DateRule ¶

type DateRule struct {
	BaseRule
}

DateRule is the rule for extracting the date information from a page.

func NewDateRule ¶

func NewDateRule(strategies ...ExtractionStrategy) *DateRule

type DescriptionRule ¶

type DescriptionRule struct {
	BaseRule
}

DescriptionRule is the rule for extracting the description information from a page.

func NewDescriptionRule ¶

func NewDescriptionRule() *DescriptionRule

type ExtractFunc ¶

type ExtractFunc func(node *html.Node, targetURL *url.URL, selectors []string) ExtractResult

ExtractFunc is the function signature for all extractors that can be used in a strategy. It accepts the node to extract from, the target URL, and the selectors to use It returns the value as an array of strings, a string indicating where it was found, and a boolean indicating if the value was found

func ExtractAttr ¶

func ExtractAttr(attribute string) ExtractFunc

ExtractAttr extracts a selector from the given document using the given attribute.

type ExtractResult ¶

type ExtractResult interface {
	ApplyMetadata(key string, u *url.URL, m *metadata.Metadata)
	Found() bool
	SelectorInfo() SelectorInfo
	Value() any
}

ExtractResult is the result of an extraction.

func ExtractCSS ¶

func ExtractCSS(node *html.Node, _ *url.URL, selectors []string) ExtractResult

ExtractCSS extracts the given CSS selector from the given document.

func ExtractJSONLD ¶

func ExtractJSONLD(node *html.Node, _ *url.URL, selectors []string) ExtractResult

ExtractJSONLD extracts the given JSON-LD attribute from the given document.

func ExtractMeta ¶

func ExtractMeta(node *html.Node, targetURL *url.URL, selectors []string) ExtractResult

ExtractMeta extracts the given meta tag from the given document.

type ExtractionStrategy ¶

type ExtractionStrategy struct {
	Selectors []string
	Extractor ExtractFunc
}

ExtractionStrategy is the strategy for extracting a value

type FaviconRule ¶

type FaviconRule struct {
	BaseRule
}

FaviconRule is the rule for extracting the favicon URL of a page.

func NewFaviconRule ¶

func NewFaviconRule() *FaviconRule

func (*FaviconRule) Extract ¶

func (r *FaviconRule) Extract(node *html.Node, targetURL *url.URL) (ExtractResult, error)

type FeedRule ¶

type FeedRule struct {
	BaseRule
}

FeedRule is the rule for extracting the feed URL of a page. It will respond with an array of feed URLs it found.

func NewFeedRule ¶

func NewFeedRule() *FeedRule

func (*FeedRule) Extract ¶

func (fr *FeedRule) Extract(node *html.Node, targetURL *url.URL) (ExtractResult, error)

Extract extracts the value from the node

type LangRule ¶

type LangRule struct {
	BaseRule
}

LangRule is the rule for extracting the language information from a page.

func NewLangRule ¶

func NewLangRule() *LangRule

type LeadImageRule ¶

type LeadImageRule struct {
	BaseRule
}

LeadImageRule is the rule for extracting the lead image from a page.

func NewLeadImageRule ¶

func NewLeadImageRule() *LeadImageRule

type MultiStringResult ¶

type MultiStringResult struct {
	*BaseResult
	// contains filtered or unexported fields
}

func NewMultiStringResult ¶

func NewMultiStringResult(value []string, selectorInfo SelectorInfo, found bool) *MultiStringResult

func (*MultiStringResult) ApplyMetadata ¶

func (r *MultiStringResult) ApplyMetadata(key string, _ *url.URL, m *metadata.Metadata)

func (*MultiStringResult) Found ¶

func (r *MultiStringResult) Found() bool

func (*MultiStringResult) Value ¶

func (r *MultiStringResult) Value() any

type NoResult ¶

type NoResult struct {
	*BaseResult
}

func NewNoResult ¶

func NewNoResult() *NoResult

func (*NoResult) Value ¶

func (r *NoResult) Value() any

type PublisherRule ¶

type PublisherRule struct {
	BaseRule
}

PublisherRule is the rule for extracting the publisher information from a page.

func NewPublisherRule ¶

func NewPublisherRule() *PublisherRule

type ReadableResult ¶

type ReadableResult struct {
	*BaseResult
	// contains filtered or unexported fields
}

func NewReadableResult ¶

func NewReadableResult(value ReadableValue, selectorInfo SelectorInfo, found bool) *ReadableResult

func (*ReadableResult) ApplyMetadata ¶

func (r *ReadableResult) ApplyMetadata(_ string, _ *url.URL, m *metadata.Metadata)

func (*ReadableResult) Value ¶

func (r *ReadableResult) Value() any

type ReadableRule ¶

type ReadableRule struct {
	BaseRule
}

ReadableRule is the rule for extracting the readable content

func NewReadableRule ¶

func NewReadableRule() *ReadableRule

NewReadableRule creates a new ReadableRule

type ReadableValue ¶

type ReadableValue struct {
	Excerpt    string
	HTML       string
	Text       string
	Image      string
	Lang       string
	Length     int
	Title      string
	Byline     string
	SiteName   string
	IsReadable bool
}

type Rule ¶

type Rule interface {
	// Extract extracts the value from the node
	Extract(node *html.Node, targetURL *url.URL) (ExtractResult, error)
}

A Rule is a rule for extracting a value from a node. It encapsulates multiple strategies for extracting a value. Each strategy is tried in order of priority until a value is found, or all strategies have been tried.

type SelectorInfo ¶

type SelectorInfo struct {
	Attr     string
	InMeta   bool
	Selector string
}

type SiteNameRule ¶

type SiteNameRule struct {
	BaseRule
}

SiteNameRule is the rule for extracting the site name information from a page.

func NewSiteNameRule ¶

func NewSiteNameRule() *SiteNameRule

func (*SiteNameRule) Extract ¶

func (r *SiteNameRule) Extract(node *html.Node, targetURL *url.URL) (ExtractResult, error)

type StringResult ¶

type StringResult struct {
	*BaseResult
	// contains filtered or unexported fields
}

func NewStringResult ¶

func NewStringResult(value string, selectorInfo SelectorInfo, found bool) *StringResult

func (*StringResult) ApplyMetadata ¶

func (r *StringResult) ApplyMetadata(key string, u *url.URL, m *metadata.Metadata)

func (*StringResult) Value ¶

func (r *StringResult) Value() any

type TitleRule ¶

type TitleRule struct {
	BaseRule
}

TitleRule is the rule for extracting the title information from a page.

func NewTitleRule ¶

func NewTitleRule() *TitleRule

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL