collector

package
v0.0.0-...-812ebae Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Sep 7, 2025 License: MIT Imports: 15 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

This section is empty.

Types

type APICollector

type APICollector struct {
	// contains filtered or unexported fields
}

APICollector API数据采集器

func NewAPICollector

func NewAPICollector() *APICollector

NewAPICollector 创建API采集器

func (*APICollector) Collect

func (a *APICollector) Collect(ctx context.Context, config CollectConfig) (CollectResult, error)

Collect 采集API数据

func (*APICollector) GetSourceType

func (a *APICollector) GetSourceType() string

GetSourceType 返回采集器类型

func (*APICollector) Validate

func (a *APICollector) Validate(config CollectConfig) error

Validate 验证配置

type Article

type Article struct {
	ID          string            `json:"id"`
	Title       string            `json:"title"`
	Content     string            `json:"content"`
	Summary     string            `json:"summary"`
	Author      string            `json:"author"`
	URL         string            `json:"url"`
	PublishedAt time.Time         `json:"published_at"`
	Tags        []string          `json:"tags"`
	Source      string            `json:"source"`
	SourceType  string            `json:"source_type"`
	Language    string            `json:"language"`
	Metadata    map[string]string `json:"metadata"`
}

Article 表示采集到的文章数据

func AggregateResults

func AggregateResults(results []CollectResult) ([]Article, []error)

AggregateResults 聚合多个采集结果

func DeduplicateArticles

func DeduplicateArticles(articles []Article) []Article

DeduplicateArticles 去重文章

func SortArticles

func SortArticles(articles []Article, sortBy SortBy, ascending bool) []Article

type ArticleFilter

type ArticleFilter struct {
	Keywords  []string  // 关键词过滤
	Authors   []string  // 作者过滤
	Tags      []string  // 标签过滤
	Languages []string  // 语言过滤
	DateFrom  time.Time // 起始日期
	DateTo    time.Time // 结束日期
	MinLength int       // 最小内容长度
	MaxLength int       // 最大内容长度
}

FilterArticles 根据条件过滤文章

func (*ArticleFilter) FilterArticles

func (af *ArticleFilter) FilterArticles(articles []Article) []Article

FilterArticles 过滤文章

type AtomAuthor

type AtomAuthor struct {
	Name  string `xml:"name"`
	Email string `xml:"email"`
}

type AtomCategory

type AtomCategory struct {
	Term string `xml:"term,attr"`
}

type AtomContent

type AtomContent struct {
	Type  string `xml:"type,attr"`
	Value string `xml:",chardata"`
}

type AtomEntry

type AtomEntry struct {
	Title     string         `xml:"title"`
	Content   AtomContent    `xml:"content"`
	Summary   string         `xml:"summary"`
	Link      []AtomLink     `xml:"link"`
	ID        string         `xml:"id"`
	Published string         `xml:"published"`
	Updated   string         `xml:"updated"`
	Author    AtomAuthor     `xml:"author"`
	Category  []AtomCategory `xml:"category"`
}

type AtomFeed

type AtomFeed struct {
	XMLName xml.Name    `xml:"feed"`
	Title   string      `xml:"title"`
	Link    []AtomLink  `xml:"link"`
	Entries []AtomEntry `xml:"entry"`
}

Atom feed 结构定义

type AtomLink struct {
	Href string `xml:"href,attr"`
	Rel  string `xml:"rel,attr"`
}

type BatchCollector

type BatchCollector struct {
	// contains filtered or unexported fields
}

BatchCollector 批量采集器

func NewBatchCollector

func NewBatchCollector(manager CollectorManager, maxConcurrent int, timeout time.Duration) *BatchCollector

NewBatchCollector 创建批量采集器

func (*BatchCollector) CollectBatch

func (bc *BatchCollector) CollectBatch(ctx context.Context, configs []CollectConfig) []CollectResult

CollectBatch 批量采集,支持并发控制

type Channel

type Channel struct {
	Title       string `xml:"title"`
	Description string `xml:"description"`
	Link        string `xml:"link"`
	Items       []Item `xml:"item"`
}

type CollectConfig

type CollectConfig struct {
	URL         string            `json:"url"`
	Headers     map[string]string `json:"headers,omitempty"`
	Timeout     time.Duration     `json:"timeout,omitempty"`
	MaxArticles int               `json:"max_articles,omitempty"`
	Language    string            `json:"language,omitempty"`
	Tags        []string          `json:"tags,omitempty"`
	Metadata    map[string]string `json:"metadata,omitempty"`
}

CollectConfig 表示采集配置

type CollectResult

type CollectResult struct {
	Articles []Article `json:"articles"`
	Source   string    `json:"source"`
	Error    error     `json:"error,omitempty"`
}

CollectResult 表示采集结果

type CollectorManager

type CollectorManager interface {
	// RegisterCollector 注册采集器
	RegisterCollector(sourceType string, collector DataCollector)

	// CollectAll 并发采集多个数据源
	CollectAll(ctx context.Context, configs []CollectConfig) []CollectResult

	// CollectWithRetry 带重试机制的采集
	CollectWithRetry(ctx context.Context, config CollectConfig, retryConfig RetryConfig) (CollectResult, error)

	// GetCollector 根据类型获取采集器
	GetCollector(sourceType string) (DataCollector, bool)
}

CollectorManager 管理所有采集器并提供并发采集能力

func NewCollectorManager

func NewCollectorManager() CollectorManager

NewCollectorManager 创建采集器管理器

type CollectorManagerImpl

type CollectorManagerImpl struct {
	// contains filtered or unexported fields
}

CollectorManagerImpl 采集器管理器实现

func (*CollectorManagerImpl) CollectAll

func (cm *CollectorManagerImpl) CollectAll(ctx context.Context, configs []CollectConfig) []CollectResult

CollectAll 并发采集多个数据源

func (*CollectorManagerImpl) CollectWithRetry

func (cm *CollectorManagerImpl) CollectWithRetry(ctx context.Context, config CollectConfig, retryConfig RetryConfig) (CollectResult, error)

CollectWithRetry 带重试机制的采集

func (*CollectorManagerImpl) GetCollector

func (cm *CollectorManagerImpl) GetCollector(sourceType string) (DataCollector, bool)

GetCollector 根据类型获取采集器

func (*CollectorManagerImpl) RegisterCollector

func (cm *CollectorManagerImpl) RegisterCollector(sourceType string, collector DataCollector)

RegisterCollector 注册采集器

type DataCollector

type DataCollector interface {
	// Collect 采集数据,返回文章列表
	Collect(ctx context.Context, config CollectConfig) (CollectResult, error)

	// GetSourceType 返回采集器类型
	GetSourceType() string

	// Validate 验证配置是否有效
	Validate(config CollectConfig) error
}

DataCollector 定义数据采集器的统一接口

type DevToArticle

type DevToArticle struct {
	ID                 int       `json:"id"`
	Title              string    `json:"title"`
	Description        string    `json:"description"`
	BodyMarkdown       string    `json:"body_markdown"`
	URL                string    `json:"url"`
	PublishedAt        string    `json:"published_at"`
	CreatedAt          string    `json:"created_at"`
	TagList            []string  `json:"tag_list"`
	User               DevToUser `json:"user"`
	Organization       *DevToOrg `json:"organization"`
	ReadingTimeMinutes int       `json:"reading_time_minutes"`
}

Dev.to API 响应结构

type DevToOrg

type DevToOrg struct {
	Name     string `json:"name"`
	Username string `json:"username"`
}

type DevToUser

type DevToUser struct {
	Username string `json:"username"`
	Name     string `json:"name"`
}

type GitHubIssue

type GitHubIssue struct {
	ID        int    `json:"id"`
	Number    int    `json:"number"`
	Title     string `json:"title"`
	Body      string `json:"body"`
	State     string `json:"state"`
	HTMLURL   string `json:"html_url"`
	CreatedAt string `json:"created_at"`
	UpdatedAt string `json:"updated_at"`
	User      struct {
		Login string `json:"login"`
	} `json:"user"`
	Labels []struct {
		Name string `json:"name"`
	} `json:"labels"`
}

type GitHubRepo

type GitHubRepo struct {
	ID              int    `json:"id"`
	Name            string `json:"name"`
	FullName        string `json:"full_name"`
	Description     string `json:"description"`
	HTMLURL         string `json:"html_url"`
	Language        string `json:"language"`
	CreatedAt       string `json:"created_at"`
	UpdatedAt       string `json:"updated_at"`
	PushedAt        string `json:"pushed_at"`
	StargazersCount int    `json:"stargazers_count"`
	ForksCount      int    `json:"forks_count"`
	WatchersCount   int    `json:"watchers_count"`
	OpenIssuesCount int    `json:"open_issues_count"`
	Fork            bool   `json:"fork"`
	Owner           struct {
		Login string `json:"login"`
	} `json:"owner"`
	Topics []string `json:"topics"`
}

GitHub API 响应结构

type HTMLCollector

type HTMLCollector struct {
	// contains filtered or unexported fields
}

HTMLCollector HTML网页采集器

func NewHTMLCollector

func NewHTMLCollector() *HTMLCollector

NewHTMLCollector 创建HTML采集器

func (*HTMLCollector) Collect

func (h *HTMLCollector) Collect(ctx context.Context, config CollectConfig) (CollectResult, error)

Collect 采集HTML页面数据

func (*HTMLCollector) GetSourceType

func (h *HTMLCollector) GetSourceType() string

GetSourceType 返回采集器类型

func (*HTMLCollector) Validate

func (h *HTMLCollector) Validate(config CollectConfig) error

Validate 验证配置

type HTMLSelector

type HTMLSelector struct {
	Title       string `json:"title"`        // 标题选择器
	Content     string `json:"content"`      // 内容选择器
	Summary     string `json:"summary"`      // 摘要选择器
	Author      string `json:"author"`       // 作者选择器
	PublishedAt string `json:"published_at"` // 发布时间选择器
	Tags        string `json:"tags"`         // 标签选择器
	Links       string `json:"links"`        // 链接选择器
}

HTMLSelector 定义HTML选择器配置

type Item

type Item struct {
	Title       string `xml:"title"`
	Description string `xml:"description"`
	Link        string `xml:"link"`
	GUID        string `xml:"guid"`
	PubDate     string `xml:"pubDate"`
	Author      string `xml:"author"`
	Category    string `xml:"category"`
}
type Link struct {
	URL   string
	Title string
}

type RSSCollector

type RSSCollector struct {
	// contains filtered or unexported fields
}

RSSCollector RSS数据采集器

func NewRSSCollector

func NewRSSCollector() *RSSCollector

NewRSSCollector 创建RSS采集器

func (*RSSCollector) Collect

func (r *RSSCollector) Collect(ctx context.Context, config CollectConfig) (CollectResult, error)

Collect 采集RSS数据

func (*RSSCollector) GetSourceType

func (r *RSSCollector) GetSourceType() string

GetSourceType 返回采集器类型

func (*RSSCollector) Validate

func (r *RSSCollector) Validate(config CollectConfig) error

Validate 验证配置

type RSSFeed

type RSSFeed struct {
	XMLName xml.Name `xml:"rss"`
	Channel Channel  `xml:"channel"`
}

RSS feed 结构定义

type RetryConfig

type RetryConfig struct {
	MaxRetries int           `json:"max_retries"`
	RetryDelay time.Duration `json:"retry_delay"`
}

RetryConfig 重试配置

type SortBy

type SortBy string

SortArticles 排序文章

const (
	SortByDate   SortBy = "date"
	SortByTitle  SortBy = "title"
	SortByAuthor SortBy = "author"
)

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL