Documentation
¶
Index ¶
- type APICollector
- type Article
- type ArticleFilter
- type AtomAuthor
- type AtomCategory
- type AtomContent
- type AtomEntry
- type AtomFeed
- type AtomLink
- type BatchCollector
- type Channel
- type CollectConfig
- type CollectResult
- type CollectorManager
- type CollectorManagerImpl
- func (cm *CollectorManagerImpl) CollectAll(ctx context.Context, configs []CollectConfig) []CollectResult
- func (cm *CollectorManagerImpl) CollectWithRetry(ctx context.Context, config CollectConfig, retryConfig RetryConfig) (CollectResult, error)
- func (cm *CollectorManagerImpl) GetCollector(sourceType string) (DataCollector, bool)
- func (cm *CollectorManagerImpl) RegisterCollector(sourceType string, collector DataCollector)
- type DataCollector
- type DevToArticle
- type DevToOrg
- type DevToUser
- type GitHubIssue
- type GitHubRepo
- type HTMLCollector
- type HTMLSelector
- type Item
- type Link
- type RSSCollector
- type RSSFeed
- type RetryConfig
- type SortBy
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type APICollector ¶
type APICollector struct {
// contains filtered or unexported fields
}
APICollector API数据采集器
func (*APICollector) Collect ¶
func (a *APICollector) Collect(ctx context.Context, config CollectConfig) (CollectResult, error)
Collect 采集API数据
func (*APICollector) GetSourceType ¶
func (a *APICollector) GetSourceType() string
GetSourceType 返回采集器类型
func (*APICollector) Validate ¶
func (a *APICollector) Validate(config CollectConfig) error
Validate 验证配置
type Article ¶
type Article struct { ID string `json:"id"` Title string `json:"title"` Content string `json:"content"` Summary string `json:"summary"` Author string `json:"author"` URL string `json:"url"` PublishedAt time.Time `json:"published_at"` Tags []string `json:"tags"` Source string `json:"source"` SourceType string `json:"source_type"` Language string `json:"language"` Metadata map[string]string `json:"metadata"` }
Article 表示采集到的文章数据
func AggregateResults ¶
func AggregateResults(results []CollectResult) ([]Article, []error)
AggregateResults 聚合多个采集结果
func DeduplicateArticles ¶
DeduplicateArticles 去重文章
type ArticleFilter ¶
type ArticleFilter struct { Keywords []string // 关键词过滤 Authors []string // 作者过滤 Tags []string // 标签过滤 Languages []string // 语言过滤 DateFrom time.Time // 起始日期 DateTo time.Time // 结束日期 MinLength int // 最小内容长度 MaxLength int // 最大内容长度 }
FilterArticles 根据条件过滤文章
func (*ArticleFilter) FilterArticles ¶
func (af *ArticleFilter) FilterArticles(articles []Article) []Article
FilterArticles 过滤文章
type AtomAuthor ¶
type AtomCategory ¶
type AtomCategory struct {
Term string `xml:"term,attr"`
}
type AtomContent ¶
type AtomEntry ¶
type AtomEntry struct { Title string `xml:"title"` Content AtomContent `xml:"content"` Summary string `xml:"summary"` Link []AtomLink `xml:"link"` ID string `xml:"id"` Published string `xml:"published"` Updated string `xml:"updated"` Author AtomAuthor `xml:"author"` Category []AtomCategory `xml:"category"` }
type AtomFeed ¶
type AtomFeed struct { XMLName xml.Name `xml:"feed"` Title string `xml:"title"` Link []AtomLink `xml:"link"` Entries []AtomEntry `xml:"entry"` }
Atom feed 结构定义
type BatchCollector ¶
type BatchCollector struct {
// contains filtered or unexported fields
}
BatchCollector 批量采集器
func NewBatchCollector ¶
func NewBatchCollector(manager CollectorManager, maxConcurrent int, timeout time.Duration) *BatchCollector
NewBatchCollector 创建批量采集器
func (*BatchCollector) CollectBatch ¶
func (bc *BatchCollector) CollectBatch(ctx context.Context, configs []CollectConfig) []CollectResult
CollectBatch 批量采集,支持并发控制
type CollectConfig ¶
type CollectConfig struct { URL string `json:"url"` Headers map[string]string `json:"headers,omitempty"` Timeout time.Duration `json:"timeout,omitempty"` MaxArticles int `json:"max_articles,omitempty"` Language string `json:"language,omitempty"` Tags []string `json:"tags,omitempty"` Metadata map[string]string `json:"metadata,omitempty"` }
CollectConfig 表示采集配置
type CollectResult ¶
type CollectResult struct { Articles []Article `json:"articles"` Source string `json:"source"` Error error `json:"error,omitempty"` }
CollectResult 表示采集结果
type CollectorManager ¶
type CollectorManager interface { // RegisterCollector 注册采集器 RegisterCollector(sourceType string, collector DataCollector) // CollectAll 并发采集多个数据源 CollectAll(ctx context.Context, configs []CollectConfig) []CollectResult // CollectWithRetry 带重试机制的采集 CollectWithRetry(ctx context.Context, config CollectConfig, retryConfig RetryConfig) (CollectResult, error) // GetCollector 根据类型获取采集器 GetCollector(sourceType string) (DataCollector, bool) }
CollectorManager 管理所有采集器并提供并发采集能力
type CollectorManagerImpl ¶
type CollectorManagerImpl struct {
// contains filtered or unexported fields
}
CollectorManagerImpl 采集器管理器实现
func (*CollectorManagerImpl) CollectAll ¶
func (cm *CollectorManagerImpl) CollectAll(ctx context.Context, configs []CollectConfig) []CollectResult
CollectAll 并发采集多个数据源
func (*CollectorManagerImpl) CollectWithRetry ¶
func (cm *CollectorManagerImpl) CollectWithRetry(ctx context.Context, config CollectConfig, retryConfig RetryConfig) (CollectResult, error)
CollectWithRetry 带重试机制的采集
func (*CollectorManagerImpl) GetCollector ¶
func (cm *CollectorManagerImpl) GetCollector(sourceType string) (DataCollector, bool)
GetCollector 根据类型获取采集器
func (*CollectorManagerImpl) RegisterCollector ¶
func (cm *CollectorManagerImpl) RegisterCollector(sourceType string, collector DataCollector)
RegisterCollector 注册采集器
type DataCollector ¶
type DataCollector interface { // Collect 采集数据,返回文章列表 Collect(ctx context.Context, config CollectConfig) (CollectResult, error) // GetSourceType 返回采集器类型 GetSourceType() string // Validate 验证配置是否有效 Validate(config CollectConfig) error }
DataCollector 定义数据采集器的统一接口
type DevToArticle ¶
type DevToArticle struct { ID int `json:"id"` Title string `json:"title"` Description string `json:"description"` BodyMarkdown string `json:"body_markdown"` URL string `json:"url"` PublishedAt string `json:"published_at"` CreatedAt string `json:"created_at"` TagList []string `json:"tag_list"` User DevToUser `json:"user"` Organization *DevToOrg `json:"organization"` ReadingTimeMinutes int `json:"reading_time_minutes"` }
Dev.to API 响应结构
type GitHubIssue ¶
type GitHubIssue struct { ID int `json:"id"` Number int `json:"number"` Title string `json:"title"` Body string `json:"body"` State string `json:"state"` HTMLURL string `json:"html_url"` CreatedAt string `json:"created_at"` UpdatedAt string `json:"updated_at"` User struct { Login string `json:"login"` } `json:"user"` Labels []struct { Name string `json:"name"` } `json:"labels"` }
type GitHubRepo ¶
type GitHubRepo struct { ID int `json:"id"` Name string `json:"name"` FullName string `json:"full_name"` Description string `json:"description"` HTMLURL string `json:"html_url"` Language string `json:"language"` CreatedAt string `json:"created_at"` UpdatedAt string `json:"updated_at"` PushedAt string `json:"pushed_at"` StargazersCount int `json:"stargazers_count"` ForksCount int `json:"forks_count"` WatchersCount int `json:"watchers_count"` OpenIssuesCount int `json:"open_issues_count"` Fork bool `json:"fork"` Owner struct { Login string `json:"login"` } `json:"owner"` Topics []string `json:"topics"` }
GitHub API 响应结构
type HTMLCollector ¶
type HTMLCollector struct {
// contains filtered or unexported fields
}
HTMLCollector HTML网页采集器
func (*HTMLCollector) Collect ¶
func (h *HTMLCollector) Collect(ctx context.Context, config CollectConfig) (CollectResult, error)
Collect 采集HTML页面数据
func (*HTMLCollector) GetSourceType ¶
func (h *HTMLCollector) GetSourceType() string
GetSourceType 返回采集器类型
func (*HTMLCollector) Validate ¶
func (h *HTMLCollector) Validate(config CollectConfig) error
Validate 验证配置
type HTMLSelector ¶
type HTMLSelector struct { Title string `json:"title"` // 标题选择器 Content string `json:"content"` // 内容选择器 Summary string `json:"summary"` // 摘要选择器 Author string `json:"author"` // 作者选择器 PublishedAt string `json:"published_at"` // 发布时间选择器 Tags string `json:"tags"` // 标签选择器 Links string `json:"links"` // 链接选择器 }
HTMLSelector 定义HTML选择器配置
type RSSCollector ¶
type RSSCollector struct {
// contains filtered or unexported fields
}
RSSCollector RSS数据采集器
func (*RSSCollector) Collect ¶
func (r *RSSCollector) Collect(ctx context.Context, config CollectConfig) (CollectResult, error)
Collect 采集RSS数据
func (*RSSCollector) GetSourceType ¶
func (r *RSSCollector) GetSourceType() string
GetSourceType 返回采集器类型
func (*RSSCollector) Validate ¶
func (r *RSSCollector) Validate(config CollectConfig) error
Validate 验证配置
type RetryConfig ¶
type RetryConfig struct { MaxRetries int `json:"max_retries"` RetryDelay time.Duration `json:"retry_delay"` }
RetryConfig 重试配置