Documentation
¶
Index ¶
- Constants
- type ContentDeduplicationJoint
- type EmptyJoint
- type FetchJoint
- type FilterCheckJoint
- type HashJoint
- type HtmlToTextJoint
- type IgnoreTimeoutJoint
- type IndexJoint
- type InitTaskJoint
- type LanguageDetectJoint
- type LoadMetadataJoint
- type ParsePageJoint
- type SaveSnapshotToDBJoint
- type SaveSnapshotToFileSystemJoint
- type SaveTaskJoint
- type TaskDeduplicationJoint
- type UpdateCheckTimeJoint
- type UrlFilterJoint
- type UrlNormalizationJoint
Constants ¶
const ( CONTEXT_TASK_ID api.ParaKey = "CRAWLER_TASK_ID" CONTEXT_CRAWLER_DOMAIN api.ParaKey = "CRAWLER_DOMAIN" CONTEXT_CRAWLER_TASK api.ParaKey = "CRAWLER_TASK" CONTEXT_CRAWLER_SNAPSHOT api.ParaKey = "CRAWLER_SNAPSHOT" CONTEXT_PAGE_LINKS api.ParaKey = "PAGE_LINKS" )
Crawler common pipeline context keys
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type ContentDeduplicationJoint ¶ added in v0.9.0
type ContentDeduplicationJoint struct {
api.Parameters
}
ContentDeduplicationJoint used to check the hash of page body, if duplicated hash already exists, will break the pipeline
func (ContentDeduplicationJoint) Name ¶ added in v0.9.0
func (joint ContentDeduplicationJoint) Name() string
Name return: content_deduplication
type FetchJoint ¶
type FetchJoint struct {
Parameters
// contains filtered or unexported fields
}
func (FetchJoint) Name ¶
func (joint FetchJoint) Name() string
func (FetchJoint) Process ¶
func (joint FetchJoint) Process(context *Context) error
type FilterCheckJoint ¶ added in v0.9.0
type FilterCheckJoint struct {
api.Parameters
//ignore files end with js,css,apk,zip
SkipPageParsePattern *regexp.Regexp
}
FilterCheckJointused to check the task url if it is already in the filter, if not in the filter, then add it to task filter, and make sure won't add it next time
func (FilterCheckJoint) Name ¶ added in v0.9.0
func (joint FilterCheckJoint) Name() string
Name return: filter_check
type HashJoint ¶
type HashJoint struct {
api.Parameters
}
type HtmlToTextJoint ¶
type HtmlToTextJoint struct {
Parameters
}
func (HtmlToTextJoint) Name ¶
func (joint HtmlToTextJoint) Name() string
func (HtmlToTextJoint) Process ¶
func (joint HtmlToTextJoint) Process(context *Context) error
type IgnoreTimeoutJoint ¶
type IgnoreTimeoutJoint struct {
api.Parameters
}
func (IgnoreTimeoutJoint) Name ¶
func (joint IgnoreTimeoutJoint) Name() string
type IndexJoint ¶
type IndexJoint struct {
}
IndexJoint is used to send snapshot and task info into index
type InitTaskJoint ¶
type InitTaskJoint struct {
api.Parameters
}
InitTaskJoint basically start the pipeline process, construct a model.Task, may loaded from db with CONTEXT_TASK_ID or manually passed in with CONTEXT_CRAWLER_TASK
type LanguageDetectJoint ¶ added in v0.9.0
type LanguageDetectJoint struct {
}
LanguageDetectJoint used to detect the language of the webpage
func (LanguageDetectJoint) Name ¶ added in v0.9.0
func (joint LanguageDetectJoint) Name() string
Name return lang_detect
type ParsePageJoint ¶
type ParsePageJoint struct {
Parameters
MaxPageOfBreadth map[int]int //max page to fetch in each level's breadth, eg: 1:100;2:50;3:5;4:1
}
func (ParsePageJoint) Name ¶
func (joint ParsePageJoint) Name() string
func (ParsePageJoint) Process ¶
func (joint ParsePageJoint) Process(context *Context) error
type SaveSnapshotToDBJoint ¶
type SaveSnapshotToDBJoint struct {
Parameters
}
func (SaveSnapshotToDBJoint) Name ¶
func (this SaveSnapshotToDBJoint) Name() string
func (SaveSnapshotToDBJoint) Process ¶
func (this SaveSnapshotToDBJoint) Process(c *Context) error
type SaveSnapshotToFileSystemJoint ¶
type SaveSnapshotToFileSystemJoint struct {
// contains filtered or unexported fields
}
func (SaveSnapshotToFileSystemJoint) Name ¶
func (joint SaveSnapshotToFileSystemJoint) Name() string
func (SaveSnapshotToFileSystemJoint) Process ¶
func (joint SaveSnapshotToFileSystemJoint) Process(c *Context) error
type SaveTaskJoint ¶
type SaveTaskJoint struct {
Parameters
}
func (SaveTaskJoint) IsCreate ¶
func (joint SaveTaskJoint) IsCreate(v bool) SaveTaskJoint
func (SaveTaskJoint) Name ¶
func (joint SaveTaskJoint) Name() string
func (SaveTaskJoint) Process ¶
func (joint SaveTaskJoint) Process(context *Context) error
type TaskDeduplicationJoint ¶ added in v0.9.0
type TaskDeduplicationJoint struct {
}
TaskDeduplicationJoint is used to find whether the task already in the database
func (TaskDeduplicationJoint) Name ¶ added in v0.9.0
func (joint TaskDeduplicationJoint) Name() string
Name return task_deduplication
type UpdateCheckTimeJoint ¶ added in v0.9.0
type UpdateCheckTimeJoint struct {
Parameters
}
func (UpdateCheckTimeJoint) Name ¶ added in v0.9.0
func (this UpdateCheckTimeJoint) Name() string
func (UpdateCheckTimeJoint) Process ¶ added in v0.9.0
func (this UpdateCheckTimeJoint) Process(c *Context) error
type UrlFilterJoint ¶ added in v0.9.0
type UrlFilterJoint struct {
api.Parameters
}
UrlFilterJoint used to validate urls, include host,path,file and file extension
func (UrlFilterJoint) Name ¶ added in v0.9.0
func (joint UrlFilterJoint) Name() string
Name is url_filter
type UrlNormalizationJoint ¶
type UrlNormalizationJoint struct {
api.Parameters
// contains filtered or unexported fields
}
UrlNormalizationJoint used to cleanup url and do normalization
func (UrlNormalizationJoint) Name ¶
func (joint UrlNormalizationJoint) Name() string
Name of this joint is: url_normalization
Source Files
¶
- content_deduplication.go
- context_key.go
- empty.go
- fetch.go
- filter_check.go
- hash.go
- html_to_text.go
- ignore_timeout.go
- index.go
- lang_detect.go
- load_metadata.go
- local_network_filter.go
- parse.go
- robots.go
- save_fs.go
- save_snapshot.go
- save_task.go
- start.go
- task_deduplication.go
- update_check_time.go
- url_filter.go
- url_normalization.go