pipe

package
v0.9.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Oct 20, 2017 License: Apache-2.0 Imports: 31 Imported by: 0

Documentation

Index

Constants

View Source
const (
	CONTEXT_TASK_ID          api.ParaKey = "CRAWLER_TASK_ID"
	CONTEXT_CRAWLER_DOMAIN   api.ParaKey = "CRAWLER_DOMAIN"
	CONTEXT_CRAWLER_TASK     api.ParaKey = "CRAWLER_TASK"
	CONTEXT_CRAWLER_SNAPSHOT api.ParaKey = "CRAWLER_SNAPSHOT"

	CONTEXT_PAGE_LINKS api.ParaKey = "PAGE_LINKS"
)

Crawler common pipeline context keys

Variables

This section is empty.

Functions

This section is empty.

Types

type ContentDeduplicationJoint added in v0.9.0

type ContentDeduplicationJoint struct {
	api.Parameters
}

ContentDeduplicationJoint used to check the hash of page body, if duplicated hash already exists, will break the pipeline

func (ContentDeduplicationJoint) Name added in v0.9.0

func (joint ContentDeduplicationJoint) Name() string

Name return: content_deduplication

func (ContentDeduplicationJoint) Process added in v0.9.0

func (joint ContentDeduplicationJoint) Process(c *api.Context) error

Process the content hash Deduplication

type EmptyJoint

type EmptyJoint struct {
}

EmptyJoint is a place holder

func (EmptyJoint) Name

func (joint EmptyJoint) Name() string

Name return empty

func (EmptyJoint) Process

func (joint EmptyJoint) Process(s *api.Context) error

Process do nothing

type FetchJoint

type FetchJoint struct {
	Parameters
	// contains filtered or unexported fields
}

func (FetchJoint) Name

func (joint FetchJoint) Name() string

func (FetchJoint) Process

func (joint FetchJoint) Process(context *Context) error

type FilterCheckJoint added in v0.9.0

type FilterCheckJoint struct {
	api.Parameters
	//ignore files end with js,css,apk,zip
	SkipPageParsePattern *regexp.Regexp
}

FilterCheckJointused to check the task url if it is already in the filter, if not in the filter, then add it to task filter, and make sure won't add it next time

func (FilterCheckJoint) Name added in v0.9.0

func (joint FilterCheckJoint) Name() string

Name return: filter_check

func (FilterCheckJoint) Process added in v0.9.0

func (joint FilterCheckJoint) Process(context *api.Context) error

Process the filtering and add it to the filter

type HashJoint

type HashJoint struct {
	api.Parameters
}

func (HashJoint) Name

func (joint HashJoint) Name() string

func (HashJoint) Process

func (joint HashJoint) Process(context *api.Context) error

type HtmlToTextJoint

type HtmlToTextJoint struct {
	Parameters
}

func (HtmlToTextJoint) Name

func (joint HtmlToTextJoint) Name() string

func (HtmlToTextJoint) Process

func (joint HtmlToTextJoint) Process(context *Context) error

type IgnoreTimeoutJoint

type IgnoreTimeoutJoint struct {
	api.Parameters
}

func (IgnoreTimeoutJoint) Name

func (joint IgnoreTimeoutJoint) Name() string

func (IgnoreTimeoutJoint) Process

func (joint IgnoreTimeoutJoint) Process(context *api.Context) error

type IndexJoint

type IndexJoint struct {
}

IndexJoint is used to send snapshot and task info into index

func (IndexJoint) Name

func (joint IndexJoint) Name() string

Name return index

func (IndexJoint) Process

func (joint IndexJoint) Process(c *api.Context) error

Process wrapper index document and send to queue

type InitTaskJoint

type InitTaskJoint struct {
	api.Parameters
}

InitTaskJoint basically start the pipeline process, construct a model.Task, may loaded from db with CONTEXT_TASK_ID or manually passed in with CONTEXT_CRAWLER_TASK

func (InitTaskJoint) Name

func (joint InitTaskJoint) Name() string

Name return: init_task

func (InitTaskJoint) Process

func (joint InitTaskJoint) Process(context *api.Context) error

Process task load, init a new snapshot instance

type LanguageDetectJoint added in v0.9.0

type LanguageDetectJoint struct {
}

LanguageDetectJoint used to detect the language of the webpage

func (LanguageDetectJoint) Name added in v0.9.0

func (joint LanguageDetectJoint) Name() string

Name return lang_detect

func (LanguageDetectJoint) Process added in v0.9.0

func (joint LanguageDetectJoint) Process(c *api.Context) error

Process language detect

type LoadMetadataJoint

type LoadMetadataJoint struct {
}

LoadMetadataJoint load metadata from db

func (LoadMetadataJoint) Name

func (joint LoadMetadataJoint) Name() string

Name is load_metadata

func (LoadMetadataJoint) Process

func (joint LoadMetadataJoint) Process(context *api.Context) error

Process load metadata

type ParsePageJoint

type ParsePageJoint struct {
	Parameters
	MaxPageOfBreadth map[int]int //max page to fetch in each level's breadth, eg: 1:100;2:50;3:5;4:1

}

func (ParsePageJoint) Name

func (joint ParsePageJoint) Name() string

func (ParsePageJoint) Process

func (joint ParsePageJoint) Process(context *Context) error

type SaveSnapshotToDBJoint

type SaveSnapshotToDBJoint struct {
	Parameters
}

func (SaveSnapshotToDBJoint) Name

func (this SaveSnapshotToDBJoint) Name() string

func (SaveSnapshotToDBJoint) Process

func (this SaveSnapshotToDBJoint) Process(c *Context) error

type SaveSnapshotToFileSystemJoint

type SaveSnapshotToFileSystemJoint struct {
	// contains filtered or unexported fields
}

func (SaveSnapshotToFileSystemJoint) Name

func (SaveSnapshotToFileSystemJoint) Process

func (joint SaveSnapshotToFileSystemJoint) Process(c *Context) error

type SaveTaskJoint

type SaveTaskJoint struct {
	Parameters
}

func (SaveTaskJoint) IsCreate

func (joint SaveTaskJoint) IsCreate(v bool) SaveTaskJoint

func (SaveTaskJoint) Name

func (joint SaveTaskJoint) Name() string

func (SaveTaskJoint) Process

func (joint SaveTaskJoint) Process(context *Context) error

type TaskDeduplicationJoint added in v0.9.0

type TaskDeduplicationJoint struct {
}

TaskDeduplicationJoint is used to find whether the task already in the database

func (TaskDeduplicationJoint) Name added in v0.9.0

func (joint TaskDeduplicationJoint) Name() string

Name return task_deduplication

func (TaskDeduplicationJoint) Process added in v0.9.0

func (joint TaskDeduplicationJoint) Process(c *api.Context) error

Process deduplication

type UpdateCheckTimeJoint added in v0.9.0

type UpdateCheckTimeJoint struct {
	Parameters
}

func (UpdateCheckTimeJoint) Name added in v0.9.0

func (this UpdateCheckTimeJoint) Name() string

func (UpdateCheckTimeJoint) Process added in v0.9.0

func (this UpdateCheckTimeJoint) Process(c *Context) error

type UrlFilterJoint added in v0.9.0

type UrlFilterJoint struct {
	api.Parameters
}

UrlFilterJoint used to validate urls, include host,path,file and file extension

func (UrlFilterJoint) Name added in v0.9.0

func (joint UrlFilterJoint) Name() string

Name is url_filter

func (UrlFilterJoint) Process added in v0.9.0

func (joint UrlFilterJoint) Process(context *api.Context) error

Process check all the url match rules

type UrlNormalizationJoint

type UrlNormalizationJoint struct {
	api.Parameters
	// contains filtered or unexported fields
}

UrlNormalizationJoint used to cleanup url and do normalization

func (UrlNormalizationJoint) Name

func (joint UrlNormalizationJoint) Name() string

Name of this joint is: url_normalization

func (UrlNormalizationJoint) Process

func (joint UrlNormalizationJoint) Process(context *api.Context) error

Process will handle relative url and cleanup url

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL