pipe

package

v0.9.0 Latest Latest Go to latest Published: Oct 20, 2017 License: Apache-2.0 Imports: 31 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/infinitbyte/gopa

Links

Open Source Insights

Documentation ¶

Index ¶

Constants
type ContentDeduplicationJoint
- func (joint ContentDeduplicationJoint) Name() string
- func (joint ContentDeduplicationJoint) Process(c *api.Context) error
type EmptyJoint
- func (joint EmptyJoint) Name() string
- func (joint EmptyJoint) Process(s *api.Context) error
type FetchJoint
- func (joint FetchJoint) Name() string
- func (joint FetchJoint) Process(context *Context) error
type FilterCheckJoint
- func (joint FilterCheckJoint) Name() string
- func (joint FilterCheckJoint) Process(context *api.Context) error
type HashJoint
- func (joint HashJoint) Name() string
- func (joint HashJoint) Process(context *api.Context) error
type HtmlToTextJoint
- func (joint HtmlToTextJoint) Name() string
- func (joint HtmlToTextJoint) Process(context *Context) error
type IgnoreTimeoutJoint
- func (joint IgnoreTimeoutJoint) Name() string
- func (joint IgnoreTimeoutJoint) Process(context *api.Context) error
type IndexJoint
- func (joint IndexJoint) Name() string
- func (joint IndexJoint) Process(c *api.Context) error
type InitTaskJoint
- func (joint InitTaskJoint) Name() string
- func (joint InitTaskJoint) Process(context *api.Context) error
type LanguageDetectJoint
- func (joint LanguageDetectJoint) Name() string
- func (joint LanguageDetectJoint) Process(c *api.Context) error
type LoadMetadataJoint
- func (joint LoadMetadataJoint) Name() string
- func (joint LoadMetadataJoint) Process(context *api.Context) error
type ParsePageJoint
- func (joint ParsePageJoint) Name() string
- func (joint ParsePageJoint) Process(context *Context) error
type SaveSnapshotToDBJoint
- func (this SaveSnapshotToDBJoint) Name() string
- func (this SaveSnapshotToDBJoint) Process(c *Context) error
type SaveSnapshotToFileSystemJoint
- func (joint SaveSnapshotToFileSystemJoint) Name() string
- func (joint SaveSnapshotToFileSystemJoint) Process(c *Context) error
type SaveTaskJoint
- func (joint SaveTaskJoint) IsCreate(v bool) SaveTaskJoint
- func (joint SaveTaskJoint) Name() string
- func (joint SaveTaskJoint) Process(context *Context) error
type TaskDeduplicationJoint
- func (joint TaskDeduplicationJoint) Name() string
- func (joint TaskDeduplicationJoint) Process(c *api.Context) error
type UpdateCheckTimeJoint
- func (this UpdateCheckTimeJoint) Name() string
- func (this UpdateCheckTimeJoint) Process(c *Context) error
type UrlFilterJoint
- func (joint UrlFilterJoint) Name() string
- func (joint UrlFilterJoint) Process(context *api.Context) error
type UrlNormalizationJoint
- func (joint UrlNormalizationJoint) Name() string
- func (joint UrlNormalizationJoint) Process(context *api.Context) error

Constants ¶

View Source

const (
	CONTEXT_TASK_ID          api.ParaKey = "CRAWLER_TASK_ID"
	CONTEXT_CRAWLER_DOMAIN   api.ParaKey = "CRAWLER_DOMAIN"
	CONTEXT_CRAWLER_TASK     api.ParaKey = "CRAWLER_TASK"
	CONTEXT_CRAWLER_SNAPSHOT api.ParaKey = "CRAWLER_SNAPSHOT"

	CONTEXT_PAGE_LINKS api.ParaKey = "PAGE_LINKS"
)

Crawler common pipeline context keys

Variables ¶

This section is empty.

Functions ¶

This section is empty.

Types ¶

type ContentDeduplicationJoint ¶ added in v0.9.0

type ContentDeduplicationJoint struct {
	api.Parameters
}

ContentDeduplicationJoint used to check the hash of page body, if duplicated hash already exists, will break the pipeline

func (ContentDeduplicationJoint) Name ¶ added in v0.9.0

func (joint ContentDeduplicationJoint) Name() string

Name return: content_deduplication

func (ContentDeduplicationJoint) Process ¶ added in v0.9.0

func (joint ContentDeduplicationJoint) Process(c *api.Context) error

Process the content hash Deduplication

type EmptyJoint ¶

type EmptyJoint struct {
}

EmptyJoint is a place holder

func (EmptyJoint) Name ¶

func (joint EmptyJoint) Name() string

Name return empty

func (EmptyJoint) Process ¶

func (joint EmptyJoint) Process(s *api.Context) error

Process do nothing

type FetchJoint ¶

type FetchJoint struct {
	Parameters
	// contains filtered or unexported fields
}

func (FetchJoint) Name ¶

func (joint FetchJoint) Name() string

func (FetchJoint) Process ¶

func (joint FetchJoint) Process(context *Context) error

type FilterCheckJoint ¶ added in v0.9.0

type FilterCheckJoint struct {
	api.Parameters
	//ignore files end with js,css,apk,zip
	SkipPageParsePattern *regexp.Regexp
}

FilterCheckJointused to check the task url if it is already in the filter, if not in the filter, then add it to task filter, and make sure won't add it next time

func (FilterCheckJoint) Name ¶ added in v0.9.0

func (joint FilterCheckJoint) Name() string

Name return: filter_check

func (FilterCheckJoint) Process ¶ added in v0.9.0

func (joint FilterCheckJoint) Process(context *api.Context) error

Process the filtering and add it to the filter

type HashJoint ¶

type HashJoint struct {
	api.Parameters
}

func (HashJoint) Name ¶

func (joint HashJoint) Name() string

func (HashJoint) Process ¶

func (joint HashJoint) Process(context *api.Context) error

type HtmlToTextJoint ¶

type HtmlToTextJoint struct {
	Parameters
}

func (HtmlToTextJoint) Name ¶

func (joint HtmlToTextJoint) Name() string

func (HtmlToTextJoint) Process ¶

func (joint HtmlToTextJoint) Process(context *Context) error

type IgnoreTimeoutJoint ¶

type IgnoreTimeoutJoint struct {
	api.Parameters
}

func (IgnoreTimeoutJoint) Name ¶

func (joint IgnoreTimeoutJoint) Name() string

func (IgnoreTimeoutJoint) Process ¶

func (joint IgnoreTimeoutJoint) Process(context *api.Context) error

type IndexJoint ¶

type IndexJoint struct {
}

IndexJoint is used to send snapshot and task info into index

func (IndexJoint) Name ¶

func (joint IndexJoint) Name() string

Name return index

func (IndexJoint) Process ¶

func (joint IndexJoint) Process(c *api.Context) error

Process wrapper index document and send to queue

type InitTaskJoint ¶

type InitTaskJoint struct {
	api.Parameters
}

InitTaskJoint basically start the pipeline process, construct a model.Task, may loaded from db with CONTEXT_TASK_ID or manually passed in with CONTEXT_CRAWLER_TASK

func (InitTaskJoint) Name ¶

func (joint InitTaskJoint) Name() string

Name return: init_task

func (InitTaskJoint) Process ¶

func (joint InitTaskJoint) Process(context *api.Context) error

Process task load, init a new snapshot instance

type LanguageDetectJoint ¶ added in v0.9.0

type LanguageDetectJoint struct {
}

LanguageDetectJoint used to detect the language of the webpage

func (LanguageDetectJoint) Name ¶ added in v0.9.0

func (joint LanguageDetectJoint) Name() string

Name return lang_detect

func (LanguageDetectJoint) Process ¶ added in v0.9.0

func (joint LanguageDetectJoint) Process(c *api.Context) error

Process language detect

type LoadMetadataJoint ¶

type LoadMetadataJoint struct {
}

LoadMetadataJoint load metadata from db

func (LoadMetadataJoint) Name ¶

func (joint LoadMetadataJoint) Name() string

Name is load_metadata

func (LoadMetadataJoint) Process ¶

func (joint LoadMetadataJoint) Process(context *api.Context) error

Process load metadata

type ParsePageJoint ¶

type ParsePageJoint struct {
	Parameters
	MaxPageOfBreadth map[int]int //max page to fetch in each level's breadth, eg: 1:100;2:50;3:5;4:1

}

func (ParsePageJoint) Name ¶

func (joint ParsePageJoint) Name() string

func (ParsePageJoint) Process ¶

func (joint ParsePageJoint) Process(context *Context) error

type SaveSnapshotToDBJoint ¶

type SaveSnapshotToDBJoint struct {
	Parameters
}

func (SaveSnapshotToDBJoint) Name ¶

func (this SaveSnapshotToDBJoint) Name() string

func (SaveSnapshotToDBJoint) Process ¶

func (this SaveSnapshotToDBJoint) Process(c *Context) error

type SaveSnapshotToFileSystemJoint ¶

type SaveSnapshotToFileSystemJoint struct {
	// contains filtered or unexported fields
}

func (SaveSnapshotToFileSystemJoint) Name ¶

func (joint SaveSnapshotToFileSystemJoint) Name() string

func (SaveSnapshotToFileSystemJoint) Process ¶

func (joint SaveSnapshotToFileSystemJoint) Process(c *Context) error

type SaveTaskJoint ¶

type SaveTaskJoint struct {
	Parameters
}

func (SaveTaskJoint) IsCreate ¶

func (joint SaveTaskJoint) IsCreate(v bool) SaveTaskJoint

func (SaveTaskJoint) Name ¶

func (joint SaveTaskJoint) Name() string

func (SaveTaskJoint) Process ¶

func (joint SaveTaskJoint) Process(context *Context) error

type TaskDeduplicationJoint ¶ added in v0.9.0

type TaskDeduplicationJoint struct {
}

TaskDeduplicationJoint is used to find whether the task already in the database

func (TaskDeduplicationJoint) Name ¶ added in v0.9.0

func (joint TaskDeduplicationJoint) Name() string

Name return task_deduplication

func (TaskDeduplicationJoint) Process ¶ added in v0.9.0

func (joint TaskDeduplicationJoint) Process(c *api.Context) error

Process deduplication

type UpdateCheckTimeJoint ¶ added in v0.9.0

type UpdateCheckTimeJoint struct {
	Parameters
}

func (UpdateCheckTimeJoint) Name ¶ added in v0.9.0

func (this UpdateCheckTimeJoint) Name() string

func (UpdateCheckTimeJoint) Process ¶ added in v0.9.0

func (this UpdateCheckTimeJoint) Process(c *Context) error

type UrlFilterJoint ¶ added in v0.9.0

type UrlFilterJoint struct {
	api.Parameters
}

UrlFilterJoint used to validate urls, include host,path,file and file extension

func (UrlFilterJoint) Name ¶ added in v0.9.0

func (joint UrlFilterJoint) Name() string

Name is url_filter

func (UrlFilterJoint) Process ¶ added in v0.9.0

func (joint UrlFilterJoint) Process(context *api.Context) error

Process check all the url match rules

type UrlNormalizationJoint ¶

type UrlNormalizationJoint struct {
	api.Parameters
	// contains filtered or unexported fields
}

UrlNormalizationJoint used to cleanup url and do normalization

func (UrlNormalizationJoint) Name ¶

func (joint UrlNormalizationJoint) Name() string

Name of this joint is: url_normalization

func (UrlNormalizationJoint) Process ¶

func (joint UrlNormalizationJoint) Process(context *api.Context) error

Process will handle relative url and cleanup url

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL