spider

package
v0.0.0-...-ef324e3 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jan 11, 2021 License: MIT Imports: 13 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func AppendManager

func AppendManager(r IManagerRunner)

func DumpData

func DumpData(texts []*config.Text)

func InitGlobalLog

func InitGlobalLog(name string)

The logs imported from orinal "log" are global log. generally, when there are some framework level infos or serious errors, it will use global log.

func InitSpiderLog

func InitSpiderLog(name string)

The logs imported from "slog" are spider log.

func NewDefaultDownloadPart

func NewDefaultDownloadPart(name string, link *config.Link) config.IDownloaderPart

func NewDefaultResolverPart

func NewDefaultResolverPart(name string, rawData *config.RawData) config.IResolverPart

func RunAll

func RunAll()

Types

type DefaultDownloadPart

type DefaultDownloadPart struct {
	DownloadPart
}

func (*DefaultDownloadPart) GenerateRawData

func (this *DefaultDownloadPart) GenerateRawData() (*config.RawData, error)

type DefaultResolverPart

type DefaultResolverPart struct {
	ResolverPart
}
func (this *DefaultResolverPart) GenerateLinks() ([]*config.Link, error)

default option is resolving the page and getting all links in this page.

func (*DefaultResolverPart) GenerateText

func (this *DefaultResolverPart) GenerateText() (*config.Text, error)

default option is return html data directly, not the pure text.

type DownloadPart

type DownloadPart struct {
	// contains filtered or unexported fields
}
func (this *DownloadPart) GetLink() *config.Link

func (*DownloadPart) GetLogName

func (this *DownloadPart) GetLogName() string

func (*DownloadPart) GetName

func (this *DownloadPart) GetName() string
func NewDownloadPart(name string, link *config.Link) layer.IPart{
	d := new(DownloadPart)
	d.InitObject(name, link)
	return d
}

func (*DownloadPart) GetPipeline

func (this *DownloadPart) GetPipeline() layer.IPipeline

func (*DownloadPart) GetUnit

func (this *DownloadPart) GetUnit() layer.IUnit

func (*DownloadPart) InitObject

func (this *DownloadPart) InitObject(name string, link *config.Link)

func (*DownloadPart) InitPart

func (this *DownloadPart) InitPart()

func (*DownloadPart) Run

func (this *DownloadPart) Run() (layer.IPart, interface{})

func (*DownloadPart) SetDownloader

func (this *DownloadPart) SetDownloader(downloader config.IDownloader)

func (*DownloadPart) SetPipeline

func (this *DownloadPart) SetPipeline(pipeline layer.IPipeline)

func (*DownloadPart) SetRawData

func (this *DownloadPart) SetRawData(rawData *config.RawData)

func (*DownloadPart) SetUnit

func (this *DownloadPart) SetUnit(unit layer.IUnit)

type IManagerRunner

type IManagerRunner interface {
	Run()
	GetName() string
}

type PartProxy

type PartProxy struct {
	// contains filtered or unexported fields
}

func NewPartProxy

func NewPartProxy(part layer.IPart) *PartProxy

func (*PartProxy) Forward

func (this *PartProxy) Forward() (layer.IPartProxy, interface{})

func (*PartProxy) GetName

func (this *PartProxy) GetName() string

func (*PartProxy) GetUnit

func (this *PartProxy) GetUnit() layer.IUnit

func (*PartProxy) InitPart

func (this *PartProxy) InitPart()

func (*PartProxy) InitPartProxy

func (this *PartProxy) InitPartProxy()

func (*PartProxy) Run

func (this *PartProxy) Run() (layer.IPart, interface{})

func (*PartProxy) SetPipeline

func (this *PartProxy) SetPipeline(pipeline layer.IPipeline)

func (*PartProxy) SetUnit

func (this *PartProxy) SetUnit(unit layer.IUnit)

type ResolverPart

type ResolverPart struct {
	// contains filtered or unexported fields
}

func (*ResolverPart) GetLogName

func (this *ResolverPart) GetLogName() string

func (*ResolverPart) GetName

func (this *ResolverPart) GetName() string

func (*ResolverPart) GetPipeline

func (this *ResolverPart) GetPipeline() layer.IPipeline

func (*ResolverPart) GetRawData

func (this *ResolverPart) GetRawData() *config.RawData

func (*ResolverPart) GetUnit

func (this *ResolverPart) GetUnit() layer.IUnit

func (*ResolverPart) InitObject

func (this *ResolverPart) InitObject(name string, rawData *config.RawData)

func (*ResolverPart) InitPart

func (this *ResolverPart) InitPart()

func (*ResolverPart) Run

func (this *ResolverPart) Run() (layer.IPart, interface{})

func (*ResolverPart) SetPipeline

func (this *ResolverPart) SetPipeline(pipeline layer.IPipeline)
func NewResolverPart(name string, rawData *config.RawData) layer.IPart{
	r := new(ResolverPart)
	r.InitObject(name, rawData)
	return r
}

func (*ResolverPart) SetResolver

func (this *ResolverPart) SetResolver(resolver config.IResolver)

func (*ResolverPart) SetText

func (this *ResolverPart) SetText(text *config.Text)

func (*ResolverPart) SetUnit

func (this *ResolverPart) SetUnit(unit layer.IUnit)
func (this *ResolverPart) Setlinks(links []*config.Link)

type SpiderManager

type SpiderManager struct {
	// contains filtered or unexported fields
}

func NewSpiderManager

func NewSpiderManager(name string, seeds ...*config.Link) *SpiderManager

func (*SpiderManager) AddLayerStrategy

func (this *SpiderManager) AddLayerStrategy(layer int, dpptr config.DownloadPartPtr, rpptr config.ResolverPartPtr)

func (*SpiderManager) GetName

func (this *SpiderManager) GetName() string

func (*SpiderManager) Run

func (this *SpiderManager) Run()

type SpiderPipeline

type SpiderPipeline struct {
	layer.Dump
	// contains filtered or unexported fields
}

func NewSpiderPipeline

func NewSpiderPipeline(name string) *SpiderPipeline

func (*SpiderPipeline) AddSeeds

func (this *SpiderPipeline) AddSeeds(seeds ...*config.Link)

func (*SpiderPipeline) Flush

func (this *SpiderPipeline) Flush()

1. flush some cached Text data to local file or database, etc. 2. the number of flushing cached Text can setup by {config.NumCachedText}. 3. framework will auto call Flush() as long as the number of cached Text reaches {config.NumCachedText}. 4. run in a goroutine, do not share data with outside unless you known how to do sync. there are many goroutine to do this function, but {this.dumpedText} has it own memory and truely has necessary value data in every goroutine because they're copied from outside with thread-safe.

func (*SpiderPipeline) GetFirstPartProxy

func (this *SpiderPipeline) GetFirstPartProxy() layer.IPartProxy

only in initializing processing, single-thread, and thread-safe

func (*SpiderPipeline) GetName

func (this *SpiderPipeline) GetName() string

func (*SpiderPipeline) GetNextLayer

func (this *SpiderPipeline) GetNextLayer() layer.ILayer

func (*SpiderPipeline) Update

func (this *SpiderPipeline) Update()

all threads has done, this function only executed in single-thread, thread-safe update and turn to next layer, all about this layer will be cleared.

func (*SpiderPipeline) Write

func (this *SpiderPipeline) Write(d interface{})

many parallel tasks call Write(), must do sync.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL