Documentation ¶
Index ¶
- func AppendManager(r IManagerRunner)
- func DumpData(texts []*config.Text)
- func InitGlobalLog(name string)
- func InitSpiderLog(name string)
- func NewDefaultDownloadPart(name string, link *config.Link) config.IDownloaderPart
- func NewDefaultResolverPart(name string, rawData *config.RawData) config.IResolverPart
- func RunAll()
- type DefaultDownloadPart
- type DefaultResolverPart
- type DownloadPart
- func (this *DownloadPart) GetLink() *config.Link
- func (this *DownloadPart) GetLogName() string
- func (this *DownloadPart) GetName() string
- func (this *DownloadPart) GetPipeline() layer.IPipeline
- func (this *DownloadPart) GetUnit() layer.IUnit
- func (this *DownloadPart) InitObject(name string, link *config.Link)
- func (this *DownloadPart) InitPart()
- func (this *DownloadPart) Run() (layer.IPart, interface{})
- func (this *DownloadPart) SetDownloader(downloader config.IDownloader)
- func (this *DownloadPart) SetPipeline(pipeline layer.IPipeline)
- func (this *DownloadPart) SetRawData(rawData *config.RawData)
- func (this *DownloadPart) SetUnit(unit layer.IUnit)
- type IManagerRunner
- type PartProxy
- func (this *PartProxy) Forward() (layer.IPartProxy, interface{})
- func (this *PartProxy) GetName() string
- func (this *PartProxy) GetUnit() layer.IUnit
- func (this *PartProxy) InitPart()
- func (this *PartProxy) InitPartProxy()
- func (this *PartProxy) Run() (layer.IPart, interface{})
- func (this *PartProxy) SetPipeline(pipeline layer.IPipeline)
- func (this *PartProxy) SetUnit(unit layer.IUnit)
- type ResolverPart
- func (this *ResolverPart) GetLogName() string
- func (this *ResolverPart) GetName() string
- func (this *ResolverPart) GetPipeline() layer.IPipeline
- func (this *ResolverPart) GetRawData() *config.RawData
- func (this *ResolverPart) GetUnit() layer.IUnit
- func (this *ResolverPart) InitObject(name string, rawData *config.RawData)
- func (this *ResolverPart) InitPart()
- func (this *ResolverPart) Run() (layer.IPart, interface{})
- func (this *ResolverPart) SetPipeline(pipeline layer.IPipeline)
- func (this *ResolverPart) SetResolver(resolver config.IResolver)
- func (this *ResolverPart) SetText(text *config.Text)
- func (this *ResolverPart) SetUnit(unit layer.IUnit)
- func (this *ResolverPart) Setlinks(links []*config.Link)
- type SpiderManager
- type SpiderPipeline
- func (this *SpiderPipeline) AddSeeds(seeds ...*config.Link)
- func (this *SpiderPipeline) Flush()
- func (this *SpiderPipeline) GetFirstPartProxy() layer.IPartProxy
- func (this *SpiderPipeline) GetName() string
- func (this *SpiderPipeline) GetNextLayer() layer.ILayer
- func (this *SpiderPipeline) Update()
- func (this *SpiderPipeline) Write(d interface{})
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func AppendManager ¶
func AppendManager(r IManagerRunner)
func InitGlobalLog ¶
func InitGlobalLog(name string)
The logs imported from orinal "log" are global log. generally, when there are some framework level infos or serious errors, it will use global log.
func NewDefaultDownloadPart ¶
func NewDefaultDownloadPart(name string, link *config.Link) config.IDownloaderPart
func NewDefaultResolverPart ¶
func NewDefaultResolverPart(name string, rawData *config.RawData) config.IResolverPart
Types ¶
type DefaultDownloadPart ¶
type DefaultDownloadPart struct {
DownloadPart
}
func (*DefaultDownloadPart) GenerateRawData ¶
func (this *DefaultDownloadPart) GenerateRawData() (*config.RawData, error)
type DefaultResolverPart ¶
type DefaultResolverPart struct {
ResolverPart
}
func (*DefaultResolverPart) GenerateLinks ¶
func (this *DefaultResolverPart) GenerateLinks() ([]*config.Link, error)
default option is resolving the page and getting all links in this page.
func (*DefaultResolverPart) GenerateText ¶
func (this *DefaultResolverPart) GenerateText() (*config.Text, error)
default option is return html data directly, not the pure text.
type DownloadPart ¶
type DownloadPart struct {
// contains filtered or unexported fields
}
func (*DownloadPart) GetLink ¶
func (this *DownloadPart) GetLink() *config.Link
func (*DownloadPart) GetLogName ¶
func (this *DownloadPart) GetLogName() string
func (*DownloadPart) GetName ¶
func (this *DownloadPart) GetName() string
func NewDownloadPart(name string, link *config.Link) layer.IPart{ d := new(DownloadPart) d.InitObject(name, link) return d }
func (*DownloadPart) GetPipeline ¶
func (this *DownloadPart) GetPipeline() layer.IPipeline
func (*DownloadPart) GetUnit ¶
func (this *DownloadPart) GetUnit() layer.IUnit
func (*DownloadPart) InitObject ¶
func (this *DownloadPart) InitObject(name string, link *config.Link)
func (*DownloadPart) InitPart ¶
func (this *DownloadPart) InitPart()
func (*DownloadPart) Run ¶
func (this *DownloadPart) Run() (layer.IPart, interface{})
func (*DownloadPart) SetDownloader ¶
func (this *DownloadPart) SetDownloader(downloader config.IDownloader)
func (*DownloadPart) SetPipeline ¶
func (this *DownloadPart) SetPipeline(pipeline layer.IPipeline)
func (*DownloadPart) SetRawData ¶
func (this *DownloadPart) SetRawData(rawData *config.RawData)
func (*DownloadPart) SetUnit ¶
func (this *DownloadPart) SetUnit(unit layer.IUnit)
type IManagerRunner ¶
type IManagerRunner interface { Run() GetName() string }
type PartProxy ¶
type PartProxy struct {
// contains filtered or unexported fields
}
func NewPartProxy ¶
func (*PartProxy) Forward ¶
func (this *PartProxy) Forward() (layer.IPartProxy, interface{})
func (*PartProxy) InitPartProxy ¶
func (this *PartProxy) InitPartProxy()
func (*PartProxy) SetPipeline ¶
type ResolverPart ¶
type ResolverPart struct {
// contains filtered or unexported fields
}
func (*ResolverPart) GetLogName ¶
func (this *ResolverPart) GetLogName() string
func (*ResolverPart) GetName ¶
func (this *ResolverPart) GetName() string
func (*ResolverPart) GetPipeline ¶
func (this *ResolverPart) GetPipeline() layer.IPipeline
func (*ResolverPart) GetRawData ¶
func (this *ResolverPart) GetRawData() *config.RawData
func (*ResolverPart) GetUnit ¶
func (this *ResolverPart) GetUnit() layer.IUnit
func (*ResolverPart) InitObject ¶
func (this *ResolverPart) InitObject(name string, rawData *config.RawData)
func (*ResolverPart) InitPart ¶
func (this *ResolverPart) InitPart()
func (*ResolverPart) Run ¶
func (this *ResolverPart) Run() (layer.IPart, interface{})
func (*ResolverPart) SetPipeline ¶
func (this *ResolverPart) SetPipeline(pipeline layer.IPipeline)
func NewResolverPart(name string, rawData *config.RawData) layer.IPart{ r := new(ResolverPart) r.InitObject(name, rawData) return r }
func (*ResolverPart) SetResolver ¶
func (this *ResolverPart) SetResolver(resolver config.IResolver)
func (*ResolverPart) SetText ¶
func (this *ResolverPart) SetText(text *config.Text)
func (*ResolverPart) SetUnit ¶
func (this *ResolverPart) SetUnit(unit layer.IUnit)
func (*ResolverPart) Setlinks ¶
func (this *ResolverPart) Setlinks(links []*config.Link)
type SpiderManager ¶
type SpiderManager struct {
// contains filtered or unexported fields
}
func NewSpiderManager ¶
func NewSpiderManager(name string, seeds ...*config.Link) *SpiderManager
func (*SpiderManager) AddLayerStrategy ¶
func (this *SpiderManager) AddLayerStrategy(layer int, dpptr config.DownloadPartPtr, rpptr config.ResolverPartPtr)
func (*SpiderManager) GetName ¶
func (this *SpiderManager) GetName() string
func (*SpiderManager) Run ¶
func (this *SpiderManager) Run()
type SpiderPipeline ¶
func NewSpiderPipeline ¶
func NewSpiderPipeline(name string) *SpiderPipeline
func (*SpiderPipeline) AddSeeds ¶
func (this *SpiderPipeline) AddSeeds(seeds ...*config.Link)
func (*SpiderPipeline) Flush ¶
func (this *SpiderPipeline) Flush()
1. flush some cached Text data to local file or database, etc. 2. the number of flushing cached Text can setup by {config.NumCachedText}. 3. framework will auto call Flush() as long as the number of cached Text reaches {config.NumCachedText}. 4. run in a goroutine, do not share data with outside unless you known how to do sync. there are many goroutine to do this function, but {this.dumpedText} has it own memory and truely has necessary value data in every goroutine because they're copied from outside with thread-safe.
func (*SpiderPipeline) GetFirstPartProxy ¶
func (this *SpiderPipeline) GetFirstPartProxy() layer.IPartProxy
only in initializing processing, single-thread, and thread-safe
func (*SpiderPipeline) GetName ¶
func (this *SpiderPipeline) GetName() string
func (*SpiderPipeline) GetNextLayer ¶
func (this *SpiderPipeline) GetNextLayer() layer.ILayer
func (*SpiderPipeline) Update ¶
func (this *SpiderPipeline) Update()
all threads has done, this function only executed in single-thread, thread-safe update and turn to next layer, all about this layer will be cleared.
func (*SpiderPipeline) Write ¶
func (this *SpiderPipeline) Write(d interface{})
many parallel tasks call Write(), must do sync.