scrapy

package
v0.0.0-...-a02b7a7 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Dec 28, 2020 License: MIT Imports: 32 Imported by: 0

Documentation

Index

Constants

View Source
const (
	ParserNotFound = iota
	RequestsError
)
View Source
const (
	GET                          = "get"
	POST                         = "post"
	POSTJSON                     = "post-json"
	DefaultTimeOut time.Duration = 1
)
View Source
const (
	ProxIpMaxExpiredTime float64 = 5
)
View Source
const (
	TimeFormat = "2006-01-02 15:04:05"
)

Variables

View Source
var (
	Stop = make(chan os.Signal, 1)
)

Functions

func AutoGetHtmlEncode

func AutoGetHtmlEncode(html string, encode string) string

func DecodeMessage

func DecodeMessage(b []byte) (*nsq.Message, error)

func DefaultPipelines

func DefaultPipelines(i IItem)

func Host

func Host(url string) (string, error)

func NewKafkaConsumer

func NewKafkaConsumer(opt *Consumer) *kafkaConsumer

func NewNsqConsumer

func NewNsqConsumer(opt *Consumer) *nsqConsumer

func NewRedisCluster

func NewRedisCluster(ips []string, password string) *redis.ClusterClient

func Once

func Once(f func())

func Regex

func Regex(html, rex string) pie.Strings

RegexParse : 通过正则表达式提取 html中的指定 regex 元素

func Validated

func Validated(s interface{}) bool

Types

type AbuyunProxy

type AbuyunProxy struct {
	AppID       string
	AppSecret   string
	ProxyServer string
}

func NewAbutunProxy

func NewAbutunProxy(appid, secret, proxyServer string) *AbuyunProxy

func (AbuyunProxy) Get

func (p AbuyunProxy) Get() IProxy

func (AbuyunProxy) Init

func (p AbuyunProxy) Init()

func (AbuyunProxy) ProxyClient

func (p AbuyunProxy) ProxyClient() (*http.Client, IProxy)

func (AbuyunProxy) Put

func (p AbuyunProxy) Put(proxy IProxy)

type Auth

type Auth string

type Broker

type Broker struct {
	NsqBroker   *NsqBroker   `yaml:"nsq" json:"nsq_broker"`
	RedisBroker *RedisBroker `yaml:"redis" json:"redis_broker"`
	KafkaBroker *KafkaBroker `yaml:"kafka" json:"kafka_broker"`
	Nsq         bool         `yaml:"__nsq" json:"nsq"`
	Rds         bool         `yaml:"__rds" json:"rds"`
	Kfk         bool         `yaml:"__kafka" json:"kafka"`
}

func (*Broker) Add

func (b *Broker) Add(item IItem) bool

func (*Broker) GetBroker

func (b *Broker) GetBroker() IBroker

func (*Broker) Init

func (b *Broker) Init()

type BrowserName

type BrowserName String

type Channel

type Channel struct {
	Title         string     `xml:"title"`
	Link          string     `xml:"link"`
	Description   string     `xml:"description"`
	LastBuildDate string     `xml:"lastBuildDate"`
	Item          []*XmlItem `xml:"item"`
	sync.RWMutex
}

func (*Channel) AddItem

func (c *Channel) AddItem(item *XmlItem)

func (*Channel) AddLastPubTime

func (c *Channel) AddLastPubTime(pub string)

type Consumer

type Consumer struct {
	Nsq   *nsqConsumer
	Redis *RedisConsumer
	Kafka *kafkaConsumer
	Limit int
}

type Crawler

type Crawler struct {
	Request *Requests `validate:"required"`
	Cb      func(i IItem)
	Parser  IParser `validate:"required"`
	Item    IItem   `validate:"required"`

	Ip IProxy

	ProxyQueue IProxyQueue
	sync.RWMutex
	// contains filtered or unexported fields
}

func NewCrawler

func NewCrawler(url String, args ...interface{}) *Crawler

func NewProxyCrawler

func NewProxyCrawler(url String, args ...interface{}) *Crawler

func (*Crawler) Do

func (t *Crawler) Do() (*Crawler, error)

func (*Crawler) Html

func (t *Crawler) Html() String

func (*Crawler) SetCookies

func (t *Crawler) SetCookies(cookie *http.Cookie) *Crawler

func (*Crawler) SetHeader

func (t *Crawler) SetHeader(header requests.Header) *Crawler

func (*Crawler) SetMethod

func (t *Crawler) SetMethod(method string) *Crawler

func (*Crawler) SetParser

func (t *Crawler) SetParser(i IParser) *Crawler

func (*Crawler) SetPipelines

func (t *Crawler) SetPipelines(cb func(i IItem)) *Crawler

func (*Crawler) SetPostJson

func (t *Crawler) SetPostJson(json String) *Crawler

func (*Crawler) SetTimeOut

func (t *Crawler) SetTimeOut(duration time.Duration) *Crawler

func (*Crawler) Validate

func (t *Crawler) Validate() (err error)

type DefaultClient

type DefaultClient struct {
	sync.RWMutex
	// contains filtered or unexported fields
}

func (*DefaultClient) Get

func (d *DefaultClient) Get(url String, args ...interface{}) (resp *requests.Response, err error)

func (*DefaultClient) PostBinary

func (d *DefaultClient) PostBinary(url, js String, args ...interface{}) (*requests.Response, error)

func (*DefaultClient) PostJson

func (d *DefaultClient) PostJson(url, js String, args ...interface{}) (*requests.Response, error)

func (*DefaultClient) SetHeaders

func (d *DefaultClient) SetHeaders(header requests.Header)

func (*DefaultClient) SetTimeOut

func (d *DefaultClient) SetTimeOut(duration time.Duration)

type DefaultParser

type DefaultParser struct {
	Result IItem
	sync.RWMutex
}

func (*DefaultParser) Load

func (r *DefaultParser) Load(i IItem)

func (*DefaultParser) Validate

func (r *DefaultParser) Validate() bool

type Feeds

type Feeds struct {
	XMLName xml.Name `xml:"rss"`
	Version string   `xml:"version,attr"`
	Channel *Channel `xml:"channel"`
}

func NewFeeds

func NewFeeds() *Feeds

func (*Feeds) Add

func (f *Feeds) Add(v interface{})

func (*Feeds) Dumps

func (f *Feeds) Dumps() (String, error)

func (*Feeds) Empty

func (f *Feeds) Empty() bool

type G

type G string

goquery解析

type GoQueryAttribParser

type GoQueryAttribParser struct {
	DefaultParser
	Result *StringList
	// contains filtered or unexported fields
}

func A

func A(pattern _A, attrib string) *GoQueryAttribParser

func (*GoQueryAttribParser) Encode

func (g *GoQueryAttribParser) Encode(s string) IParser

func (*GoQueryAttribParser) Parser

func (g *GoQueryAttribParser) Parser(html String, item IItem, sss ...string) (IItem, bool)

func (*GoQueryAttribParser) Validate

func (g *GoQueryAttribParser) Validate() bool

type GoQueryParser

type GoQueryParser struct {
	Html string

	DefaultParser
	// contains filtered or unexported fields
}

func NewGoQueryParser

func NewGoQueryParser(pattern G) *GoQueryParser

func (*GoQueryParser) Encode

func (g *GoQueryParser) Encode(s string) IParser

func (*GoQueryParser) Parser

func (g *GoQueryParser) Parser(html String, item IItem, sss ...string) (IItem, bool)

type GoQueryTextParser

type GoQueryTextParser struct {
	Html string

	DefaultParser
	// contains filtered or unexported fields
}

func NewGoQueryTextParser

func NewGoQueryTextParser(pattern T) *GoQueryTextParser

func (*GoQueryTextParser) Encode

func (g *GoQueryTextParser) Encode(s string) IParser

func (*GoQueryTextParser) Parser

func (g *GoQueryTextParser) Parser(html String, item IItem, sss ...string) (IItem, bool)

type IBroker

type IBroker interface {
	Init()
	Add(item IItem) bool
}

type IClient

type IClient interface {
	Get(url String, args ...interface{}) (*requests.Response, error)
	PostJson(url, js String, args ...interface{}) (*requests.Response, error)
	SetTimeOut(duration time.Duration)
	SetHeaders(header requests.Header)
}

func NewDefaultClient

func NewDefaultClient() IClient

type IConsumer

type IConsumer interface {
	Init()
	Run()
}

type IItem

type IItem interface {
	// add property
	Add(v interface{})
	Dumps() (String, error)
	Empty() bool
}

type IParser

type IParser interface {
	Validate() bool
	Load(i IItem)
	Parser(string2 String, i IItem, string3 ...string) (IItem, bool)
	Encode(string) IParser
}

type IProxy

type IProxy interface {
	AddFails()
	AddSucc()
	Expired() bool

	ProxyUrl() string
	// contains filtered or unexported methods
}

type IProxyQueue

type IProxyQueue interface {
	Init()
	Get() IProxy
	Put(IProxy)
	ProxyClient() (*http.Client, IProxy)
	// contains filtered or unexported methods
}

type JsonParser

type JsonParser struct {
	DefaultParser
	Html string
	// contains filtered or unexported fields
}

func NewJsonParser

func NewJsonParser() *JsonParser

func (*JsonParser) Encode

func (r *JsonParser) Encode(s string) IParser

func (*JsonParser) Parser

func (r *JsonParser) Parser(htm String, interfaceI IItem, s ...string) (i IItem, ret bool)

type KafkaBroker

type KafkaBroker struct {
	Addrs []string `json:"addrs"`

	Topic string `json:"topic"`

	sync.WaitGroup
	// contains filtered or unexported fields
}

func NewKafkaBroker

func NewKafkaBroker(addrs []string, topic string) *KafkaBroker

func (*KafkaBroker) Add

func (k *KafkaBroker) Add(item IItem) bool

func (*KafkaBroker) Init

func (k *KafkaBroker) Init()

type Map

type Map struct {
	sync.RWMutex
	// contains filtered or unexported fields
}

func NewMap

func NewMap() *Map

func (*Map) Add

func (m *Map) Add(v interface{})

func (*Map) Contains

func (m *Map) Contains(s string) bool

func (*Map) Dumps

func (m *Map) Dumps() (String, error)

func (*Map) Empty

func (m *Map) Empty() bool

func (*Map) Get

func (m *Map) Get(k String) interface{}

func (*Map) Items

func (m *Map) Items() map[String]interface{}

func (*Map) Load

func (m *Map) Load(b []byte) error

func (*Map) Pop

func (m *Map) Pop(s String) interface{}

func (*Map) Size

func (m *Map) Size() int

type MixedParser

type MixedParser struct {
	DefaultParser
	// contains filtered or unexported fields
}

func NewMixdParser

func NewMixdParser(pattern Pattern) *MixedParser

func (*MixedParser) Encode

func (m *MixedParser) Encode(s string) IParser

func (*MixedParser) Parser

func (m *MixedParser) Parser(html String, item IItem, s ...string) (i IItem, ret bool)

type MongoClient

type MongoClient struct {
	// contains filtered or unexported fields
}

func NewMongoClient

func NewMongoClient(config []*bongo.Config) *MongoClient

func (*MongoClient) Add

func (m *MongoClient) Add(doc bongo.Document) error

func (*MongoClient) Collection

func (m *MongoClient) Collection(col string) *MongoClient

func (*MongoClient) Count

func (m *MongoClient) Count(m2 bson.M) (int, error)

func (*MongoClient) Del

func (m *MongoClient) Del(doc bongo.Document)

删除单条doc, 如果doc 包含BeforeDelete 和 AfterDeleteHook 则触发.

func (*MongoClient) Find

func (m *MongoClient) Find(query interface{}) *bongo.ResultSet

func (*MongoClient) FindOne

func (m *MongoClient) FindOne(query interface{}, result interface{}) error

func (*MongoClient) Init

func (m *MongoClient) Init()

func (*MongoClient) Pipe

func (m *MongoClient) Pipe(args ...bson.M) *mgo.Pipe

func (*MongoClient) Remove

func (m *MongoClient) Remove(query bson.M) bool

type Next

type Next struct {
	G map[string]string       `json:"g" bson:"g"`
	R map[string]string       `json:"r" bson:"r"`
	T map[string]string       `json:"t" bson:"t"`
	A map[string]ParserResult `json:"a" bson:"a"`
}

func NewNext

func NewNext(arg ...interface{}) (*Next, error)

func (*Next) Load

func (n *Next) Load(m map[string]interface{}) error

func (*Next) MergeGr

func (n *Next) MergeGr() (result Pattern)

type NsqBroker

type NsqBroker struct {
	Urls    []string `json:"urls" validate:"required"`
	Topic   string   `json:"topic" validate:"required"`
	Channel string   `json:"channel" validate:"required"`

	sync.RWMutex
	// contains filtered or unexported fields
}

func NewNsqBroker

func NewNsqBroker(urls []string, Topic, Channel string) *NsqBroker

func (*NsqBroker) Add

func (n *NsqBroker) Add(item IItem) bool

func (*NsqBroker) Init

func (n *NsqBroker) Init()

type Options

type Options struct {
	Version  string    `json:"version"`
	AppName  string    `yaml:"kind" json:"app_name"`
	Pages    *Pages    `json:"pages"`
	Broker   *Broker   `json:"broker"`
	Consumer *Consumer `json:"consumer"`
}

func NewOptions

func NewOptions(path string) (*Options, error)

func (*Options) Dumps

func (o *Options) Dumps() (string, error)

func (*Options) Item

func (o *Options) Item() (String, error)

type Page

type Page struct {
	Next   *Next             `json:"next-parser" yaml:"next-parser"`
	Url    String            `json:"url"`
	Parser G                 `json:"parser"`
	Meta   map[string]string `json:"meta"`
}

type Pages

type Pages struct {
	Labels map[string]*Page
}

type ParserResult

type ParserResult struct {
	Key   string      `bson:"key"`
	Value interface{} `bson:"value"`
}

func NewPr

func NewPr(key string, value interface{}) *ParserResult

func (*ParserResult) String

func (p *ParserResult) String() string

type Pattern

type Pattern map[string]interface{}

type ProxyClient

type ProxyClient struct {
	sync.RWMutex
	// contains filtered or unexported fields
}

func (*ProxyClient) Get

func (p *ProxyClient) Get(url String, args ...interface{}) (resp *requests.Response, err error)

func (*ProxyClient) PostJson

func (p *ProxyClient) PostJson(url, js String, args ...interface{}) (*requests.Response, error)

func (*ProxyClient) SetHeaders

func (p *ProxyClient) SetHeaders(header requests.Header)

func (*ProxyClient) SetTimeOut

func (p *ProxyClient) SetTimeOut(duration time.Duration)

type ProxyIp

type ProxyIp struct {
	Host       string    `json:"host"`
	Port       int       `json:"port"`
	CreateTime time.Time `json:"create_time"`
	Failures   int       `json:"failures"`
	Used       int       `json:"used"`
	Available  bool      `json:"available"`
	sync.RWMutex
}

func NewProxyIp

func NewProxyIp(host string, port int) *ProxyIp

func (*ProxyIp) AddFails

func (p *ProxyIp) AddFails()

func (*ProxyIp) AddSucc

func (p *ProxyIp) AddSucc()

func (*ProxyIp) Expired

func (p *ProxyIp) Expired() bool

func (*ProxyIp) ProxyUrl

func (p *ProxyIp) ProxyUrl() string

type ProxyQueue

type ProxyQueue struct {
	Queue   chan IProxy
	PullUrl string
	MaxCaps int
	Sleep   time.Duration
	sync.RWMutex
}

func (*ProxyQueue) Get

func (q *ProxyQueue) Get() IProxy

func (*ProxyQueue) Init

func (q *ProxyQueue) Init()

func (*ProxyQueue) ProxyClient

func (p *ProxyQueue) ProxyClient() (*http.Client, IProxy)

func (*ProxyQueue) Put

func (q *ProxyQueue) Put(ip IProxy)

type R

type R string

正则表达式解析

type RedisBroker

type RedisBroker struct {
	Host     string `json:"host" validate:"required"`
	Password string `json:"password"`
	Db       int    `json:"db"`
	Topic    string `json:"Topic" validate:"required"`

	WaitGroupWrap
	// contains filtered or unexported fields
}

func NewRedisBroker

func NewRedisBroker(host, password, topic string, db int) *RedisBroker

func (*RedisBroker) Add

func (r *RedisBroker) Add(item IItem) bool

func (*RedisBroker) Init

func (r *RedisBroker) Init()

type RedisClient

type RedisClient struct {
	// contains filtered or unexported fields
}

func NewRedis

func NewRedis(args ...*redis.Options) (*RedisClient, error)

func (*RedisClient) Existed

func (r *RedisClient) Existed(key string) bool

func (*RedisClient) Expire

func (r *RedisClient) Expire(key string, duration time.Duration)

func (*RedisClient) Incr

func (r *RedisClient) Incr(key string) int64

func (*RedisClient) Instance

func (r *RedisClient) Instance() *redis.Client

func (*RedisClient) LLen

func (r *RedisClient) LLen(key string) (int64, error)

func (*RedisClient) Lpop

func (r *RedisClient) Lpop(key string) (string, error)

func (*RedisClient) Lpush

func (r *RedisClient) Lpush(key string, val interface{})

func (*RedisClient) MaxKeyCount

func (r *RedisClient) MaxKeyCount(key string, max int) bool

func (*RedisClient) Pipelines

func (r *RedisClient) Pipelines(fn func(pipeliner redis.Pipeliner) error) error

func (*RedisClient) Publish

func (r *RedisClient) Publish(channels string, msg interface{}) error

func (*RedisClient) SCard

func (r *RedisClient) SCard(key string) (int64, error)

func (*RedisClient) SIsMember

func (r *RedisClient) SIsMember(key, id string) bool

func (*RedisClient) SPopN

func (r *RedisClient) SPopN(key string, count int64) ([]string, error)

func (*RedisClient) Sorted

func (r *RedisClient) Sorted(key string, sort *redis.Sort) *redis.StringSliceCmd

func (*RedisClient) Sub

func (r *RedisClient) Sub(channels ...string) <-chan *redis.Message

type RedisConsumer

type RedisConsumer struct {
	Host     string
	Password string
	Db       int
	Topic    string

	Limit int
	// contains filtered or unexported fields
}

func NewRedisConsumer

func NewRedisConsumer(opt *Consumer) *RedisConsumer

func (*RedisConsumer) HandleMessage

func (r *RedisConsumer) HandleMessage(msg *nsq.Message) (err error)

func (*RedisConsumer) Init

func (r *RedisConsumer) Init()

func (*RedisConsumer) Run

func (r *RedisConsumer) Run()

func (*RedisConsumer) SetHandler

func (r *RedisConsumer) SetHandler(handler nsq.Handler) IConsumer

type RegexItem

type RegexItem struct {
	// contains filtered or unexported fields
}

func (*RegexItem) StringVal

func (r *RegexItem) StringVal() string

type RegexItems

type RegexItems []RegexItem

func (RegexItems) First

func (r RegexItems) First() string

func (RegexItems) Val

func (r RegexItems) Val() pie.Strings

type RegexParser

type RegexParser struct {
	Html    string
	Pattern String
	DefaultParser
	// contains filtered or unexported fields
}

func NewRegexParser

func NewRegexParser(pattern R) *RegexParser

func (*RegexParser) Encode

func (r *RegexParser) Encode(s string) IParser

func (*RegexParser) Parser

func (r *RegexParser) Parser(htm String, interfaceI IItem, s ...string) (i IItem, ret bool)

type Requests

type Requests struct {
	Url String

	sync.RWMutex
	// contains filtered or unexported fields
}

func NewRequest

func NewRequest(url String, args ...interface{}) *Requests

func (*Requests) Do

func (r *Requests) Do() (resp *requests.Response, err error)

func (*Requests) Json

func (r *Requests) Json(js String) *Requests

func (*Requests) SetCookies

func (r *Requests) SetCookies(cookie *http.Cookie) *Requests

func (*Requests) SetHeader

func (r *Requests) SetHeader(headers requests.Header) *Requests

func (*Requests) SetMethod

func (r *Requests) SetMethod(method string) *Requests

func (*Requests) SetTimeOut

func (r *Requests) SetTimeOut(timeout time.Duration) *Requests

type Response

type Response struct {
	*requests.Response
}

type String

type String string

func (*String) Decode

func (s *String) Decode() []byte

func (*String) Empty

func (s *String) Empty() bool

func (*String) HasPrefix

func (s *String) HasPrefix(pattern string) bool

func (*String) Hash

func (s *String) Hash() string

func (*String) Replace

func (s *String) Replace(pattern string) String

func (*String) String

func (s *String) String() string

type StringList

type StringList struct {
	sync.RWMutex
	// contains filtered or unexported fields
}

func NewStringList

func NewStringList() *StringList

func (*StringList) Add

func (l *StringList) Add(item interface{})

func (*StringList) Contains

func (l *StringList) Contains(s string) bool

func (*StringList) Dumps

func (l *StringList) Dumps() (String, error)

func (*StringList) Empty

func (l *StringList) Empty() bool

func (*StringList) Items

func (l *StringList) Items() pie.Strings

func (*StringList) Load

func (l *StringList) Load(b []byte) error

func (*StringList) Size

func (l *StringList) Size() int

type T

type T string

goquery解析html指定节点的text

type Url

type Url String

func (*Url) AddHttp

func (u *Url) AddHttp()

func (*Url) Contains

func (u *Url) Contains(s string) bool

func (Url) Empty

func (u Url) Empty() bool

func (Url) Host

func (u Url) Host() (string, error)

func (*Url) IsHttp

func (u *Url) IsHttp() bool

func (*Url) IsHttps

func (u *Url) IsHttps() bool

func (*Url) String

func (u *Url) String() string

type WaitGroupWrap

type WaitGroupWrap struct {
	sync.WaitGroup
}

func (*WaitGroupWrap) Wrap

func (w *WaitGroupWrap) Wrap(cb func())

type XmlItem

type XmlItem struct {
	Title       string `json:"title" xml:"title" validate:"required"`
	Link        string `json:"link" xml:"link" validate:"required"`
	PubData     string `json:"pub_data" xml:"pubDate"`
	Description string `json:"description" xml:"description" validate:"required"`
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL