crawler

package module
v1.5.6 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Aug 6, 2019 License: Apache-2.0 Imports: 10 Imported by: 0

README

go-crawler

get data easily to custom struct using annotations,

sample:
package main

import (
	"bytes"
	"fmt"
	"github.com/x-armory/go-crawler"
	"github.com/x-armory/go-exception"
	"net/url"
	"time"
)

func main() {
	NewBaiduCrawler().Start()
}

// custom crawler business definition
func NewBaiduCrawler() *crawler.Crawler {
	lastSyncTime := time.Now().AddDate(0, 0, -3)
	return &crawler.Crawler{
		Business:         &BaiduBusiness{},
		RequestGenerator: crawler.NewPeriodRequestGenerator(crawler.Day, 0, lastSyncTime, getRequestParametersFunc()),
		DataUnmarshaler:  crawler.NewXpathUnmarshaler(0, 0, 1, -1),
	}
}

type BaiduData struct {
	Title string `xpath:"//*[@id='%d']/h3/a"`
	Desc  string `xpath:"//*[@id='%d']/div[1]/text()"`
}

type BaiduBusiness struct {
	data   []BaiduData
	report bytes.Buffer
	count  int
}

func (b *BaiduBusiness) NewPeriodData() interface{} {
	b.data = []BaiduData{}
	return &b.data
}

func (b *BaiduBusiness) ProcessPeriodData() {
	for _, d := range b.data {
		b.count++
		fmt.Printf("%+v\n", d)
	}
}

func (b *BaiduBusiness) SendReport() string {
	println("total count", b.count)
	return b.report.String()
}

func getRequestParametersFunc() crawler.PeriodRequestParametersFunc {
	return func(start time.Time, end time.Time) (method string, urlStr string, headers map[string][]string, values map[string][]string) {
		ex.Assert(!start.IsZero(), ex.Exception(crawler.NoMoreDataException, "", nil))
		date := start.Format("2006-01-02")
		println("sync date", date)
		encode := url.Values(map[string][]string{"wd": {date}}).Encode()
		return "GET", "https://www.baidu.com/s?" + encode, nil, nil
	}
}

Documentation

Index

Constants

View Source
const NoMoreDataException = "CRAWLER_NO_MORE_DATA_EXCEPTION"

Variables

View Source
var DefaultHttpClient *http.Client

Functions

func NewPeriodRequestGenerator

func NewPeriodRequestGenerator(syncDuration SyncDuration, offset int, lastSyncTIme time.Time, parametersFunc PeriodRequestParametersFunc, ignoreWeekend bool) *periodRequestGenerator

func NewXpathUnmarshaler

func NewXpathUnmarshaler(httpDelayMillisMin int, httpDelayMillisMax int, varStart int, varEnd int) *xpathUnmarshaler

Types

type Business

type Business interface {
	NewPeriodData() interface{}
	ProcessPeriodData()
	Finish()
	OnError(err *ex.ExceptionClass)
}

type Crawler

type Crawler struct {
	RequestGenerator
	DataUnmarshaler
	Business
}

func (Crawler) Start

func (c Crawler) Start()

type DataUnmarshaler

type DataUnmarshaler interface {
	Unmarshal(req *http.Request, target interface{})
}

type PeriodRequestParametersFunc

type PeriodRequestParametersFunc func(start time.Time, end time.Time) (method string, urlStr string, headers map[string][]string, values map[string][]string)

type RequestGenerator

type RequestGenerator interface {
	GenRequest() (req *http.Request)
}

type SyncDuration

type SyncDuration int
const (
	Year SyncDuration = iota
	Month
	Day
)

Directories

Path Synopsis
sample

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL