article_spider

package module
v4.0.35 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Nov 8, 2023 License: Apache-2.0 Imports: 25 Imported by: 0

README

article-spider是一个用go编写的爬取文章工具。支持两种模式,常规爬取模式和浏览器自动化模式

中文文档


声明:该爬虫仅供学习使用,如产生任何法律后果,本人概不负责

安装

go get github.com/PeterYangs/article-spider/v4

v1版本

v2版本

快速开始

package main

import (
	"context"
	articleSpider "github.com/PeterYangs/article-spider/v4"
)

func main() {

	f := articleSpider.Form{
		Host:         "https://www.925g.com/",
		Channel:      "/zixun_page[PAGE].html/",
		ListSelector: "body > div.ny-container.uk-background-default > div.wrap > div > div.commonLeftDiv.uk-float-left > div > div.bdDiv > div > ul > li",
		HrefSelector: " a",
		PageStart:    1,
		Length:       2,
		DetailFields: map[string]articleSpider.Field{
			"title": {ExcelHeader: "J", Types: articleSpider.Text, Selector: "body > div.ny-container.uk-background-default > div.wrap > div > div.commonLeftDiv.uk-float-left > div > div.articleDiv > div.hd > h1"},
			"img": {ExcelHeader: "H", Types: articleSpider.Image, Selector: "body > div.ny-container.uk-background-default > div.wrap > div > div.commonLeftDiv.uk-float-left > div > div.articleDiv > div.bd img:nth-child(1)", ImageDir: "app", ImagePrefix: func(form *articleSpider.Form, path string) string {

				return "app"
			}},
			"content": {ExcelHeader: "I", Types: articleSpider.HtmlWithImage, Selector: "body > div.ny-container.uk-background-default > div.wrap > div > div.commonLeftDiv.uk-float-left > div > div.articleDiv > div.bd", ImagePrefix: func(form *articleSpider.Form, path string) string {

				return "/api"
			}},
		},
		ListFields:            map[string]articleSpider.Field{},
		CustomExcelHeader:     true,
		DetailCoroutineNumber: 5,
	}

	s := articleSpider.NewSpider(f, articleSpider.Normal, context.Background())

	s.Start()

}

一些例子

常用属性

	Host                       string                                   //网站域名
	Channel                    string                                   //栏目链接,页码用[PAGE]替换
	PageStart                  int                                      //页码起始页
	Length                     int                                      //爬取页码长度
	ListSelector               string                                   //列表选择器
	HrefSelector               string                                   //a链接选择器,相对于列表选择器
	DisableAutoCoding          bool                                     //是否禁用自动转码
	DetailFields               map[string]Field                         //详情页面字段选择器
	ListFields                 map[string]Field                         //列表页面字段选择器,暂不支持api爬取
	HttpTimeout                time.Duration                            //请求超时时间
	HttpHeader                 map[string]string                        //header
	HttpProxy                  string                                   //代理
	ChannelFunc                func(form *Form) []string                //自定义栏目链接
	DetailCoroutineNumber      int                                      //爬取详情页协程数
	LazyImageAttrName          string                                   //懒加载图片属性,默认为data-original
	DisableImageExtensionCheck bool                                     //禁用图片拓展名检查,禁用后所有图片拓展名强制为png
	AllowImageExtension        []string                                 //允许下载的图片拓展名
	DefaultImg                 func(form *Form, item Field) string      //图片出错时,设置默认图片
	MiddleSelector             []string                                 //中间层选择器(a链接选择器),当详情页有多层时使用,暂不支持自动模式
	CustomExcelHeader          bool                                     //自定义Excel表格头部
	ResultCallback             func(item map[string]string, form *Form) //自定义获取爬取结果回调
	ApiConversion              func(html string, form *Form) []string   //api获取链接
	AutoPrefixEvent            func(chromedpCtx context.Context)        //自动爬取模式前置事件
	AutoListWaitSelector       string                                   //列表等待选择器(用于自动化爬取)
	AutoNextPageMode           NextPageMode                             //下一页模式(用于自动化爬取,目前支持常规分页和加载更多)
	AutoDetailForceNewTab      bool                                     //自动模式详情页强制打开新窗口(必须是a链接)
	AutoDetailWaitSelector     string                                   //详情等待选择器(用于自动化爬取)
	AutoNextSelector           string                                   //下一页选择器(用于自动化爬取)
	FilterError                bool                                     //过滤错误的行
	DetailUrls                 []string                                 //详情页列表


设置header(包含cookie)

package main

import (
	"context"
	articleSpider "github.com/PeterYangs/article-spider/v4"
)

func main() {

	f := articleSpider.Form{
		Host:         "https://www.925g.com/",
		Channel:      "/zixun_page[PAGE].html/",
		ListSelector: "body > div.ny-container.uk-background-default > div.wrap > div > div.commonLeftDiv.uk-float-left > div > div.bdDiv > div > ul > li",
		HrefSelector: " a",
		PageStart:    1,
		Length:       2,
		DetailFields: map[string]articleSpider.Field{
			"title": {ExcelHeader: "J", Types: articleSpider.Text, Selector: "body > div.ny-container.uk-background-default > div.wrap > div > div.commonLeftDiv.uk-float-left > div > div.articleDiv > div.hd > h1"},
			"img": {ExcelHeader: "H", Types: articleSpider.Image, Selector: "body > div.ny-container.uk-background-default > div.wrap > div > div.commonLeftDiv.uk-float-left > div > div.articleDiv > div.bd img:nth-child(1)", ImageDir: "app", ImagePrefix: func(form *articleSpider.Form, path string) string {

				return "app"
			}},
			"content": {ExcelHeader: "I", Types: articleSpider.HtmlWithImage, Selector: "body > div.ny-container.uk-background-default > div.wrap > div > div.commonLeftDiv.uk-float-left > div > div.articleDiv > div.bd", ImagePrefix: func(form *articleSpider.Form, path string) string {

				return "/api"
			}},
		},
		ListFields:            map[string]articleSpider.Field{},
		CustomExcelHeader:     true,
		DetailCoroutineNumber: 5,
		HttpHeader: map[string]string{
			"cookie":     "xx",
			"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36",
		},
	}

	s := articleSpider.NewSpider(f, articleSpider.Normal,context.Background())

	s.Start()

}

自定义分页链接

package main

import (
	"context"
	articleSpider "github.com/PeterYangs/article-spider/v4"
)

func main() {

	f := articleSpider.Form{
		Host: "https://www.925g.com",
		ChannelFunc: func(form *articleSpider.Form) []string {

			return []string{
				"/zixun_page1.html/",
				"/zixun_page2.html/",
				"/zixun_page3.html/",
				"/zixun_page4.html/",
				"/zixun_page5.html/",
				"/zixun_page6.html/",
				"/zixun_page7.html/",
				"/zixun_page8.html/",
				"/zixun_page9.html/",
			}
		},
		ListSelector: "body > div.ny-container.uk-background-default > div.wrap > div > div.commonLeftDiv.uk-float-left > div > div.bdDiv > div > ul > li",
		HrefSelector: " a",
		PageStart:    1,
		Length:       2,
		DetailFields: map[string]articleSpider.Field{
			"title": {ExcelHeader: "J", Types: articleSpider.Text, Selector: "body > div.ny-container.uk-background-default > div.wrap > div > div.commonLeftDiv.uk-float-left > div > div.articleDiv > div.hd > h1"},
			"img": {ExcelHeader: "H", Types: articleSpider.Image, Selector: "body > div.ny-container.uk-background-default > div.wrap > div > div.commonLeftDiv.uk-float-left > div > div.articleDiv > div.bd img:nth-child(1)", ImageDir: "app", ImagePrefix: func(form *articleSpider.Form, path string) string {

				return "app"
			}},
			"content": {ExcelHeader: "I", Types: articleSpider.HtmlWithImage, Selector: "body > div.ny-container.uk-background-default > div.wrap > div > div.commonLeftDiv.uk-float-left > div > div.articleDiv > div.bd", ImagePrefix: func(form *articleSpider.Form, path string) string {

				return "/api"
			}},
		},
		ListFields:            map[string]articleSpider.Field{},
		CustomExcelHeader:     true,
		DetailCoroutineNumber: 5,
	}

	s := articleSpider.NewSpider(f, articleSpider.Normal,context.Background())

	s.Start()

}

详情页中间层

package main

import (
	"context"
	articleSpider "github.com/PeterYangs/article-spider/v4"
)

func main() {

	f := articleSpider.Form{
		Host:           "https://www.ahjingcheng.com",
		Channel:        "/show/dongzuo--------[PAGE]---/",
		ListSelector:   "body > div:nth-child(5) > div > div.col-lg-wide-75.col-xs-1.padding-0 > div:nth-child(2) > div > div.stui-pannel_bd > ul > li",
		HrefSelector:   " div > a",
		PageStart:      1,
		Length:         2,
		MiddleSelector: []string{"body > div:nth-child(3) > div > div.col-lg-wide-75.col-xs-1.padding-0 > div:nth-child(1) > div > div:nth-child(2) > div.stui-content__thumb > a"},
		DetailFields: map[string]articleSpider.Field{
			"url": {Types: articleSpider.Regular, Selector: `"url":"([0-9A-Za-z/\\._:]+)","url_next"`, RegularIndex: 1},
		},

		DetailCoroutineNumber: 1,
		HttpHeader: map[string]string{
			"cookie":     "Hm_lvt_66246be1ec92d6574526bda37cf445cc=1633767654; Hm_lvt_56a5b64a8f7a92a018377c693e064bdf=1633767654; recente=%5B%7B%22vod_name%22%3A%22%E4%B8%80%E7%BA%A7%E6%8C%87%E6%8E%A7%22%2C%22vod_url%22%3A%22https%3A%2F%2Fwww.ahjingcheng.com%2Fplay%2F119516-1-1%2F%22%2C%22vod_part%22%3A%22%E6%AD%A3%E7%89%87%22%7D%2C%7B%22vod_name%22%3A%22%E5%85%BB%E8%80%81%E5%BA%84%E5%9B%AD%22%2C%22vod_url%22%3A%22https%3A%2F%2Fwww.ahjingcheng.com%2Fplay%2F119506-1-1%2F%22%2C%22vod_part%22%3A%221080P%22%7D%2C%7B%22vod_name%22%3A%22%E4%B8%96%E7%95%8C%E4%B8%8A%E6%9C%80%E7%BE%8E%E4%B8%BD%E7%9A%84%E6%88%91%E7%9A%84%E5%A5%B3%22%2C%22vod_url%22%3A%22https%3A%2F%2Fwww.ahjingcheng.com%2Fplay%2F59426-1-1%2F%22%2C%22vod_part%22%3A%22%E5%85%A8%E9%9B%86%22%7D%2C%7B%22vod_name%22%3A%22%E6%9C%BA%E6%A2%B0%E5%B8%882%EF%BC%9A%E5%A4%8D%E6%B4%BB%E8%8B%B1%E6%96%87%E7%89%88%22%2C%22vod_url%22%3A%22https%3A%2F%2Fwww.ahjingcheng.com%2Fplay%2F91322-1-1%2F%22%2C%22vod_part%22%3A%22%E9%AB%98%E6%B8%85%22%7D%5D; Hm_lvt_66246be1ec92d6574526bda37cf445cc=1633767654; Hm_lvt_56a5b64a8f7a92a018377c693e064bdf=1633767654; PHPSESSID=7sfu1ui3crco1a817vocccl2u1; Hm_lpvt_66246be1ec92d6574526bda37cf445cc=1633914645; Hm_lpvt_56a5b64a8f7a92a018377c693e064bdf=1633914645",
			"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36",
		},
	}

	s := articleSpider.NewSpider(f, articleSpider.Normal,context.Background())

	s.Start()

}

自行处理爬取结果

package main

import (
	"fmt"
	"context"
	articleSpider "github.com/PeterYangs/article-spider/v4"
)

func main() {

	f := articleSpider.Form{
		Host:         "https://www.925g.com",
		Channel:      "/zixun_page[PAGE].html/",
		ListSelector: "body > div.ny-container.uk-background-default > div.wrap > div > div.commonLeftDiv.uk-float-left > div > div.bdDiv > div > ul > li",
		HrefSelector: " a",
		PageStart:    1,
		Length:       10,
		ListFields: map[string]articleSpider.Field{

			"title": {ExcelHeader: "K", Types: articleSpider.Text, Selector: " a > div > span"},
		},
		CustomExcelHeader:     true,
		DetailCoroutineNumber: 2,
		ResultCallback: func(item map[string]string, form *articleSpider.Form) {

			for s2, s3 := range item {

				fmt.Println(s2, ":", s3)

			}

		},
	}

	s := articleSpider.NewSpider(f, articleSpider.Normal,context.Background())

	s.Start()

}

爬取列表是api的网页

package main

import (
	"context"
	"encoding/json"
	articleSpider "github.com/PeterYangs/article-spider/v4"
)

func main() {

	f := articleSpider.Form{
		Host:      "http://www.tiyuxiu.com",
		Channel:   "/data/list_0_[PAGE].json?__t=16339338",
		PageStart: 1,
		Length:    10,
		DetailFields: map[string]articleSpider.Field{

			"title":   {Types: articleSpider.Text, Selector: "h1"},
			"content": {Types: articleSpider.HtmlWithImage, Selector: "#main-content"},
		},
		//CustomExcelHeader:     true,
		DetailCoroutineNumber: 2,
		ApiConversion: func(html string, form *articleSpider.Form) []string {

			type list struct {
				Url string
			}

			var l []list

			json.Unmarshal([]byte(html), &l)

			var temp []string

			for _, l2 := range l {

				temp = append(temp, l2.Url)

			}

			return temp

		},
	}

	s := articleSpider.NewSpider(f, articleSpider.Api,context.Background()).Debug()

	s.Start()
}

自动化模式

package main

import (
	"context"
	"fmt"
	articleSpider "github.com/PeterYangs/article-spider/v4"
)

func main() {

	s := articleSpider.NewSpider(articleSpider.Form{

		Host:         "https://www.925g.com",
		Channel:      "/zixun/",
		ListSelector: "body > div.ny-container.uk-background-default > div.wrap > div > div.commonLeftDiv.uk-float-left > div > div.bdDiv > div > ul > li",
		HrefSelector: "  a",
		//下一页选择器
		AutoNextSelector: "body > div.ny-container.uk-background-default > div.wrap > div > div.commonLeftDiv.uk-float-left > div > div.bdDiv > ul > li:nth-child(11) > a",
		//列表等待选择器
		//AutoListWaitSelector: "body > div.ny-container.uk-background-default > div.wrap > div > div.commonLeftDiv.uk-float-left > div > div.bdDiv > div > ul > li:nth-child(1)",
		//详情等待选择器
		AutoDetailWaitSelector: "body > div.ny-container.uk-background-default > div.wrap > div > div.commonLeftDiv.uk-float-left > div > div.articleDiv > div.hd > h1",
		Length:                 3,
		DetailFields: map[string]articleSpider.Field{
			"title": {ExcelHeader: "J", Types: articleSpider.Text, Selector: "body > div.ny-container.uk-background-default > div.wrap > div > div.commonLeftDiv.uk-float-left > div > div.articleDiv > div.hd > h1"},
			"content": {ExcelHeader: "H", Types: articleSpider.HtmlWithImage, Selector: "body > div.ny-container.uk-background-default > div.wrap > div > div.commonLeftDiv.uk-float-left > div > div.articleDiv > div.bd", ImageDir: "app", ImagePrefix: func(form *articleSpider.Form, path string) string {

				return "app"
			}},
		},

		//cookie
		HttpHeader: map[string]string{
			"cookie": "user_cookie=Vmod7XlkHN; UM_distinctid=17b805b421c1e0-0005d3dc1ac8ea-c343365-1fa400-17b805b421dda7; url_data=https://www.925g.com/zixun/,https://www.925g.com/; PHPSESSID=3m0ee50ba4r40jq3fleob2n71i; CNZZDATA1278942394=1852940385-1600066493-%7C1635143024; Hm_lvt_46233f03c62deb1e98a07bf1e1708415=1634807167,1634887947,1634955841,1635153418; Hm_lpvt_46233f03c62deb1e98a07bf1e1708415=1635153430",
		},
	}, articleSpider.Auto,context.Background())

	err := s.Start()

	if err != nil {

		fmt.Println(err)
	}

}

自动化模式爬取加载更多页面

package main

import (
	"context"
	articleSpider "github.com/PeterYangs/article-spider/v4"

	"github.com/chromedp/chromedp"
)

func main() {

	f := articleSpider.Form{
		Host:         "https://www.btcfans.com",
		Channel:      "/zh-cn/wallet",
		ListSelector: "body > div.page-width.page-content > div.main-content > div > div.module-content > ul > li",
		HrefSelector: " a",
		//下一页选择器
		AutoNextSelector: "body > div.page-width.page-content > div.main-content > div > div.module-content > a",
		//列表等待选择器
		AutoListWaitSelector: "body > div.page-width.page-content > div.main-content > div > div.module-content > ul > li:nth-child(1)",
		//详情等待选择器
		AutoDetailWaitSelector: "body > div.page-width.page-content > div.main-content > div.wallet-detail-page > div.info_1 > div.name > div.name-ch",
		Length:                 4,
		DetailFields: map[string]articleSpider.Field{
			"title": {ExcelHeader: "G", Types: articleSpider.Text, Selector: "body > div.page-width.page-content > div.main-content > div.wallet-detail-page > div.info_1 > div.name > div.name-ch"},
			"content": {Types: articleSpider.HtmlWithImage, Selector: "body > div.page-width.page-content > div.main-content > div.wallet-detail-page > div.wallet-des > div > p", ExcelHeader: "E", ImagePrefix: func(form *articleSpider.Form, imageName string) string {

				return "/api/uploads"
			}, ImageDir: "game[date:md]/[random:1-100]"},
			"desc":    {Types: articleSpider.Attr, Selector: "meta[name=\"description\"]", AttrKey: "content", ExcelHeader: "H"},
			"keyword": {Types: articleSpider.Attr, Selector: "meta[name=\"keywords\"]", AttrKey: "content", ExcelHeader: "K"},
			"img":     {Types: articleSpider.Image, Selector: "body > div.page-width.page-content > div.main-content > div.wallet-detail-page > div.info_1 > div.cover > img", ExcelHeader: "F", ImageDir: "game[date:md]/[random:1-100]"},
			"type":    {Types: articleSpider.Fixed, Selector: "2", ExcelHeader: "L"},
			//"size":    {Types: fileTypes.SingleField, Selector: "#dinfo > p.base > i:nth-child(3)", ExcelHeader: "M"},
		},

		//cookie
		HttpHeader: map[string]string{
			"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36",
			"cookie":     "lang=zh-CN; lang=zh-CN; lang=zh-CN; _ga=GA1.1.1532009431.1641283813; UM_distinctid=17e24238a22739-0fc0995e9cfdad-c343365-1fa400-17e24238a2352e; guid=cff3a072d6ca30b80ee729f0884a8596f65d9a28; CNZZDATA5291371=cnzz_eid%3D1358048227-1641278212-%26ntime%3D1641338428; CNZZDATA1278599438=848177868-1641279863-%7C1641340242; Hm_lvt_ddaa34551214df42d1e5f11974f9f744=1641283822,1641346329; _csrf=3f62bc78510faa5fecfbf404cbee0ec56d1c4f3a; s_a=1; _ga_76F07DJEB4=GS1.1.1641346328.3.1.1641346978.0; Hm_lpvt_ddaa34551214df42d1e5f11974f9f744=1641346980",
		},
		//下一页模式
		AutoNextPageMode:  articleSpider.LoadMore,
		CustomExcelHeader: true,
		//爬取前置事件
		AutoPrefixEvent: func(chromedpCtx context.Context) {

			//关闭弹窗
			chromedp.Run(
				chromedpCtx,

				chromedp.Click("#Alert > div > div.sure_btn", chromedp.ByQuery),
			)

		},
	}

	s := articleSpider.NewSpider(f, articleSpider.Auto,context.Background())

	s.Start()

}

代理

package main

import (
	"context"
	articleSpider "github.com/PeterYangs/article-spider/v4"
)

func main() {

	f := articleSpider.Form{
		Host:         "https://www.cgcosplay.jp",
		Channel:      "/product-list?page=[PAGE]",
		ListSelector: "#inner_main_container > section > div > div.page_contents.clearfix.alllist_contents > div > div.itemlist_box.tiled_list_box.layout_photo > div > ul > li",
		HrefSelector: " div > a",
		PageStart:    1,
		Length:       10,
		ListFields: map[string]articleSpider.Field{
			"title": {ExcelHeader: "A", Types: articleSpider.Text, Selector: "div > a > div > div.list_item_data > p.item_name > span.goods_name"},
			"price": {ExcelHeader: "B", Types: articleSpider.Text, Selector: "div > a > div > div.list_item_data > div > div > p.selling_price > span.figure"},
			"img": {ExcelHeader: "C", Types: articleSpider.Image, Selector: "  div > a > div > div.list_item_photo > div > div", ImageDir: "cgcosplay_image", ImagePrefix: func(form *articleSpider.Form, path string) string {

				return "cgcosplay_image"
			}},
		},
		CustomExcelHeader:     true,
		DetailCoroutineNumber: 10,
		LazyImageAttrName:     "data-src",
		HttpProxy:             "http://127.0.0.1:4780",
	}

	s := articleSpider.NewSpider(f, articleSpider.Normal,context.Background())

	s.Start()

}

排除不需要的元素

package main

import (
	"context"
	articleSpider "github.com/PeterYangs/article-spider/v4"
	
)

func main() {

	f := articleSpider.Form{
		Host:         "http://www.3h3.com",
		Channel:      "/news/g_38_[PAGE].html",
		ListSelector: "body > div.main > div > div > div.col-l > ul.ul-info > li",
		HrefSelector: "  div.pic > a",
		PageStart:    2,
		Length:       1,
		DetailFields: map[string]articleSpider.Field{
			"content": {Types: articleSpider.HtmlWithImage, Selector: "body > div.main > div > div > div.col-l > div.art-body", NotSelector: []string{"body > div.main > div > div > div.col-l > div.art-body > div"}},

		},

	}

	s := articleSpider.NewSpider(f, articleSpider.Normal,context.Background())

	s.Start()

}

根据详情页链接爬取

package main

import (
	"context"
	articleSpider "github.com/PeterYangs/article-spider/v4"
)

func main() {

	f := articleSpider.Form{
		Host: "https://www.925g.com/",

		DetailUrls: []string{

			"https://www.925g.com/gonglue/138499.html",
			"https://www.925g.com/gonglue/138498.html",
			"https://www.925g.com/gonglue/138497.html",
			"https://www.925g.com/gonglue/138496.html",
			"https://www.925g.com/gonglue/138495.html",
			"https://www.925g.com/gonglue/138494.html",
		},
		DetailFields: map[string]articleSpider.Field{
			"title": {Types: articleSpider.Text, Selector: "body > div.ny-container.uk-background-default > div.wrap > div > div.commonLeftDiv.uk-float-left > div > div.articleDiv > div.hd > h1"},
			"img":   {Types: articleSpider.Image, Selector: "body > div.ny-container.uk-background-default > div.wrap > div > div.commonLeftDiv.uk-float-left > div > div.articleDiv > div.bd img:nth-child(1)", ImageDir: "[date:md]/[random:1-100]"},
			"content": {Types: articleSpider.HtmlWithImage, Selector: "body > div.ny-container.uk-background-default > div.wrap > div > div.commonLeftDiv.uk-float-left > div > div.articleDiv > div.bd", ImagePrefix: func(form *articleSpider.Form, path string) string {

				return "/api"
			}, ImageDir: "[date:md]/[random:1-100]"},
		},
		DetailCoroutineNumber: 3,
		FilterError:           true,
	}

	s := articleSpider.NewSpider(f, articleSpider.Url, context.Background())

	s.Start()

}

结果过滤

package main

import (
	"context"
	articleSpider "github.com/PeterYangs/article-spider/v4"
	"strings"
	"time"
)

func main() {

	f := articleSpider.Form{
		Host:         "https://www.xyzs.com",
		Channel:      "/app/soft/index_[PAGE].html",
		ListSelector: "body > div.wrapper > section.aplist > ul > li",

		PageStart: 51,
		Length:    100,

		ListFields: map[string]articleSpider.Field{
			"title": {Types: articleSpider.Text, Selector: " a > p.name"},
		},

		DetailCoroutineNumber: 1,
		FilterError:           true,
		Filter: func(m map[string]string) bool {

			defer time.Sleep(100 * time.Millisecond)

			if strings.Contains(m["title"], "直播") {

				return true
			}

			return false

		},
	}

	s := articleSpider.NewSpider(f, articleSpider.Normal, context.Background())

	s.Start()

}

关于图片保存路径说明

Field中的图片路径设置

ImageDir:图片生成路径,该路径会生成在结果中,支持动态 ImagePrefix:图片前缀路径,不会出现在结果中

全局设置

SetImageDir(path),图片保存前缀,不会出现在结果中,默认是image

SetSavePath(path),图片保存文件夹,不会出现在结果中

图片保存路径拼接顺序:savePath+imageDir(全局)+imageDir(field)+文件名 图片结果路径拼接顺序: imagePrefix+ImageDir+文件名

Documentation

Index

Constants

View Source
const (
	Info    = 0x00000
	Debug   = 0x00001
	Error   = 0x00002
	Log     = 0x00003
	Process = 0x00004
	Finish  = 0x00005
)

Variables

This section is empty.

Functions

func If

func If(condition bool, trueVal, falseVal interface{}) interface{}

If 伪三元运算

func NewApi

func NewApi(s *Spider) *api

func NewAuto

func NewAuto(s *Spider) *auto

func NewExcel

func NewExcel(s *Spider) *excel

func NewNormal

func NewNormal(s *Spider) *normal

func NewResult

func NewResult(s *Spider) *result

func NewUrl added in v4.0.5

func NewUrl(s *Spider) *url

Types

type Field

type Field struct {
	Types              FieldTypes
	Selector           string                                              //字段选择器
	NotSelector        []string                                            //剔除选择器(后置选择器,意思是先获取该item的doc再剔除节点)
	PrefixNotSelector  []string                                            //前置剔除选择器(意思是先剔除html的节点)
	AttrKey            string                                              //属性值参数
	ImagePrefix        func(form *Form, imageName string) string           //图片路径前缀,会添加到图片路径前缀,但不会生成文件夹
	ImageDir           string                                              //图片子文件夹,支持变量 1.[date:Y-m-d] 2.[random:1-100] 3.[singleField:title]
	ExcelHeader        string                                              //excel表头,需要CustomExcelHeader为true,例:A
	RegularIndex       int                                                 //正则匹配中的反向引用的下标,默认是1
	ConversionFunc     func(data string, resList map[string]string) string //转换格式函数,第一个参数是该字段数据,第二个参数是所有数据,跟web框架的获取器类似
	LazyImageAttrName  string                                              //懒加载图片属性,默认为data-original
	ImageResizePercent int                                                 //图片缩放百分比

}

type FieldTypes

type FieldTypes int
const (
	Text           FieldTypes = 0x00000 //单个字段
	Image          FieldTypes = 0x00002 //单个图片
	OnlyHtml       FieldTypes = 0x00003 //普通html(不包括图片)
	HtmlWithImage  FieldTypes = 0x00004 //html包括图片
	MultipleImages FieldTypes = 0x00005 //多图
	Attr           FieldTypes = 0x00006 //标签属性选择器
	Fixed          FieldTypes = 0x00007 //固定数据,填什么返回什么,选择器就是返回的数据
	Regular        FieldTypes = 0x00008 //正则(FindStringSubmatch,返回一个结果)
	File           FieldTypes = 0x00009 //文件类型
	Attrs          FieldTypes = 0x00010 //属性列表,如一个图片列表的所有图片链接
)

type Form

type Form struct {
	Host                       string                                   //网站域名
	Channel                    string                                   //栏目链接,页码用[PAGE]替换
	PageStart                  int                                      //页码起始页
	PageCurrent                int                                      //当前页码
	ListUrlCurrent             string                                   //当前列表链接
	Length                     int                                      //爬取页码长度
	ListSelector               string                                   //列表选择器
	HrefSelector               string                                   //a链接选择器,相对于列表选择器
	DisableAutoCoding          bool                                     //是否禁用自动转码
	DetailFields               map[string]Field                         //详情页面字段选择器
	ListFields                 map[string]Field                         //列表页面字段选择器,暂不支持api爬取
	HttpTimeout                time.Duration                            //请求超时时间
	HttpHeader                 map[string]string                        //header
	HttpProxy                  string                                   //代理
	ProxyFunc                  func() (Proxy, error)                    //动态获取代理ip,不要和HttpProxy一起用(每次翻页时切换ip)
	ProxyFinishFunc            func(Proxy)                              //动态代理ip使用完毕后回调函数
	ChannelFunc                func(form *Form) []string                //自定义栏目链接
	DetailCoroutineNumber      int                                      //爬取详情页协程数
	LazyImageAttrName          string                                   //懒加载图片属性,默认为data-original
	DisableImageExtensionCheck bool                                     //禁用图片拓展名检查,禁用后所有图片拓展名强制为png
	AllowImageExtension        []string                                 //允许下载的图片拓展名
	DefaultImg                 func(form *Form, item Field) string      //图片出错时,设置默认图片
	MiddleSelector             []string                                 //中间层选择器(a链接选择器),当详情页有多层时使用,暂不支持自动模式
	CustomExcelHeader          bool                                     //自定义Excel表格头部
	ResultCallback             func(item map[string]string, form *Form) //自定义获取爬取结果回调
	ApiConversion              func(html string, form *Form) []string   //api获取链接
	AutoPrefixEvent            func(chromedpCtx context.Context)        //自动爬取模式前置事件
	AutoListWaitSelector       string                                   //列表等待选择器(用于自动化爬取)
	AutoNextPageMode           NextPageMode                             //下一页模式(用于自动化爬取,目前支持常规分页和加载更多)
	AutoDetailForceNewTab      bool                                     //自动模式详情页强制打开新窗口(必须是a链接)
	AutoDetailWaitSelector     string                                   //详情等待选择器(用于自动化爬取)
	AutoNextSelector           string                                   //下一页选择器(用于自动化爬取)
	FilterError                bool                                     //过滤错误的行
	DetailUrls                 []string                                 //详情页列表
	Filter                     func(map[string]string) bool             //数据过滤,返回false则放弃数据
	ListSleep                  time.Duration                            //列表等待时间(爬完一页后等待的时间)
	DetailSleep                time.Duration                            //详情等待时间(爬完一个详情后等待的时间,注意多协程)
	DisableMessage             bool                                     //禁用终端输出
	ListGetErrorFunc           func(form *Form, err error)              //列表获取失败回调函数(列表长度为0或状态码非200)
	// contains filtered or unexported fields
}

func (*Form) DealCoding

func (f *Form) DealCoding(html string, header http2.Header) (string, error)

DealCoding 解决编码问题

func (*Form) DownImg

func (f *Form) DownImg(url string, item Field, res *sync.Map) (string, error)

DownImg 下载图片(包括生成文件夹)

func (*Form) GetCharsetByContentType

func (f *Form) GetCharsetByContentType(contentType string) string

GetCharsetByContentType 从contentType中获取编码

func (*Form) GetDir

func (f *Form) GetDir(path string, res *sync.Map) string

func (*Form) GetHref

func (f *Form) GetHref(href string) string

GetHref 获取完整a链接

func (*Form) GetHtml

func (f *Form) GetHtml(url string) (string, error)

GetHtml 从链接中获取html

func (*Form) ResolveSelector

func (f *Form) ResolveSelector(html string, selector map[string]Field, originUrl string) (*Rows, error)

ResolveSelector 解析选择器

type Mode

type Mode int
const (
	Normal Mode = 0x00000 //常规模式
	Api    Mode = 0x00001 //api模式
	Auto   Mode = 0x00002 //自动化模式
	Url    Mode = 0x00003 //详情页链接模式
)

func (Mode) ToString added in v4.0.6

func (m Mode) ToString() string

type NextPageMode

type NextPageMode int
const (
	Pagination NextPageMode = 0 //常规分页
	LoadMore   NextPageMode = 1 //加载更多
)

type Notice

type Notice struct {
	// contains filtered or unexported fields
}

func NewNotice

func NewNotice(s *Spider) *Notice

func (*Notice) Debug

func (n *Notice) Debug(content ...interface{})

func (*Notice) Error

func (n *Notice) Error(content ...interface{})

func (*Notice) Finish

func (n *Notice) Finish(content ...interface{})

func (*Notice) Info

func (n *Notice) Info(content ...interface{})

func (*Notice) Log

func (n *Notice) Log(content ...interface{})

func (*Notice) Process

func (n *Notice) Process(content ...interface{})

func (*Notice) Service

func (n *Notice) Service()

func (*Notice) Stop

func (n *Notice) Stop()

type Proxy added in v4.0.31

type Proxy struct {
	Scheme string //协议(如:http,https,sock)
	Host   string
	Port   int
	Expire int64 //过期时间,时间戳
}

type Rows added in v4.0.3

type Rows struct {
	// contains filtered or unexported fields
}

func NewRows added in v4.0.3

func NewRows(m map[string]string) *Rows

func (*Rows) Add added in v4.0.3

func (r *Rows) Add(rr *Rows)

Add 合并两个结果

type Spider

type Spider struct {
	CustomDownloadFun func(imgUrl string, imgPath string, f *Form, item Field) error //自实现图片下载
	// contains filtered or unexported fields
}

func NewSpider

func NewSpider(f Form, mode Mode, cxt context.Context) *Spider

func (*Spider) CustomDownloadImage added in v4.0.7

func (s *Spider) CustomDownloadImage(fun func(imgUrl string, imgPath string, f *Form, item Field) error) *Spider

CustomDownloadImage 自实现图片下载

func (*Spider) Debug

func (s *Spider) Debug() *Spider

func (*Spider) SetImageDir added in v4.0.1

func (s *Spider) SetImageDir(path string)

SetImageDir 设置图片文件夹

func (*Spider) SetSavePath added in v4.0.1

func (s *Spider) SetSavePath(path string)

SetSavePath 图片保存文件夹,不会出现在图片路径中,为空则为当前运行路径

func (*Spider) Start

func (s *Spider) Start() error

Directories

Path Synopsis

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL