Documentation ¶
Index ¶
Constants ¶
This section is empty.
Variables ¶
View Source
var TaobaoSearch = &Spider{ Name: "淘宝天猫搜索", Description: "淘宝天猫搜索结果 [s.taobao.com]", Keyin: KEYIN, Limit: LIMIT, EnableCookie: false, RuleTree: &RuleTree{ Root: func(ctx *Context) { ctx.Aid(map[string]interface{}{"loop": [2]int{0, 1}, "Rule": "生成请求"}, "生成请求") }, Trunk: map[string]*Rule{ "生成请求": { AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} { for loop := aid["loop"].([2]int); loop[0] < loop[1]; loop[0]++ { ctx.AddQueue(&request.Request{ Url: "http://s.taobao.com/search?q=" + ctx.GetKeyin() + "&ie=utf8&cps=yes&app=vproduct&cd=false&v=auction&tab=all&vlist=1&bcoffset=1&s=" + strconv.Itoa(loop[0]*44), Rule: aid["Rule"].(string), }) } return nil }, ParseFunc: func(ctx *Context) { query := ctx.GetDom() src := query.Find("script").Text() if strings.Contains(src, "抱歉!没有找到与") { logs.Log.Critical(" ********************** 淘宝关键词 [%v] 的搜索结果不存在! ********************** ", ctx.GetKeyin()) return } re, _ := regexp.Compile(`(?U)"totalCount":[\d]+}`) total := re.FindString(src) re, _ = regexp.Compile(`[\d]+`) total = re.FindString(total) totalCount, _ := strconv.Atoi(total) maxPage := (totalCount - 4) / 44 if (totalCount-4)%44 > 0 { maxPage++ } if ctx.GetLimit() > maxPage || ctx.GetLimit() == 0 { ctx.SetLimit(maxPage) } else if ctx.GetLimit() == 0 { logs.Log.Critical("[消息提示:| 任务:%v | KEYIN:%v | 规则:%v] 没有抓取到任何数据!!!\n", ctx.GetName(), ctx.GetKeyin(), ctx.GetRuleName()) return } logs.Log.Critical(" ********************** 淘宝关键词 [%v] 的搜索结果共有 %v 页,计划抓取 %v 页 **********************", ctx.GetKeyin(), maxPage, ctx.GetLimit()) ctx.Aid(map[string]interface{}{"loop": [2]int{1, ctx.GetLimit()}, "Rule": "搜索结果"}) ctx.Parse("搜索结果") }, }, "搜索结果": { ParseFunc: func(ctx *Context) { query := ctx.GetDom() src := query.Find("script").Text() re, _ := regexp.Compile(`"auctions".*,"recommendAuctions"`) src = re.FindString(src) re, _ = regexp.Compile(`"auctions":`) src = re.ReplaceAllString(src, "") re, _ = regexp.Compile(`,"recommendAuctions"`) src = re.ReplaceAllString(src, "") re, _ = regexp.Compile("\\<[\\S\\s]+?\\>") src = re.ReplaceAllString(src, " ") src = strings.Trim(src, " \t\n") infos := []map[string]interface{}{} err := json.Unmarshal([]byte(src), &infos) if err != nil { logs.Log.Error("error is %v\n", err) return } else { for _, info := range infos { ctx.AddQueue(&request.Request{ Url: "http:" + info["detail_url"].(string), Rule: "商品详情", Temp: ctx.CreatItem(map[int]interface{}{ 0: info["raw_title"], 1: info["view_price"], 2: info["view_sales"], 3: info["nick"], 4: info["item_loc"], }, "商品详情"), Priority: 1, }) } } }, }, "商品详情": { ItemFields: []string{ "标题", "价格", "销量", "店铺", "发货地", }, ParseFunc: func(ctx *Context) { r := ctx.CopyTemps() re := regexp.MustCompile(`"newProGroup":.*,"progressiveSupport"`) d := re.FindString(ctx.GetText()) if d == "" { h, _ := ctx.GetDom().Find(".attributes-list").Html() d = UnicodeToUTF8(h) d = strings.Replace(d, " ", " ", -1) d = CleanHtml(d, 5) d = strings.Replace(d, "产品参数:\n", "", -1) for _, v := range strings.Split(d, "\n") { if v == "" { continue } feild := strings.Split(v, ":") feild[0] = strings.Trim(feild[0], " ") feild[1] = strings.Trim(feild[1], " ") if feild[0] == "" || feild[1] == "" { continue } ctx.UpsertItemField(feild[0]) r[feild[0]] = feild[1] } } else { d = strings.Replace(d, `"newProGroup":`, "", -1) d = strings.Replace(d, `,"progressiveSupport"`, "", -1) infos := []map[string]interface{}{} err := json.Unmarshal([]byte(d), &infos) if err != nil { logs.Log.Error("error is %v\n", err) return } else { for _, info := range infos { for _, attr := range info["attrs"].([]interface{}) { a := attr.(map[string]interface{}) ctx.UpsertItemField(a["name"].(string)) r[a["name"].(string)] = a["value"] } } } } ctx.Output(r) }, }, }, }, }
Functions ¶
This section is empty.
Types ¶
This section is empty.
Click to show internal directories.
Click to hide internal directories.