Documentation ¶
Index ¶
Constants ¶
This section is empty.
Variables ¶
View Source
var GoogleSearch = &Spider{ Name: "Google search", Description: "Crawls pages from [www.google.com]", Keyin: KEYIN, Limit: LIMIT, EnableCookie: false, RuleTree: &RuleTree{ Root: func(ctx *Context) { var url string var success bool logs.Log.Informational("Running google spider,this may take some time...") for _, ip := range googleIp { url = "http://" + ip + "/?gws_rd=ssl#q=" + ctx.GetKeyin() logs.Log.Informational("测试 " + ip) if _, err := goquery.NewDocument(url); err == nil { success = true break } } if !success { logs.Log.Critical("Could not reach any of the Google mirrors") return } logs.Log.Critical("Starting Google search ...") ctx.AddQueue(&request.Request{ Url: url, Rule: "total_pages", Temp: map[string]interface{}{ "baseUrl": url, }, }) }, Trunk: map[string]*Rule{ "total_pages": { AidFunc: func(ctx *Context, aid map[string]interface{}) interface{} { for loop := aid["loop"].([2]int); loop[0] < loop[1]; loop[0]++ { ctx.AddQueue(&request.Request{ Url: aid["urlBase"].(string) + "&start=" + strconv.Itoa(10*loop[0]), Rule: aid["Rule"].(string), }) } return nil }, ParseFunc: func(ctx *Context) { query := ctx.GetDom() txt := query.Find("#resultStats").Text() re, _ := regexp.Compile(`,+`) txt = re.ReplaceAllString(txt, "") re, _ = regexp.Compile(`[\d]+`) txt = re.FindString(txt) num, _ := strconv.Atoi(txt) total := int(math.Ceil(float64(num) / 10)) if total > ctx.GetLimit() { total = ctx.GetLimit() } else if total == 0 { logs.Log.Critical("[ERROR:| Spider:%v | KEYIN:%v | Rule:%v] Did not fetch any data!!!\n", ctx.GetName(), ctx.GetKeyin(), ctx.GetRuleName()) return } ctx.Aid(map[string]interface{}{ "loop": [2]int{1, total}, "urlBase": ctx.GetTemp("baseUrl", ""), "Rule": "search_results", }) ctx.Parse("search_results") }, }, "search_results": { ItemFields: []string{ "title", "content", "href", }, ParseFunc: func(ctx *Context) { query := ctx.GetDom() query.Find("#ires .g").Each(func(i int, s *goquery.Selection) { t := s.Find(".r > a") href, _ := t.Attr("href") href = strings.TrimLeft(href, "/url?q=") logs.Log.Informational(href) title := t.Text() content := s.Find(".st").Text() ctx.Output(map[int]interface{}{ 0: title, 1: content, 2: href, }) }) }, }, }, }, }
Functions ¶
This section is empty.
Types ¶
This section is empty.
Click to show internal directories.
Click to hide internal directories.