Documentation ¶
Index ¶
Constants ¶
This section is empty.
Variables ¶
View Source
var FileTest = &Spider{ Name: "中国新闻网", Description: "测试 [http://www.chinanews.com/scroll-news/news1.html]", EnableCookie: false, RuleTree: &RuleTree{ Root: func(ctx *Context) { ctx.AddQueue(&request.Request{ Url: "http://www.chinanews.com/scroll-news/news1.html", Rule: "滚动新闻", }) }, Trunk: map[string]*Rule{ "滚动新闻": { ParseFunc: func(ctx *Context) { query := ctx.GetDom() navBox := query.Find(".pagebox a") navBox.Each(func(i int, s *goquery.Selection) { if url, ok := s.Attr("href"); ok { ctx.AddQueue(&request.Request{ Url: "http://www.chinanews.com" + url, Rule: "新闻列表", }) } }) }, }, "新闻列表": { ParseFunc: func(ctx *Context) { query := ctx.GetDom() newList := query.Find(".content_list li") newList.Each(func(i int, s *goquery.Selection) { newsType := s.Find(".dd_lm a").Text() newsTitle := s.Find(".dd_bt a").Text() newsTime := s.Find(".dd_time").Text() if url, ok := s.Find(".dd_bt a").Attr("href"); ok { ctx.AddQueue(&request.Request{ Url: "http://" + url[2:len(url)], Rule: "新闻内容", Temp: map[string]interface{}{ "newsType": newsType, "newsTitle": newsTitle, "newsTime": newsTime, }, }) } }) }, }, "新闻内容": { ItemFields: []string{ "类别", "来源", "标题", "内容", "时间", }, ParseFunc: func(ctx *Context) { query := ctx.GetDom() content := query.Find(".left_zw").Text() from := query.Find(".left-t").Text() i := strings.LastIndex(from, "来源") if i == -1 { from = "未知" } else { from = from[i+9 : len(from)] from = strings.Replace(from, "参与互动", "", 1) if from == "" { from = query.Find(".left-t").Eq(2).Text() from = strings.Replace(from, "参与互动", "", 1) } } ctx.Output(map[int]interface{}{ 0: ctx.GetTemp("newsType", ""), 1: from, 2: ctx.GetTemp("newsTitle", ""), 3: content, 4: ctx.GetTemp("newsTime", ""), }) }, }, }, }, }
Functions ¶
This section is empty.
Types ¶
This section is empty.
Click to show internal directories.
Click to hide internal directories.