Documentation ¶
Index ¶
- func Crawl(ctx context.Context, concurrencyConfig ConcurrencyConfig, links []string, ...)
- func CrawlByConcurrentHandler(ctx context.Context, concurrencyConfig ConcurrencyConfig, ...)
- func HandleLink(ctx context.Context, threadID int, link string, ...) []string
- func HandleLinks(ctx context.Context, threadID int, links chan string, ...)
- func HandleLinksConcurrently(ctx context.Context, concurrencyFactor int, links chan string, ...)
- type ConcurrencyConfig
- type CrawlDependencies
- type HandleLinkDependencies
- type LinkChecker
- type LinkExtractor
- type LinkHandler
- type Logger
- type Waiter
Examples ¶
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func Crawl ¶
func Crawl( ctx context.Context, concurrencyConfig ConcurrencyConfig, links []string, dependencies CrawlDependencies, )
Crawl ...
Example ¶
package main import ( "compress/gzip" "context" "fmt" "html/template" "io" stdlog "log" "net/http" "net/http/httptest" "os" "path" "runtime" "strings" crawler "github.com/thewizardplusplus/go-crawler" "github.com/thewizardplusplus/go-crawler/checkers" "github.com/thewizardplusplus/go-crawler/extractors" "github.com/thewizardplusplus/go-crawler/extractors/transformers" "github.com/thewizardplusplus/go-crawler/models" urlutils "github.com/thewizardplusplus/go-crawler/url-utils" htmlselector "github.com/thewizardplusplus/go-html-selector" ) type LinkHandler struct { Name string ServerURL string } func (handler LinkHandler) HandleLink( ctx context.Context, link models.SourcedLink, ) { var prefix string if handler.Name != "" { prefix = fmt.Sprintf("[%s] ", handler.Name) } fmt.Printf( "%sreceived link %q from page %q\n", prefix, handler.replaceServerURL(link.Link), handler.replaceServerURL(link.SourceLink), ) } // replace the test server URL for reproducibility of the example func (handler LinkHandler) replaceServerURL(link string) string { return strings.Replace(link, handler.ServerURL, "http://example.com", -1) } // nolint: gocyclo func RunServer() *httptest.Server { return httptest.NewServer(http.HandlerFunc(func( writer http.ResponseWriter, request *http.Request, ) { if request.URL.Path == "/robots.txt" { sitemapLink := completeLinkWithHost("/sitemap_from_robots_txt.xml", request.Host) fmt.Fprintf( writer, ` User-agent: go-crawler Disallow: /2 Sitemap: %s `, sitemapLink, ) return } var links []string switch request.URL.Path { case "/sitemap.xml": links = []string{"/1", "/2", "/hidden/1", "/hidden/2"} case "/sitemap_from_robots_txt.xml": links = []string{"/hidden/3", "/hidden/4"} case "/hidden/1/sitemap.xml": links = []string{"/hidden/5", "/hidden/6"} case "/1/sitemap.xml", "/2/sitemap.xml", "/hidden/sitemap.xml": links = []string{} } for index := range links { links[index] = completeLinkWithHost(links[index], request.Host) } if links != nil { writer.Header().Set("Content-Encoding", "gzip") compressingWriter := gzip.NewWriter(writer) defer compressingWriter.Close() renderTemplate(compressingWriter, links, ` <?xml version="1.0" encoding="UTF-8" ?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> {{ range $link := . }} <url> <loc>{{ $link }}</loc> </url> {{ end }} </urlset> `) return } switch request.URL.Path { case "/": links = []string{"/1", "/2", "/2", "https://golang.org/"} case "/1": links = []string{"/1/1", "/1/2"} case "/2": links = []string{"/2/1", "/2/2"} case "/hidden/1": links = []string{"/hidden/1/test"} } renderTemplate(writer, links, ` <ul> {{ range $link := . }} <li> <a href="{{ $link }}">{{ $link }}</a> </li> {{ end }} </ul> `) })) } func completeLinkWithHost(link string, host string) string { return "http://" + path.Join(host, link) } func renderTemplate(writer io.Writer, data interface{}, text string) { template, _ := template.New("").Parse(text) template.Execute(writer, data) } func main() { server := RunServer() defer server.Close() logger := stdlog.New(os.Stderr, "", stdlog.LstdFlags|stdlog.Lmicroseconds) // wrap the standard logger via the github.com/go-log/log package wrappedLogger := print.New(logger) crawler.Crawl( context.Background(), crawler.ConcurrencyConfig{ ConcurrencyFactor: runtime.NumCPU(), BufferSize: 1000, }, []string{server.URL}, crawler.CrawlDependencies{ LinkExtractor: extractors.DefaultExtractor{ HTTPClient: http.DefaultClient, Filters: htmlselector.OptimizeFilters(htmlselector.FilterGroup{ "a": {"href"}, }), LinkTransformer: transformers.ResolvingTransformer{ BaseTagSelection: transformers.SelectFirstBaseTag, BaseTagFilters: transformers.DefaultBaseTagFilters, BaseHeaderNames: urlutils.DefaultBaseHeaderNames, Logger: wrappedLogger, }, }, LinkChecker: checkers.HostChecker{ ComparisonResult: urlutils.Same, Logger: wrappedLogger, }, LinkHandler: LinkHandler{ ServerURL: server.URL, }, Logger: wrappedLogger, }, ) }
Output: received link "http://example.com/1" from page "http://example.com" received link "http://example.com/1/1" from page "http://example.com/1" received link "http://example.com/1/2" from page "http://example.com/1" received link "http://example.com/2" from page "http://example.com" received link "http://example.com/2" from page "http://example.com" received link "http://example.com/2/1" from page "http://example.com/2" received link "http://example.com/2/1" from page "http://example.com/2" received link "http://example.com/2/2" from page "http://example.com/2" received link "http://example.com/2/2" from page "http://example.com/2" received link "https://golang.org/" from page "http://example.com"
Example (WithAllFeatures) ¶
package main import ( "compress/gzip" "context" "fmt" "html/template" "io" stdlog "log" "net/http" "net/http/httptest" "os" "path" "runtime" "strings" "time" crawler "github.com/thewizardplusplus/go-crawler" "github.com/thewizardplusplus/go-crawler/checkers" "github.com/thewizardplusplus/go-crawler/extractors" "github.com/thewizardplusplus/go-crawler/extractors/transformers" "github.com/thewizardplusplus/go-crawler/handlers" "github.com/thewizardplusplus/go-crawler/models" "github.com/thewizardplusplus/go-crawler/registers" "github.com/thewizardplusplus/go-crawler/registers/sitemap" urlutils "github.com/thewizardplusplus/go-crawler/url-utils" htmlselector "github.com/thewizardplusplus/go-html-selector" ) type LinkHandler struct { Name string ServerURL string } func (handler LinkHandler) HandleLink( ctx context.Context, link models.SourcedLink, ) { var prefix string if handler.Name != "" { prefix = fmt.Sprintf("[%s] ", handler.Name) } fmt.Printf( "%sreceived link %q from page %q\n", prefix, handler.replaceServerURL(link.Link), handler.replaceServerURL(link.SourceLink), ) } // replace the test server URL for reproducibility of the example func (handler LinkHandler) replaceServerURL(link string) string { return strings.Replace(link, handler.ServerURL, "http://example.com", -1) } // nolint: gocyclo func RunServer() *httptest.Server { return httptest.NewServer(http.HandlerFunc(func( writer http.ResponseWriter, request *http.Request, ) { if request.URL.Path == "/robots.txt" { sitemapLink := completeLinkWithHost("/sitemap_from_robots_txt.xml", request.Host) fmt.Fprintf( writer, ` User-agent: go-crawler Disallow: /2 Sitemap: %s `, sitemapLink, ) return } var links []string switch request.URL.Path { case "/sitemap.xml": links = []string{"/1", "/2", "/hidden/1", "/hidden/2"} case "/sitemap_from_robots_txt.xml": links = []string{"/hidden/3", "/hidden/4"} case "/hidden/1/sitemap.xml": links = []string{"/hidden/5", "/hidden/6"} case "/1/sitemap.xml", "/2/sitemap.xml", "/hidden/sitemap.xml": links = []string{} } for index := range links { links[index] = completeLinkWithHost(links[index], request.Host) } if links != nil { writer.Header().Set("Content-Encoding", "gzip") compressingWriter := gzip.NewWriter(writer) defer compressingWriter.Close() renderTemplate(compressingWriter, links, ` <?xml version="1.0" encoding="UTF-8" ?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> {{ range $link := . }} <url> <loc>{{ $link }}</loc> </url> {{ end }} </urlset> `) return } switch request.URL.Path { case "/": links = []string{"/1", "/2", "/2", "https://golang.org/"} case "/1": links = []string{"/1/1", "/1/2"} case "/2": links = []string{"/2/1", "/2/2"} case "/hidden/1": links = []string{"/hidden/1/test"} } renderTemplate(writer, links, ` <ul> {{ range $link := . }} <li> <a href="{{ $link }}">{{ $link }}</a> </li> {{ end }} </ul> `) })) } func completeLinkWithHost(link string, host string) string { return "http://" + path.Join(host, link) } func renderTemplate(writer io.Writer, data interface{}, text string) { template, _ := template.New("").Parse(text) template.Execute(writer, data) } func main() { server := RunServer() defer server.Close() logger := stdlog.New(os.Stderr, "", stdlog.LstdFlags|stdlog.Lmicroseconds) // wrap the standard logger via the github.com/go-log/log package wrappedLogger := print.New(logger) robotsTXTRegister := registers.NewRobotsTXTRegister(http.DefaultClient) crawler.CrawlByConcurrentHandler( context.Background(), crawler.ConcurrencyConfig{ ConcurrencyFactor: runtime.NumCPU(), BufferSize: 1000, }, crawler.ConcurrencyConfig{ ConcurrencyFactor: runtime.NumCPU(), BufferSize: 1000, }, []string{server.URL}, crawler.CrawlDependencies{ LinkExtractor: extractors.NewDelayingExtractor( time.Second, time.Sleep, extractors.ExtractorGroup{ Name: "main extractors", LinkExtractors: []models.LinkExtractor{ extractors.RepeatingExtractor{ LinkExtractor: extractors.DefaultExtractor{ HTTPClient: http.DefaultClient, Filters: htmlselector.OptimizeFilters(htmlselector.FilterGroup{ "a": {"href"}, }), LinkTransformer: transformers.TransformerGroup{ transformers.TrimmingTransformer{ TrimLink: urlutils.TrimLink, }, transformers.ResolvingTransformer{ BaseTagSelection: transformers.SelectFirstBaseTag, BaseTagFilters: transformers.DefaultBaseTagFilters, BaseHeaderNames: urlutils.DefaultBaseHeaderNames, Logger: wrappedLogger, }, }, }, RepeatCount: 5, RepeatDelay: time.Second, Logger: wrappedLogger, SleepHandler: time.Sleep, }, extractors.RepeatingExtractor{ LinkExtractor: extractors.TrimmingExtractor{ TrimLink: urlutils.TrimLink, LinkExtractor: extractors.SitemapExtractor{ SitemapRegister: registers.NewSitemapRegister( time.Second, extractors.ExtractorGroup{ Name: "extractors of Sitemap links", LinkExtractors: []models.LinkExtractor{ sitemap.HierarchicalGenerator{ SanitizeLink: urlutils.SanitizeLink, MaximalDepth: -1, }, sitemap.RobotsTXTGenerator{ RobotsTXTRegister: robotsTXTRegister, }, }, Logger: wrappedLogger, }, wrappedLogger, sitemap.Loader{HTTPClient: http.DefaultClient}.LoadLink, ), Logger: wrappedLogger, }, }, RepeatCount: 5, RepeatDelay: time.Second, Logger: wrappedLogger, SleepHandler: time.Sleep, }, }, Logger: wrappedLogger, }, ), LinkChecker: checkers.CheckerGroup{ checkers.HostChecker{ Logger: wrappedLogger, }, checkers.DuplicateChecker{ LinkRegister: registers.NewLinkRegister(urlutils.SanitizeLink), Logger: wrappedLogger, }, checkers.RobotsTXTChecker{ UserAgent: "go-crawler", RobotsTXTRegister: robotsTXTRegister, Logger: wrappedLogger, }, }, LinkHandler: handlers.CheckedHandler{ LinkChecker: checkers.DuplicateChecker{ // don't use here the link register from the duplicate checker above LinkRegister: registers.NewLinkRegister(urlutils.SanitizeLink), Logger: wrappedLogger, }, LinkHandler: handlers.HandlerGroup{ handlers.CheckedHandler{ LinkChecker: checkers.HostChecker{ ComparisonResult: urlutils.Same, Logger: wrappedLogger, }, LinkHandler: LinkHandler{ Name: "inner", ServerURL: server.URL, }, }, handlers.CheckedHandler{ LinkChecker: checkers.HostChecker{ ComparisonResult: urlutils.Different, Logger: wrappedLogger, }, LinkHandler: LinkHandler{ Name: "outer", ServerURL: server.URL, }, }, }, }, Logger: wrappedLogger, }, ) }
Output: [inner] received link "http://example.com/1" from page "http://example.com" [inner] received link "http://example.com/1/1" from page "http://example.com/1" [inner] received link "http://example.com/1/2" from page "http://example.com/1" [inner] received link "http://example.com/2" from page "http://example.com" [inner] received link "http://example.com/hidden/1" from page "http://example.com" [inner] received link "http://example.com/hidden/1/test" from page "http://example.com/hidden/1" [inner] received link "http://example.com/hidden/2" from page "http://example.com" [inner] received link "http://example.com/hidden/3" from page "http://example.com" [inner] received link "http://example.com/hidden/4" from page "http://example.com" [inner] received link "http://example.com/hidden/5" from page "http://example.com/hidden/1/test" [inner] received link "http://example.com/hidden/6" from page "http://example.com/hidden/1/test" [outer] received link "https://golang.org/" from page "http://example.com"
Example (WithFewHandlers) ¶
package main import ( "compress/gzip" "context" "fmt" "html/template" "io" stdlog "log" "net/http" "net/http/httptest" "os" "path" "runtime" "strings" crawler "github.com/thewizardplusplus/go-crawler" "github.com/thewizardplusplus/go-crawler/checkers" "github.com/thewizardplusplus/go-crawler/extractors" "github.com/thewizardplusplus/go-crawler/extractors/transformers" "github.com/thewizardplusplus/go-crawler/handlers" "github.com/thewizardplusplus/go-crawler/models" urlutils "github.com/thewizardplusplus/go-crawler/url-utils" htmlselector "github.com/thewizardplusplus/go-html-selector" ) type LinkHandler struct { Name string ServerURL string } func (handler LinkHandler) HandleLink( ctx context.Context, link models.SourcedLink, ) { var prefix string if handler.Name != "" { prefix = fmt.Sprintf("[%s] ", handler.Name) } fmt.Printf( "%sreceived link %q from page %q\n", prefix, handler.replaceServerURL(link.Link), handler.replaceServerURL(link.SourceLink), ) } // replace the test server URL for reproducibility of the example func (handler LinkHandler) replaceServerURL(link string) string { return strings.Replace(link, handler.ServerURL, "http://example.com", -1) } // nolint: gocyclo func RunServer() *httptest.Server { return httptest.NewServer(http.HandlerFunc(func( writer http.ResponseWriter, request *http.Request, ) { if request.URL.Path == "/robots.txt" { sitemapLink := completeLinkWithHost("/sitemap_from_robots_txt.xml", request.Host) fmt.Fprintf( writer, ` User-agent: go-crawler Disallow: /2 Sitemap: %s `, sitemapLink, ) return } var links []string switch request.URL.Path { case "/sitemap.xml": links = []string{"/1", "/2", "/hidden/1", "/hidden/2"} case "/sitemap_from_robots_txt.xml": links = []string{"/hidden/3", "/hidden/4"} case "/hidden/1/sitemap.xml": links = []string{"/hidden/5", "/hidden/6"} case "/1/sitemap.xml", "/2/sitemap.xml", "/hidden/sitemap.xml": links = []string{} } for index := range links { links[index] = completeLinkWithHost(links[index], request.Host) } if links != nil { writer.Header().Set("Content-Encoding", "gzip") compressingWriter := gzip.NewWriter(writer) defer compressingWriter.Close() renderTemplate(compressingWriter, links, ` <?xml version="1.0" encoding="UTF-8" ?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> {{ range $link := . }} <url> <loc>{{ $link }}</loc> </url> {{ end }} </urlset> `) return } switch request.URL.Path { case "/": links = []string{"/1", "/2", "/2", "https://golang.org/"} case "/1": links = []string{"/1/1", "/1/2"} case "/2": links = []string{"/2/1", "/2/2"} case "/hidden/1": links = []string{"/hidden/1/test"} } renderTemplate(writer, links, ` <ul> {{ range $link := . }} <li> <a href="{{ $link }}">{{ $link }}</a> </li> {{ end }} </ul> `) })) } func completeLinkWithHost(link string, host string) string { return "http://" + path.Join(host, link) } func renderTemplate(writer io.Writer, data interface{}, text string) { template, _ := template.New("").Parse(text) template.Execute(writer, data) } func main() { server := RunServer() defer server.Close() logger := stdlog.New(os.Stderr, "", stdlog.LstdFlags|stdlog.Lmicroseconds) // wrap the standard logger via the github.com/go-log/log package wrappedLogger := print.New(logger) crawler.Crawl( context.Background(), crawler.ConcurrencyConfig{ ConcurrencyFactor: runtime.NumCPU(), BufferSize: 1000, }, []string{server.URL}, crawler.CrawlDependencies{ LinkExtractor: extractors.DefaultExtractor{ HTTPClient: http.DefaultClient, Filters: htmlselector.OptimizeFilters(htmlselector.FilterGroup{ "a": {"href"}, }), LinkTransformer: transformers.ResolvingTransformer{ BaseTagSelection: transformers.SelectFirstBaseTag, BaseTagFilters: transformers.DefaultBaseTagFilters, BaseHeaderNames: urlutils.DefaultBaseHeaderNames, Logger: wrappedLogger, }, }, LinkChecker: checkers.HostChecker{ ComparisonResult: urlutils.Same, Logger: wrappedLogger, }, LinkHandler: handlers.HandlerGroup{ handlers.CheckedHandler{ LinkChecker: checkers.HostChecker{ ComparisonResult: urlutils.Same, Logger: wrappedLogger, }, LinkHandler: LinkHandler{ Name: "inner", ServerURL: server.URL, }, }, handlers.CheckedHandler{ LinkChecker: checkers.HostChecker{ ComparisonResult: urlutils.Different, Logger: wrappedLogger, }, LinkHandler: LinkHandler{ Name: "outer", ServerURL: server.URL, }, }, }, Logger: wrappedLogger, }, ) }
Output: [inner] received link "http://example.com/1" from page "http://example.com" [inner] received link "http://example.com/1/1" from page "http://example.com/1" [inner] received link "http://example.com/1/2" from page "http://example.com/1" [inner] received link "http://example.com/2" from page "http://example.com" [inner] received link "http://example.com/2" from page "http://example.com" [inner] received link "http://example.com/2/1" from page "http://example.com/2" [inner] received link "http://example.com/2/1" from page "http://example.com/2" [inner] received link "http://example.com/2/2" from page "http://example.com/2" [inner] received link "http://example.com/2/2" from page "http://example.com/2" [outer] received link "https://golang.org/" from page "http://example.com"
Example (WithRobotsTXT) ¶
package main import ( "compress/gzip" "context" "fmt" "html/template" "io" stdlog "log" "net/http" "net/http/httptest" "os" "path" "runtime" "strings" crawler "github.com/thewizardplusplus/go-crawler" "github.com/thewizardplusplus/go-crawler/checkers" "github.com/thewizardplusplus/go-crawler/extractors" "github.com/thewizardplusplus/go-crawler/extractors/transformers" "github.com/thewizardplusplus/go-crawler/models" "github.com/thewizardplusplus/go-crawler/registers" urlutils "github.com/thewizardplusplus/go-crawler/url-utils" htmlselector "github.com/thewizardplusplus/go-html-selector" ) type LinkHandler struct { Name string ServerURL string } func (handler LinkHandler) HandleLink( ctx context.Context, link models.SourcedLink, ) { var prefix string if handler.Name != "" { prefix = fmt.Sprintf("[%s] ", handler.Name) } fmt.Printf( "%sreceived link %q from page %q\n", prefix, handler.replaceServerURL(link.Link), handler.replaceServerURL(link.SourceLink), ) } // replace the test server URL for reproducibility of the example func (handler LinkHandler) replaceServerURL(link string) string { return strings.Replace(link, handler.ServerURL, "http://example.com", -1) } // nolint: gocyclo func RunServer() *httptest.Server { return httptest.NewServer(http.HandlerFunc(func( writer http.ResponseWriter, request *http.Request, ) { if request.URL.Path == "/robots.txt" { sitemapLink := completeLinkWithHost("/sitemap_from_robots_txt.xml", request.Host) fmt.Fprintf( writer, ` User-agent: go-crawler Disallow: /2 Sitemap: %s `, sitemapLink, ) return } var links []string switch request.URL.Path { case "/sitemap.xml": links = []string{"/1", "/2", "/hidden/1", "/hidden/2"} case "/sitemap_from_robots_txt.xml": links = []string{"/hidden/3", "/hidden/4"} case "/hidden/1/sitemap.xml": links = []string{"/hidden/5", "/hidden/6"} case "/1/sitemap.xml", "/2/sitemap.xml", "/hidden/sitemap.xml": links = []string{} } for index := range links { links[index] = completeLinkWithHost(links[index], request.Host) } if links != nil { writer.Header().Set("Content-Encoding", "gzip") compressingWriter := gzip.NewWriter(writer) defer compressingWriter.Close() renderTemplate(compressingWriter, links, ` <?xml version="1.0" encoding="UTF-8" ?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> {{ range $link := . }} <url> <loc>{{ $link }}</loc> </url> {{ end }} </urlset> `) return } switch request.URL.Path { case "/": links = []string{"/1", "/2", "/2", "https://golang.org/"} case "/1": links = []string{"/1/1", "/1/2"} case "/2": links = []string{"/2/1", "/2/2"} case "/hidden/1": links = []string{"/hidden/1/test"} } renderTemplate(writer, links, ` <ul> {{ range $link := . }} <li> <a href="{{ $link }}">{{ $link }}</a> </li> {{ end }} </ul> `) })) } func completeLinkWithHost(link string, host string) string { return "http://" + path.Join(host, link) } func renderTemplate(writer io.Writer, data interface{}, text string) { template, _ := template.New("").Parse(text) template.Execute(writer, data) } func main() { server := RunServer() defer server.Close() logger := stdlog.New(os.Stderr, "", stdlog.LstdFlags|stdlog.Lmicroseconds) // wrap the standard logger via the github.com/go-log/log package wrappedLogger := print.New(logger) crawler.Crawl( context.Background(), crawler.ConcurrencyConfig{ ConcurrencyFactor: runtime.NumCPU(), BufferSize: 1000, }, []string{server.URL}, crawler.CrawlDependencies{ LinkExtractor: extractors.DefaultExtractor{ HTTPClient: http.DefaultClient, Filters: htmlselector.OptimizeFilters(htmlselector.FilterGroup{ "a": {"href"}, }), LinkTransformer: transformers.ResolvingTransformer{ BaseTagSelection: transformers.SelectFirstBaseTag, BaseTagFilters: transformers.DefaultBaseTagFilters, BaseHeaderNames: urlutils.DefaultBaseHeaderNames, Logger: wrappedLogger, }, }, LinkChecker: checkers.CheckerGroup{ checkers.HostChecker{ ComparisonResult: urlutils.Same, Logger: wrappedLogger, }, checkers.RobotsTXTChecker{ UserAgent: "go-crawler", RobotsTXTRegister: registers.NewRobotsTXTRegister(http.DefaultClient), Logger: wrappedLogger, }, }, LinkHandler: LinkHandler{ ServerURL: server.URL, }, Logger: wrappedLogger, }, ) }
Output: received link "http://example.com/1" from page "http://example.com" received link "http://example.com/1/1" from page "http://example.com/1" received link "http://example.com/1/2" from page "http://example.com/1" received link "http://example.com/2" from page "http://example.com" received link "http://example.com/2" from page "http://example.com" received link "https://golang.org/" from page "http://example.com"
Example (WithSitemap) ¶
package main import ( "compress/gzip" "context" "fmt" "html/template" "io" stdlog "log" "net/http" "net/http/httptest" "os" "path" "runtime" "strings" "time" crawler "github.com/thewizardplusplus/go-crawler" "github.com/thewizardplusplus/go-crawler/checkers" "github.com/thewizardplusplus/go-crawler/extractors" "github.com/thewizardplusplus/go-crawler/extractors/transformers" "github.com/thewizardplusplus/go-crawler/handlers" "github.com/thewizardplusplus/go-crawler/models" "github.com/thewizardplusplus/go-crawler/registers" "github.com/thewizardplusplus/go-crawler/registers/sitemap" urlutils "github.com/thewizardplusplus/go-crawler/url-utils" htmlselector "github.com/thewizardplusplus/go-html-selector" ) type LinkHandler struct { Name string ServerURL string } func (handler LinkHandler) HandleLink( ctx context.Context, link models.SourcedLink, ) { var prefix string if handler.Name != "" { prefix = fmt.Sprintf("[%s] ", handler.Name) } fmt.Printf( "%sreceived link %q from page %q\n", prefix, handler.replaceServerURL(link.Link), handler.replaceServerURL(link.SourceLink), ) } // replace the test server URL for reproducibility of the example func (handler LinkHandler) replaceServerURL(link string) string { return strings.Replace(link, handler.ServerURL, "http://example.com", -1) } // nolint: gocyclo func RunServer() *httptest.Server { return httptest.NewServer(http.HandlerFunc(func( writer http.ResponseWriter, request *http.Request, ) { if request.URL.Path == "/robots.txt" { sitemapLink := completeLinkWithHost("/sitemap_from_robots_txt.xml", request.Host) fmt.Fprintf( writer, ` User-agent: go-crawler Disallow: /2 Sitemap: %s `, sitemapLink, ) return } var links []string switch request.URL.Path { case "/sitemap.xml": links = []string{"/1", "/2", "/hidden/1", "/hidden/2"} case "/sitemap_from_robots_txt.xml": links = []string{"/hidden/3", "/hidden/4"} case "/hidden/1/sitemap.xml": links = []string{"/hidden/5", "/hidden/6"} case "/1/sitemap.xml", "/2/sitemap.xml", "/hidden/sitemap.xml": links = []string{} } for index := range links { links[index] = completeLinkWithHost(links[index], request.Host) } if links != nil { writer.Header().Set("Content-Encoding", "gzip") compressingWriter := gzip.NewWriter(writer) defer compressingWriter.Close() renderTemplate(compressingWriter, links, ` <?xml version="1.0" encoding="UTF-8" ?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> {{ range $link := . }} <url> <loc>{{ $link }}</loc> </url> {{ end }} </urlset> `) return } switch request.URL.Path { case "/": links = []string{"/1", "/2", "/2", "https://golang.org/"} case "/1": links = []string{"/1/1", "/1/2"} case "/2": links = []string{"/2/1", "/2/2"} case "/hidden/1": links = []string{"/hidden/1/test"} } renderTemplate(writer, links, ` <ul> {{ range $link := . }} <li> <a href="{{ $link }}">{{ $link }}</a> </li> {{ end }} </ul> `) })) } func completeLinkWithHost(link string, host string) string { return "http://" + path.Join(host, link) } func renderTemplate(writer io.Writer, data interface{}, text string) { template, _ := template.New("").Parse(text) template.Execute(writer, data) } func main() { server := RunServer() defer server.Close() logger := stdlog.New(os.Stderr, "", stdlog.LstdFlags|stdlog.Lmicroseconds) // wrap the standard logger via the github.com/go-log/log package wrappedLogger := print.New(logger) crawler.Crawl( context.Background(), crawler.ConcurrencyConfig{ ConcurrencyFactor: runtime.NumCPU(), BufferSize: 1000, }, []string{server.URL}, crawler.CrawlDependencies{ LinkExtractor: extractors.ExtractorGroup{ Name: "main extractors", LinkExtractors: []models.LinkExtractor{ extractors.DefaultExtractor{ HTTPClient: http.DefaultClient, Filters: htmlselector.OptimizeFilters(htmlselector.FilterGroup{ "a": {"href"}, }), LinkTransformer: transformers.ResolvingTransformer{ BaseTagSelection: transformers.SelectFirstBaseTag, BaseTagFilters: transformers.DefaultBaseTagFilters, BaseHeaderNames: urlutils.DefaultBaseHeaderNames, Logger: wrappedLogger, }, }, extractors.SitemapExtractor{ SitemapRegister: registers.NewSitemapRegister( time.Second, extractors.ExtractorGroup{ Name: "extractors of Sitemap links", LinkExtractors: []models.LinkExtractor{ sitemap.HierarchicalGenerator{ SanitizeLink: urlutils.SanitizeLink, MaximalDepth: -1, }, sitemap.RobotsTXTGenerator{ RobotsTXTRegister: registers.NewRobotsTXTRegister( http.DefaultClient, ), }, }, Logger: wrappedLogger, }, wrappedLogger, sitemap.Loader{HTTPClient: http.DefaultClient}.LoadLink, ), Logger: wrappedLogger, }, }, Logger: wrappedLogger, }, LinkChecker: checkers.CheckerGroup{ checkers.HostChecker{ ComparisonResult: urlutils.Same, Logger: wrappedLogger, }, checkers.DuplicateChecker{ LinkRegister: registers.NewLinkRegister(urlutils.SanitizeLink), Logger: wrappedLogger, }, }, LinkHandler: handlers.CheckedHandler{ LinkChecker: checkers.DuplicateChecker{ // don't use here the link register from the duplicate checker above LinkRegister: registers.NewLinkRegister(urlutils.SanitizeLink), Logger: wrappedLogger, }, LinkHandler: LinkHandler{ ServerURL: server.URL, }, }, Logger: wrappedLogger, }, ) }
Output: received link "http://example.com/1" from page "http://example.com" received link "http://example.com/1/1" from page "http://example.com/1" received link "http://example.com/1/2" from page "http://example.com/1" received link "http://example.com/2" from page "http://example.com" received link "http://example.com/2/1" from page "http://example.com/2" received link "http://example.com/2/2" from page "http://example.com/2" received link "http://example.com/hidden/1" from page "http://example.com" received link "http://example.com/hidden/1/test" from page "http://example.com/hidden/1" received link "http://example.com/hidden/2" from page "http://example.com" received link "http://example.com/hidden/3" from page "http://example.com" received link "http://example.com/hidden/4" from page "http://example.com" received link "http://example.com/hidden/5" from page "http://example.com/hidden/1/test" received link "http://example.com/hidden/6" from page "http://example.com/hidden/1/test" received link "https://golang.org/" from page "http://example.com"
Example (WithoutDuplicatesOnExtracting) ¶
package main import ( "compress/gzip" "context" "fmt" "html/template" "io" stdlog "log" "net/http" "net/http/httptest" "os" "path" "runtime" "strings" crawler "github.com/thewizardplusplus/go-crawler" "github.com/thewizardplusplus/go-crawler/checkers" "github.com/thewizardplusplus/go-crawler/extractors" "github.com/thewizardplusplus/go-crawler/extractors/transformers" "github.com/thewizardplusplus/go-crawler/models" "github.com/thewizardplusplus/go-crawler/registers" urlutils "github.com/thewizardplusplus/go-crawler/url-utils" htmlselector "github.com/thewizardplusplus/go-html-selector" ) type LinkHandler struct { Name string ServerURL string } func (handler LinkHandler) HandleLink( ctx context.Context, link models.SourcedLink, ) { var prefix string if handler.Name != "" { prefix = fmt.Sprintf("[%s] ", handler.Name) } fmt.Printf( "%sreceived link %q from page %q\n", prefix, handler.replaceServerURL(link.Link), handler.replaceServerURL(link.SourceLink), ) } // replace the test server URL for reproducibility of the example func (handler LinkHandler) replaceServerURL(link string) string { return strings.Replace(link, handler.ServerURL, "http://example.com", -1) } // nolint: gocyclo func RunServer() *httptest.Server { return httptest.NewServer(http.HandlerFunc(func( writer http.ResponseWriter, request *http.Request, ) { if request.URL.Path == "/robots.txt" { sitemapLink := completeLinkWithHost("/sitemap_from_robots_txt.xml", request.Host) fmt.Fprintf( writer, ` User-agent: go-crawler Disallow: /2 Sitemap: %s `, sitemapLink, ) return } var links []string switch request.URL.Path { case "/sitemap.xml": links = []string{"/1", "/2", "/hidden/1", "/hidden/2"} case "/sitemap_from_robots_txt.xml": links = []string{"/hidden/3", "/hidden/4"} case "/hidden/1/sitemap.xml": links = []string{"/hidden/5", "/hidden/6"} case "/1/sitemap.xml", "/2/sitemap.xml", "/hidden/sitemap.xml": links = []string{} } for index := range links { links[index] = completeLinkWithHost(links[index], request.Host) } if links != nil { writer.Header().Set("Content-Encoding", "gzip") compressingWriter := gzip.NewWriter(writer) defer compressingWriter.Close() renderTemplate(compressingWriter, links, ` <?xml version="1.0" encoding="UTF-8" ?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> {{ range $link := . }} <url> <loc>{{ $link }}</loc> </url> {{ end }} </urlset> `) return } switch request.URL.Path { case "/": links = []string{"/1", "/2", "/2", "https://golang.org/"} case "/1": links = []string{"/1/1", "/1/2"} case "/2": links = []string{"/2/1", "/2/2"} case "/hidden/1": links = []string{"/hidden/1/test"} } renderTemplate(writer, links, ` <ul> {{ range $link := . }} <li> <a href="{{ $link }}">{{ $link }}</a> </li> {{ end }} </ul> `) })) } func completeLinkWithHost(link string, host string) string { return "http://" + path.Join(host, link) } func renderTemplate(writer io.Writer, data interface{}, text string) { template, _ := template.New("").Parse(text) template.Execute(writer, data) } func main() { server := RunServer() defer server.Close() logger := stdlog.New(os.Stderr, "", stdlog.LstdFlags|stdlog.Lmicroseconds) // wrap the standard logger via the github.com/go-log/log package wrappedLogger := print.New(logger) crawler.Crawl( context.Background(), crawler.ConcurrencyConfig{ ConcurrencyFactor: runtime.NumCPU(), BufferSize: 1000, }, []string{server.URL}, crawler.CrawlDependencies{ LinkExtractor: extractors.DefaultExtractor{ HTTPClient: http.DefaultClient, Filters: htmlselector.OptimizeFilters(htmlselector.FilterGroup{ "a": {"href"}, }), LinkTransformer: transformers.ResolvingTransformer{ BaseTagSelection: transformers.SelectFirstBaseTag, BaseTagFilters: transformers.DefaultBaseTagFilters, BaseHeaderNames: urlutils.DefaultBaseHeaderNames, Logger: wrappedLogger, }, }, LinkChecker: checkers.CheckerGroup{ checkers.HostChecker{ ComparisonResult: urlutils.Same, Logger: wrappedLogger, }, checkers.DuplicateChecker{ LinkRegister: registers.NewLinkRegister(urlutils.SanitizeLink), Logger: wrappedLogger, }, }, LinkHandler: LinkHandler{ ServerURL: server.URL, }, Logger: wrappedLogger, }, ) }
Output: received link "http://example.com/1" from page "http://example.com" received link "http://example.com/1/1" from page "http://example.com/1" received link "http://example.com/1/2" from page "http://example.com/1" received link "http://example.com/2" from page "http://example.com" received link "http://example.com/2" from page "http://example.com" received link "http://example.com/2/1" from page "http://example.com/2" received link "http://example.com/2/2" from page "http://example.com/2" received link "https://golang.org/" from page "http://example.com"
Example (WithoutDuplicatesOnHandling) ¶
package main import ( "compress/gzip" "context" "fmt" "html/template" "io" stdlog "log" "net/http" "net/http/httptest" "os" "path" "runtime" "strings" crawler "github.com/thewizardplusplus/go-crawler" "github.com/thewizardplusplus/go-crawler/checkers" "github.com/thewizardplusplus/go-crawler/extractors" "github.com/thewizardplusplus/go-crawler/extractors/transformers" "github.com/thewizardplusplus/go-crawler/handlers" "github.com/thewizardplusplus/go-crawler/models" "github.com/thewizardplusplus/go-crawler/registers" urlutils "github.com/thewizardplusplus/go-crawler/url-utils" htmlselector "github.com/thewizardplusplus/go-html-selector" ) type LinkHandler struct { Name string ServerURL string } func (handler LinkHandler) HandleLink( ctx context.Context, link models.SourcedLink, ) { var prefix string if handler.Name != "" { prefix = fmt.Sprintf("[%s] ", handler.Name) } fmt.Printf( "%sreceived link %q from page %q\n", prefix, handler.replaceServerURL(link.Link), handler.replaceServerURL(link.SourceLink), ) } // replace the test server URL for reproducibility of the example func (handler LinkHandler) replaceServerURL(link string) string { return strings.Replace(link, handler.ServerURL, "http://example.com", -1) } // nolint: gocyclo func RunServer() *httptest.Server { return httptest.NewServer(http.HandlerFunc(func( writer http.ResponseWriter, request *http.Request, ) { if request.URL.Path == "/robots.txt" { sitemapLink := completeLinkWithHost("/sitemap_from_robots_txt.xml", request.Host) fmt.Fprintf( writer, ` User-agent: go-crawler Disallow: /2 Sitemap: %s `, sitemapLink, ) return } var links []string switch request.URL.Path { case "/sitemap.xml": links = []string{"/1", "/2", "/hidden/1", "/hidden/2"} case "/sitemap_from_robots_txt.xml": links = []string{"/hidden/3", "/hidden/4"} case "/hidden/1/sitemap.xml": links = []string{"/hidden/5", "/hidden/6"} case "/1/sitemap.xml", "/2/sitemap.xml", "/hidden/sitemap.xml": links = []string{} } for index := range links { links[index] = completeLinkWithHost(links[index], request.Host) } if links != nil { writer.Header().Set("Content-Encoding", "gzip") compressingWriter := gzip.NewWriter(writer) defer compressingWriter.Close() renderTemplate(compressingWriter, links, ` <?xml version="1.0" encoding="UTF-8" ?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> {{ range $link := . }} <url> <loc>{{ $link }}</loc> </url> {{ end }} </urlset> `) return } switch request.URL.Path { case "/": links = []string{"/1", "/2", "/2", "https://golang.org/"} case "/1": links = []string{"/1/1", "/1/2"} case "/2": links = []string{"/2/1", "/2/2"} case "/hidden/1": links = []string{"/hidden/1/test"} } renderTemplate(writer, links, ` <ul> {{ range $link := . }} <li> <a href="{{ $link }}">{{ $link }}</a> </li> {{ end }} </ul> `) })) } func completeLinkWithHost(link string, host string) string { return "http://" + path.Join(host, link) } func renderTemplate(writer io.Writer, data interface{}, text string) { template, _ := template.New("").Parse(text) template.Execute(writer, data) } func main() { server := RunServer() defer server.Close() logger := stdlog.New(os.Stderr, "", stdlog.LstdFlags|stdlog.Lmicroseconds) // wrap the standard logger via the github.com/go-log/log package wrappedLogger := print.New(logger) crawler.Crawl( context.Background(), crawler.ConcurrencyConfig{ ConcurrencyFactor: runtime.NumCPU(), BufferSize: 1000, }, []string{server.URL}, crawler.CrawlDependencies{ LinkExtractor: extractors.DefaultExtractor{ HTTPClient: http.DefaultClient, Filters: htmlselector.OptimizeFilters(htmlselector.FilterGroup{ "a": {"href"}, }), LinkTransformer: transformers.ResolvingTransformer{ BaseTagSelection: transformers.SelectFirstBaseTag, BaseTagFilters: transformers.DefaultBaseTagFilters, BaseHeaderNames: urlutils.DefaultBaseHeaderNames, Logger: wrappedLogger, }, }, LinkChecker: checkers.CheckerGroup{ checkers.HostChecker{ ComparisonResult: urlutils.Same, Logger: wrappedLogger, }, checkers.DuplicateChecker{ LinkRegister: registers.NewLinkRegister(urlutils.SanitizeLink), Logger: wrappedLogger, }, }, LinkHandler: handlers.CheckedHandler{ LinkChecker: checkers.DuplicateChecker{ // don't use here the link register from the duplicate checker above LinkRegister: registers.NewLinkRegister(urlutils.SanitizeLink), Logger: wrappedLogger, }, LinkHandler: LinkHandler{ ServerURL: server.URL, }, }, Logger: wrappedLogger, }, ) }
Output: received link "http://example.com/1" from page "http://example.com" received link "http://example.com/1/1" from page "http://example.com/1" received link "http://example.com/1/2" from page "http://example.com/1" received link "http://example.com/2" from page "http://example.com" received link "http://example.com/2/1" from page "http://example.com/2" received link "http://example.com/2/2" from page "http://example.com/2" received link "https://golang.org/" from page "http://example.com"
func CrawlByConcurrentHandler ¶ added in v1.7.1
func CrawlByConcurrentHandler( ctx context.Context, concurrencyConfig ConcurrencyConfig, handlerConcurrencyConfig ConcurrencyConfig, links []string, dependencies CrawlDependencies, )
CrawlByConcurrentHandler ...
func HandleLink ¶
func HandleLink( ctx context.Context, threadID int, link string, dependencies HandleLinkDependencies, ) []string
HandleLink ...
func HandleLinks ¶
func HandleLinks( ctx context.Context, threadID int, links chan string, dependencies HandleLinkDependencies, )
HandleLinks ...
func HandleLinksConcurrently ¶
func HandleLinksConcurrently( ctx context.Context, concurrencyFactor int, links chan string, dependencies HandleLinkDependencies, )
HandleLinksConcurrently ...
Types ¶
type ConcurrencyConfig ¶ added in v1.7.1
ConcurrencyConfig ...
type CrawlDependencies ¶
type CrawlDependencies struct { LinkExtractor models.LinkExtractor LinkChecker models.LinkChecker LinkHandler models.LinkHandler Logger log.Logger }
CrawlDependencies ...
type HandleLinkDependencies ¶
type HandleLinkDependencies struct { CrawlDependencies Waiter syncutils.WaitGroup }
HandleLinkDependencies ...
type LinkChecker ¶
type LinkChecker interface { models.LinkChecker }
LinkChecker ...
It's used only for mock generating.
type LinkExtractor ¶
type LinkExtractor interface { models.LinkExtractor }
LinkExtractor ...
It's used only for mock generating.
type LinkHandler ¶
type LinkHandler interface { models.LinkHandler }
LinkHandler ...
It's used only for mock generating.
Click to show internal directories.
Click to hide internal directories.