collymongo

package module
v1.0.10 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jul 3, 2020 License: MIT Imports: 9 Imported by: 0

README

collymongo

GoDoc Go Report Card

A MongoDB storage backend for the Colly framework.

Example:

package main

import (
	"fmt"
	"log"

	"github.com/gocolly/colly/v2"

	"gitlab.com/MicahParks/collymongo"
)

func main() {

	// Create the collector.
	c := colly.NewCollector()

	// Create the MongoDB storage backend.
	storage := &collymongo.CollyMongo{Uri: "mongodb://localhost:27017"}

	// Set the storage backend.
	if err := c.SetStorage(storage); err != nil {
		log.Fatalln(err)
	}

	// Find and visit all links.
	c.OnHTML("a[href]", func(e *colly.HTMLElement) {
		if err := e.Request.Visit(e.Attr("href")); err != nil {

			// If the link has been visited before or if the URL is empty, then skip it.
			if err.Error() != "URL already visited" && err.Error() != "Missing URL" {
				log.Fatalln(err)
			}
		}
	})

	// State what URL the scraper is on.
	c.OnRequest(func(r *colly.Request) {
		fmt.Println("Visiting: " + r.URL.String())
	})

	// Start the scraper on the Go Colly website.
	if err := c.Visit("http://go-colly.org/"); err != nil {
		log.Fatalln(err)
	}
}

Documentation

Overview

Package collymongo is a MongoDB storage backend for the Colly framework.

Example:

 package main

import (
	"fmt"
	"log"

	"github.com/gocolly/colly/v2"

	"gitlab.com/MicahParks/collymongo"
)

func main() {

	// Create the collector.
	c := colly.NewCollector()

	// Create the MongoDB storage backend.
	storage := &collymongo.CollyMongo{Uri: "mongodb://localhost:27017"}

	// Set the storage backend.
	if err := c.SetStorage(storage); err != nil {
		log.Fatalln(err)
	}

	// Find and visit all links.
	c.OnHTML("a[href]", func(e *colly.HTMLElement) {
		if err := e.Request.Visit(e.Attr("href")); err != nil {

			// If the link has been visited before or if the URL is empty, then skip it.
			if err.Error() != "URL already visited" && err.Error() != "Missing URL" {
				log.Fatalln(err)
			}
		}
	})

	// State what URL the scraper is on.
	c.OnRequest(func(r *colly.Request) {
		fmt.Println("Visiting: " + r.URL.String())
	})

	// Start the scraper on the Go Colly website.
	if err := c.Visit("http://go-colly.org/"); err != nil {
		log.Fatalln(err)
	}
}

Index

Constants

View Source
const (

	// DefaultCookie is the default name of the collection to store the hostname cookie relationship in MongoDB.
	DefaultCookie = "cookie"

	// DefaultDatabase is the default name for the database to store Colly persistent data.
	DefaultDatabase = "colly"

	// DefaultRequest is the default name of the collection to store request IDs in MongoDB.
	DefaultRequest = "request"

	// DefaultWait is the amount of time to wait for a call to MongoDB if none was given by the package user.
	DefaultWait = time.Second * 10
)

Variables

This section is empty.

Functions

This section is empty.

Types

type CollyMongo

type CollyMongo struct {

	// ClientOptions are the options used to create the client for MongoDB. Can safely be left blank.
	ClientOptions *options.ClientOptions

	// CookieCol is the name of the collection in MongoDB used to store host to cookie relationships. It defaults to
	// "cookie". Can safely be left blank.
	CookieCol string

	// CookieOpts are the collection options to pass to the mongo package when creating a collection struct. Can safely
	// be left blank.
	CookieOpts []*options.CollectionOptions

	// DatabaseName is the name of the database to store Colly's host to cookie relationships and determining if a
	// request has already been made. It defaults to "colly". Can safely be left blank.
	DatabaseName string

	// DatabaseOpts are the database options to pass to the mongo package when creating a database struct. Can safely be
	// left blank.
	DatabaseOpts []*options.DatabaseOptions

	// LogFatal means that if an error occurs when inserting/finding cookies, collymongo will use Logger.Fatalln to
	// report the error. collymongo will create a logger if none was given. Can safely be left blank.
	LogFatal bool

	// Logger is the *log.Logger that collymongo will print MongoDB errors to if one happens when finding cookies. By
	// default no logging will occur. Can safely be left blank.
	Logger *log.Logger

	// FindCookieOpts are the options to pass to the mongo package when finding one cookie by hostname. Can safely be
	// left blank.
	FindCookieOpts []*options.FindOneOptions

	// FindCtxTime is the amount of time to put in the context timeout that is passed to the mongo package for finding
	// documents. Defaults to 10 seconds. Can safely be left blank.
	FindCtxTime time.Duration

	// FindRequestOpts are the options to pass to the mongo package when finding request IDs that have already been
	// performed. Can safely be left blank.
	FindRequestOpts []*options.FindOneOptions

	// InitCtxTime is the amount of time to put in the context timeout that is passed to the mongo package when
	// initializing that database connection. Defaults to 10 seconds. Can safely be left blank.
	InitCtxTime time.Duration

	// ReplaceCookieOpts are the options to pass to the mongo package when inserting a cookie into the cookie
	// collection. Can safely be left blank.
	ReplaceCookieOpts []*options.FindOneAndReplaceOptions

	// InsertCtxTime is the amount of time to put in the context timeout that is passed to the mongo package when
	// inserting a document. Defaults to 10 seconds. Can safely be left blank.
	InsertCtxTime time.Duration

	// InsertRequestOpts are the options to pass to the mongo package when inserting a request ID into the request
	// collection. Can safely be left blank.
	InsertRequestOpts []*options.InsertOneOptions

	// Uri is the MongoDB URI string. Mandatory.
	Uri string

	// RequestCol is the name of the collection in MongoDB used to store host to request IDs. It defaults to "request".
	// Can safely be left blank.
	RequestCol string

	// RequestOpts are the options to pass the mongo package when creating the request collection.  Can safely be left
	// blank.
	RequestOpts []*options.CollectionOptions
	// contains filtered or unexported fields
}

CollyMongo implements the Storage interface from github.com/gocolly/colly/storage. It allows for the use of MongoDB as a storage backend for Colly's host to cookie relationships and previous request IDs.

func (*CollyMongo) Cookies

func (m *CollyMongo) Cookies(u *url.URL) (cookies string)

Cookies follows the implementation of the storage.Storage interface. It accepts a URL and returns the cookies associated with the host, if any.

func (*CollyMongo) Init

func (m *CollyMongo) Init() (err error)

Init follows the implementation of the storage.Storage interface. It will create a client and connect to MongoDB. It will also create the database, cookie, and request collections if they do not already exist.

func (*CollyMongo) IsVisited

func (m *CollyMongo) IsVisited(requestID uint64) (visited bool, err error)

IsVisited follows the implementation of the storage.Storage interface. It checks if a request has been preformed yet.

func (*CollyMongo) SetCookies

func (m *CollyMongo) SetCookies(u *url.URL, cookies string)

SetCookies follows the implementation of the storage.Storage interface. It takes in a cookies and their URL then stores the URL's hostname and cookies in a document in MongoDB.

func (*CollyMongo) Visited

func (m *CollyMongo) Visited(requestID uint64) (err error)

Visited follows the implementation of the storage.Storage interface. It tells MongoDB the ID of a request that has already been preformed.

Directories

Path Synopsis

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL