mycelium

package module
v0.0.0-...-fce1600 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Oct 5, 2013 License: MIT Imports: 13 Imported by: 1

README

Mycelium

An experimental parallelized web crawler written in Golang.

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func InfiniteCrawl

func InfiniteCrawl(linksIn <-chan string, linksOut chan<- string, wantMore chan<- bool, pages chan<- *Page)

func Rank

func Rank(url string) float64

func RankLength

func RankLength(url string) float64

func RankProtocol

func RankProtocol(rawURL string) float64

Types

type DataStore

type DataStore interface {
	Save(*Page) error
	Listen(<-chan *Page)
}

type Page

type Page struct {
	URL  string
	Body string
	// contains filtered or unexported fields
}

func NewPageFromResponse

func NewPageFromResponse(resp *http.Response) *Page

Creates a new Page using an http.Response

func StagedCrawl

func StagedCrawl(seedUrls []string, stages int) []*Page
func (self *Page) GetLinks() []string

Extracts all links (<a> tags with href attributes) from a Page

type RedisDataStore

type RedisDataStore struct {
	// contains filtered or unexported fields
}

func NewDefaultRedisDataStore

func NewDefaultRedisDataStore() *RedisDataStore

func NewRedisDataStore

func NewRedisDataStore(conn redis.Conn) *RedisDataStore

func (*RedisDataStore) Listen

func (self *RedisDataStore) Listen(pages <-chan *Page)

func (*RedisDataStore) Save

func (self *RedisDataStore) Save(page *Page) error

func (*RedisDataStore) Stop

func (self *RedisDataStore) Stop()

type RedisTaskQueue

type RedisTaskQueue struct {
	// contains filtered or unexported fields
}

func NewDefaultRedisTaskQueue

func NewDefaultRedisTaskQueue() *RedisTaskQueue

func NewRedisTaskQueue

func NewRedisTaskQueue(conn redis.Conn) *RedisTaskQueue

func (*RedisTaskQueue) Listen

func (self *RedisTaskQueue) Listen(incoming <-chan string, outgoing chan<- string, wantMore <-chan bool)

func (*RedisTaskQueue) Pop

func (self *RedisTaskQueue) Pop(numTasks int) ([]string, error)

func (*RedisTaskQueue) Push

func (self *RedisTaskQueue) Push(link string) error

func (*RedisTaskQueue) Stop

func (self *RedisTaskQueue) Stop()

type RobotFilter

type RobotFilter struct {
	UserAgent string
	// contains filtered or unexported fields
}

func NewRobotFilter

func NewRobotFilter() *RobotFilter

func (*RobotFilter) Allowed

func (self *RobotFilter) Allowed(rawUrl string) bool

Checks if the given url is allowed to be crawled using github.com/temoto/robotstxt-go

func (*RobotFilter) PoliteGet

func (self *RobotFilter) PoliteGet(url string) (*http.Response, error)

Checks if the given url is allowed to be crawled by robots and retrieves the page using http.Get() if it is.

type TaskQueue

type TaskQueue interface {

	// pushes a task onto the queue
	// in this case, the task is a url to be crawled
	Push(string) error

	// pops tasks from the queue
	Pop(int) ([]string, error)

	Listen(incoming <-chan string, outgoing chan<- string, wantMore <-chan bool)
}

type Worker

type Worker struct {
	// contains filtered or unexported fields
}

func NewWorker

func NewWorker() *Worker

func (*Worker) GetPage

func (self *Worker) GetPage(url string) (*Page, error)

func (*Worker) GetPages

func (self *Worker) GetPages(urls []string, timeout time.Duration) []*Page

Directories

Path Synopsis

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL