Documentation ¶
Index ¶
- func ReplaceSQL(old, searchPattern string) string
- type Link
- type LinkBatcher
- func (lb *LinkBatcher) AddLink(link *Link) error
- func (lb *LinkBatcher) KillWorkers()
- func (lb *LinkBatcher) ResilientBatchAddLinks(links []*Link) error
- func (lb *LinkBatcher) SpawnWorkers(nWorkers int)
- func (lb *LinkBatcher) WaitUntilEmpty() <-chan bool
- func (lb *LinkBatcher) Worker(endSignal <-chan bool, doneChan chan<- bool)
- type Page
- type PageBatcher
- type Storage
- func (s *Storage) AddLink(link *Link) error
- func (s *Storage) AddPage(page *Page) error
- func (s *Storage) BatchAddLinks(links []*Link) error
- func (s *Storage) BatchAddPages(pages []*Page) error
- func (s *Storage) CheckLinkExists(fromU *url.URL, toU *url.URL) (bool, error)
- func (s *Storage) CheckPageExists(u *url.URL) (bool, error)
- func (s *Storage) Close() error
- func (s *Storage) CountLinks() (int, error)
- func (s *Storage) CountPages() (int, error)
- func (s *Storage) GetLinksFrom(pageHash string, limit int) ([]string, error)
- func (s *Storage) GetLinksTo(pageHash string, limit int) ([]string, error)
- func (s *Storage) GetPage(pageHash string) (*Page, error)
- func (s *Storage) GetPageHashesFromHost(host string, limit int) ([]string, error)
- func (s *Storage) Init() error
- func (s *Storage) KeepPingingOn(d time.Duration) chan<- bool
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func ReplaceSQL ¶
ReplaceSQL replaces the instance occurrence of any string pattern with an increasing $n based sequence
Types ¶
type LinkBatcher ¶
type LinkBatcher struct {
// contains filtered or unexported fields
}
LinkBatcher is a simple batching system for recording links to the db
func NewLinkBatcher ¶
func NewLinkBatcher(maxBatch int, s *Storage) *LinkBatcher
NewLinkBatcher is a helpfer function for constructing a LinkBatcher object
func (*LinkBatcher) AddLink ¶
func (lb *LinkBatcher) AddLink(link *Link) error
AddLink is a lightweight function to just whack that link into the channel
func (*LinkBatcher) KillWorkers ¶
func (lb *LinkBatcher) KillWorkers()
KillWorkers simply kills all previously spawned workers
func (*LinkBatcher) ResilientBatchAddLinks ¶
func (lb *LinkBatcher) ResilientBatchAddLinks(links []*Link) error
ResilientBatchAddLinks shrinks the batch sizes until it eventually works :shrug:
func (*LinkBatcher) SpawnWorkers ¶
func (lb *LinkBatcher) SpawnWorkers(nWorkers int)
SpawnWorkers spawns n workers, and returns a kill channel
func (*LinkBatcher) WaitUntilEmpty ¶
func (lb *LinkBatcher) WaitUntilEmpty() <-chan bool
WaitUntilEmpty returns a channel that receives input once the buffered channel is empty.
func (*LinkBatcher) Worker ¶
func (lb *LinkBatcher) Worker(endSignal <-chan bool, doneChan chan<- bool)
Worker is the worker process for the link batcher This is straight up nicked from https://blog.drkaka.com/batch-get-from-golangs-buffered-channel-9638573f0c6e
type PageBatcher ¶
PageBatcher is a simple batching system for recording links to the db
func NewPageBatcher ¶
func NewPageBatcher(maxBatch int, s *Storage) (*PageBatcher, error)
NewPageBatcher is a helpfer function for constructing a PageBatcher object
func (*PageBatcher) AddPage ¶
func (pb *PageBatcher) AddPage(page *Page) bool
AddPage is a lightweight function to just whack that page into the channel Returns true if it added the page (hadn't been added previously)
func (*PageBatcher) KillWorkers ¶
func (pb *PageBatcher) KillWorkers()
KillWorkers simply kills all previously spawned workers
func (*PageBatcher) SpawnWorkers ¶
func (pb *PageBatcher) SpawnWorkers(nWorkers int)
SpawnWorkers spawns n workers, and returns a kill channel
func (*PageBatcher) WaitUntilEmpty ¶
func (pb *PageBatcher) WaitUntilEmpty() <-chan bool
WaitUntilEmpty returns a channel that receives input once the buffered channel is empty.
func (*PageBatcher) Worker ¶
func (pb *PageBatcher) Worker(endSignal <-chan bool, doneChan chan<- bool)
Worker is the worker process for the page batcher This is straight up nicked from https://blog.drkaka.com/batch-get-from-golangs-buffered-channel-9638573f0c6e
type Storage ¶
type Storage struct { URI string PageTable string LinkTable string // contains filtered or unexported fields }
Storage implements a PostgreSQL storage backend for colly
func NewStorage ¶
NewStorage is a wrapper for easily creating a storage object.
func (*Storage) BatchAddLinks ¶
BatchAddLinks takes a batch of links and inserts them, not giving a fuck whether or not they clash
func (*Storage) BatchAddPages ¶
BatchAddPages takes a batch of pages and inserts them, not giving a fuck whether or not they clash
func (*Storage) CheckLinkExists ¶
CheckLinkExists checks that the link exists in the visited database
func (*Storage) CheckPageExists ¶
CheckPageExists checks that the page exists in the visited database
func (*Storage) CountLinks ¶
CountLinks retrieves an estimate of the number of links scraped.
func (*Storage) CountPages ¶
CountPages retrieves an estimate of the number of pages scraped.
func (*Storage) GetLinksFrom ¶
GetLinksFrom retrieves the links from this page hash.
func (*Storage) GetLinksTo ¶
GetLinksTo retrieves the links from this page hash.
func (*Storage) GetPageHashesFromHost ¶
GetPageHashesFromHost retrieves the page hashes of all pages with this host.