scraper

package
v0.0.0-...-730425d Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Dec 3, 2014 License: Apache-2.0 Imports: 19 Imported by: 0

Documentation

Index

Constants

View Source
const (
	SelectorTypeList   = "list"
	SelectorTypeDetail = "detail"
	SelectorIdFromUrl  = "IdFromUrl"
	SelectorIdFromCSS  = "IdFromCSS"
	SelectorIdFromLink = "IdFromLink"
)

Variables

View Source
var (
	ErrSelectorNotFound = errors.New("Selector not found")
	ErrJobNotFound      = errors.New("Scrap job not found")
)
View Source
var (
	ErrNoBaseSelector  = fmt.Errorf("No Base selector for the scraping")
	ErrInvalidSelector = fmt.Errorf("InvalidSelector it can not be Recursive for a Detail type")
)

Functions

func DocumentScrap

func DocumentScrap(jobId string, selector ScrapSelector, doc *goquery.Document, items chan ItemResult)

Scrapping logic from the document

func ExtractIdFromURL

func ExtractIdFromURL(u string, pathIndex int, split string, splitIndex int) (string, error)

func GenerateStringKey

func GenerateStringKey(selector ScrapSelector) string

func SanitizeURL

func SanitizeURL(scrapUrl, url string, linkLimit int) string

func SnippetBase

func SnippetBase(selector ScrapSelector) (string, error)

func UseHttpClient

func UseHttpClient(client *http.Client)

You can use a custom http.Client calling this function before doing any scrapping

func UseHttpClientWithTimeout

func UseHttpClientWithTimeout(timeout time.Duration)

You can set the timeout for the standard http.Client before doing any scrapping

func UseMaxConnections

func UseMaxConnections(max int)

limit the number of maximun http conections used

func UseUserAgent

func UseUserAgent(ua string)

set a custom Agent for the scraper

func WriteJsonToDisk

func WriteJsonToDisk(baseDir string, it model.Item)

Types

type DefaultScrapAndStore

type DefaultScrapAndStore struct {
	// contains filtered or unexported fields
}

scrap and store

func (DefaultScrapAndStore) ScrapAndStore

func (ss DefaultScrapAndStore) ScrapAndStore(selector ScrapSelector) (string, error)

func (DefaultScrapAndStore) Store

func (ss DefaultScrapAndStore) Store(items chan ItemResult)

type DefaultScrapper

type DefaultScrapper struct {
}

func (DefaultScrapper) Scrap

func (d DefaultScrapper) Scrap(selector ScrapSelector) (string, chan ItemResult, error)

DefaultScrapper Scraps a Web looking for items, if the selector has multiple pages it does the scrap in all the pages concurrently

type ElasticStorage

type ElasticStorage struct {
	// contains filtered or unexported fields
}

Elastic Search storage

func (ElasticStorage) StoreItem

func (sto ElasticStorage) StoreItem(it ItemResult)

type ExtractId

type ExtractId struct {
	UrlPathIndex int    `json:"urlPathIndex"`
	SplitString  string `json:"splitString"`
	SplitIndex   int    `json:"splitIndex"`
}

type FileStorage

type FileStorage struct {
	// contains filtered or unexported fields
}

Local Files Storage

func (FileStorage) StoreItem

func (sto FileStorage) StoreItem(it ItemResult)

type FromReaderScrapper

type FromReaderScrapper struct {
	// contains filtered or unexported fields
}

Scrapper from reader useful for testing

func (FromReaderScrapper) Scrap

func (s FromReaderScrapper) Scrap(selector ScrapSelector) (string, chan ItemResult, error)

type ItemResult

type ItemResult struct {
	JobId string
	Item  model.Item
	Err   error
}

type RecursiveScrapper

type RecursiveScrapper struct {
	// contains filtered or unexported fields
}

func (RecursiveScrapper) Scrap

func (rs RecursiveScrapper) Scrap(selector ScrapSelector) (string, chan ItemResult, error)

Recursive Scrapper can dig into detail pages, and do a recursive scrap the normal flow is 1) Scrap a List page -> multiple items in that page 2) For each item follow the link 3) Get the detail Selector from Redis related with the item scraped 4) Scrap the detail page

func (RecursiveScrapper) ScrapAllRecursiveItems

func (rs RecursiveScrapper) ScrapAllRecursiveItems(jobId string, selector ScrapSelector, inItems chan ItemResult, outItems chan ItemResult, wg *sync.WaitGroup)

type RedisScrapdata

type RedisScrapdata struct {
	// contains filtered or unexported fields
}

func NewRedisScrapdata

func NewRedisScrapdata() *RedisScrapdata

func (*RedisScrapdata) FinishJob

func (r *RedisScrapdata) FinishJob(jobId string) error

func (*RedisScrapdata) SaveSelector

func (r *RedisScrapdata) SaveSelector(s ScrapSelector) error

func (*RedisScrapdata) ScrapJob

func (r *RedisScrapdata) ScrapJob(jobId string) (map[string]interface{}, error)

func (*RedisScrapdata) ScrapLog

func (r *RedisScrapdata) ScrapLog() []string

func (*RedisScrapdata) ScrapLogTrim

func (r *RedisScrapdata) ScrapLogTrim()

func (*RedisScrapdata) ScrapLogWrite

func (r *RedisScrapdata) ScrapLogWrite(line string)

func (*RedisScrapdata) Selector

func (r *RedisScrapdata) Selector(scrapUrl, stype string) (ScrapSelector, error)

func (*RedisScrapdata) StartJob

func (r *RedisScrapdata) StartJob(jobId string, s ScrapSelector) error

type RedisStorage

type RedisStorage struct {
	// contains filtered or unexported fields
}

Redis Storage

func (RedisStorage) StoreItem

func (sto RedisStorage) StoreItem(it ItemResult)

type ScrapAndStoreItems

type ScrapAndStoreItems interface {
	ScrapAndStore(selector ScrapSelector) (string, error)
}

func NewElasticScrapAndStore

func NewElasticScrapAndStore(index string) ScrapAndStoreItems

func NewScrapAndStore

func NewScrapAndStore(sc ScrapperItems, storages []StorageItems) ScrapAndStoreItems

type ScrapSelector

type ScrapSelector struct {
	Url           string `json:"url"`
	Base          string `json:"base"`
	Stype         string `json:"stype,omitempty"`
	Recursive     bool   `json:"recursive,omitempty"`
	PageParam     string `json:"pageParam"`
	PageStart     int    `json:"pageStart"`
	PageIncr      int    `json:"pageIncr"`
	PageLimit     int    `json:"pageLimit"`
	IdFrom        string
	IdPrefix      string
	IdExtractor   ExtractId `json:"IdExtractor"`
	Id            Selector  `json:"id"`
	Link          Selector  `json:"link",omitempty`
	LinkPathLimit int       `json:"linkPathLimit",omitempty`
	Image         Selector  `json:"image,omitempty"`
	Title         Selector  `json:"title,omitempty"`
	Description   Selector  `json:"description,omitempty"`
	Price         Selector  `json:"price,omitempty"`
	Categories    Selector  `json:"categories,omitempty"`
	Stars         Selector  `json:"starts,omitempty"`

	// comma separated fixed tags
	ScrapTags string `json:"scrapTags,omitempty"`
}

GoQuery Seletor

type ScrapperItems

type ScrapperItems interface {
	Scrap(selector ScrapSelector) (string, chan ItemResult, error)
}

Scrap a website looking for items based on the CSS selector and returns the jobId, a channel with the Items scrapperd, or an error

func NewRecursiveScrapper

func NewRecursiveScrapper() ScrapperItems

func NewScrapper

func NewScrapper() ScrapperItems

func ScrapperFromReader

func ScrapperFromReader(r io.Reader) ScrapperItems

type Selector

type Selector struct {
	Exp  string `json:"exp"`
	Attr string `json:"attr,omitempty"`
}

type StorageItems

type StorageItems interface {
	StoreItem(it ItemResult)
}

func NewElasticStorage

func NewElasticStorage(index string) StorageItems

func NewFileStorage

func NewFileStorage() StorageItems

func NewRedisStorage

func NewRedisStorage() StorageItems

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL