scraper

package

v0.0.0-...-730425d Latest Latest Go to latest Published: Dec 3, 2014 License: Apache-2.0 Imports: 19 Imported by: 0

Details

Valid go.mod file

The Go module system was introduced in Go 1.11 and is the official dependency management solution for Go.
Redistributable license

Redistributable licenses place minimal restrictions on how software can be used, modified, and redistributed.
Tagged version

Modules with tagged versions give importers more predictable builds.
Stable version

When a project reaches major version v1 it is considered stable.
Learn more about best practices

Repository

github.com/dahernan/gopherscraper

Links

Open Source Insights

Documentation ¶

Index ¶

Constants
Variables
func DocumentScrap(jobId string, selector ScrapSelector, doc *goquery.Document, ...)
func ExtractIdFromURL(u string, pathIndex int, split string, splitIndex int) (string, error)
func GenerateStringKey(selector ScrapSelector) string
func SanitizeURL(scrapUrl, url string, linkLimit int) string
func SnippetBase(selector ScrapSelector) (string, error)
func UseHttpClient(client *http.Client)
func UseHttpClientWithTimeout(timeout time.Duration)
func UseMaxConnections(max int)
func UseUserAgent(ua string)
func WriteJsonToDisk(baseDir string, it model.Item)
type DefaultScrapAndStore
- func (ss DefaultScrapAndStore) ScrapAndStore(selector ScrapSelector) (string, error)
- func (ss DefaultScrapAndStore) Store(items chan ItemResult)
type DefaultScrapper
- func (d DefaultScrapper) Scrap(selector ScrapSelector) (string, chan ItemResult, error)
type ElasticStorage
- func (sto ElasticStorage) StoreItem(it ItemResult)
type ExtractId
type FileStorage
- func (sto FileStorage) StoreItem(it ItemResult)
type FromReaderScrapper
- func (s FromReaderScrapper) Scrap(selector ScrapSelector) (string, chan ItemResult, error)
type ItemResult
type RecursiveScrapper
- func (rs RecursiveScrapper) Scrap(selector ScrapSelector) (string, chan ItemResult, error)
- func (rs RecursiveScrapper) ScrapAllRecursiveItems(jobId string, selector ScrapSelector, inItems chan ItemResult, ...)
type RedisScrapdata
- func NewRedisScrapdata() *RedisScrapdata
- func (r *RedisScrapdata) FinishJob(jobId string) error
- func (r *RedisScrapdata) SaveSelector(s ScrapSelector) error
- func (r *RedisScrapdata) ScrapJob(jobId string) (map[string]interface{}, error)
- func (r *RedisScrapdata) ScrapLog() []string
- func (r *RedisScrapdata) ScrapLogTrim()
- func (r *RedisScrapdata) ScrapLogWrite(line string)
- func (r *RedisScrapdata) Selector(scrapUrl, stype string) (ScrapSelector, error)
- func (r *RedisScrapdata) StartJob(jobId string, s ScrapSelector) error
type RedisStorage
- func (sto RedisStorage) StoreItem(it ItemResult)
type ScrapAndStoreItems
- func NewElasticScrapAndStore(index string) ScrapAndStoreItems
- func NewScrapAndStore(sc ScrapperItems, storages []StorageItems) ScrapAndStoreItems
type ScrapSelector
type ScrapperItems
- func NewRecursiveScrapper() ScrapperItems
- func NewScrapper() ScrapperItems
- func ScrapperFromReader(r io.Reader) ScrapperItems
type Selector
type StorageItems
- func NewElasticStorage(index string) StorageItems
- func NewFileStorage() StorageItems
- func NewRedisStorage() StorageItems

Constants ¶

View Source

const (
	SelectorTypeList   = "list"
	SelectorTypeDetail = "detail"
	SelectorIdFromUrl  = "IdFromUrl"
	SelectorIdFromCSS  = "IdFromCSS"
	SelectorIdFromLink = "IdFromLink"
)

Variables ¶

View Source

var (
	ErrSelectorNotFound = errors.New("Selector not found")
	ErrJobNotFound      = errors.New("Scrap job not found")
)

View Source

var (
	ErrNoBaseSelector  = fmt.Errorf("No Base selector for the scraping")
	ErrInvalidSelector = fmt.Errorf("InvalidSelector it can not be Recursive for a Detail type")
)

Functions ¶

func ExtractIdFromURL ¶

func ExtractIdFromURL(u string, pathIndex int, split string, splitIndex int) (string, error)

func GenerateStringKey ¶

func GenerateStringKey(selector ScrapSelector) string

func SanitizeURL ¶

func SanitizeURL(scrapUrl, url string, linkLimit int) string

func SnippetBase ¶

func SnippetBase(selector ScrapSelector) (string, error)

func UseHttpClient ¶

func UseHttpClient(client *http.Client)

You can use a custom http.Client calling this function before doing any scrapping

func UseHttpClientWithTimeout ¶

func UseHttpClientWithTimeout(timeout time.Duration)

You can set the timeout for the standard http.Client before doing any scrapping

func UseMaxConnections ¶

func UseMaxConnections(max int)

limit the number of maximun http conections used

func WriteJsonToDisk ¶

func WriteJsonToDisk(baseDir string, it model.Item)

Types ¶

func (DefaultScrapAndStore) ScrapAndStore ¶

func (ss DefaultScrapAndStore) ScrapAndStore(selector ScrapSelector) (string, error)

func (DefaultScrapAndStore) Store ¶

func (ss DefaultScrapAndStore) Store(items chan ItemResult)

type DefaultScrapper ¶

type DefaultScrapper struct {
}

func (DefaultScrapper) Scrap ¶

func (d DefaultScrapper) Scrap(selector ScrapSelector) (string, chan ItemResult, error)

DefaultScrapper Scraps a Web looking for items, if the selector has multiple pages it does the scrap in all the pages concurrently

func (ElasticStorage) StoreItem ¶

func (sto ElasticStorage) StoreItem(it ItemResult)

type ExtractId ¶

type ExtractId struct {
	UrlPathIndex int    `json:"urlPathIndex"`
	SplitString  string `json:"splitString"`
	SplitIndex   int    `json:"splitIndex"`
}

func (FileStorage) StoreItem ¶

func (sto FileStorage) StoreItem(it ItemResult)

func (FromReaderScrapper) Scrap ¶

func (s FromReaderScrapper) Scrap(selector ScrapSelector) (string, chan ItemResult, error)

type ItemResult ¶

type ItemResult struct {
	JobId string
	Item  model.Item
	Err   error
}

type RecursiveScrapper ¶

type RecursiveScrapper struct {
	// contains filtered or unexported fields
}

func (RecursiveScrapper) Scrap ¶

func (rs RecursiveScrapper) Scrap(selector ScrapSelector) (string, chan ItemResult, error)

Recursive Scrapper can dig into detail pages, and do a recursive scrap the normal flow is 1) Scrap a List page -> multiple items in that page 2) For each item follow the link 3) Get the detail Selector from Redis related with the item scraped 4) Scrap the detail page

func (RecursiveScrapper) ScrapAllRecursiveItems ¶

func (rs RecursiveScrapper) ScrapAllRecursiveItems(jobId string, selector ScrapSelector, inItems chan ItemResult, outItems chan ItemResult, wg *sync.WaitGroup)

type RedisScrapdata ¶

type RedisScrapdata struct {
	// contains filtered or unexported fields
}

func NewRedisScrapdata ¶

func NewRedisScrapdata() *RedisScrapdata

func (*RedisScrapdata) FinishJob ¶

func (r *RedisScrapdata) FinishJob(jobId string) error

func (*RedisScrapdata) SaveSelector ¶

func (r *RedisScrapdata) SaveSelector(s ScrapSelector) error

func (*RedisScrapdata) ScrapJob ¶

func (r *RedisScrapdata) ScrapJob(jobId string) (map[string]interface{}, error)

func (*RedisScrapdata) ScrapLog ¶

func (r *RedisScrapdata) ScrapLog() []string

func (*RedisScrapdata) ScrapLogTrim ¶

func (r *RedisScrapdata) ScrapLogTrim()

func (*RedisScrapdata) ScrapLogWrite ¶

func (r *RedisScrapdata) ScrapLogWrite(line string)

func (*RedisScrapdata) Selector ¶

func (r *RedisScrapdata) Selector(scrapUrl, stype string) (ScrapSelector, error)

func (*RedisScrapdata) StartJob ¶

func (r *RedisScrapdata) StartJob(jobId string, s ScrapSelector) error

type RedisStorage ¶

type RedisStorage struct {
	// contains filtered or unexported fields
}

Redis Storage

func (RedisStorage) StoreItem ¶

func (sto RedisStorage) StoreItem(it ItemResult)

type ScrapAndStoreItems ¶

type ScrapAndStoreItems interface {
	ScrapAndStore(selector ScrapSelector) (string, error)
}

func NewElasticScrapAndStore ¶

func NewElasticScrapAndStore(index string) ScrapAndStoreItems

func NewScrapAndStore ¶

func NewScrapAndStore(sc ScrapperItems, storages []StorageItems) ScrapAndStoreItems

type ScrapSelector ¶

type ScrapSelector struct {
	Url           string `json:"url"`
	Base          string `json:"base"`
	Stype         string `json:"stype,omitempty"`
	Recursive     bool   `json:"recursive,omitempty"`
	PageParam     string `json:"pageParam"`
	PageStart     int    `json:"pageStart"`
	PageIncr      int    `json:"pageIncr"`
	PageLimit     int    `json:"pageLimit"`
	IdFrom        string
	IdPrefix      string
	IdExtractor   ExtractId `json:"IdExtractor"`
	Id            Selector  `json:"id"`
	Link          Selector  `json:"link",omitempty`
	LinkPathLimit int       `json:"linkPathLimit",omitempty`
	Image         Selector  `json:"image,omitempty"`
	Title         Selector  `json:"title,omitempty"`
	Description   Selector  `json:"description,omitempty"`
	Price         Selector  `json:"price,omitempty"`
	Categories    Selector  `json:"categories,omitempty"`
	Stars         Selector  `json:"starts,omitempty"`

	// comma separated fixed tags
	ScrapTags string `json:"scrapTags,omitempty"`
}

GoQuery Seletor

type ScrapperItems ¶

type ScrapperItems interface {
	Scrap(selector ScrapSelector) (string, chan ItemResult, error)
}

Scrap a website looking for items based on the CSS selector and returns the jobId, a channel with the Items scrapperd, or an error

func NewRecursiveScrapper ¶

func NewRecursiveScrapper() ScrapperItems

func NewScrapper ¶

func NewScrapper() ScrapperItems

func ScrapperFromReader ¶

func ScrapperFromReader(r io.Reader) ScrapperItems

type Selector ¶

type Selector struct {
	Exp  string `json:"exp"`
	Attr string `json:"attr,omitempty"`
}

type StorageItems ¶

type StorageItems interface {
	StoreItem(it ItemResult)
}

func NewElasticStorage ¶

func NewElasticStorage(index string) StorageItems

func NewFileStorage ¶

func NewFileStorage() StorageItems

func NewRedisStorage ¶

func NewRedisStorage() StorageItems

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL

Documentation ¶

Index ¶

Constants ¶

Variables ¶

Functions ¶

func DocumentScrap ¶

func ExtractIdFromURL ¶

func GenerateStringKey ¶

func SanitizeURL ¶

func SnippetBase ¶

func UseHttpClient ¶

func UseHttpClientWithTimeout ¶

func UseMaxConnections ¶

func UseUserAgent ¶

func WriteJsonToDisk ¶

Types ¶

type DefaultScrapAndStore ¶

func (DefaultScrapAndStore) ScrapAndStore ¶

func (DefaultScrapAndStore) Store ¶

type DefaultScrapper ¶

func (DefaultScrapper) Scrap ¶

type ElasticStorage ¶

func (ElasticStorage) StoreItem ¶

type ExtractId ¶

type FileStorage ¶

func (FileStorage) StoreItem ¶

type FromReaderScrapper ¶

func (FromReaderScrapper) Scrap ¶

type ItemResult ¶

type RecursiveScrapper ¶

func (RecursiveScrapper) Scrap ¶

func (RecursiveScrapper) ScrapAllRecursiveItems ¶

type RedisScrapdata ¶

func NewRedisScrapdata ¶

func (*RedisScrapdata) FinishJob ¶

func (*RedisScrapdata) SaveSelector ¶

func (*RedisScrapdata) ScrapJob ¶

func (*RedisScrapdata) ScrapLog ¶

func (*RedisScrapdata) ScrapLogTrim ¶

func (*RedisScrapdata) ScrapLogWrite ¶

func (*RedisScrapdata) Selector ¶

func (*RedisScrapdata) StartJob ¶

type RedisStorage ¶

func (RedisStorage) StoreItem ¶

type ScrapAndStoreItems ¶

func NewElasticScrapAndStore ¶

func NewScrapAndStore ¶

type ScrapSelector ¶

type ScrapperItems ¶

func NewRecursiveScrapper ¶

func NewScrapper ¶

func ScrapperFromReader ¶

type Selector ¶

type StorageItems ¶

func NewElasticStorage ¶

func NewFileStorage ¶

func NewRedisStorage ¶

Source Files ¶