Documentation ¶
Index ¶
- Constants
- Variables
- func DocumentScrap(jobId string, selector ScrapSelector, doc *goquery.Document, ...)
- func ExtractIdFromURL(u string, pathIndex int, split string, splitIndex int) (string, error)
- func GenerateStringKey(selector ScrapSelector) string
- func SanitizeURL(scrapUrl, url string, linkLimit int) string
- func SnippetBase(selector ScrapSelector) (string, error)
- func UseHttpClient(client *http.Client)
- func UseHttpClientWithTimeout(timeout time.Duration)
- func UseMaxConnections(max int)
- func UseUserAgent(ua string)
- func WriteJsonToDisk(baseDir string, it model.Item)
- type DefaultScrapAndStore
- type DefaultScrapper
- type ElasticStorage
- type ExtractId
- type FileStorage
- type FromReaderScrapper
- type ItemResult
- type RecursiveScrapper
- type RedisScrapdata
- func (r *RedisScrapdata) FinishJob(jobId string) error
- func (r *RedisScrapdata) SaveSelector(s ScrapSelector) error
- func (r *RedisScrapdata) ScrapJob(jobId string) (map[string]interface{}, error)
- func (r *RedisScrapdata) ScrapLog() []string
- func (r *RedisScrapdata) ScrapLogTrim()
- func (r *RedisScrapdata) ScrapLogWrite(line string)
- func (r *RedisScrapdata) Selector(scrapUrl, stype string) (ScrapSelector, error)
- func (r *RedisScrapdata) StartJob(jobId string, s ScrapSelector) error
- type RedisStorage
- type ScrapAndStoreItems
- type ScrapSelector
- type ScrapperItems
- type Selector
- type StorageItems
Constants ¶
const ( SelectorTypeList = "list" SelectorTypeDetail = "detail" SelectorIdFromUrl = "IdFromUrl" SelectorIdFromCSS = "IdFromCSS" SelectorIdFromLink = "IdFromLink" )
Variables ¶
var ( ErrSelectorNotFound = errors.New("Selector not found") ErrJobNotFound = errors.New("Scrap job not found") )
var ( ErrNoBaseSelector = fmt.Errorf("No Base selector for the scraping") ErrInvalidSelector = fmt.Errorf("InvalidSelector it can not be Recursive for a Detail type") )
Functions ¶
func DocumentScrap ¶
func DocumentScrap(jobId string, selector ScrapSelector, doc *goquery.Document, items chan ItemResult)
Scrapping logic from the document
func ExtractIdFromURL ¶
func GenerateStringKey ¶
func GenerateStringKey(selector ScrapSelector) string
func SanitizeURL ¶
func SnippetBase ¶
func SnippetBase(selector ScrapSelector) (string, error)
func UseHttpClient ¶
You can use a custom http.Client calling this function before doing any scrapping
func UseHttpClientWithTimeout ¶
You can set the timeout for the standard http.Client before doing any scrapping
func UseMaxConnections ¶
func UseMaxConnections(max int)
limit the number of maximun http conections used
func WriteJsonToDisk ¶
Types ¶
type DefaultScrapAndStore ¶
type DefaultScrapAndStore struct {
// contains filtered or unexported fields
}
scrap and store
func (DefaultScrapAndStore) ScrapAndStore ¶
func (ss DefaultScrapAndStore) ScrapAndStore(selector ScrapSelector) (string, error)
func (DefaultScrapAndStore) Store ¶
func (ss DefaultScrapAndStore) Store(items chan ItemResult)
type DefaultScrapper ¶
type DefaultScrapper struct { }
func (DefaultScrapper) Scrap ¶
func (d DefaultScrapper) Scrap(selector ScrapSelector) (string, chan ItemResult, error)
DefaultScrapper Scraps a Web looking for items, if the selector has multiple pages it does the scrap in all the pages concurrently
type ElasticStorage ¶
type ElasticStorage struct {
// contains filtered or unexported fields
}
Elastic Search storage
func (ElasticStorage) StoreItem ¶
func (sto ElasticStorage) StoreItem(it ItemResult)
type FileStorage ¶
type FileStorage struct {
// contains filtered or unexported fields
}
Local Files Storage
func (FileStorage) StoreItem ¶
func (sto FileStorage) StoreItem(it ItemResult)
type FromReaderScrapper ¶
type FromReaderScrapper struct {
// contains filtered or unexported fields
}
Scrapper from reader useful for testing
func (FromReaderScrapper) Scrap ¶
func (s FromReaderScrapper) Scrap(selector ScrapSelector) (string, chan ItemResult, error)
type RecursiveScrapper ¶
type RecursiveScrapper struct {
// contains filtered or unexported fields
}
func (RecursiveScrapper) Scrap ¶
func (rs RecursiveScrapper) Scrap(selector ScrapSelector) (string, chan ItemResult, error)
Recursive Scrapper can dig into detail pages, and do a recursive scrap the normal flow is 1) Scrap a List page -> multiple items in that page 2) For each item follow the link 3) Get the detail Selector from Redis related with the item scraped 4) Scrap the detail page
func (RecursiveScrapper) ScrapAllRecursiveItems ¶
func (rs RecursiveScrapper) ScrapAllRecursiveItems(jobId string, selector ScrapSelector, inItems chan ItemResult, outItems chan ItemResult, wg *sync.WaitGroup)
type RedisScrapdata ¶
type RedisScrapdata struct {
// contains filtered or unexported fields
}
func NewRedisScrapdata ¶
func NewRedisScrapdata() *RedisScrapdata
func (*RedisScrapdata) FinishJob ¶
func (r *RedisScrapdata) FinishJob(jobId string) error
func (*RedisScrapdata) SaveSelector ¶
func (r *RedisScrapdata) SaveSelector(s ScrapSelector) error
func (*RedisScrapdata) ScrapJob ¶
func (r *RedisScrapdata) ScrapJob(jobId string) (map[string]interface{}, error)
func (*RedisScrapdata) ScrapLog ¶
func (r *RedisScrapdata) ScrapLog() []string
func (*RedisScrapdata) ScrapLogTrim ¶
func (r *RedisScrapdata) ScrapLogTrim()
func (*RedisScrapdata) ScrapLogWrite ¶
func (r *RedisScrapdata) ScrapLogWrite(line string)
func (*RedisScrapdata) Selector ¶
func (r *RedisScrapdata) Selector(scrapUrl, stype string) (ScrapSelector, error)
func (*RedisScrapdata) StartJob ¶
func (r *RedisScrapdata) StartJob(jobId string, s ScrapSelector) error
type RedisStorage ¶
type RedisStorage struct {
// contains filtered or unexported fields
}
Redis Storage
func (RedisStorage) StoreItem ¶
func (sto RedisStorage) StoreItem(it ItemResult)
type ScrapAndStoreItems ¶
type ScrapAndStoreItems interface {
ScrapAndStore(selector ScrapSelector) (string, error)
}
func NewElasticScrapAndStore ¶
func NewElasticScrapAndStore(index string) ScrapAndStoreItems
func NewScrapAndStore ¶
func NewScrapAndStore(sc ScrapperItems, storages []StorageItems) ScrapAndStoreItems
type ScrapSelector ¶
type ScrapSelector struct { Url string `json:"url"` Base string `json:"base"` Stype string `json:"stype,omitempty"` Recursive bool `json:"recursive,omitempty"` PageParam string `json:"pageParam"` PageStart int `json:"pageStart"` PageIncr int `json:"pageIncr"` PageLimit int `json:"pageLimit"` IdFrom string IdPrefix string IdExtractor ExtractId `json:"IdExtractor"` Id Selector `json:"id"` Link Selector `json:"link",omitempty` LinkPathLimit int `json:"linkPathLimit",omitempty` Image Selector `json:"image,omitempty"` Title Selector `json:"title,omitempty"` Description Selector `json:"description,omitempty"` Price Selector `json:"price,omitempty"` Categories Selector `json:"categories,omitempty"` Stars Selector `json:"starts,omitempty"` // comma separated fixed tags ScrapTags string `json:"scrapTags,omitempty"` }
GoQuery Seletor
type ScrapperItems ¶
type ScrapperItems interface {
Scrap(selector ScrapSelector) (string, chan ItemResult, error)
}
Scrap a website looking for items based on the CSS selector and returns the jobId, a channel with the Items scrapperd, or an error
func NewRecursiveScrapper ¶
func NewRecursiveScrapper() ScrapperItems
func NewScrapper ¶
func NewScrapper() ScrapperItems
func ScrapperFromReader ¶
func ScrapperFromReader(r io.Reader) ScrapperItems
type StorageItems ¶
type StorageItems interface {
StoreItem(it ItemResult)
}
func NewElasticStorage ¶
func NewElasticStorage(index string) StorageItems
func NewFileStorage ¶
func NewFileStorage() StorageItems
func NewRedisStorage ¶
func NewRedisStorage() StorageItems