Documentation ¶
Index ¶
- func DBimageMarkAsDownloaded(p_image *GFcrawlerPageImage, pRuntimeSys *gf_core.RuntimeSys) *gf_core.GFerror
- func DBmongoCreateLink(pLink *GFcrawlerPageOutgoingLink, pRuntimeSys *gf_core.RuntimeSys) *gf_core.GFerror
- func DBmongoImageCreate(p_img *GFcrawlerPageImage, pRuntime *GFcrawlerRuntime, ...) (bool, *gf_core.GFerror)
- func DBmongoImageCreateRef(p_img_ref *GFcrawlerPageImageRef, pRuntime *GFcrawlerRuntime, ...) *gf_core.GFerror
- func DBmongoImageSetImageID(pGFimageIDstr gf_images_core.GFimageID, pImage *GFcrawlerPageImage, ...) *gf_core.GFerror
- func DBmongoImageUpdateAfterProcess(pPageImg *GFcrawlerPageImage, pGFimageIDstr gf_images_core.GFimageID, ...) *gf_core.GFerror
- func DBmongoLinkExists(pLinkHashStr string, pRuntimeSys *gf_core.RuntimeSys) (bool, *gf_core.GFerror)
- func DBmongoLinkIndexInit(pRuntimeSys *gf_core.RuntimeSys) *gf_core.GFerror
- func DBmongoLinkMarkAsResolved(pLink *GFcrawlerPageOutgoingLink, pFetchIDstr string, ...) *gf_core.GFerror
- func DBmongoLinkMarkImportInProgress(pStatusBool bool, p_unix_time_f float64, pLink *GFcrawlerPageOutgoingLink, ...) *gf_core.GFerror
- func FetchParseResult(pURLfetch *GFcrawlerURLfetch, pCycleRunIDstr string, pCrawlerNameStr string, ...) *gf_core.GFerror
- func FlowsAddExternImage(pCrawlerPageImageIDstr GFcrawlerPageImageID, pFlowsNamesLst []string, ...) *gf_core.GFerror
- func GetAllCrawlers(pCrawlConfigFilePathStr string, pRuntimeSys *gf_core.RuntimeSys) (map[string]GFcrawlerDef, *gf_core.GFerror)
- func IndexQuery(p_term_str string, pRuntime *GFcrawlerRuntime, pRuntimeSys *gf_core.RuntimeSys) *gf_core.GFerror
- func LinkAllocInit(pCrawlerNameStr string, pRuntimeSys *gf_core.RuntimeSys) *gf_core.GFerror
- func LinkAllocRun(pAlloc *Gf_crawl_link_alloc, pRuntimeSys *gf_core.RuntimeSys) *gf_core.GFerror
- func LinksGetOutgoingInPage(pURLfetch *GFcrawlerURLfetch, pCycleRunIDstr string, pCrawlerNameStr string, ...)
- type GFcrawlConfig
- type GFcrawlerDef
- type GFcrawlerError
- type GFcrawlerPageImage
- type GFcrawlerPageImageID
- type GFcrawlerPageImageRef
- type GFcrawlerPageOutgoingLink
- type GFcrawlerRecentImages
- type GFcrawlerRuntime
- type GFcrawlerURLfetch
- type Gf_crawl_link_alloc
- type Gf_crawl_link_alloc_block
- type Gf_index__query_run
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func DBimageMarkAsDownloaded ¶
func DBimageMarkAsDownloaded(p_image *GFcrawlerPageImage, pRuntimeSys *gf_core.RuntimeSys) *gf_core.GFerror
func DBmongoCreateLink ¶
func DBmongoCreateLink(pLink *GFcrawlerPageOutgoingLink, pRuntimeSys *gf_core.RuntimeSys) *gf_core.GFerror
func DBmongoImageCreate ¶
func DBmongoImageCreate(p_img *GFcrawlerPageImage, pRuntime *GFcrawlerRuntime, pRuntimeSys *gf_core.RuntimeSys) (bool, *gf_core.GFerror)
func DBmongoImageCreateRef ¶
func DBmongoImageCreateRef(p_img_ref *GFcrawlerPageImageRef, pRuntime *GFcrawlerRuntime, pRuntimeSys *gf_core.RuntimeSys) *gf_core.GFerror
func DBmongoImageSetImageID ¶
func DBmongoImageSetImageID(pGFimageIDstr gf_images_core.GFimageID, pImage *GFcrawlerPageImage, pRuntimeSys *gf_core.RuntimeSys) *gf_core.GFerror
func DBmongoImageUpdateAfterProcess ¶
func DBmongoImageUpdateAfterProcess(pPageImg *GFcrawlerPageImage, pGFimageIDstr gf_images_core.GFimageID, pRuntimeSys *gf_core.RuntimeSys) *gf_core.GFerror
func DBmongoLinkExists ¶
func DBmongoLinkIndexInit ¶
func DBmongoLinkIndexInit(pRuntimeSys *gf_core.RuntimeSys) *gf_core.GFerror
func DBmongoLinkMarkAsResolved ¶
func DBmongoLinkMarkAsResolved(pLink *GFcrawlerPageOutgoingLink, pFetchIDstr string, pFetchCreationTimeF float64, pRuntimeSys *gf_core.RuntimeSys) *gf_core.GFerror
func DBmongoLinkMarkImportInProgress ¶
func DBmongoLinkMarkImportInProgress(pStatusBool bool, p_unix_time_f float64, pLink *GFcrawlerPageOutgoingLink, pRuntime *GFcrawlerRuntime, pRuntimeSys *gf_core.RuntimeSys) *gf_core.GFerror
func FetchParseResult ¶
func FetchParseResult(pURLfetch *GFcrawlerURLfetch, pCycleRunIDstr string, pCrawlerNameStr string, pImagesLocalDirPathStr string, pMediaDomainStr string, pS3bucketNameStr string, pUserID gf_core.GF_ID, pRuntime *GFcrawlerRuntime, pRuntimeSys *gf_core.RuntimeSys) *gf_core.GFerror
func FlowsAddExternImage ¶
func FlowsAddExternImage(pCrawlerPageImageIDstr GFcrawlerPageImageID, pFlowsNamesLst []string, pMediaDomainStr string, pCrawledImagesS3bucketNameStr string, pImagesS3bucketNameStr string, pUserID gf_core.GF_ID, pCtx context.Context, pRuntime *GFcrawlerRuntime, pRuntimeSys *gf_core.RuntimeSys) *gf_core.GFerror
adds an image already crawled from an external source URL to some named list of flows in the gf_images app/system. to do this it adds the flow_name to the gf_image DB record, and then copies the discovered image file from gf_crawlers file_storage (S3/IPFS) to gf_images service file_storage (S3/IPFS). at the moment this is called directly in the gf_crawl HTTP handler.
func GetAllCrawlers ¶
func GetAllCrawlers(pCrawlConfigFilePathStr string, pRuntimeSys *gf_core.RuntimeSys) (map[string]GFcrawlerDef, *gf_core.GFerror)
func IndexQuery ¶
func IndexQuery(p_term_str string, pRuntime *GFcrawlerRuntime, pRuntimeSys *gf_core.RuntimeSys) *gf_core.GFerror
func LinkAllocInit ¶
func LinkAllocInit(pCrawlerNameStr string, pRuntimeSys *gf_core.RuntimeSys) *gf_core.GFerror
func LinkAllocRun ¶
func LinkAllocRun(pAlloc *Gf_crawl_link_alloc, pRuntimeSys *gf_core.RuntimeSys) *gf_core.GFerror
func LinksGetOutgoingInPage ¶
func LinksGetOutgoingInPage(pURLfetch *GFcrawlerURLfetch, pCycleRunIDstr string, pCrawlerNameStr string, pRuntime *GFcrawlerRuntime, pRuntimeSys *gf_core.RuntimeSys)
Types ¶
type GFcrawlConfig ¶
type GFcrawlConfig struct {
CrawlersDefsLst []GFcrawlerDef `yaml:"crawlers-defs"`
}
type GFcrawlerDef ¶
type GFcrawlerError ¶
type GFcrawlerError struct { Id primitive.ObjectID `bson:"_id,omitempty" json:"-"` IDstr string `bson:"id_str" json:"id_str"` Tstr string `bson:"t" json:"t"` //"crawler_error" CreationUNIXtimeF float64 `bson:"creation_unix_time_f"` TypeStr string `bson:"type_str" json:"type_str"` MsgStr string `bson:"msg_str" json:"msg_str"` DataMap map[string]interface{} `bson:"data_map" json:"data_map"` //if an error is related to a particular URL, it is noted here. GFerrorIDstr string `bson:"gf_error_id_str" json:"gf_error_id_str"` CrawlerNameStr string `bson:"crawler_name_str" json:"crawler_name_str"` URLstr string `bson:"url_str" json:"url_str"` }
func CreateErrorAndEvent ¶
func CreateErrorAndEvent(pErrorTypeStr string, pErrorMsgStr string, pErrorDataMap map[string]interface{}, pErrorURLstr string, pCrawlerNameStr string, pGFerr *gf_core.GFerror, pRuntime *GFcrawlerRuntime, pRuntimeSys *gf_core.RuntimeSys) (*GFcrawlerError, *gf_core.GFerror)
type GFcrawlerPageImage ¶
type GFcrawlerPageImage struct { Id primitive.ObjectID `bson:"_id,omitempty"` IDstr GFcrawlerPageImageID `bson:"id_str"` T_str string `bson:"t"` // "crawler_page_img" Creation_unix_time_f float64 `bson:"creation_unix_time_f"` Crawler_name_str string `bson:"crawler_name_str"` // name of the crawler that discovered this image Cycle_run_id_str string `bson:"cycle_run_id_str"` Img_ext_str string `bson:"img_ext_str"` // jpg|gif|png Url_str string `bson:"url_str"` Domain_str string `bson:"domain_str"` // domain of the url_str Origin_page_url_str string `bson:"origin_page_url_str"` // page url from whos html this element was extracted Origin_page_url_domain_str string `bson:"origin_page_url_domain_str"` // domain of the origin_page_url_str // NEW_FIELD!! a lot of records dont have this field // IMPORTANT!! - this is unique for the image src encountered. this way the same data links are not entered in duplicates, // and using the hash the DB can qucikly be checked for existence of record Hash_str string `bson:"hash_str"` // IMPORTANT!! - indicates if the image was fetched from the remote server, // and has been stored on S3 and ready for usage by other services. Downloaded_bool bool `bson:"downloaded_bool"` // IMPORTANT!! - the usage was determined to be useful for internal applications, // they're not page elements, or other small unimportant parts. // if it is valid for usage then a gf_image for this image should be // found in the db Valid_for_usage_bool bool `bson:"valid_for_usage_bool"` S3_stored_bool bool `bson:"s3_stored_bool"` // if persisting to s3 succeeded Nsfv_bool bool `bson:"nsfv_bool"` // NSFV (not safe for viewing/nudity) flag for the image GFimageIDstr gf_images_core.GFimageID `bson:"image_id_str"` // id of the gf_image for this corresponding crawler_page_img //FIX!! - should be "gf_image_id_str" }
type GFcrawlerPageImageID ¶
type GFcrawlerPageImageID string
type GFcrawlerPageImageRef ¶
type GFcrawlerPageImageRef struct { Id primitive.ObjectID `bson:"_id,omitempty"` Id_str string `bson:"id_str"` T_str string `bson:"t"` //"crawler_page_img_ref" Creation_unix_time_f float64 `bson:"creation_unix_time_f"` Crawler_name_str string `bson:"crawler_name_str"` //name of the crawler that discovered this image Cycle_run_id_str string `bson:"cycle_run_id_str" json:"cycle_run_id_str"` Url_str string `bson:"url_str"` Domain_str string `bson:"domain_str"` Origin_page_url_str string `bson:"origin_page_url_str"` //page url from whos html this element was extracted Origin_page_url_domain_str string `bson:"origin_page_url_domain_str"` //NEW_FIELD!! a lot of records dont have this field // IMPORTANT!! - this is unique for the image src encountered. this way the same data links are not entered in duplicates, // and using the hash the DB can qucikly be checked for existence of record Hash_str string `bson:"hash_str"` }
IMPORTANT!! - reference to an image, on a particular page.
the same image, with the same Url_str can appear on multiple pages, and this struct tracks that, one record per reference
type GFcrawlerPageOutgoingLink ¶
type GFcrawlerPageOutgoingLink struct { Id primitive.ObjectID `bson:"_id,omitempty"` IDstr string `bson:"id_str"` T_str string `bson:"t"` // "crawler_page_outgoing_link" CreationUNIXtimeF float64 `bson:"creation_unix_time_f"` Crawler_name_str string `bson:"crawler_name_str"` // name of the crawler that discovered this link Cycle_run_id_str string `bson:"cycle_run_id_str"` A_href_str string `bson:"a_href_str"` Domain_str string `bson:"domain_str"` Origin_url_str string `bson:"origin_url_str"` // page url from whos html this element was extracted Origin_url_domain_str string `bson:"origin_url_domain_str"` // IMPORTANT!! - this is a hash of the . it Hash_str string `bson:"hash_str"` Valid_for_crawl_bool bool `bson:"valid_for_crawl_bool"` // if the link should be crawled, or if it should be ignored Images_processed_bool bool `bson:"images_processed_bool"` // if all the images in the page have been downloaded/transformed/stored-in-s3 Fetched_bool bool `bson:"fetched_bool"` // indicator if the link has been fetched (its html downloaded and parsed) Fetch_last_id_str string `bson:"fetch_last_id_str"` Fetch_last_time_f float64 `bson:"fetch_last_time_f"` //------------------- // IMPORTANT!! - indicates if this link hasis currently being processed by some // crawler master/worker in the cluster Import__in_progress_bool bool `bson:"import__in_progress_bool"` Import__start_time_f float64 `bson:"import__start_time_f"` // when has the "in_progress" flag been set. for detecting interrupted/incomplete imports //------------------- // IMPORTANT!! - last error that occured/interupted processing of this link Error_type_str string `bson:"error_type_str,omitempty"` Error_id_str string `bson:"error_id_str,omitempty"` }
func DBmongoGetLink ¶
func DBmongoGetLink(pLink_id_str string, pRuntimeSys *gf_core.RuntimeSys) (*GFcrawlerPageOutgoingLink, *gf_core.GFerror)
func DBmongoLinkGetUnresolved ¶
func DBmongoLinkGetUnresolved(pCrawlerNameStr string, pRuntimeSys *gf_core.RuntimeSys) (*GFcrawlerPageOutgoingLink, *gf_core.GFerror)
type GFcrawlerRecentImages ¶
type GFcrawlerRecentImages struct { Domain_str string `bson:"_id" json:"domain_str"` Imgs_count_int int `bson:"imgs_count_int" json:"imgs_count_int"` Crawler_page_img_ids_lst []string `bson:"crawler_page_img_ids_lst" json:"crawler_page_img_ids_lst"` Creation_times_lst []float64 `bson:"creation_times_lst" json:"creation_times_lst"` Urls_lst []string `bson:"urls_lst" json:"urls_lst"` Nsfv_lst []bool `bson:"nsfv_lst" json:"nsfv_lst"` Origin_page_urls_lst []string `bson:"origin_page_urls_lst" json:"origin_page_urls_lst"` }
func DBmongoImagesGetRecent ¶
func DBmongoImagesGetRecent(pRuntimeSys *gf_core.RuntimeSys) ([]GFcrawlerRecentImages, *gf_core.GFerror)
type GFcrawlerRuntime ¶
type GFcrawlerRuntime struct { EventsCtx *gf_events.EventsCtx EsearchClient *elastic.Client S3info *gf_aws.GFs3Info ImagesUseNewStorageEngineBool bool PluginsPyDirPathStr string }
func T__init ¶
func T__init() (*gf_core.RuntimeSys, *GFcrawlerRuntime)
type GFcrawlerURLfetch ¶
type GFcrawlerURLfetch struct { Id primitive.ObjectID `bson:"_id,omitempty"` Id_str string `bson:"id_str" json:"id_str"` T_str string `bson:"t" json:"t"` // "crawler_url_fetch" Creation_unix_time_f float64 `bson:"creation_unix_time_f" json:"creation_unix_time_f"` Cycle_run_id_str string `bson:"cycle_run_id_str" json:"cycle_run_id_str"` Domain_str string `bson:"domain_str" json:"domain_str"` Url_str string `bson:"url_str" json:"url_str"` Start_time_f float64 `bson:"start_time_f" json:"-"` End_time_f float64 `bson:"end_time_f" json:"-"` Page_text_str string `bson:"page_text_str" json:"page_text_str"` // full text of the page html - indexed in ES //------------------- // IMPORTANT!! - last error that occured/interupted processing of this link Error_type_str string `bson:"error_type_str,omitempty"` Error_id_str string `bson:"error_id_str,omitempty"` // contains filtered or unexported fields }
func FetchURL ¶
func FetchURL(pURLstr string, pLink *GFcrawlerPageOutgoingLink, pCycleRunIDstr string, pCrawlerNameStr string, pRuntime *GFcrawlerRuntime, pRuntimeSys *gf_core.RuntimeSys) (*GFcrawlerURLfetch, string, *gf_core.GFerror)
type Gf_crawl_link_alloc ¶
type Gf_crawl_link_alloc struct { Id primitive.ObjectID `bson:"_id,omitempty"` Id_str string `bson:"id_str"` T_str string `bson:"t"` // "crawler_link_alloc" Creation_unix_time_f float64 `bson:"creation_unix_time_f"` Crawler_name_str string `bson:"crawler_name_str"` Block_size_int int `bson:"block_size_int"` Sleep_time_sec_int int `bson:"sleep_time_sec_int"` Last_run_unix_time_f float64 Current_link_block_id_str string `bson:"current_link_block_id_str"` }
func DBmongoLinkAllocCreate ¶
func DBmongoLinkAllocCreate(pCrawlerNameStr string, pRuntimeSys *gf_core.RuntimeSys) (*Gf_crawl_link_alloc, *gf_core.GFerror)
type Gf_crawl_link_alloc_block ¶
type Gf_crawl_link_alloc_block struct { Id primitive.ObjectID `bson:"_id,omitempty"` Id_str string `bson:"id_str"` Creation_unix_time_f float64 `bson:"creation_unix_time_f"` T_str string `bson:"t"` // "crawler_link_alloc_block" Allocator_id_str string `bson:"allocator_id_str"` Unresolved_links_ids_lst []string `bson:"unresolved_links_ids_lst"` }
func DBmongoLinkAllocCreateLinksBlock ¶
func DBmongoLinkAllocCreateLinksBlock(p_alloc_id_str string, pCrawlerNameStr string, pBlockSizeInt int, pRuntimeSys *gf_core.RuntimeSys) (*Gf_crawl_link_alloc_block, *gf_core.GFerror)
type Gf_index__query_run ¶
type Gf_index__query_run struct { Id primitive.ObjectID `bson:"_id,omitempty"` Id_str string `bson:"id_str"` T_str string `bson:"t"` //"index__query_run" Run_time_milisec_int int64 `bson:"run_time_milisec_int"` Hits_total_int int64 `bson:"hits_total_int"` Hits_scores_lst []float64 `bson:"hits_scores_lst"` Hits_score_max_f float64 `bson:"hits_score_max_f"` Hits_urls_lst []string `bson:"hits_urls_lst"` }
Source Files ¶
- gf_crawl_config.go
- gf_crawl_core.go
- gf_crawl_domains.go
- gf_crawl_error.go
- gf_crawl_fetch.go
- gf_crawl_images_adt.go
- gf_crawl_images_db_mongo.go
- gf_crawl_images_download.go
- gf_crawl_images_flows.go
- gf_crawl_images_pipeline.go
- gf_crawl_images_process.go
- gf_crawl_images_s3.go
- gf_crawl_images_utils.go
- gf_crawl_index.go
- gf_crawl_links.go
- gf_crawl_links_allocator_db_mongo.go
- gf_crawl_links_db_mongo.go
- t__utils.go
Click to show internal directories.
Click to hide internal directories.