gf_crawl_core

package
v0.0.0-...-a1b0e2b Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Apr 26, 2024 License: GPL-2.0 Imports: 30 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func DBimageMarkAsDownloaded

func DBimageMarkAsDownloaded(p_image *GFcrawlerPageImage, pRuntimeSys *gf_core.RuntimeSys) *gf_core.GFerror
func DBmongoCreateLink(pLink *GFcrawlerPageOutgoingLink,
	pRuntimeSys *gf_core.RuntimeSys) *gf_core.GFerror

func DBmongoImageCreate

func DBmongoImageCreate(p_img *GFcrawlerPageImage,
	pRuntime *GFcrawlerRuntime,
	pRuntimeSys *gf_core.RuntimeSys) (bool, *gf_core.GFerror)

func DBmongoImageCreateRef

func DBmongoImageCreateRef(p_img_ref *GFcrawlerPageImageRef,
	pRuntime *GFcrawlerRuntime,
	pRuntimeSys *gf_core.RuntimeSys) *gf_core.GFerror

func DBmongoImageSetImageID

func DBmongoImageSetImageID(pGFimageIDstr gf_images_core.GFimageID,
	pImage *GFcrawlerPageImage,
	pRuntimeSys *gf_core.RuntimeSys) *gf_core.GFerror

func DBmongoImageUpdateAfterProcess

func DBmongoImageUpdateAfterProcess(pPageImg *GFcrawlerPageImage,
	pGFimageIDstr gf_images_core.GFimageID,
	pRuntimeSys *gf_core.RuntimeSys) *gf_core.GFerror

func DBmongoLinkExists

func DBmongoLinkExists(pLinkHashStr string,
	pRuntimeSys *gf_core.RuntimeSys) (bool, *gf_core.GFerror)

func DBmongoLinkIndexInit

func DBmongoLinkIndexInit(pRuntimeSys *gf_core.RuntimeSys) *gf_core.GFerror

func DBmongoLinkMarkAsResolved

func DBmongoLinkMarkAsResolved(pLink *GFcrawlerPageOutgoingLink,
	pFetchIDstr string,
	pFetchCreationTimeF float64,
	pRuntimeSys *gf_core.RuntimeSys) *gf_core.GFerror

func DBmongoLinkMarkImportInProgress

func DBmongoLinkMarkImportInProgress(pStatusBool bool,
	p_unix_time_f float64,
	pLink *GFcrawlerPageOutgoingLink,
	pRuntime *GFcrawlerRuntime,
	pRuntimeSys *gf_core.RuntimeSys) *gf_core.GFerror

func FetchParseResult

func FetchParseResult(pURLfetch *GFcrawlerURLfetch,
	pCycleRunIDstr string,
	pCrawlerNameStr string,
	pImagesLocalDirPathStr string,

	pMediaDomainStr string,
	pS3bucketNameStr string,
	pUserID gf_core.GF_ID,
	pRuntime *GFcrawlerRuntime,
	pRuntimeSys *gf_core.RuntimeSys) *gf_core.GFerror

func FlowsAddExternImage

func FlowsAddExternImage(pCrawlerPageImageIDstr GFcrawlerPageImageID,
	pFlowsNamesLst []string,
	pMediaDomainStr string,
	pCrawledImagesS3bucketNameStr string,
	pImagesS3bucketNameStr string,
	pUserID gf_core.GF_ID,
	pCtx context.Context,
	pRuntime *GFcrawlerRuntime,
	pRuntimeSys *gf_core.RuntimeSys) *gf_core.GFerror

adds an image already crawled from an external source URL to some named list of flows in the gf_images app/system. to do this it adds the flow_name to the gf_image DB record, and then copies the discovered image file from gf_crawlers file_storage (S3/IPFS) to gf_images service file_storage (S3/IPFS). at the moment this is called directly in the gf_crawl HTTP handler.

func GetAllCrawlers

func GetAllCrawlers(pCrawlConfigFilePathStr string,
	pRuntimeSys *gf_core.RuntimeSys) (map[string]GFcrawlerDef, *gf_core.GFerror)

func IndexQuery

func IndexQuery(p_term_str string,
	pRuntime *GFcrawlerRuntime,
	pRuntimeSys *gf_core.RuntimeSys) *gf_core.GFerror

func LinkAllocInit

func LinkAllocInit(pCrawlerNameStr string, pRuntimeSys *gf_core.RuntimeSys) *gf_core.GFerror

func LinkAllocRun

func LinkAllocRun(pAlloc *Gf_crawl_link_alloc,
	pRuntimeSys *gf_core.RuntimeSys) *gf_core.GFerror

func LinksGetOutgoingInPage

func LinksGetOutgoingInPage(pURLfetch *GFcrawlerURLfetch,
	pCycleRunIDstr string,
	pCrawlerNameStr string,
	pRuntime *GFcrawlerRuntime,
	pRuntimeSys *gf_core.RuntimeSys)

Types

type GFcrawlConfig

type GFcrawlConfig struct {
	CrawlersDefsLst []GFcrawlerDef `yaml:"crawlers-defs"`
}

type GFcrawlerDef

type GFcrawlerDef struct {
	NameStr     string `yaml:"name"`
	StartURLstr string `yaml:"start-url"`
}

type GFcrawlerError

type GFcrawlerError struct {
	Id                primitive.ObjectID     `bson:"_id,omitempty"    json:"-"`
	IDstr             string                 `bson:"id_str"           json:"id_str"`
	Tstr              string                 `bson:"t"                json:"t"` //"crawler_error"
	CreationUNIXtimeF float64                `bson:"creation_unix_time_f"`
	TypeStr           string                 `bson:"type_str"         json:"type_str"`
	MsgStr            string                 `bson:"msg_str"          json:"msg_str"`
	DataMap           map[string]interface{} `bson:"data_map"         json:"data_map"` //if an error is related to a particular URL, it is noted here.
	GFerrorIDstr      string                 `bson:"gf_error_id_str"  json:"gf_error_id_str"`
	CrawlerNameStr    string                 `bson:"crawler_name_str" json:"crawler_name_str"`
	URLstr            string                 `bson:"url_str"          json:"url_str"`
}

func CreateErrorAndEvent

func CreateErrorAndEvent(pErrorTypeStr string,
	pErrorMsgStr string,
	pErrorDataMap map[string]interface{},
	pErrorURLstr string,
	pCrawlerNameStr string,
	pGFerr *gf_core.GFerror,
	pRuntime *GFcrawlerRuntime,
	pRuntimeSys *gf_core.RuntimeSys) (*GFcrawlerError, *gf_core.GFerror)

type GFcrawlerPageImage

type GFcrawlerPageImage struct {
	Id                         primitive.ObjectID   `bson:"_id,omitempty"`
	IDstr                      GFcrawlerPageImageID `bson:"id_str"`
	T_str                      string               `bson:"t"` // "crawler_page_img"
	Creation_unix_time_f       float64              `bson:"creation_unix_time_f"`
	Crawler_name_str           string               `bson:"crawler_name_str"` // name of the crawler that discovered this image
	Cycle_run_id_str           string               `bson:"cycle_run_id_str"`
	Img_ext_str                string               `bson:"img_ext_str"` // jpg|gif|png
	Url_str                    string               `bson:"url_str"`
	Domain_str                 string               `bson:"domain_str"`                 // domain of the url_str
	Origin_page_url_str        string               `bson:"origin_page_url_str"`        // page url from whos html this element was extracted
	Origin_page_url_domain_str string               `bson:"origin_page_url_domain_str"` // domain of the origin_page_url_str // NEW_FIELD!! a lot of records dont have this field

	// IMPORTANT!! - this is unique for the image src encountered. this way the same data links are not entered in duplicates,
	//               and using the hash the DB can qucikly be checked for existence of record
	Hash_str string `bson:"hash_str"`

	// IMPORTANT!! - indicates if the image was fetched from the remote server,
	//               and has been stored on S3 and ready for usage by other services.
	Downloaded_bool bool `bson:"downloaded_bool"`

	// IMPORTANT!! - the usage was determined to be useful for internal applications,
	//               they're not page elements, or other small unimportant parts.
	//               if it is valid for usage then a gf_image for this image should be
	//               found in the db
	Valid_for_usage_bool bool                     `bson:"valid_for_usage_bool"`
	S3_stored_bool       bool                     `bson:"s3_stored_bool"` // if persisting to s3 succeeded
	Nsfv_bool            bool                     `bson:"nsfv_bool"`      // NSFV (not safe for viewing/nudity) flag for the image
	GFimageIDstr         gf_images_core.GFimageID `bson:"image_id_str"`   // id of the gf_image for this corresponding crawler_page_img //FIX!! - should be "gf_image_id_str"
}

type GFcrawlerPageImageID

type GFcrawlerPageImageID string

type GFcrawlerPageImageRef

type GFcrawlerPageImageRef struct {
	Id                         primitive.ObjectID `bson:"_id,omitempty"`
	Id_str                     string             `bson:"id_str"`
	T_str                      string             `bson:"t"` //"crawler_page_img_ref"
	Creation_unix_time_f       float64            `bson:"creation_unix_time_f"`
	Crawler_name_str           string             `bson:"crawler_name_str"` //name of the crawler that discovered this image
	Cycle_run_id_str           string             `bson:"cycle_run_id_str" json:"cycle_run_id_str"`
	Url_str                    string             `bson:"url_str"`
	Domain_str                 string             `bson:"domain_str"`
	Origin_page_url_str        string             `bson:"origin_page_url_str"`        //page url from whos html this element was extracted
	Origin_page_url_domain_str string             `bson:"origin_page_url_domain_str"` //NEW_FIELD!! a lot of records dont have this field

	// IMPORTANT!! - this is unique for the image src encountered. this way the same data links are not entered in duplicates,
	//               and using the hash the DB can qucikly be checked for existence of record
	Hash_str string `bson:"hash_str"`
}

IMPORTANT!! - reference to an image, on a particular page.

the same image, with the same Url_str can appear on multiple pages, and this
struct tracks that, one record per reference
type GFcrawlerPageOutgoingLink struct {
	Id                    primitive.ObjectID `bson:"_id,omitempty"`
	IDstr                 string             `bson:"id_str"`
	T_str                 string             `bson:"t"` // "crawler_page_outgoing_link"
	CreationUNIXtimeF     float64            `bson:"creation_unix_time_f"`
	Crawler_name_str      string             `bson:"crawler_name_str"` // name of the crawler that discovered this link
	Cycle_run_id_str      string             `bson:"cycle_run_id_str"`
	A_href_str            string             `bson:"a_href_str"`
	Domain_str            string             `bson:"domain_str"`
	Origin_url_str        string             `bson:"origin_url_str"` // page url from whos html this element was extracted
	Origin_url_domain_str string             `bson:"origin_url_domain_str"`

	// IMPORTANT!! - this is a hash of the . it
	Hash_str string `bson:"hash_str"`

	Valid_for_crawl_bool  bool    `bson:"valid_for_crawl_bool"`  // if the link should be crawled, or if it should be ignored
	Images_processed_bool bool    `bson:"images_processed_bool"` // if all the images in the page have been downloaded/transformed/stored-in-s3
	Fetched_bool          bool    `bson:"fetched_bool"`          // indicator if the link has been fetched (its html downloaded and parsed)
	Fetch_last_id_str     string  `bson:"fetch_last_id_str"`
	Fetch_last_time_f     float64 `bson:"fetch_last_time_f"`

	//-------------------
	// IMPORTANT!! - indicates if this link hasis currently being processed by some
	//               crawler master/worker in the cluster
	Import__in_progress_bool bool    `bson:"import__in_progress_bool"`
	Import__start_time_f     float64 `bson:"import__start_time_f"` // when has the "in_progress" flag been set. for detecting interrupted/incomplete imports

	//-------------------
	// IMPORTANT!! - last error that occured/interupted processing of this link
	Error_type_str string `bson:"error_type_str,omitempty"`
	Error_id_str   string `bson:"error_id_str,omitempty"`
}
func DBmongoGetLink(pLink_id_str string,
	pRuntimeSys *gf_core.RuntimeSys) (*GFcrawlerPageOutgoingLink, *gf_core.GFerror)

func DBmongoLinkGetUnresolved

func DBmongoLinkGetUnresolved(pCrawlerNameStr string,
	pRuntimeSys *gf_core.RuntimeSys) (*GFcrawlerPageOutgoingLink, *gf_core.GFerror)

type GFcrawlerRecentImages

type GFcrawlerRecentImages struct {
	Domain_str               string    `bson:"_id"                      json:"domain_str"`
	Imgs_count_int           int       `bson:"imgs_count_int"           json:"imgs_count_int"`
	Crawler_page_img_ids_lst []string  `bson:"crawler_page_img_ids_lst" json:"crawler_page_img_ids_lst"`
	Creation_times_lst       []float64 `bson:"creation_times_lst"       json:"creation_times_lst"`
	Urls_lst                 []string  `bson:"urls_lst"                 json:"urls_lst"`
	Nsfv_lst                 []bool    `bson:"nsfv_lst"                 json:"nsfv_lst"`
	Origin_page_urls_lst     []string  `bson:"origin_page_urls_lst"     json:"origin_page_urls_lst"`
}

func DBmongoImagesGetRecent

func DBmongoImagesGetRecent(pRuntimeSys *gf_core.RuntimeSys) ([]GFcrawlerRecentImages, *gf_core.GFerror)

type GFcrawlerRuntime

type GFcrawlerRuntime struct {
	EventsCtx                     *gf_events.EventsCtx
	EsearchClient                 *elastic.Client
	S3info                        *gf_aws.GFs3Info
	ImagesUseNewStorageEngineBool bool

	PluginsPyDirPathStr string
}

func T__init

func T__init() (*gf_core.RuntimeSys, *GFcrawlerRuntime)

type GFcrawlerURLfetch

type GFcrawlerURLfetch struct {
	Id                   primitive.ObjectID `bson:"_id,omitempty"`
	Id_str               string             `bson:"id_str"               json:"id_str"`
	T_str                string             `bson:"t"                    json:"t"` // "crawler_url_fetch"
	Creation_unix_time_f float64            `bson:"creation_unix_time_f" json:"creation_unix_time_f"`
	Cycle_run_id_str     string             `bson:"cycle_run_id_str"     json:"cycle_run_id_str"`
	Domain_str           string             `bson:"domain_str"           json:"domain_str"`
	Url_str              string             `bson:"url_str"              json:"url_str"`
	Start_time_f         float64            `bson:"start_time_f"         json:"-"`
	End_time_f           float64            `bson:"end_time_f"           json:"-"`
	Page_text_str        string             `bson:"page_text_str"        json:"page_text_str"` // full text of the page html - indexed in ES

	//-------------------
	// IMPORTANT!! - last error that occured/interupted processing of this link
	Error_type_str string `bson:"error_type_str,omitempty"`
	Error_id_str   string `bson:"error_id_str,omitempty"`
	// contains filtered or unexported fields
}

func FetchURL

func FetchURL(pURLstr string,
	pLink *GFcrawlerPageOutgoingLink,
	pCycleRunIDstr string,
	pCrawlerNameStr string,
	pRuntime *GFcrawlerRuntime,
	pRuntimeSys *gf_core.RuntimeSys) (*GFcrawlerURLfetch, string, *gf_core.GFerror)
type Gf_crawl_link_alloc struct {
	Id                   primitive.ObjectID `bson:"_id,omitempty"`
	Id_str               string             `bson:"id_str"`
	T_str                string             `bson:"t"` // "crawler_link_alloc"
	Creation_unix_time_f float64            `bson:"creation_unix_time_f"`
	Crawler_name_str     string             `bson:"crawler_name_str"`
	Block_size_int       int                `bson:"block_size_int"`
	Sleep_time_sec_int   int                `bson:"sleep_time_sec_int"`

	Last_run_unix_time_f      float64
	Current_link_block_id_str string `bson:"current_link_block_id_str"`
}

func DBmongoLinkAllocCreate

func DBmongoLinkAllocCreate(pCrawlerNameStr string,
	pRuntimeSys *gf_core.RuntimeSys) (*Gf_crawl_link_alloc, *gf_core.GFerror)
type Gf_crawl_link_alloc_block struct {
	Id                       primitive.ObjectID `bson:"_id,omitempty"`
	Id_str                   string             `bson:"id_str"`
	Creation_unix_time_f     float64            `bson:"creation_unix_time_f"`
	T_str                    string             `bson:"t"` // "crawler_link_alloc_block"
	Allocator_id_str         string             `bson:"allocator_id_str"`
	Unresolved_links_ids_lst []string           `bson:"unresolved_links_ids_lst"`
}

func DBmongoLinkAllocCreateLinksBlock

func DBmongoLinkAllocCreateLinksBlock(p_alloc_id_str string,
	pCrawlerNameStr string,
	pBlockSizeInt int,
	pRuntimeSys *gf_core.RuntimeSys) (*Gf_crawl_link_alloc_block, *gf_core.GFerror)

type Gf_index__query_run

type Gf_index__query_run struct {
	Id                   primitive.ObjectID `bson:"_id,omitempty"`
	Id_str               string             `bson:"id_str"`
	T_str                string             `bson:"t"` //"index__query_run"
	Run_time_milisec_int int64              `bson:"run_time_milisec_int"`
	Hits_total_int       int64              `bson:"hits_total_int"`
	Hits_scores_lst      []float64          `bson:"hits_scores_lst"`
	Hits_score_max_f     float64            `bson:"hits_score_max_f"`
	Hits_urls_lst        []string           `bson:"hits_urls_lst"`
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL