gf_crawl_core

package

v0.0.0-...-a1b0e2b Latest Latest Go to latest Published: Apr 26, 2024 License: GPL-2.0 Imports: 30 Imported by: 0

Details

Valid go.mod file

The Go module system was introduced in Go 1.11 and is the official dependency management solution for Go.
Redistributable license

Redistributable licenses place minimal restrictions on how software can be used, modified, and redistributed.
Tagged version

Modules with tagged versions give importers more predictable builds.
Stable version

When a project reaches major version v1 it is considered stable.
Learn more about best practices

Repository

github.com/gloflow/gloflow

Links

Open Source Insights

Documentation ¶

Index ¶

func DBimageMarkAsDownloaded(p_image *GFcrawlerPageImage, pRuntimeSys *gf_core.RuntimeSys) *gf_core.GFerror
func DBmongoCreateLink(pLink *GFcrawlerPageOutgoingLink, pRuntimeSys *gf_core.RuntimeSys) *gf_core.GFerror
func DBmongoImageCreate(p_img *GFcrawlerPageImage, pRuntime *GFcrawlerRuntime, ...) (bool, *gf_core.GFerror)
func DBmongoImageCreateRef(p_img_ref *GFcrawlerPageImageRef, pRuntime *GFcrawlerRuntime, ...) *gf_core.GFerror
func DBmongoImageSetImageID(pGFimageIDstr gf_images_core.GFimageID, pImage *GFcrawlerPageImage, ...) *gf_core.GFerror
func DBmongoImageUpdateAfterProcess(pPageImg *GFcrawlerPageImage, pGFimageIDstr gf_images_core.GFimageID, ...) *gf_core.GFerror
func DBmongoLinkExists(pLinkHashStr string, pRuntimeSys *gf_core.RuntimeSys) (bool, *gf_core.GFerror)
func DBmongoLinkIndexInit(pRuntimeSys *gf_core.RuntimeSys) *gf_core.GFerror
func DBmongoLinkMarkAsResolved(pLink *GFcrawlerPageOutgoingLink, pFetchIDstr string, ...) *gf_core.GFerror
func DBmongoLinkMarkImportInProgress(pStatusBool bool, p_unix_time_f float64, pLink *GFcrawlerPageOutgoingLink, ...) *gf_core.GFerror
func FetchParseResult(pURLfetch *GFcrawlerURLfetch, pCycleRunIDstr string, pCrawlerNameStr string, ...) *gf_core.GFerror
func FlowsAddExternImage(pCrawlerPageImageIDstr GFcrawlerPageImageID, pFlowsNamesLst []string, ...) *gf_core.GFerror
func GetAllCrawlers(pCrawlConfigFilePathStr string, pRuntimeSys *gf_core.RuntimeSys) (map[string]GFcrawlerDef, *gf_core.GFerror)
func IndexQuery(p_term_str string, pRuntime *GFcrawlerRuntime, pRuntimeSys *gf_core.RuntimeSys) *gf_core.GFerror
func LinkAllocInit(pCrawlerNameStr string, pRuntimeSys *gf_core.RuntimeSys) *gf_core.GFerror
func LinkAllocRun(pAlloc *Gf_crawl_link_alloc, pRuntimeSys *gf_core.RuntimeSys) *gf_core.GFerror
func LinksGetOutgoingInPage(pURLfetch *GFcrawlerURLfetch, pCycleRunIDstr string, pCrawlerNameStr string, ...)
type GFcrawlConfig
type GFcrawlerDef
type GFcrawlerError
- func CreateErrorAndEvent(pErrorTypeStr string, pErrorMsgStr string, ...) (*GFcrawlerError, *gf_core.GFerror)
type GFcrawlerPageImage
type GFcrawlerPageImageID
type GFcrawlerPageImageRef
type GFcrawlerPageOutgoingLink
- func DBmongoGetLink(pLink_id_str string, pRuntimeSys *gf_core.RuntimeSys) (*GFcrawlerPageOutgoingLink, *gf_core.GFerror)
- func DBmongoLinkGetUnresolved(pCrawlerNameStr string, pRuntimeSys *gf_core.RuntimeSys) (*GFcrawlerPageOutgoingLink, *gf_core.GFerror)
type GFcrawlerRecentImages
- func DBmongoImagesGetRecent(pRuntimeSys *gf_core.RuntimeSys) ([]GFcrawlerRecentImages, *gf_core.GFerror)
type GFcrawlerRuntime
- func T__init() (*gf_core.RuntimeSys, *GFcrawlerRuntime)
type GFcrawlerURLfetch
- func FetchURL(pURLstr string, pLink *GFcrawlerPageOutgoingLink, pCycleRunIDstr string, ...) (*GFcrawlerURLfetch, string, *gf_core.GFerror)
type Gf_crawl_link_alloc
- func DBmongoLinkAllocCreate(pCrawlerNameStr string, pRuntimeSys *gf_core.RuntimeSys) (*Gf_crawl_link_alloc, *gf_core.GFerror)
type Gf_crawl_link_alloc_block
- func DBmongoLinkAllocCreateLinksBlock(p_alloc_id_str string, pCrawlerNameStr string, pBlockSizeInt int, ...) (*Gf_crawl_link_alloc_block, *gf_core.GFerror)
type Gf_index__query_run

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

func DBimageMarkAsDownloaded ¶

func DBimageMarkAsDownloaded(p_image *GFcrawlerPageImage, pRuntimeSys *gf_core.RuntimeSys) *gf_core.GFerror

func DBmongoCreateLink ¶

func DBmongoCreateLink(pLink *GFcrawlerPageOutgoingLink,
	pRuntimeSys *gf_core.RuntimeSys) *gf_core.GFerror

func DBmongoImageCreate ¶

func DBmongoImageCreate(p_img *GFcrawlerPageImage,
	pRuntime *GFcrawlerRuntime,
	pRuntimeSys *gf_core.RuntimeSys) (bool, *gf_core.GFerror)

func DBmongoImageCreateRef ¶

func DBmongoImageCreateRef(p_img_ref *GFcrawlerPageImageRef,
	pRuntime *GFcrawlerRuntime,
	pRuntimeSys *gf_core.RuntimeSys) *gf_core.GFerror

func DBmongoImageSetImageID ¶

func DBmongoImageSetImageID(pGFimageIDstr gf_images_core.GFimageID,
	pImage *GFcrawlerPageImage,
	pRuntimeSys *gf_core.RuntimeSys) *gf_core.GFerror

func DBmongoImageUpdateAfterProcess ¶

func DBmongoImageUpdateAfterProcess(pPageImg *GFcrawlerPageImage,
	pGFimageIDstr gf_images_core.GFimageID,
	pRuntimeSys *gf_core.RuntimeSys) *gf_core.GFerror

func DBmongoLinkExists ¶

func DBmongoLinkExists(pLinkHashStr string,
	pRuntimeSys *gf_core.RuntimeSys) (bool, *gf_core.GFerror)

func DBmongoLinkIndexInit ¶

func DBmongoLinkIndexInit(pRuntimeSys *gf_core.RuntimeSys) *gf_core.GFerror

func DBmongoLinkMarkAsResolved ¶

func DBmongoLinkMarkAsResolved(pLink *GFcrawlerPageOutgoingLink,
	pFetchIDstr string,
	pFetchCreationTimeF float64,
	pRuntimeSys *gf_core.RuntimeSys) *gf_core.GFerror

func DBmongoLinkMarkImportInProgress ¶

func DBmongoLinkMarkImportInProgress(pStatusBool bool,
	p_unix_time_f float64,
	pLink *GFcrawlerPageOutgoingLink,
	pRuntime *GFcrawlerRuntime,
	pRuntimeSys *gf_core.RuntimeSys) *gf_core.GFerror

func FetchParseResult ¶

func FetchParseResult(pURLfetch *GFcrawlerURLfetch,
	pCycleRunIDstr string,
	pCrawlerNameStr string,
	pImagesLocalDirPathStr string,

	pMediaDomainStr string,
	pS3bucketNameStr string,
	pUserID gf_core.GF_ID,
	pRuntime *GFcrawlerRuntime,
	pRuntimeSys *gf_core.RuntimeSys) *gf_core.GFerror

func FlowsAddExternImage ¶

func FlowsAddExternImage(pCrawlerPageImageIDstr GFcrawlerPageImageID,
	pFlowsNamesLst []string,
	pMediaDomainStr string,
	pCrawledImagesS3bucketNameStr string,
	pImagesS3bucketNameStr string,
	pUserID gf_core.GF_ID,
	pCtx context.Context,
	pRuntime *GFcrawlerRuntime,
	pRuntimeSys *gf_core.RuntimeSys) *gf_core.GFerror

adds an image already crawled from an external source URL to some named list of flows in the gf_images app/system. to do this it adds the flow_name to the gf_image DB record, and then copies the discovered image file from gf_crawlers file_storage (S3/IPFS) to gf_images service file_storage (S3/IPFS). at the moment this is called directly in the gf_crawl HTTP handler.

func GetAllCrawlers ¶

func GetAllCrawlers(pCrawlConfigFilePathStr string,
	pRuntimeSys *gf_core.RuntimeSys) (map[string]GFcrawlerDef, *gf_core.GFerror)

func IndexQuery ¶

func IndexQuery(p_term_str string,
	pRuntime *GFcrawlerRuntime,
	pRuntimeSys *gf_core.RuntimeSys) *gf_core.GFerror

func LinkAllocInit ¶

func LinkAllocInit(pCrawlerNameStr string, pRuntimeSys *gf_core.RuntimeSys) *gf_core.GFerror

func LinkAllocRun ¶

func LinkAllocRun(pAlloc *Gf_crawl_link_alloc,
	pRuntimeSys *gf_core.RuntimeSys) *gf_core.GFerror

func LinksGetOutgoingInPage ¶

func LinksGetOutgoingInPage(pURLfetch *GFcrawlerURLfetch,
	pCycleRunIDstr string,
	pCrawlerNameStr string,
	pRuntime *GFcrawlerRuntime,
	pRuntimeSys *gf_core.RuntimeSys)

Types ¶

type GFcrawlConfig ¶

type GFcrawlConfig struct {
	CrawlersDefsLst []GFcrawlerDef `yaml:"crawlers-defs"`
}

type GFcrawlerDef ¶

type GFcrawlerDef struct {
	NameStr     string `yaml:"name"`
	StartURLstr string `yaml:"start-url"`
}

type GFcrawlerError ¶

type GFcrawlerError struct {
	Id                primitive.ObjectID     `bson:"_id,omitempty"    json:"-"`
	IDstr             string                 `bson:"id_str"           json:"id_str"`
	Tstr              string                 `bson:"t"                json:"t"` //"crawler_error"
	CreationUNIXtimeF float64                `bson:"creation_unix_time_f"`
	TypeStr           string                 `bson:"type_str"         json:"type_str"`
	MsgStr            string                 `bson:"msg_str"          json:"msg_str"`
	DataMap           map[string]interface{} `bson:"data_map"         json:"data_map"` //if an error is related to a particular URL, it is noted here.
	GFerrorIDstr      string                 `bson:"gf_error_id_str"  json:"gf_error_id_str"`
	CrawlerNameStr    string                 `bson:"crawler_name_str" json:"crawler_name_str"`
	URLstr            string                 `bson:"url_str"          json:"url_str"`
}

func CreateErrorAndEvent ¶

func CreateErrorAndEvent(pErrorTypeStr string,
	pErrorMsgStr string,
	pErrorDataMap map[string]interface{},
	pErrorURLstr string,
	pCrawlerNameStr string,
	pGFerr *gf_core.GFerror,
	pRuntime *GFcrawlerRuntime,
	pRuntimeSys *gf_core.RuntimeSys) (*GFcrawlerError, *gf_core.GFerror)

type GFcrawlerPageImage ¶

type GFcrawlerPageImage struct {
	Id                         primitive.ObjectID   `bson:"_id,omitempty"`
	IDstr                      GFcrawlerPageImageID `bson:"id_str"`
	T_str                      string               `bson:"t"` // "crawler_page_img"
	Creation_unix_time_f       float64              `bson:"creation_unix_time_f"`
	Crawler_name_str           string               `bson:"crawler_name_str"` // name of the crawler that discovered this image
	Cycle_run_id_str           string               `bson:"cycle_run_id_str"`
	Img_ext_str                string               `bson:"img_ext_str"` // jpg|gif|png
	Url_str                    string               `bson:"url_str"`
	Domain_str                 string               `bson:"domain_str"`                 // domain of the url_str
	Origin_page_url_str        string               `bson:"origin_page_url_str"`        // page url from whos html this element was extracted
	Origin_page_url_domain_str string               `bson:"origin_page_url_domain_str"` // domain of the origin_page_url_str // NEW_FIELD!! a lot of records dont have this field

	// IMPORTANT!! - this is unique for the image src encountered. this way the same data links are not entered in duplicates,
	//               and using the hash the DB can qucikly be checked for existence of record
	Hash_str string `bson:"hash_str"`

	// IMPORTANT!! - indicates if the image was fetched from the remote server,
	//               and has been stored on S3 and ready for usage by other services.
	Downloaded_bool bool `bson:"downloaded_bool"`

	// IMPORTANT!! - the usage was determined to be useful for internal applications,
	//               they're not page elements, or other small unimportant parts.
	//               if it is valid for usage then a gf_image for this image should be
	//               found in the db
	Valid_for_usage_bool bool                     `bson:"valid_for_usage_bool"`
	S3_stored_bool       bool                     `bson:"s3_stored_bool"` // if persisting to s3 succeeded
	Nsfv_bool            bool                     `bson:"nsfv_bool"`      // NSFV (not safe for viewing/nudity) flag for the image
	GFimageIDstr         gf_images_core.GFimageID `bson:"image_id_str"`   // id of the gf_image for this corresponding crawler_page_img //FIX!! - should be "gf_image_id_str"
}

type GFcrawlerPageImageID ¶

type GFcrawlerPageImageID string

type GFcrawlerPageImageRef ¶

type GFcrawlerPageImageRef struct {
	Id                         primitive.ObjectID `bson:"_id,omitempty"`
	Id_str                     string             `bson:"id_str"`
	T_str                      string             `bson:"t"` //"crawler_page_img_ref"
	Creation_unix_time_f       float64            `bson:"creation_unix_time_f"`
	Crawler_name_str           string             `bson:"crawler_name_str"` //name of the crawler that discovered this image
	Cycle_run_id_str           string             `bson:"cycle_run_id_str" json:"cycle_run_id_str"`
	Url_str                    string             `bson:"url_str"`
	Domain_str                 string             `bson:"domain_str"`
	Origin_page_url_str        string             `bson:"origin_page_url_str"`        //page url from whos html this element was extracted
	Origin_page_url_domain_str string             `bson:"origin_page_url_domain_str"` //NEW_FIELD!! a lot of records dont have this field

	// IMPORTANT!! - this is unique for the image src encountered. this way the same data links are not entered in duplicates,
	//               and using the hash the DB can qucikly be checked for existence of record
	Hash_str string `bson:"hash_str"`
}

IMPORTANT!! - reference to an image, on a particular page.

the same image, with the same Url_str can appear on multiple pages, and this
struct tracks that, one record per reference

type GFcrawlerPageOutgoingLink ¶

type GFcrawlerPageOutgoingLink struct {
	Id                    primitive.ObjectID `bson:"_id,omitempty"`
	IDstr                 string             `bson:"id_str"`
	T_str                 string             `bson:"t"` // "crawler_page_outgoing_link"
	CreationUNIXtimeF     float64            `bson:"creation_unix_time_f"`
	Crawler_name_str      string             `bson:"crawler_name_str"` // name of the crawler that discovered this link
	Cycle_run_id_str      string             `bson:"cycle_run_id_str"`
	A_href_str            string             `bson:"a_href_str"`
	Domain_str            string             `bson:"domain_str"`
	Origin_url_str        string             `bson:"origin_url_str"` // page url from whos html this element was extracted
	Origin_url_domain_str string             `bson:"origin_url_domain_str"`

	// IMPORTANT!! - this is a hash of the . it
	Hash_str string `bson:"hash_str"`

	Valid_for_crawl_bool  bool    `bson:"valid_for_crawl_bool"`  // if the link should be crawled, or if it should be ignored
	Images_processed_bool bool    `bson:"images_processed_bool"` // if all the images in the page have been downloaded/transformed/stored-in-s3
	Fetched_bool          bool    `bson:"fetched_bool"`          // indicator if the link has been fetched (its html downloaded and parsed)
	Fetch_last_id_str     string  `bson:"fetch_last_id_str"`
	Fetch_last_time_f     float64 `bson:"fetch_last_time_f"`

	//-------------------
	// IMPORTANT!! - indicates if this link hasis currently being processed by some
	//               crawler master/worker in the cluster
	Import__in_progress_bool bool    `bson:"import__in_progress_bool"`
	Import__start_time_f     float64 `bson:"import__start_time_f"` // when has the "in_progress" flag been set. for detecting interrupted/incomplete imports

	//-------------------
	// IMPORTANT!! - last error that occured/interupted processing of this link
	Error_type_str string `bson:"error_type_str,omitempty"`
	Error_id_str   string `bson:"error_id_str,omitempty"`
}

func DBmongoGetLink ¶

func DBmongoGetLink(pLink_id_str string,
	pRuntimeSys *gf_core.RuntimeSys) (*GFcrawlerPageOutgoingLink, *gf_core.GFerror)

func DBmongoLinkGetUnresolved ¶

func DBmongoLinkGetUnresolved(pCrawlerNameStr string,
	pRuntimeSys *gf_core.RuntimeSys) (*GFcrawlerPageOutgoingLink, *gf_core.GFerror)

type GFcrawlerRecentImages ¶

type GFcrawlerRecentImages struct {
	Domain_str               string    `bson:"_id"                      json:"domain_str"`
	Imgs_count_int           int       `bson:"imgs_count_int"           json:"imgs_count_int"`
	Crawler_page_img_ids_lst []string  `bson:"crawler_page_img_ids_lst" json:"crawler_page_img_ids_lst"`
	Creation_times_lst       []float64 `bson:"creation_times_lst"       json:"creation_times_lst"`
	Urls_lst                 []string  `bson:"urls_lst"                 json:"urls_lst"`
	Nsfv_lst                 []bool    `bson:"nsfv_lst"                 json:"nsfv_lst"`
	Origin_page_urls_lst     []string  `bson:"origin_page_urls_lst"     json:"origin_page_urls_lst"`
}

func DBmongoImagesGetRecent ¶

func DBmongoImagesGetRecent(pRuntimeSys *gf_core.RuntimeSys) ([]GFcrawlerRecentImages, *gf_core.GFerror)

type GFcrawlerRuntime ¶

type GFcrawlerRuntime struct {
	EventsCtx                     *gf_events.EventsCtx
	EsearchClient                 *elastic.Client
	S3info                        *gf_aws.GFs3Info
	ImagesUseNewStorageEngineBool bool

	PluginsPyDirPathStr string
}

func T__init ¶

func T__init() (*gf_core.RuntimeSys, *GFcrawlerRuntime)

type GFcrawlerURLfetch ¶

type GFcrawlerURLfetch struct {
	Id                   primitive.ObjectID `bson:"_id,omitempty"`
	Id_str               string             `bson:"id_str"               json:"id_str"`
	T_str                string             `bson:"t"                    json:"t"` // "crawler_url_fetch"
	Creation_unix_time_f float64            `bson:"creation_unix_time_f" json:"creation_unix_time_f"`
	Cycle_run_id_str     string             `bson:"cycle_run_id_str"     json:"cycle_run_id_str"`
	Domain_str           string             `bson:"domain_str"           json:"domain_str"`
	Url_str              string             `bson:"url_str"              json:"url_str"`
	Start_time_f         float64            `bson:"start_time_f"         json:"-"`
	End_time_f           float64            `bson:"end_time_f"           json:"-"`
	Page_text_str        string             `bson:"page_text_str"        json:"page_text_str"` // full text of the page html - indexed in ES

	//-------------------
	// IMPORTANT!! - last error that occured/interupted processing of this link
	Error_type_str string `bson:"error_type_str,omitempty"`
	Error_id_str   string `bson:"error_id_str,omitempty"`
	// contains filtered or unexported fields
}

func FetchURL ¶

func FetchURL(pURLstr string,
	pLink *GFcrawlerPageOutgoingLink,
	pCycleRunIDstr string,
	pCrawlerNameStr string,
	pRuntime *GFcrawlerRuntime,
	pRuntimeSys *gf_core.RuntimeSys) (*GFcrawlerURLfetch, string, *gf_core.GFerror)

type Gf_crawl_link_alloc ¶

type Gf_crawl_link_alloc struct {
	Id                   primitive.ObjectID `bson:"_id,omitempty"`
	Id_str               string             `bson:"id_str"`
	T_str                string             `bson:"t"` // "crawler_link_alloc"
	Creation_unix_time_f float64            `bson:"creation_unix_time_f"`
	Crawler_name_str     string             `bson:"crawler_name_str"`
	Block_size_int       int                `bson:"block_size_int"`
	Sleep_time_sec_int   int                `bson:"sleep_time_sec_int"`

	Last_run_unix_time_f      float64
	Current_link_block_id_str string `bson:"current_link_block_id_str"`
}

func DBmongoLinkAllocCreate ¶

func DBmongoLinkAllocCreate(pCrawlerNameStr string,
	pRuntimeSys *gf_core.RuntimeSys) (*Gf_crawl_link_alloc, *gf_core.GFerror)

type Gf_crawl_link_alloc_block ¶

type Gf_crawl_link_alloc_block struct {
	Id                       primitive.ObjectID `bson:"_id,omitempty"`
	Id_str                   string             `bson:"id_str"`
	Creation_unix_time_f     float64            `bson:"creation_unix_time_f"`
	T_str                    string             `bson:"t"` // "crawler_link_alloc_block"
	Allocator_id_str         string             `bson:"allocator_id_str"`
	Unresolved_links_ids_lst []string           `bson:"unresolved_links_ids_lst"`
}

func DBmongoLinkAllocCreateLinksBlock ¶

func DBmongoLinkAllocCreateLinksBlock(p_alloc_id_str string,
	pCrawlerNameStr string,
	pBlockSizeInt int,
	pRuntimeSys *gf_core.RuntimeSys) (*Gf_crawl_link_alloc_block, *gf_core.GFerror)

type Gf_index__query_run ¶

type Gf_index__query_run struct {
	Id                   primitive.ObjectID `bson:"_id,omitempty"`
	Id_str               string             `bson:"id_str"`
	T_str                string             `bson:"t"` //"index__query_run"
	Run_time_milisec_int int64              `bson:"run_time_milisec_int"`
	Hits_total_int       int64              `bson:"hits_total_int"`
	Hits_scores_lst      []float64          `bson:"hits_scores_lst"`
	Hits_score_max_f     float64            `bson:"hits_score_max_f"`
	Hits_urls_lst        []string           `bson:"hits_urls_lst"`
}

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL