model

package
v0.11.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Aug 25, 2018 License: Apache-2.0 Imports: 8 Imported by: 28

Documentation

Index

Constants

View Source
const (
	CONTEXT_SNAPSHOT   pipeline.ParaKey = "SNAPSHOT"
	CONTEXT_PAGE_LINKS pipeline.ParaKey = "PAGE_LINKS"
)

Common pipeline context keys

View Source
const (
	CONTEXT_TASK_ID               pipeline.ParaKey = "GOPA_TASK_ID"
	CONTEXT_TASK_URL              pipeline.ParaKey = "GOPA_TASK_URL"
	CONTEXT_TASK_Reference        pipeline.ParaKey = "GOPA_TASK_Reference"
	CONTEXT_TASK_Depth            pipeline.ParaKey = "GOPA_TASK_Depth"
	CONTEXT_TASK_Breadth          pipeline.ParaKey = "GOPA_TASK_Breadth"
	CONTEXT_TASK_Host             pipeline.ParaKey = "GOPA_TASK_Host"
	CONTEXT_TASK_Schema           pipeline.ParaKey = "GOPA_TASK_Schema"
	CONTEXT_TASK_OriginalUrl      pipeline.ParaKey = "GOPA_TASK_OriginalUrl"
	CONTEXT_TASK_Status           pipeline.ParaKey = "GOPA_TASK_Status"
	CONTEXT_TASK_Message          pipeline.ParaKey = "GOPA_TASK_Message"
	CONTEXT_TASK_Created          pipeline.ParaKey = "GOPA_TASK_Created"
	CONTEXT_TASK_Updated          pipeline.ParaKey = "GOPA_TASK_Updated"
	CONTEXT_TASK_LastFetch        pipeline.ParaKey = "GOPA_TASK_LastFetch"
	CONTEXT_TASK_LastCheck        pipeline.ParaKey = "GOPA_TASK_LastCheck"
	CONTEXT_TASK_NextCheck        pipeline.ParaKey = "GOPA_TASK_NextCheck"
	CONTEXT_TASK_SnapshotID       pipeline.ParaKey = "GOPA_TASK_SnapshotID"
	CONTEXT_TASK_SnapshotSimHash  pipeline.ParaKey = "GOPA_TASK_SnapshotSimHash"
	CONTEXT_TASK_SnapshotHash     pipeline.ParaKey = "GOPA_TASK_SnapshotHash"
	CONTEXT_TASK_SnapshotCreated  pipeline.ParaKey = "GOPA_TASK_SnapshotCreated"
	CONTEXT_TASK_SnapshotVersion  pipeline.ParaKey = "GOPA_TASK_SnapshotVersion"
	CONTEXT_TASK_LastScreenshotID pipeline.ParaKey = "GOPA_TASK_LastScreenshotID"
	CONTEXT_TASK_PipelineConfigID pipeline.ParaKey = "GOPA_TASK_PipelineConfigID"
	CONTEXT_TASK_Cookies          pipeline.ParaKey = "GOPA_TASK_Cookies"

	CONTEXT_SNAPSHOT_ContentType pipeline.ParaKey = "GOPA_SNAPSHOT_ContentType"
)
View Source
const PreFetchCheck = 4
View Source
const PreFetchCheckError = 6
View Source
const PreFetchChecking = 5
View Source
const PreFetchPendingCheck = 3
View Source
const StageAfterFetch = 2
View Source
const StageFetch = 1
View Source
const StagePreFetch = 0
View Source
const Task404 int = 4
View Source
const TaskCreated int = 0
View Source
const TaskDuplicated int = 7
View Source
const TaskFailed int = 2
View Source
const TaskInterrupted int = 8
View Source
const TaskPendingFetch int = 9
View Source
const TaskRedirected int = 5
View Source
const TaskSuccess int = 3
View Source
const TaskTimeout int = 6

Variables

This section is empty.

Functions

func CreateHostConfig

func CreateHostConfig(config *HostConfig) error

func CreateProject

func CreateProject(project *Project) error

func CreateSnapshot

func CreateSnapshot(snapshot *Snapshot) error

func CreateTask

func CreateTask(task *Task) error

func DeleteHostConfig

func DeleteHostConfig(id string) error

func DeleteProject

func DeleteProject(id string) error

func DeleteSnapshot

func DeleteSnapshot(snapshot *Snapshot) error

func DeleteTask

func DeleteTask(id string) error

func GetHostStatus

func GetHostStatus(status int) (error, map[string]interface{})

func GetTaskStatus

func GetTaskStatus(host string) (error, map[string]interface{})

func GetTaskStatusText

func GetTaskStatusText(status int) string

func UpdateHostConfig

func UpdateHostConfig(config *HostConfig) error

func UpdateProject

func UpdateProject(project *Project) error

func UpdateTask

func UpdateTask(task *Task) error

Types

type Domain

type Domain struct {
	ID      string `json:"id,omitempty" elastic_meta:"_id"`
	Host    string
	Port    string
	Favicon string
	Enabled bool
}

type FetchTask

type FetchTask struct {
	Url Url

	CurrentStatus int
	CurrentStage  int
	StageStatus   map[int]int
}

func (*FetchTask) UpdateStatus

func (task *FetchTask) UpdateStatus(status int)

type Host

type Host struct {
	Host        string        `json:"host,omitempty" elastic_meta:"_id" elastic_mapping:"host: { type: keyword, ignore_above: 256 }"`
	Favicon     string        `json:"favicon,omitempty"`
	Enabled     bool          `json:"enabled"`
	HostConfigs *[]HostConfig `json:"host_configs,omitempty"`
	Created     time.Time     `json:"created,omitempty"`
	Updated     time.Time     `json:"updated,omitempty"`
}

Host is host struct

func CreateHost

func CreateHost(host string) Host

CreateHost create a domain host

func GetHost

func GetHost(host string) (Host, error)

GetHost return a single host

func GetHostList

func GetHostList(from, size int, host string) (int, []Host, error)

GetHostList return host list

type HostConfig

type HostConfig struct {
	ID         string `json:"id,omitempty" elastic_meta:"_id"`
	Host       string `json:"host"`
	UrlPattern string `json:"url_pattern"`
	Runner     string `json:"runner"`
	SortOrder  int    `json:"sort_order"`

	PipelineID string `json:"pipeline_id"`
	Cookies    string `json:"cookies,omitempty"`

	Created time.Time `json:"created,omitempty"`
	Updated time.Time `json:"updated,omitempty"`
}

func GetHostConfig

func GetHostConfig(runner, host string) []HostConfig

func GetHostConfigByHostAndUrl

func GetHostConfigByHostAndUrl(runner, host, url string) (*HostConfig, error)

func GetHostConfigByID

func GetHostConfigByID(id string) (HostConfig, error)

func GetHostConfigList

func GetHostConfigList(from, size int, host string) (int, []HostConfig, error)

type Index

type Index struct {
	Host     string    `json:"host,omitempty"`
	Task     *Task     `json:"task,omitempty" elastic_mapping:"task:{type:object}"`
	Snapshot *Snapshot `json:"snapshot,omitempty" elastic_mapping:"snapshot:{type:object}"`
}

type KV

type KV struct {
	Key   string   `json:"key,omitempty"`
	Value []string `json:"value,omitempty"`
}

type LinkGroup

type LinkGroup struct {
	Internal []PageLink `json:"internal,omitempty" elastic_mapping:"internal:{type:object}"`
	External []PageLink `json:"external,omitempty" elastic_mapping:"external:{type:object}"`
}
type PageLink struct {
	Url   string `json:"url,omitempty" elastic_mapping:"url: { type: keyword }"`
	Label string `json:"label,omitempty" elastic_mapping:"label: { type: text }"`
}

type Project

type Project struct {
	ID          string    `json:"id,omitempty" elastic_meta:"_id"`
	Name        string    `json:"name,omitempty"`
	Description string    `json:"description,omitempty"`
	Enabled     bool      `json:"enabled"`
	Created     time.Time `json:"created,omitempty"`
	Updated     time.Time `json:"updated,omitempty"`
	Banner      string    `json:"banner,omitempty"`
	Favicon     string    `json:"favicon,omitempty"`

	DomainRules config.Rules `json:"domain_rules,omitempty"`
	UrlRules    config.Rules `json:"url_rules,omitempty"`
}

Project is a definition, include a collection of Host

func GetProject

func GetProject(id string) (Project, error)

func GetProjectList

func GetProjectList(from, size int) (int, []Project, error)

type Snapshot

type Snapshot struct {
	ID      string `json:"id,omitempty" elastic_meta:"_id"`
	Version int    `json:"version,omitempty"`
	Url     string `json:"url,omitempty"`
	TaskID  string `json:"task_id,omitempty"`
	Path    string `json:"path,omitempty"` //path of this file
	File    string `json:"file,omitempty"` //filename of this page
	Ext     string `json:"ext,omitempty"`  //extension of filename

	StatusCode int    `json:"-"`
	Payload    []byte `json:"-"`
	Size       uint64 `json:"size,omitempty"`

	ScreenshotID string `json:"screenshot_id,omitempty"`

	Headers    map[string][]string     `json:"-"`
	Metadata   *map[string]interface{} `json:"-"`
	Parameters []KV                    `json:"-"`

	Language string `json:"lang,omitempty"`

	Title       string `json:"title,omitempty" elastic_mapping:"title: { type: text, fields: { keyword: { type: keyword } } }"`
	Summary     string `json:"summary,omitempty"`
	Text        string `json:"text,omitempty" elastic_mapping:"text: { type: text }"`
	ContentType string `json:"content_type,omitempty"`

	Tags []string `json:"tags,omitempty"`

	Links LinkGroup `json:"links,omitempty" elastic_mapping:"links:{type:object}"`

	Images struct {
		Internal []PageLink `json:"internal,omitempty" elastic_mapping:"internal:{type:object}"`
		External []PageLink `json:"external,omitempty" elastic_mapping:"external:{type:object}"`
	} `json:"images,omitempty" elastic_mapping:"images:{type:object}"`

	H1     []string `json:"h1,omitempty" elastic_mapping:"h1: { type: text }"`
	H2     []string `json:"h2,omitempty" elastic_mapping:"h2: { type: text }"`
	H3     []string `json:"h3,omitempty" elastic_mapping:"h3: { type: text }"`
	H4     []string `json:"h4,omitempty" elastic_mapping:"h4: { type: text }"`
	H5     []string `json:"h5,omitempty" elastic_mapping:"h5: { type: text }"`
	Bold   []string `json:"bold,omitempty" elastic_mapping:"bold: { type: text }"`
	Italic []string `json:"italic,omitempty"`

	Classifications  []string                `json:"classifications,omitempty"`
	EnrichedFeatures *map[string]interface{} `json:"enriched_features,omitempty"`

	Hash    string `json:"hash,omitempty"`
	SimHash string `json:"sim_hash,omitempty"`

	Created time.Time `json:"created,omitempty"`
}

func GetSnapshot

func GetSnapshot(id string) (Snapshot, error)

func GetSnapshotByField

func GetSnapshotByField(k, v string) ([]Snapshot, error)

func GetSnapshotList

func GetSnapshotList(from, size int, taskId string) (int, []Snapshot, error)

type Task

type Task struct {
	ID string `json:"id" elastic_meta:"_id"`
	// the url may not cleaned, may miss the host part, need reference to provide the complete url information
	Url         string    `json:"url,omitempty"`
	Reference   string    `json:"reference_url,omitempty"`
	Depth       int       `json:"depth"`
	Breadth     int       `json:"breadth"`
	Host        string    `json:"host"`
	Schema      string    `json:"schema,omitempty"`
	OriginalUrl string    `json:"original_url,omitempty"`
	Status      int       `json:"status"`
	Message     string    `json:"message,omitempty"`
	Created     time.Time `json:"created,omitempty" elastic_mapping:"created: { type: date }"`
	Updated     time.Time `json:"updated,omitempty" elastic_mapping:"updated: { type: date }"`
	LastFetch   time.Time `json:"last_fetch,omitempty" elastic_mapping:"last_fetch: { type: date }"`
	LastCheck   time.Time `json:"last_check,omitempty" elastic_mapping:"last_check: { type: date }"`
	NextCheck   time.Time `json:"next_check,omitempty" elastic_mapping:"next_check: { type: date }"`

	SnapshotVersion  int       `json:"snapshot_version,omitempty"`
	SnapshotID       string    `json:"snapshot_id,omitempty"`
	SnapshotHash     string    `json:"snapshot_hash,omitempty"`
	SnapshotSimHash  string    `json:"snapshot_simhash,omitempty"`
	SnapshotCreated  time.Time `json:"snapshot_created,omitempty" elastic_mapping:"snapshot_created: { type: date }"`
	LastScreenshotID string    `json:"last_screenshot_id,omitempty"`

	PipelineConfigID string      `json:"pipline_config_id,omitempty"`
	HostConfig       *HostConfig `json:"host_config,omitempty"`

	// transient properties
	Snapshots     []Snapshot `json:"-"`
	SnapshotCount int        `json:"-"`
}

func GetFailedTasks

func GetFailedTasks(offset int64) (int, []Task, error)

func GetPendingNewFetchTasks

func GetPendingNewFetchTasks(offset int64, size int) (int, []Task, error)

func GetPendingUpdateFetchTasks

func GetPendingUpdateFetchTasks(offset int64) (int, []Task, error)

func GetTask

func GetTask(id string) (Task, error)

func GetTaskByField

func GetTaskByField(k, v string) ([]Task, error)

func GetTaskList

func GetTaskList(from, size int, host string, status int) (int, []Task, error)

func NewTask

func NewTask(url, ref string, depth int, breadth int) *Task

type Url

type Url struct {
	Domain Domain

	FirstPath  string
	SecondPath string
	ThirdPath  string

	FileExt string
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL