pipermail

package
v0.0.0-...-b175f30 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jun 23, 2022 License: Apache-2.0 Imports: 30 Imported by: 0

Documentation

Index

Constants

View Source
const (
	// ModMboxThreadStr ...
	ModMboxThreadStr = "/thread"
	// Pipermail datasource
	Pipermail = "pipermail"
	// PiperBackendVersion ...
	PiperBackendVersion = "0.0.1"
	// MessageDateField ...
	MessageDateField = "date"
	// Message ...
	Message = "message"
	// MessageIDField ...
	MessageIDField = "Message-ID"
	// Unknown ...
	Unknown = "Unknown"
	// MaxConcurrentRequests ...
	MaxConcurrentRequests = 100000
)
View Source
const (
	// MessageReceivedField ...
	MessageReceivedField = "received"
	// MaxMessageBodyLength ...
	MaxMessageBodyLength = 1000
	// DropXFields - drop fields starting with X- - to avoid ES 1000 fields limit
	DropXFields = true
	// MaxMessageProperties maximum properties that can be set on the message object
	MaxMessageProperties = 255
	// ContentType - common constant string
	ContentType = "Content-Type"
	// LowerContentType - common constant string
	LowerContentType = "content-type"
)

Variables

View Source
var (
	// CompressedTypes ...
	CompressedTypes = []string{".gz", ".bz2", ".zip", ".tar", ".tar.gz", ".tar.bz2", ".tgz", ".tbz"}
	// AcceptedTypes ...
	AcceptedTypes = []string{".mbox", ".txt"}
	// CombinedTypes ...
	CombinedTypes []string
	// MONTHS ...
	MONTHS = map[string]int{"January": 1, "February": 2, "March": 3, "April": 4, "May": 5, "June": 6, "July": 7, "August": 8, "September": 9, "October": 10, "November": 11, "December": 12}
	// DefaultDateTime ...
	DefaultDateTime = time.Date(1970, 1, 1, 0, 0, 0, 0, time.UTC)
	// MessageSeparator ...
	MessageSeparator = []byte("\nFrom")
	// PiperRawMapping ...
	PiperRawMapping = []byte(`{"mappings":{"dynamic":true,"properties":{"metadata__updated_on":{"type":"date"},"data":{"properties":{"body":{"dynamic":false,"properties":{}}}}}}}`)
	// PiperRichMapping ...
	PiperRichMapping = []byte(`{"mappings":{"dynamic_templates":[{"notanalyzed":{"match":"*","match_mapping_type":"string","mapping":{"type":"keyword"}}},{"int_to_float":{"match":"*","match_mapping_type":"long","mapping":{"type":"float"}}},{"formatdate":{"match":"*","match_mapping_type":"date","mapping":{"format":"strict_date_optional_time||epoch_millis","type":"date"}}}]}}`)
	// EmailObfuscationPatterns ...
	EmailObfuscationPatterns = []string{" at ", "_at_", " en "}
	// ArchiveDownloadsPath ...
	ArchiveDownloadsPath = strings.TrimSpace(os.Getenv("HOME") + "/.perceval/mailinglists/")
	// DumpsPath ...
	DumpsPath = strings.TrimSpace(os.Getenv("HOME") + "/.perceval/dumps/")
)
View Source
var (
	// LowerDayNames - lower case 3 letter US day names
	LowerDayNames = map[string]struct{}{
		"mon": {},
		"tue": {},
		"wed": {},
		"thu": {},
		"fri": {},
		"sat": {},
		"sun": {},
	}
	// LowerMonthNames - map lower month names
	LowerMonthNames = map[string]string{
		"jan": "Jan",
		"feb": "Feb",
		"mar": "Mar",
		"apr": "Apr",
		"may": "May",
		"jun": "Jun",
		"jul": "Jul",
		"aug": "Aug",
		"sep": "Sep",
		"oct": "Oct",
		"nov": "Nov",
		"dec": "Dec",
	}
	// LowerFullMonthNames - map lower month names (full)
	LowerFullMonthNames = map[string]string{
		"january":   "Jan",
		"february":  "Feb",
		"march":     "Mar",
		"april":     "Apr",
		"may":       "May",
		"june":      "Jun",
		"july":      "Jul",
		"august":    "Aug",
		"september": "Sep",
		"october":   "Oct",
		"november":  "Nov",
		"december":  "Dec",
	}
	// SpacesRE - match 1 or more space characters
	SpacesRE = regexp.MustCompile(`\s+`)
	// TZOffsetRE - time zone offset that comes after +0... +1... -0... -1...
	// Can be 3 or 3 digits then whitespace and then anything
	TZOffsetRE = regexp.MustCompile(`^(\d{3})(\s+.*$|$)`)
	// MessageLineSeparator - message line separator
	MessageLineSeparator = []byte("\n")
)
View Source
var (
	// PipermailRawMapping - Pipeermail raw index mapping
	PipermailRawMapping = []byte(`{"mappings": {"dynamic":true,"properties":{"metadata__updated_on":{"type":"date"},"data":{"properties":{"description":{"type":"text","index":true},"full_description":{"type":"text","index":true}}}}}}`)
)

Functions

func DateTimeToUTC

func DateTimeToUTC(date string) time.Time

DateTimeToUTC ...

func DownloadFile

func DownloadFile(url, filepath string) error

DownloadFile will download a url to a local file. It's efficient because it will write as it downloads and not load the whole file into memory.

func ParseDateFromFilePath

func ParseDateFromFilePath(path string) time.Time

ParseDateFromFilePath ...

func ParseMBoxMsg

func ParseMBoxMsg(Debug int, groupName string, msg []byte) (item map[string]interface{}, valid, warn bool)

ParseMBoxMsg - parse a raw MBox message into object to be inserted into raw ES

func TrimDots

func TrimDots(s string) string

TrimDots ...

func TrimFirstDash

func TrimFirstDash(s string) (year, month string)

TrimFirstDash ...

func TrimFirstDot

func TrimFirstDot(s string) string

TrimFirstDot ...

Types

type AffiliationClient

type AffiliationClient interface {
	GetIdentityByUser(key string, value string) (*affiliation.AffIdentity, error)
	AddIdentity(identity *affiliation.Identity) bool
	GetOrganizations(uuid string, projectSlug string) *[]affiliation.Enrollment
}

AffiliationClient manages user identity

type ESClientProvider

type ESClientProvider interface {
	Add(index string, documentID string, body []byte) ([]byte, error)
	CreateIndex(index string, body []byte) ([]byte, error)
	Bulk(body []byte) ([]byte, error)
	Get(index string, query map[string]interface{}, result interface{}) (err error)
	GetStat(index string, field string, aggType string, mustConditions []map[string]interface{}, mustNotConditions []map[string]interface{}) (result time.Time, err error)
	BulkInsert(data []elastic.BulkData) ([]byte, error)
}

ESClientProvider used in connecting to ES Client server

type EnrichedMessage

type EnrichedMessage struct {
	ID                  string    `json:"id"`
	TZ                  float64   `json:"tz"`
	MessageID           string    `json:"Message-ID"`
	UUID                string    `json:"uuid"`
	AuthorName          string    `json:"author_name"`
	Root                bool      `json:"root"`
	AuthorOrgName       string    `json:"author_org_name"`
	AuthorBot           bool      `json:"author_bot"`
	BodyExtract         string    `json:"body_extract"`
	AuthorID            string    `json:"author_id"`
	SubjectAnalyzed     string    `json:"subject_analyzed"`
	Project             string    `json:"project"`
	MboxAuthorDomain    string    `json:"mbox_author_domain"`
	Date                time.Time `json:"date"`
	IsPipermailMessage  int       `json:"is_pipermail_message"`
	List                string    `json:"list"`
	AuthorUUID          string    `json:"author_uuid"`
	AuthorMultiOrgNames []string  `json:"author_multi_org_names"`
	Origin              string    `json:"origin"`
	Size                int64     `json:"size"`
	Tag                 string    `json:"tag"`
	Subject             string    `json:"subject"`
	FromID              string    `json:"from_id"`
	EmailDate           time.Time `json:"email_date"`
	MetadataTimestamp   time.Time `json:"metadata__timestamp"`
	MetadataBackendName string    `json:"metadata__backend_name"`
	MetadataUpdatedOn   time.Time `json:"metadata__updated_on"`
	MetadataEnrichedOn  time.Time `json:"metadata__enriched_on"`
	ProjectSlug         string    `json:"project_slug"`
	ChangedAt           time.Time `json:"changed_at"`
	GroupName           string    `json:"group_name"`
	Slug                string    `json:"slug"`
	References          string    `json:"references"`
}

EnrichedMessage represents piper mail enriched message

type Enricher

type Enricher struct {
	DSName                string // Datasource will be used as key for ES
	ElasticSearchProvider ESClientProvider
	BackendVersion        string
	// contains filtered or unexported fields
}

Enricher contains pipermail datasource enrich logic

func NewEnricher

func NewEnricher(backendVersion string, esClientProvider ESClientProvider, affiliationsClientProvider *affiliation.Affiliation) *Enricher

NewEnricher initiates a new Enricher

func (*Enricher) EnrichMessage

func (e *Enricher) EnrichMessage(rawMessage *RawMessage, now time.Time) (*EnrichedMessage, error)

EnrichMessage enriches raw message

func (*Enricher) FormatTimestampString

func (e *Enricher) FormatTimestampString(str string) (*time.Time, error)

FormatTimestampString returns a formatted RFC 33339 Datetime string

func (*Enricher) GetEmailDomain

func (e *Enricher) GetEmailDomain(email string) string

GetEmailDomain ...

func (*Enricher) GetEmailUsername

func (e *Enricher) GetEmailUsername(email string) string

GetEmailUsername ...

func (*Enricher) GetUserName

func (e *Enricher) GetUserName(rawMailString string) (username string)

GetUserName ...

func (*Enricher) HandleMapping

func (e *Enricher) HandleMapping(index string) error

HandleMapping creates rich mapping

func (*Enricher) HandleObfuscatedEmail

func (e *Enricher) HandleObfuscatedEmail(rawMailString string) (email string)

HandleObfuscatedEmail ...

func (*Enricher) IsValidEmail

func (e *Enricher) IsValidEmail(rawMailString string) bool

IsValidEmail validates email string

func (*Enricher) RemoveSpecialCharactersFromString

func (e *Enricher) RemoveSpecialCharactersFromString(s string) (val *string)

RemoveSpecialCharactersFromString ...

type Fetcher

type Fetcher struct {
	DSName                string
	IncludeArchived       bool
	HTTPClientProvider    *http.ClientProvider
	ElasticSearchProvider *elastic.ClientProvider
	BackendVersion        string
	Debug                 int
	DateFrom              time.Time
}

Fetcher contains piper mail datasource fetch logic

func NewFetcher

func NewFetcher(params *Params, httpClientProvider *http.ClientProvider, esClientProvider *elastic.ClientProvider) *Fetcher

NewFetcher initiates a new pipermail fetcher

func (*Fetcher) AddMetadata

func (f *Fetcher) AddMetadata(msg interface{}, endpoint, slug, groupName string) *RawMessage

AddMetadata - add metadata to the raw message

func (*Fetcher) ElasticRawMapping

func (f *Fetcher) ElasticRawMapping() []byte

ElasticRawMapping - Raw index mapping definition

func (*Fetcher) ElasticRichMapping

func (f *Fetcher) ElasticRichMapping() []byte

ElasticRichMapping - Rich index mapping definition

func (*Fetcher) Fetch

func (f *Fetcher) Fetch(url string, fromDate *time.Time) (map[string]string, error)

Fetch the mbox files from the remote archiver.

Stores the archives in the path given during the initialization of this object. Those archives which don't have not valid extensions will be ignored.

Pipermail archives have on their file names the date of the archive is stored following the schema year-month. When fromDate property is called, it will return the mboxes for which their year and month are equal or after that date.

fromDate: fetch archives that store messages equal or after the given date; only year and month values are compared

returns a map of links and their paths of the fetched archives

func (*Fetcher) FetchItem

func (f *Fetcher) FetchItem(slug, groupName, endpoint string, fromDate time.Time, limit int, now time.Time) ([]*RawMessage, error)

FetchItem extracts data from archives

func (*Fetcher) Find

func (f *Fetcher) Find(slice []string, val string) (bool, string)

Find takes a slice and looks for an element in it. If found it will return it's true, otherwise it will return a bool of false.

func (*Fetcher) GetLastDate

func (f *Fetcher) GetLastDate(ESIndex string, now time.Time) (time.Time, error)

GetLastDate gets fetching lastDate

func (*Fetcher) HandleMapping

func (f *Fetcher) HandleMapping(index string) error

HandleMapping updates piper mail raw mapping

func (*Fetcher) ItemCategory

func (f *Fetcher) ItemCategory(item interface{}) string

ItemCategory - return unique identifier for an item

func (*Fetcher) ItemID

func (f *Fetcher) ItemID(item interface{}) string

ItemID - return unique identifier for an item

func (*Fetcher) ItemUpdatedOn

func (f *Fetcher) ItemUpdatedOn(item interface{}) time.Time

ItemUpdatedOn - return updated on date for an item

func (f *Fetcher) ParseArchiveLinks(archivesURL string, fromDate *time.Time) ([]string, error)

ParseArchiveLinks scraps the contents of a given url to extract compressed files download links

func (*Fetcher) Query

func (f *Fetcher) Query(index string, query map[string]interface{}) (*RawHits, error)

Query query saved raw data from ES

type HTTPClientProvider

type HTTPClientProvider interface {
	Request(url string, method string, header map[string]string, body []byte, params map[string]string) (statusCode int, resBody []byte, err error)
}

HTTPClientProvider used in connecting to remote http server

type HitSource

type HitSource struct {
	ID        string    `json:"id"`
	ChangedAt time.Time `json:"changed_at"`
}

HitSource is the document _source data

type Hits

type Hits struct {
	Hits []NestedHits `json:"hits"`
}

Hits result

type Manager

type Manager struct {
	Endpoint               string
	Slug                   string
	GroupName              string
	SHConnString           string
	FetcherBackendVersion  string
	EnricherBackendVersion string
	Fetch                  bool
	Enrich                 bool
	ESUrl                  string
	ESUsername             string
	ESPassword             string
	ESIndex                string
	FromDate               *time.Time
	HTTPTimeout            time.Duration
	Project                string
	FetchSize              int
	EnrichSize             int
	AffBaseURL             string
	ESCacheURL             string
	ESCacheUsername        string
	ESCachePassword        string
	AuthGrantType          string
	AuthClientID           string
	AuthClientSecret       string
	AuthAudience           string
	Auth0URL               string
	Environment            string
	WebHookURL             string
	MaxWorkers             int
	NumberOfRawMessages    int
	// contains filtered or unexported fields
}

Manager describes piper mail manager

func NewManager

func NewManager(endPoint, slug, shConnStr, fetcherBackendVersion, enricherBackendVersion string, fetch bool, enrich bool, eSUrl string, esUser string, esPassword string, esIndex string, fromDate *time.Time, project string, fetchSize int, enrichSize int, affBaseURL, esCacheURL, esCacheUsername, esCachePassword, authGrantType, authClientID, authClientSecret, authAudience, auth0URL, env, webHookURL string) (*Manager, error)

NewManager initiates piper mail manager instance

func (*Manager) AddTask

func (m *Manager) AddTask(task func())

AddTask adds task to worker pool

func (*Manager) Sync

func (m *Manager) Sync() error

Sync runs piper mail fetch and enrich according to passed parameters

type MessageSearchFields

type MessageSearchFields struct {
	Name   string `json:"name"`
	ItemID string `json:"item_id"`
}

MessageSearchFields ...

type NHits

type NHits struct {
	Hits []NestedRawHits `json:"hits"`
}

NHits result

type NestedHits

type NestedHits struct {
	ID     string    `json:"_id"`
	Source HitSource `json:"_source"`
}

NestedHits is the actual hit data

type NestedRawHits

type NestedRawHits struct {
	ID     string     `json:"_id"`
	Source RawMessage `json:"_source"`
}

NestedRawHits is the actual hit data

type Params

type Params struct {
	FromDate       time.Time
	BackendVersion string
	Project        string
	Debug          int
	ProjectSlug    string
	GroupName      string
}

Params required parameters for piper mail fetcher

type RawHits

type RawHits struct {
	Hits NHits `json:"hits"`
}

RawHits result

type RawMessage

type RawMessage struct {
	BackendVersion    string               `json:"backend_version"`
	Data              *RawMessageData      `json:"data"`
	Tag               string               `json:"tag"`
	UUID              string               `json:"uuid"`
	SearchFields      *MessageSearchFields `json:"search_fields"`
	Origin            string               `json:"origin"`
	UpdatedOn         float64              `json:"updated_on"`
	MetadataUpdatedOn time.Time            `json:"metadata__updated_on"`
	BackendName       string               `json:"backend_name"`
	MetadataTimestamp time.Time            `json:"metadata__timestamp"`
	Timestamp         float64              `json:"timestamp"`
	Category          string               `json:"category"`
	ProjectSlug       string               `json:"project_slug"`
	GroupName         string               `json:"group_name"`
	Project           string               `json:"project"`
	ChangedAt         time.Time            `json:"changed_at"`
}

RawMessage represents piper mail raw message

type RawMessageData

type RawMessageData struct {
	ContentType     string `json:"Content-Type"`
	Date            string `json:"Date"`
	From            string `json:"From"`
	InReplyTo       string `json:"In-Reply-To"`
	MboxByteLength  int64  `json:"MBox-Bytes-Length"`
	MboxNBodies     int    `json:"MBox-N-Bodies"`
	MboxNLines      int64  `json:"MBox-N-Lines"`
	MboxProjectName string `json:"MBox-Project-Name"`
	MboxValid       bool   `json:"MBox-Valid"`
	MboxWarn        bool   `json:"MBox-Warn"`
	MessageID       string `json:"Message-ID"`
	References      string `json:"References"`
	Subject         string `json:"Subject"`
	Data            struct {
		Text struct {
			Plain []struct {
				Data string `json:"data"`
			} `json:"plain"`
		} `json:"text"`
	} `json:"data"`
	DateInTZ string  `json:"date_in_tz"`
	DateTZ   float64 `json:"date_tz"`
}

RawMessageData ...

type TopHits

type TopHits struct {
	Hits Hits `json:"hits"`
}

TopHits result

Directories

Path Synopsis

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL