adstxt

package module
v0.0.0-...-3740f12 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Feb 26, 2018 License: MIT Imports: 14 Imported by: 0

README

Crawler

This ads.txt crawler is written to the ads.txt specification found here. It handles redirects, correctly deals with subdomains, and handles common publisher implementation mistakes. The populated adstxt file object contains not only information about the ad paths, but also information about erroneous rows, inline comments, variables, etc.

The file object also contains information about the crawl of the ads.txt, including the root domain, the domain for which the ads.txt is valid, the url of the ads.txt, The lookup time, and a file checksum.

Finding the correct ads.txt

There is some confusion about which paths to put in an ads.txt file and where that file should be located. For example many blogging domains serve the global ads.txt file on each blog subdomain. The crawler traverses the url subdomains to figure out which domain the ads.txt is valid for. Given myblog.blogdomain.com the crawler will ask for myblog.blogdomain.com/ads.txt. If an ads.txt is found the crawler will then request blogdomain.com/ads.txt and compare checksums to see if the files are the same. This allows users of the crawler to avoid the possible n^2 duplication of ads.txt paths on blog domains.

Example Usage:

git clone https://github.com/dustinevan/adstxt.git
cd ./adstxt/app/atcrawl
./atcrawl bloomberg.com/ads.txt
./atcrawl batman.wikia.com/ads.txt

The example binary was built on masOS Sierra 10.12.6 with go1.9.2 darwin/amd64 If you have implementation questions or are interested in how use this at scale, feel free to contact me: dustinevancurrie@gmail.com

Documentation

Index

Constants

This section is empty.

Variables

View Source
var Client = http.Client{

	CheckRedirect: func(req *http.Request, via []*http.Request) error {
		if len(via) > 1 {
			hosts := make(map[string]struct{})
			hosts[strings.Replace(req.URL.Host, "www.", "", 1)] = struct{}{}
			for _, r := range via {
				hosts[strings.Replace(r.URL.Host, "www.", "", 1)] = struct{}{}
			}
			if len(hosts) > 2 {
				return fmt.Errorf("illegal redirect chain. %s", hosts)
			}
		}
		return nil
	},
	Timeout: time.Second * 10,
}

Functions

func ErrJoin

func ErrJoin(errs []error, delim string) error

func Get

func Get(domain string) (host string, bytes []byte, err error)

func GetCanonicalAdSystemDomain

func GetCanonicalAdSystemDomain(e string) (string, error)

func Parse

func Parse(b []byte) (rec []Record, lc []LineComment, va []Variable, el []ErrorLine, e error)

func Read

func Read(resp *http.Response) ([]byte, error)

func Request

func Request(domain string) (*http.Response, error)

func SetCanonicalMaps

func SetCanonicalMaps(filename string) error

Types

type AdsystemMap

type AdsystemMap struct {
	SysURLToCanonicalName map[string]string `json:"sys_url_to_canonical_name"`
	CanonicalNameToUrl    map[string]string `json:"canonical_name_to_url"`
}

type Domain

type Domain struct {
	Host         string   // sub2.sub1.test.co.jp
	Root         string   // test.co.jp
	PublicSuffix string   // co.jp
	ICANN        bool     // see PublicSuffix comments
	Subs         []string // [ "sub2" "sub1" ] most specific first
}

func DomainFromString

func DomainFromString(domain string) (*Domain, error)

This supports sub2.sub1.root.com format. Data not in this format should in URL format should use Should use url.Parse and then DomainFromURL

func DomainFromURL

func DomainFromURL(u *url.URL) (*Domain, error)

func (*Domain) ListDomains

func (d *Domain) ListDomains() []string

func (*Domain) String

func (d *Domain) String() string

type ErrorLine

type ErrorLine struct {
	// Reason the parse failed
	Error error `json:"error"`

	// Original data
	Line string `json:"line"`

	// Line number of the parse failure
	LineNum int `json:"line_num"`
}

type File

type File struct {
	// The URL location of this adstxt
	URL string `json:"url"`

	// The Root Domain
	RootDomain string `json:"root_domain"`

	// The Subdomain of the Root Domain the adstxt is valid for
	AdstxtDomain string `json:"adstxt_domain"`

	// Valid Exchange/PubID combinations/routes for a certain publishers bid requests
	Records []Record `json:"adpaths"`

	// Comments that occupy a full line
	LineComments []LineComment `json:"line_comments,omitempty"`

	// Any line containing a pattern of <VARIABLE>=<VALUE> should be interpreted as a variable
	// declaration.
	Variables []Variable `json:"variables,omitempty"`

	//
	ErrLines []ErrorLine

	// SHA256 checksum of the bytes in the response body
	CheckSum string `json:"checksum"`

	// The time of the adstxt get request
	LookupTime time.Time `json:"lookup_time"`
}

func Crawl

func Crawl(domain string) (*File, error)

func NewFile

func NewFile(b []byte, t time.Time, url, root, adstxtdom string) (*File, error)

func (*File) IsValidSubDomain

func (f *File) IsValidSubDomain(sub string) bool

func (*File) String

func (f *File) String() string

type LineComment

type LineComment struct {
	// Comment text
	Text string `json:"text"`

	// Line number the comment was found on after removing empty lines. This is useful for
	// attaching line comment information to records.
	LineNum int `json:"line_num"`
}

type PublisherAccountType

type PublisherAccountType int
const (
	NO_ACCOUNT_TYPE_SPECIFIED PublisherAccountType = iota
	DIRECT
	RESELLER
	BOTH // some ads.txt file contain duplicate rows for the same pubid with reseller and direct types. these can be reduced by calling DedupOnAccountType()
	INVALID_ACCOUNT_TYPE
)

func GetAccountType

func GetAccountType(s string) PublisherAccountType

func (PublisherAccountType) MarshalJSON

func (p PublisherAccountType) MarshalJSON() ([]byte, error)

func (PublisherAccountType) String

func (p PublisherAccountType) String() string

type Record

type Record struct {
	// (Required) The canonical domain name of the SSP, Exchange, Header Wrapper, etc system that
	// bidders connect to. This may be the operational domain of the system, if that is different than the
	// parent corporate domain, to facilitate WHOIS and reverse IP lookups to establish clear ownership of
	// the delegate system. Ideally the SSP or Exchange publishes a document detailing what domain name
	// to use.
	AdSystemDomain string `json:"ad_system_domain"`

	// This field is an attempt to reconcile different ad system domains that mean the same thing. Matching
	// adstxt data with bid request data requires a mapping, but because many adstxt files say the same thing
	// different this field attempts to canonize a specific ad system spelling see disambiguation.go
	CanonicalSystemDomain string `json:"canonical_system_domain"`

	// (Required) The identifier associated with the seller or reseller account within the advertising system in
	// field #1. This must contain the same value used in transactions (i.e. OpenRTB bid requests) in the
	// field specified by the SSP/exchange. Typically, in OpenRTB, this is publisher.id. For OpenDirect it is
	// typically the publisher’s organization ID. ExDomain.
	PublisherID string `json:"publisher_id"`

	// (Required) An enumeration of the type of account. A value of ‘DIRECT’ indicates that the Publisher
	// (content owner) directly controls the account indicated in field #2 on the system in field #1. This
	// tends to mean a direct business contract between the Publisher and the advertising system. A value
	// of ‘RESELLER’ indicates that the Publisher has authorized another entity to control the account
	// indicated in field #2 and resell their ad space via the system in field #1. Other types may be added
	// in the future. Note that this field should be treated as case insensitive when interpreting the data.
	AccountType PublisherAccountType `json:"account_type"`

	// (Optional) An ID that uniquely identifies the advertising system within a certification authority
	// (this ID maps to the entity listed in field #1). A current certification authority is the Trustworthy
	// Accountability Group (aka TAG), and the TAGID would be included here.
	CertAuthorityID string `json:"cert_authority_id,omitempty"`

	// Extension fields are allowed by implementers and their consumers as long as they utilize a
	// distinct final separator field ";" before adding extension data to each record
	Ext []string `json:"ext,omitempty"`

	// Anything after # on a line is considered to be a comment
	Comment string `json:"comment,omitempty"`

	// The line number the record was found on, after removing empty lines. This is useful for
	// attaching line comment information to records.
	LineNum int `json:"line_num"`
}

func ParseRecord

func ParseRecord(line string) (*Record, error)

type Variable

type Variable struct {
	Key   string `json:"key"`
	Value string `json:"value"`

	// Line number the comment was found on after removing empty lines. This is useful for
	// attaching line comment information to records.
	LineNum int `json:"line_num"`
}

func ParseVariable

func ParseVariable(line string) (*Variable, error)

Directories

Path Synopsis
app

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL