iascrape

package module
v0.0.0-...-f106840 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Apr 15, 2026 License: BSD-3-Clause Imports: 14 Imported by: 0

README

iascrape

Library to used to search and scrape the Internet Archive

References

  1. Extracting A Large Corpus from the Internet Archive, A Case Study. Code4lib Issue 61, 2025-10-21 https://journal.code4lib.org/articles/18510

Documentation

Index

Constants

View Source
const MAX_RESULTS = 5000

Variables

View Source
var DBBucketName = "ia"
View Source
var IASCRAPE_DEBUG = false
View Source
var IASCRAPE_DEGUG_DEPTH = 0
View Source
var IA_ScrapeBaseURL = "https://archive.org/services/search/v1/scrape?"

Reference: https://journal.code4lib.org/articles/18510 Internet Archive Search api (scrape): https://archive.org/help/aboutsearch.htm

View Source
var ItemBaseUrl = "http://archive.org/metadata/"

var ItemBaseUrl = "https://archive.org/metadata/"

Functions

func HeadUrl

func HeadUrl(client *http.Client, u string, retry int, delay time.Duration) error

func MakeMetadataItemFieldMap

func MakeMetadataItemFieldMap(md *ItemMetadata) map[string]*[]string

func NewClient

func NewClient() *http.Client

Types

type Backoff

type Backoff func(*RequestStats) time.Duration

type Cache

type Cache struct {
	DBBucketName string
	// contains filtered or unexported fields
}

func NewCache

func NewCache(dbFileName string) (*Cache, error)

func (*Cache) Delete

func (c *Cache) Delete(key string) error

func (*Cache) Get

func (c *Cache) Get(url string) ([]byte, error)

func (*Cache) Put

func (c *Cache) Put(url string, body []byte) error

type File

type File struct {
	Format       string      `json:"format"`
	MD5          string      `json:"md5"`
	Name         string      `json:"name"`
	Size         string      `json:"size"`
	Title        string      `json:"title"`
	Original     []string    `json:"-"`
	Original_Raw interface{} `json:"original"`
	Length       string      `json:"length"`
	TrackOrder   int         `json:"-"` // This is not part of the JSON
}

type ItemMetadata

type ItemMetadata struct {
	ItemMetadata_Raw
	AddedDate   string   `json:"addeddate"`
	Collections []string `json:"-"`

	Condition               string   `json:"condition"`
	Contributor             string   `json:"contributor"`
	Creators                []string `json:"-"`
	Dates                   []string `json:"-"`
	Descriptions            []string `json:"-"`
	Genres                  []string `json:"-"`
	Identifier              string   `json:"identifier"`
	Keywords_CommaSeparated string   `json:"keywords"`
	Keywords                []string `json:"-"`
	Languages               []string `json:"-"`
	MediaType               string   `json:"media_type"`
	Notes                   []string `json:"-"`
	LicenseUrl              string   `json:"licenseurl"`
	PublicDate              string   `json:"publicdate"`
	Publishers              []string `json:"-"`
	PublisherCatalogNumbers []string `json:"-"`
	Scanners                []string `json:"-"`
	Source                  []string `json:"-"`
	Subjects                []string `json:"-"`
	Titles                  []string `json:"-"`
	Uploaders               []string `json:"-"`
	Years                   []string `json:"-"`
	CanonicalYear           int
	// contains filtered or unexported fields
}

type ItemMetadata_Raw

type ItemMetadata_Raw struct {
	CollectionCatalogNumber_Raw interface{} `json:"collection-catalog-number"`
	Collection_Raw              interface{} `json:"collection"`
	Creator_Raw                 interface{} `json:"creator"`
	Date_Raw                    interface{} `json:"date"`
	Description_Raw             interface{} `json:"description"`
	Genre_Raw                   interface{} `json:"genre"`
	Language_Raw                interface{} `json:"language"`
	Notes_Raw                   interface{} `json:"notes"`
	PublisherCatalogNumber_Raw  interface{} `json:"publisher-catalog-number"`
	Publisher_Raw               interface{} `json:"publisher"`
	Scanner_Raw                 interface{} `json:"scanner"`
	Source_Raw                  interface{} `json:"source"`
	Subject_Raw                 interface{} `json:"subject"`
	Title_Raw                   interface{} `json:"title"`
	Uploader_Raw                interface{} `json:"uploader"`
	Year_Raw                    interface{} `json:"year"`
}

type ItemTopLevelMetadata

type ItemTopLevelMetadata struct {
	Created          int64        `json:"created"`
	D1               string       `json:"d1"`
	Date             string       `json:"date"`
	Dir              string       `json:"dir"`
	Files            []File       `json:"files"`
	Files_Count      int32        `json:"files_count"`
	ItemLastUpdated  int64        `json:"item_last_updated"`
	ItemSize         int64        `json:"item_size"`
	Metadata         ItemMetadata `json:"metadata"`
	Roles            Role         `json:"roles"`
	Segments         []string     `json:"-"`
	Segments_Raw     interface{}  `json:"segments"`
	Server           string       `json:"server"`
	Workable_Servers []string     `json:"workable_servers"`
	Uniq             int64        `json:"uniq"`
}

func GetItem

func GetItem(id string, client *http.Client, cache *Cache, verbose bool) (*ItemTopLevelMetadata, error)

type RequestStats

type RequestStats struct {
	// contains filtered or unexported fields
}

type Role

type Role struct {
	Performer_Raw interface{} `json:"performer"`
	Performers    []string
}
type Search struct {
	ChunkSize  int
	Client     *http.Client
	Limit      int64
	MaxResults int64
	Offset     int64
	Query      string
	Retries    int
	Verbose    bool
	// contains filtered or unexported fields
}

func (*Search) Execute

func (s *Search) Execute() ([]SearchItem, error)

func (*Search) Total

func (s *Search) Total() (int64, error)

type SearchItem

type SearchItem struct {
	AddedDate              string      `json:"addeddate"`
	AvgRating_Raw          interface{} `json:"avg_rating"`
	AvgRating              []int
	BTIH                   string      `json:"btih"`
	BackupLocation_Raw     interface{} `json:"backup_location"`
	BackupLocation         []string
	Collection             []string    `json:"collection"`
	CollectionsOrdered     string      `json:"collections_ordered"`
	CurateDate             string      `json:"curatedate"`
	CurateNote_Raw         interface{} `json:"curatenote"`
	CurateNote             []string
	CurateState            string      `json:"curatestate"`
	Curation_Raw           interface{} `json:"curation"`
	Curation               []string
	Curator                string      `json:"curator"`
	Date_Raw               interface{} `json:"date"`
	Date                   []string
	Description            interface{} `json:"description"`
	Downloads              int         `json:"downloads"`
	ExternalMetadataUpdate string      `json:"external_metadata_update"`
	FilesCount             int         `json:"files_count"`
	Format_Raw             interface{} `json:"format"`
	Format                 []string
	Identifier             string      `json:"identifier"`
	IndexDate              string      `json:"indexdate"`
	ItemSize               int         `json:"item_size"`
	LicenseURL             string      `json:"licenseurl"`
	ListMemberships_Raw    interface{} `json:"list_memberships"`
	ListMemberships        []string
	MatchDateAoustid       string      `json:"match_date_acoustid"`
	MediaType              string      `json:"mediatype"`
	Month                  int         `json:"month"`
	NoArchiveTorrent       string      `json:"noarchivetorrent"`
	NumFavorites           int         `json:"num_favorites"`
	OaiUpdateDate_Raw      interface{} `json:"oai_updatedate"`
	OaiUpdateDate          []string
	PrimaryCollection      string      `json:"primary_collection"`
	PublicDate             string      `json:"publicdate"`
	ReportedServer         string      `json:"reported_server"`
	ReviewBody_Raw         interface{} `json:"reviewbody"`
	ReviewBody             []string
	ReviewData             []string    `json:"review_data"`
	Reviewer_Raw           interface{} `json:"reviewer"`
	Reviewer               []string
	ReviewerItemName_Raw   interface{} `json:"reviewer_itemname"`
	ReviewerItemname       []string
	Scanner_Raw            interface{} `json:"scanner"`
	Scanner                []string
	Subject_Raw            interface{} `json:"subject"`
	Subject                []string
	SubjectCount           int         `json:"subject_count"`
	Stars_Raw              interface{} `json:"stars"`
	Stars                  []int
	Title_Raw              interface{} `json:"title"`
	Title                  []string
	Week                   int         `json:"week"`
	Year_Raw               interface{} `json:"year"`
	Year                   []int
}

type StringFields

type StringFields struct {
	// contains filtered or unexported fields
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL