ccrawl

package

v0.1.0 Latest Latest Go to latest Published: Jun 13, 2026 License: Apache-2.0 Imports: 28 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/tamnd/ccrawl-cli

Links

Open Source Insights

Documentation ¶

Overview ¶

Package ccrawl is a Go library for working with Common Crawl data: the collection list, the CDX URL index, WARC/WAT/WET archive files, the columnar Parquet index, CC-NEWS, and the host/domain ranks. It is the engine behind the ccrawl command line tool but is usable on its own.

Index ¶

Constants
Variables
func CDXNumPages(ctx context.Context, h *HTTPClient, crawlID string, q CDXQuery) (int, error)
func CDXStream(ctx context.Context, h *HTTPClient, crawlID string, q CDXQuery, ...) error
func CanonicalURL(raw string) string
func ColumnarParquetURLs(ctx context.Context, h *HTTPClient, cache *Cache, crawlID, subset string, ...) ([]string, error)
func ColumnarSource(crawlID, subset string, src Source) string
func ConfigDir() string
func DownloadFiles(ctx context.Context, h *HTTPClient, src Source, paths []string, ...) error
func DuckDBAvailable() bool
func ExtractMarkdown(body []byte) (string, error)
func ExtractText(body []byte) string
func ExtractTitle(body []byte) string
func FetchPaths(ctx context.Context, h *HTTPClient, cache *Cache, crawlID, kind string) ([]string, error)
func FileURL(path string, src Source) string
func HTTPBody(block []byte) []byte
func HTTPHeaders(block []byte) []byte
func HTTPSURL(path string) string
func HostOf(raw string) string
func InferMatchType(pattern string) (cleanURL, matchType string)
func IterateWARC(r io.Reader, fn func(WARCRecord) error) error
func IterateWAT(r io.Reader, crawlID string, fn func(WATRecord) error) error
func IterateWET(r io.Reader, crawlID string, fn func(WETRecord) error) error
func LibraryDir() string
func ParquetListLiteral(urls []string) string
func ResolveCrawl(ctx context.Context, h *HTTPClient, cache *Cache, ref string) (string, error)
func RunColumnarDuckDB(ctx context.Context, sql string, emit func(map[string]any) error) error
func RunDuckDBJSON(ctx context.Context, dbPath, sql string, emit func(map[string]any) error) error
func SURT(raw string) string
func StreamPaths(ctx context.Context, h *HTTPClient, crawlID, kind string, ...) error
type CDXQuery
type CDXRecord
- func CDXSearch(ctx context.Context, h *HTTPClient, crawlID string, q CDXQuery) ([]CDXRecord, error)
- func (r CDXRecord) Location() Location
- func (r CDXRecord) Time() time.Time
type Cache
- func NewCache(dir string, enabled bool) *Cache
- func (c *Cache) Clear() (int, error)
- func (c *Cache) Dir() string
- func (c *Cache) Get(key string, ttl time.Duration) ([]byte, bool)
- func (c *Cache) Put(key string, data []byte)
type ColumnarQuery
- func (q ColumnarQuery) SQL(src Source) string
type Config
- func DefaultConfig() Config
- func (c Config) ParquetDir() string
- func (c Config) RawDir() string
type Crawl
- func ListCrawls(ctx context.Context, h *HTTPClient, cache *Cache) ([]Crawl, error)
type DownloadResult
type HTTPClient
- func NewHTTPClient(cfg Config) *HTTPClient
- func (h *HTTPClient) FetchBytes(ctx context.Context, url string) ([]byte, error)
- func (h *HTTPClient) Get(ctx context.Context, url string) (*http.Response, error)
- func (h *HTTPClient) GetDownload(ctx context.Context, url string) (*http.Response, error)
- func (h *HTTPClient) GetRange(ctx context.Context, url string, offset, length int64) (*http.Response, error)
type Library
- func NewLibrary(root, crawl string) Library
- func (l Library) CrawlDir() string
- func (l Library) ProcessedDir(format, kind string) string
- func (l Library) RawDir(kind string) string
- func (l Library) RawPath(kind, ccPath string) string
type Location
type NewsFile
- func ListNewsFiles(ctx context.Context, h *HTTPClient, year, month int) ([]NewsFile, error)
type ParquetWriter
- func NewParquetWriter[T any](path string) (*ParquetWriter[T], error)
- func (p *ParquetWriter[T]) Close() error
- func (p *ParquetWriter[T]) Rows() int64
- func (p *ParquetWriter[T]) Write(row T) error
type Rank
- func RankLookup(ctx context.Context, h *HTTPClient, url, hostOrDomain string) (Rank, error)
- func RankTop(ctx context.Context, h *HTTPClient, url, tld string, n int) ([]Rank, error)
type Source
type WARCHeader
type WARCParquetRow
type WARCRecord
- func FetchWARCRecord(ctx context.Context, h *HTTPClient, filename string, offset, length int64) (WARCRecord, error)
type WATLink
- func ExtractLinks(base string, body []byte) []WATLink
type WATMeta
type WATParquetRow
type WATRecord
type WETParquetRow
type WETRecord

Constants ¶

View Source

const (
	CollInfoURL = "https://index.commoncrawl.org/collinfo.json"
	DataBaseURL = "https://data.commoncrawl.org/"
	CDXBaseURL  = "https://index.commoncrawl.org/"
	S3BaseURL   = "s3://commoncrawl/"

	// ColumnarPrefix is the root of the columnar (Parquet) index.
	ColumnarPrefix = "cc-index/table/cc-main/warc/"

	// UserAgent identifies the client politely to Common Crawl's CDN.
	UserAgent = "ccrawl/1.0 (+https://github.com/tamnd/ccrawl-cli)"
)

Common Crawl endpoints.

View Source

const (
	DefaultTimeout = 120 * time.Second
	DefaultRetries = 5
	DefaultDelay   = 200 * time.Millisecond
)

Defaults for the client and downloader.

View Source

const DuckDBPrelude = "INSTALL httpfs; LOAD httpfs; SET enable_progress_bar=false; SET allow_asterisks_in_http_paths=true;"

DuckDBPrelude is prepended to every statement ccrawl sends to the duckdb binary. httpfs reads remote Parquet over HTTPS or S3; the progress bar is noise on a pipe; and allow_asterisks_in_http_paths is required because the columnar index is addressed with a glob (subset=warc/*.parquet) over HTTP, which duckdb refuses by default.

Variables ¶

View Source

var DefaultColumnarColumns = []string{
	"url", "url_host_registered_domain", "fetch_status",
	"content_mime_detected", "content_languages",
	"warc_filename", "warc_record_offset", "warc_record_length",
}

DefaultColumnarColumns are the columns selected when none are given.

View Source

var LocationColumns = []string{"url", "warc_filename", "warc_record_offset", "warc_record_length"}

LocationColumns return just the fields needed to range-fetch a record.

View Source

var PathKinds = []string{"warc", "wat", "wet", "robotstxt", "non200responses", "cc-index", "cc-index-table", "segment"}

PathKinds are the file manifests published per crawl.

Functions ¶

func CDXNumPages ¶

func CDXNumPages(ctx context.Context, h *HTTPClient, crawlID string, q CDXQuery) (int, error)

CDXNumPages returns the number of result pages for a query.

func CDXStream ¶

func CDXStream(ctx context.Context, h *HTTPClient, crawlID string, q CDXQuery, fn func(CDXRecord) error) error

CDXStream runs a query and calls fn for each matching record, paginating through the server's pages and stopping at q.Limit.

func CanonicalURL ¶

func CanonicalURL(raw string) string

CanonicalURL applies light canonicalization: ensure a scheme, lower-case the host, and drop a fragment. It does not reorder query parameters.

func ColumnarParquetURLs ¶

func ColumnarParquetURLs(ctx context.Context, h *HTTPClient, cache *Cache, crawlID, subset string, src Source) ([]string, error)

ColumnarParquetURLs resolves the columnar index glob into the explicit list of parquet file URLs for one crawl and subset. Common Crawl's bucket does not allow anonymous listing, so a duckdb run cannot expand the `*.parquet` glob over HTTPS (or anonymous S3) on its own. The crawl publishes the full file list in cc-index-table.paths.gz, so we read that manifest (cached) and turn each entry into a fetchable URL for the chosen source.

func ColumnarSource ¶

func ColumnarSource(crawlID, subset string, src Source) string

ColumnarSource returns the parquet glob for one crawl's columnar index subset (subset is warc, crawldiagnostics, or robotstxt).

func ConfigDir ¶

func ConfigDir() string

ConfigDir returns the directory holding the config file.

func DownloadFiles ¶

func DownloadFiles(ctx context.Context, h *HTTPClient, src Source, paths []string, localDir string, workers int, flat bool, progress func(DownloadResult)) error

DownloadFiles fetches a list of Common Crawl relative paths into localDir, concurrently and resumably. progress is called once per file when non-nil.

func DuckDBAvailable ¶

func DuckDBAvailable() bool

DuckDBAvailable reports whether a duckdb binary is on PATH.

func ExtractMarkdown ¶

func ExtractMarkdown(body []byte) (string, error)

ExtractMarkdown converts an HTML document to a compact Markdown approximation. It is intentionally light: headings, paragraphs, list items, links, and emphasis, which covers the bulk of crawled article content.

func ExtractText ¶

func ExtractText(body []byte) string

ExtractText returns readable plain text from an HTML document, dropping the contents of script and style elements and collapsing whitespace.

func ExtractTitle ¶

func ExtractTitle(body []byte) string

ExtractTitle returns the <title> text of an HTML document.

func FetchPaths ¶

func FetchPaths(ctx context.Context, h *HTTPClient, cache *Cache, crawlID, kind string) ([]string, error)

FetchPaths downloads and decompresses a crawl's path manifest.

func FileURL ¶

func FileURL(path string, src Source) string

FileURL turns a Common Crawl relative path into a fetchable URL for the given source. HTTPS uses the CloudFront mirror; S3 uses the bucket URI.

func HTTPBody ¶

func HTTPBody(block []byte) []byte

HTTPBody splits a response block at the header/body boundary and returns the body. It returns the whole block when no boundary is found.

func HTTPHeaders ¶

func HTTPHeaders(block []byte) []byte

HTTPHeaders returns the header section (status line + headers) of a response block, without the body.

func HTTPSURL ¶

func HTTPSURL(path string) string

HTTPSURL always returns the HTTPS mirror URL (used for control-plane fetches like manifests and collinfo regardless of the bulk source).

func HostOf ¶

func HostOf(raw string) string

HostOf returns the lower-case host of a URL, or "" if it has none.

func InferMatchType ¶

func InferMatchType(pattern string) (cleanURL, matchType string)

InferMatchType guesses the CDX matchType from a user-supplied URL pattern. "*.example.com" -> domain, "example.com/*" -> prefix, otherwise exact unless the caller already chose host/domain/prefix.

func IterateWARC ¶

func IterateWARC(r io.Reader, fn func(WARCRecord) error) error

IterateWARC reads a WARC file (a multi-member gzip stream where each member is one record) and calls fn for every record. The parser lives in pkg/warc.

func IterateWAT ¶

func IterateWAT(r io.Reader, crawlID string, fn func(WATRecord) error) error

IterateWAT reads a WAT file and calls fn for each parsed record. The parser lives in pkg/wat.

func IterateWET ¶

func IterateWET(r io.Reader, crawlID string, fn func(WETRecord) error) error

IterateWET reads a WET file (WARC conversion records holding plain text) and calls fn for each record. The parser lives in pkg/wet.

func LibraryDir ¶

func LibraryDir() string

LibraryDir is the root of the structured dataset library that the --library flag downloads into and processes from. It is deliberately separate from the data dir: the data dir (see Config) holds ad-hoc downloads, the cache, and the local DuckDB file, while the library is a curated, browsable corpus you build up over time. CCRAWL_LIBRARY overrides the default of ~/notes/ccrawl.

func ParquetListLiteral ¶

func ParquetListLiteral(urls []string) string

ParquetListLiteral renders parquet URLs as a duckdb list literal, e.g. ['https://a', 'https://b'], suitable as the argument to read_parquet.

func ResolveCrawl ¶

func ResolveCrawl(ctx context.Context, h *HTTPClient, cache *Cache, ref string) (string, error)

ResolveCrawl turns a loose reference into a canonical crawl ID.

"latest"           -> newest crawl
"CC-MAIN-2024-51"  -> itself
"2024-51"          -> "CC-MAIN-2024-51"
"2024"             -> newest crawl whose ID starts with CC-MAIN-2024

func RunColumnarDuckDB ¶

func RunColumnarDuckDB(ctx context.Context, sql string, emit func(map[string]any) error) error

RunColumnarDuckDB executes sql with the local duckdb binary, installing the httpfs extension for S3/HTTPS parquet access, and streams JSON rows to emit.

func RunDuckDBJSON ¶

func RunDuckDBJSON(ctx context.Context, dbPath, sql string, emit func(map[string]any) error) error

RunDuckDBJSON runs sql with the local duckdb binary and streams JSON rows to emit. An empty dbPath runs against an in-memory database; a path opens (and creates) a persistent database file. httpfs is loaded so remote parquet over HTTPS or S3 works either way.

func SURT ¶

func SURT(raw string) string

SURT converts a URL into a Sort-friendly URI Reordering Transform key, the canonical form Common Crawl uses to sort and group its index. For example "https://www.example.com/a/b?q=1" becomes "com,example,www)/a/b?q=1".

The transform lower-cases the scheme and host, reverses the host labels, drops a leading "www.", strips the default port, and keeps the path and query.

func StreamPaths ¶

func StreamPaths(ctx context.Context, h *HTTPClient, crawlID, kind string, fn func(string) error) error

StreamPaths streams a crawl's path manifest one path at a time.

Types ¶

type CDXQuery ¶

type CDXQuery struct {
	URL    string // URL or pattern
	Match  string // exact|prefix|host|domain (empty -> inferred from URL)
	From   string // 14-digit (or loose) lower time bound
	To     string // 14-digit (or loose) upper time bound
	Status string // HTTP status filter (e.g. "200")
	MIME   string // mime-detected filter
	Lang   string // languages filter (ISO-639-3)
	Filter []string
	Limit  int
}

CDXQuery describes a query against the CDX URL index.

type CDXRecord ¶

type CDXRecord struct {
	CrawlID      string `json:"crawl,omitempty"`
	URLKey       string `json:"urlkey"`
	Timestamp    string `json:"timestamp"` // 14-digit YYYYMMDDHHmmss
	URL          string `json:"url"`
	MIME         string `json:"mime"`
	MIMEDetected string `json:"mime-detected"`
	Status       string `json:"status"`
	Digest       string `json:"digest"`
	Length       string `json:"length"`
	Offset       string `json:"offset"`
	Filename     string `json:"filename"`
	Charset      string `json:"charset,omitempty"`
	Languages    string `json:"languages,omitempty"`
	Truncated    string `json:"truncated,omitempty"`
	Redirect     string `json:"redirect,omitempty"`
}

CDXRecord is one capture from the URL index. Numeric fields stay as strings because that is how the CDX server returns them; helpers convert on demand.

func CDXSearch ¶

func CDXSearch(ctx context.Context, h *HTTPClient, crawlID string, q CDXQuery) ([]CDXRecord, error)

CDXSearch runs a query and collects matching records (bounded by q.Limit).

func (CDXRecord) Location ¶

func (r CDXRecord) Location() Location

Location returns the byte span of this capture within its WARC file.

func (CDXRecord) Time ¶

func (r CDXRecord) Time() time.Time

Time parses the 14-digit timestamp. The zero time is returned on failure.

type Cache ¶

type Cache struct {
	// contains filtered or unexported fields
}

Cache is a tiny on-disk blob cache keyed by an arbitrary string, with a TTL per entry. It is safe for the simple single-process use the CLI makes of it.

func NewCache ¶

func NewCache(dir string, enabled bool) *Cache

NewCache returns a cache rooted under dir. If dir is empty or enabled is false, all operations are no-ops (cache miss on every Get).

func (*Cache) Clear ¶

func (c *Cache) Clear() (int, error)

Clear removes every cached entry. It returns the number of files removed.

func (*Cache) Dir ¶

func (c *Cache) Dir() string

Dir returns the cache directory.

func (*Cache) Get ¶

func (c *Cache) Get(key string, ttl time.Duration) ([]byte, bool)

Get returns cached bytes for key if present and younger than ttl.

func (*Cache) Put ¶

func (c *Cache) Put(key string, data []byte)

Put stores data under key.

type ColumnarQuery ¶

type ColumnarQuery struct {
	Crawl      string
	Subset     string // warc (default) | crawldiagnostics | robotstxt
	Domain     string // url_host_registered_domain
	Host       string // url_host_name
	TLD        string // url_host_tld
	MIME       string // content_mime_detected
	Lang       string // content_languages (substring match)
	PathPrefix string // url_path prefix
	Status     int    // fetch_status (0 = any)
	Select     []string
	Limit      int
}

ColumnarQuery builds SQL against the columnar (Parquet) index. The zero value selects everything; set fields to add WHERE clauses.

func (ColumnarQuery) SQL ¶

func (q ColumnarQuery) SQL(src Source) string

SQL renders the query as a runnable DuckDB statement reading parquet over the given source. The same text runs in Athena or Spark after swapping read_parquet for the engine's table reference.

type Config ¶

type Config struct {
	DataDir   string
	CacheDir  string
	DBPath    string
	Source    Source
	Workers   int
	Timeout   time.Duration
	Delay     time.Duration
	Retries   int
	UserAgent string
	CrawlID   string
}

Config controls library behaviour. The zero value is not usable; call DefaultConfig and adjust.

func DefaultConfig ¶

func DefaultConfig() Config

DefaultConfig returns a Config rooted at the XDG data/cache directories, with the most recent crawl resolved lazily (CrawlID == "latest").

func (Config) ParquetDir ¶

func (c Config) ParquetDir() string

ParquetDir is where converted Parquet files land.

func (Config) RawDir ¶

func (c Config) RawDir() string

RawDir is where downloaded archive files land.

type Crawl ¶

type Crawl struct {
	ID     string `json:"id"`
	Name   string `json:"name"`
	CDXAPI string `json:"cdx-api"`
	From   string `json:"from,omitempty"`
	To     string `json:"to,omitempty"`
}

Crawl is one Common Crawl collection as published in collinfo.json.

func ListCrawls ¶

func ListCrawls(ctx context.Context, h *HTTPClient, cache *Cache) ([]Crawl, error)

ListCrawls fetches and parses collinfo.json. Results are cached when a cache is supplied (pass nil to skip).

type DownloadResult ¶

type DownloadResult struct {
	Path      string
	LocalPath string
	Bytes     int64
	Skipped   bool
	Err       error
}

DownloadResult is the outcome of fetching one file.

type HTTPClient ¶

type HTTPClient struct {
	// contains filtered or unexported fields
}

HTTPClient is a polite, retrying HTTP client for Common Crawl. It rate-limits requests, retries on 429/5xx with linear backoff, and supports byte-range requests for single-record retrieval.

func NewHTTPClient ¶

func NewHTTPClient(cfg Config) *HTTPClient

NewHTTPClient builds an HTTPClient from cfg.

func (*HTTPClient) FetchBytes ¶

func (h *HTTPClient) FetchBytes(ctx context.Context, url string) ([]byte, error)

FetchBytes fetches url and returns the whole body.

func (*HTTPClient) Get ¶

func (h *HTTPClient) Get(ctx context.Context, url string) (*http.Response, error)

Get fetches url with retries.

func (*HTTPClient) GetDownload ¶

func (h *HTTPClient) GetDownload(ctx context.Context, url string) (*http.Response, error)

GetDownload fetches url with no client timeout (relies on ctx cancellation), for large archive bodies.

func (*HTTPClient) GetRange ¶

func (h *HTTPClient) GetRange(ctx context.Context, url string, offset, length int64) (*http.Response, error)

GetRange fetches the [offset, offset+length) byte span of url.

type Library ¶

type Library struct {
	Root  string
	Crawl string
}

Library is a structured corpus of Common Crawl archive files for one crawl, rooted at Root. The layout is predictable so a directory listing tells you exactly what you have:

<root>/<crawl>/<kind>/<file>.gz               raw downloaded archives
<root>/<crawl>/<format>/<kind>/<file>.<ext>   processed output (parquet|jsonl)

Files are stored flat under each kind by their base name. A Common Crawl file name already encodes its segment and timestamp and is unique within a crawl, so the base name alone is a safe, stable key with no risk of collision.

func NewLibrary ¶

func NewLibrary(root, crawl string) Library

NewLibrary returns a Library rooted at root (or LibraryDir() when root is empty) for the given crawl ID.

func (Library) CrawlDir ¶

func (l Library) CrawlDir() string

CrawlDir is the per-crawl root, the parent of every kind and format directory.

func (Library) ProcessedDir ¶

func (l Library) ProcessedDir(format, kind string) string

ProcessedDir is where processed output of a kind lives, grouped by format (parquet or jsonl) so the same archives can be materialised more than one way side by side.

func (Library) RawDir ¶

func (l Library) RawDir(kind string) string

RawDir is where downloaded archives of a kind live.

func (Library) RawPath ¶

func (l Library) RawPath(kind, ccPath string) string

RawPath is the local path a given Common Crawl path maps to under the library.

type Location ¶

type Location struct {
	Filename string `json:"filename"`
	Offset   int64  `json:"offset"`
	Length   int64  `json:"length"`
	URL      string `json:"url,omitempty"`
}

Location is the WARC file plus byte span needed to range-fetch this capture.

type NewsFile ¶

type NewsFile struct {
	Path string
	Year int
	Mon  int
}

NewsFile describes one CC-NEWS WARC file.

func ListNewsFiles ¶

func ListNewsFiles(ctx context.Context, h *HTTPClient, year, month int) ([]NewsFile, error)

ListNewsFiles returns the CC-NEWS WARC files for a year and month. Pass month 0 to list every month of a year; pass year 0 to list everything found via the index page.

type ParquetWriter ¶

type ParquetWriter[T any] struct {
	// contains filtered or unexported fields
}

ParquetWriter writes rows of type T to a zstd-compressed Parquet file.

func NewParquetWriter ¶

func NewParquetWriter[T any](path string) (*ParquetWriter[T], error)

NewParquetWriter creates a Parquet writer for path.

func (*ParquetWriter[T]) Close ¶

func (p *ParquetWriter[T]) Close() error

Close flushes and closes the file.

func (*ParquetWriter[T]) Rows ¶

func (p *ParquetWriter[T]) Rows() int64

Rows returns the number of rows written.

func (*ParquetWriter[T]) Write ¶

func (p *ParquetWriter[T]) Write(row T) error

Write appends one row.

type Rank ¶

type Rank struct {
	Key         string  `json:"key"` // host or domain (forward form)
	HarmonicPos int64   `json:"harmonic_pos"`
	HarmonicVal float64 `json:"harmonic_val"`
	PageRankPos int64   `json:"pagerank_pos"`
	PageRankVal float64 `json:"pagerank_val"`
}

Rank is a host/domain entry from the web-graph rank tables.

func RankLookup ¶

func RankLookup(ctx context.Context, h *HTTPClient, url, hostOrDomain string) (Rank, error)

RankLookup streams a gzipped rank table from url and returns the entry whose reversed key matches the given host or domain, or a not-found error.

func RankTop ¶

func RankTop(ctx context.Context, h *HTTPClient, url, tld string, n int) ([]Rank, error)

RankTop streams a rank table and returns the first n rows (the table is sorted by harmonic centrality, most central first).

type Source ¶

type Source string

Source selects the transport used for bulk data files.

const (
	SourceHTTPS Source = "https"
	SourceS3    Source = "s3"
)

type WARCHeader ¶

type WARCHeader = warc.Header

WARCHeader holds parsed WARC record headers.

type WARCParquetRow ¶

type WARCParquetRow struct {
	RecordID      string    `parquet:"record_id,dict"`
	CrawlID       string    `parquet:"crawl_id,dict"`
	WARCType      string    `parquet:"warc_type,dict"`
	TargetURI     string    `parquet:"target_uri"`
	Date          time.Time `parquet:"date,timestamp(microsecond)"`
	IPAddress     string    `parquet:"ip_address,dict"`
	PayloadDigest string    `parquet:"payload_digest"`
	ContentType   string    `parquet:"content_type,dict"`
	ContentLength int64     `parquet:"content_length"`
	Truncated     string    `parquet:"truncated,dict"`
	HTTPStatus    int32     `parquet:"http_status"`
	HTTPMIME      string    `parquet:"http_mime,dict"`
	WARCFilename  string    `parquet:"warc_filename,dict"`
	WARCOffset    int64     `parquet:"warc_offset"`
	WARCLength    int64     `parquet:"warc_length"`
	Title         string    `parquet:"title"`
	Language      string    `parquet:"language,dict"`
	Markdown      string    `parquet:"markdown"`
	Text          string    `parquet:"text"`
}

WARCParquetRow is the columnar schema for parsed WARC record metadata. When a response body is converted, the content fields are populated too.

type WARCRecord ¶

type WARCRecord = warc.Record

WARCRecord is a parsed WARC record: its header and the raw block bytes. For a response record the block is the full HTTP message (status line, headers, body).

func FetchWARCRecord ¶

func FetchWARCRecord(ctx context.Context, h *HTTPClient, filename string, offset, length int64) (WARCRecord, error)

FetchWARCRecord retrieves a single WARC record from the given file using a byte-range request. This is how a capture's content is pulled without downloading the whole multi-gigabyte WARC.

type WATLink ¶

type WATLink = wat.Link

WATLink is a hyperlink extracted from page HTML.

func ExtractLinks ¶

func ExtractLinks(base string, body []byte) []WATLink

ExtractLinks returns the outbound hyperlinks of an HTML document, resolved against base when possible.

type WATMeta ¶

type WATMeta = wat.Meta

WATMeta is a <meta> tag extracted from page HTML.

type WATParquetRow ¶

type WATParquetRow struct {
	RecordID    string    `parquet:"record_id,dict"`
	CrawlID     string    `parquet:"crawl_id,dict"`
	URL         string    `parquet:"url"`
	Date        time.Time `parquet:"date,timestamp(microsecond)"`
	HTTPStatus  int32     `parquet:"http_status"`
	ContentType string    `parquet:"content_type,dict"`
	Title       string    `parquet:"title"`
	LinksCount  int32     `parquet:"links_count"`
	Links       string    `parquet:"links"` // JSON
	Metas       string    `parquet:"metas"` // JSON
}

WATParquetRow is the columnar schema for WAT link and metadata records.

type WATRecord ¶

type WATRecord = wat.Record

WATRecord is metadata extracted by Common Crawl from a single page.

type WETParquetRow ¶

type WETParquetRow struct {
	RecordID        string    `parquet:"record_id,dict"`
	CrawlID         string    `parquet:"crawl_id,dict"`
	URL             string    `parquet:"url"`
	Date            time.Time `parquet:"date,timestamp(microsecond)"`
	ContentLanguage string    `parquet:"content_language,dict"`
	TextLength      int32     `parquet:"text_length"`
	Text            string    `parquet:"text"`
}

WETParquetRow is the columnar schema for WET plain-text records.

type WETRecord ¶

type WETRecord = wet.Record

WETRecord is extracted plain text for one page.

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL