web

package

v0.0.34 Latest Latest Go to latest Published: May 12, 2026 License: Apache-2.0 Imports: 17 Imported by: 3

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/deepnoodle-ai/wonton

Links

Open Source Insights

README ¶

web

URL manipulation, text normalization, media type detection, binary file fetching, and web search abstractions for web crawling and content processing. Provides utilities for normalizing URLs, resolving relative links, cleaning text, identifying media files, downloading binary content, and implementing search functionality.

Usage Examples

URL Normalization

package main

import (
	"fmt"
	"log"

	"github.com/deepnoodle-ai/wonton/web"
)

func main() {
	// Normalize URL (adds https://, removes query params)
	url, err := web.NormalizeURL("example.com/path?query=1#fragment")
	if err != nil {
		log.Fatal(err)
	}
	fmt.Println(url.String())
	// Output: https://example.com/path

	// Normalize with http:// (converts to https://)
	url, _ = web.NormalizeURL("http://example.com")
	fmt.Println(url.String())
	// Output: https://example.com

	// Already normalized URL
	url, _ = web.NormalizeURL("https://example.com/path")
	fmt.Println(url.String())
	// Output: https://example.com/path

	// Trim whitespace
	url, _ = web.NormalizeURL("  example.com  ")
	fmt.Println(url.String())
	// Output: https://example.com
}

Resolving Relative Links

func resolveLinks(baseDomain string, links []string) {
	for _, link := range links {
		resolved, ok := web.ResolveLink(baseDomain, link)
		if ok {
			fmt.Printf("%s -> %s\n", link, resolved)
		} else {
			fmt.Printf("%s -> (invalid)\n", link)
		}
	}
}

func main() {
	// Resolve relative URLs
	baseDomain := "https://example.com/blog"
	links := []string{
		"/about",                    // Absolute path
		"post/123",                  // Relative path
		"../contact",                // Parent path
		"https://other.com/page",    // Absolute URL
		"mailto:test@example.com",   // Non-HTTP (rejected)
		"#section",                  // Fragment (removed)
	}

	resolveLinks(baseDomain, links)
	// Output:
	// /about -> https://example.com/about
	// post/123 -> https://example.com/blog/post/123
	// ../contact -> https://example.com/contact
	// https://other.com/page -> https://other.com/page
	// mailto:test@example.com -> (invalid)
	// #section -> https://example.com/blog
}

Host Comparison

func compareHosts() {
	url1, _ := web.NormalizeURL("https://example.com/path1")
	url2, _ := web.NormalizeURL("https://example.com/path2")
	url3, _ := web.NormalizeURL("https://sub.example.com/path")
	url4, _ := web.NormalizeURL("https://other.com/path")

	// Check if same host
	fmt.Println(web.AreSameHost(url1, url2))
	// Output: true

	fmt.Println(web.AreSameHost(url1, url3))
	// Output: false

	// Check if related hosts (same domain)
	fmt.Println(web.AreRelatedHosts(url1, url3))
	// Output: true (both *.example.com)

	fmt.Println(web.AreRelatedHosts(url1, url4))
	// Output: false (different domains)
}

Sorting URLs

func sortURLs(urls []string) {
	// Parse URLs
	var parsedURLs []*url.URL
	for _, u := range urls {
		parsed, err := web.NormalizeURL(u)
		if err == nil {
			parsedURLs = append(parsedURLs, parsed)
		}
	}

	// Sort alphabetically
	web.SortURLs(parsedURLs)

	// Print sorted URLs
	for _, u := range parsedURLs {
		fmt.Println(u.String())
	}
}

func main() {
	urls := []string{
		"example.com/zebra",
		"example.com/alpha",
		"example.com/beta",
	}
	sortURLs(urls)
	// Output:
	// https://example.com/alpha
	// https://example.com/beta
	// https://example.com/zebra
}

Text Normalization

func cleanText(input string) string {
	return web.NormalizeText(input)
}

func main() {
	// Trim whitespace
	fmt.Println(cleanText("  Hello  "))
	// Output: Hello

	// Unescape HTML entities
	fmt.Println(cleanText("Hello &amp; goodbye"))
	// Output: Hello & goodbye

	fmt.Println(cleanText("&lt;div&gt;"))
	// Output: <div>

	// Remove non-printable characters
	fmt.Println(cleanText("Hello\x00\x01World"))
	// Output: Hello  World

	// Combined transformations
	fmt.Println(cleanText("  &quot;Hello&quot; \x00"))
	// Output: "Hello"
}

Checking Punctuation

func checkPunctuation() {
	fmt.Println(web.EndsWithPunctuation("Hello."))
	// Output: true

	fmt.Println(web.EndsWithPunctuation("Hello?"))
	// Output: true

	fmt.Println(web.EndsWithPunctuation("Hello"))
	// Output: false

	fmt.Println(web.EndsWithPunctuation("Hello!"))
	// Output: true

	fmt.Println(web.EndsWithPunctuation(""))
	// Output: false
}

Media URL Detection

func filterMediaURLs(urls []string) {
	var mediaURLs []string
	var pageURLs []string

	for _, rawURL := range urls {
		parsed, err := web.NormalizeURL(rawURL)
		if err != nil {
			continue
		}

		if web.IsMediaURL(parsed) {
			mediaURLs = append(mediaURLs, parsed.String())
		} else {
			pageURLs = append(pageURLs, parsed.String())
		}
	}

	fmt.Println("Page URLs:")
	for _, u := range pageURLs {
		fmt.Printf("  %s\n", u)
	}

	fmt.Println("\nMedia URLs:")
	for _, u := range mediaURLs {
		fmt.Printf("  %s\n", u)
	}
}

func main() {
	urls := []string{
		"example.com/page.html",
		"example.com/image.jpg",
		"example.com/document.pdf",
		"example.com/video.mp4",
		"example.com/about",
	}

	filterMediaURLs(urls)
	// Output:
	// Page URLs:
	//   https://example.com/page.html
	//   https://example.com/about
	//
	// Media URLs:
	//   https://example.com/image.jpg
	//   https://example.com/document.pdf
	//   https://example.com/video.mp4
}

Web Crawler URL Processing

type Crawler struct {
	baseDomain string
	visited    map[string]bool
	queue      []string
}

func (c *Crawler) AddLink(link string) {
	// Resolve relative link
	resolved, ok := web.ResolveLink(c.baseDomain, link)
	if !ok {
		return
	}

	// Parse normalized URL
	url, err := web.NormalizeURL(resolved)
	if err != nil {
		return
	}

	// Skip media files
	if web.IsMediaURL(url) {
		return
	}

	// Check if same domain
	baseURL, _ := web.NormalizeURL(c.baseDomain)
	if !web.AreSameHost(url, baseURL) {
		return
	}

	// Add to queue if not visited
	urlStr := url.String()
	if !c.visited[urlStr] {
		c.visited[urlStr] = true
		c.queue = append(c.queue, urlStr)
	}
}

Link Extraction with Filtering

func extractPageLinks(baseURL string, htmlLinks []string) []string {
	var validLinks []string

	for _, link := range htmlLinks {
		// Resolve and normalize
		resolved, ok := web.ResolveLink(baseURL, link)
		if !ok {
			continue
		}

		url, err := web.NormalizeURL(resolved)
		if err != nil {
			continue
		}

		// Skip media files
		if web.IsMediaURL(url) {
			continue
		}

		validLinks = append(validLinks, url.String())
	}

	return validLinks
}

HTML Text Cleaning

func extractCleanText(htmlText string) string {
	// Remove HTML tags (simplified - use proper parser in production)
	text := regexp.MustCompile(`<[^>]*>`).ReplaceAllString(htmlText, "")

	// Normalize the text
	text = web.NormalizeText(text)

	return text
}

func main() {
	html := `<p>Hello &amp; welcome!</p><div>This is a test.</div>`
	fmt.Println(extractCleanText(html))
	// Output: Hello & welcome! This is a test.
}

Deduplicate URLs

func deduplicateURLs(urls []string) []string {
	seen := make(map[string]bool)
	var unique []string

	for _, rawURL := range urls {
		// Normalize to ensure consistent comparison
		normalized, err := web.NormalizeURL(rawURL)
		if err != nil {
			continue
		}

		urlStr := normalized.String()
		if !seen[urlStr] {
			seen[urlStr] = true
			unique = append(unique, urlStr)
		}
	}

	return unique
}

func main() {
	urls := []string{
		"example.com",
		"http://example.com",
		"https://example.com",
		"example.com?foo=bar",
		"example.com#section",
	}

	deduplicated := deduplicateURLs(urls)
	for _, u := range deduplicated {
		fmt.Println(u)
	}
	// Output:
	// https://example.com
}

Binary File Fetching

func downloadFile() {
	fetcher := web.NewDefaultBinaryFetcher()

	// Download to memory
	result, err := fetcher.FetchBinary(context.Background(), &web.BinaryFetchInput{
		URL: "https://example.com/document.pdf",
	})
	if err != nil {
		log.Fatal(err)
	}
	fmt.Printf("Downloaded %d bytes: %s\n", result.Size, result.Filename)
	// result.Data contains the file contents

	// Download to file
	result, err = fetcher.FetchBinary(context.Background(), &web.BinaryFetchInput{
		URL:        "https://example.com/image.png",
		OutputPath: "/tmp/downloads/image.png",
		CreateDirs: true,
	})
	if err != nil {
		log.Fatal(err)
	}
	fmt.Printf("Saved to: %s\n", result.DownloadPath)

	// Download with size limit and MIME verification
	result, err = fetcher.FetchBinary(context.Background(), &web.BinaryFetchInput{
		URL:            "https://example.com/file.pdf",
		OutputPath:     "/tmp/downloads/",  // Directory - filename from response
		MaxSizeBytes:   10 * 1024 * 1024,   // 10MB limit
		ExpectedType:   "application/pdf",
		VerifyMimeType: true,
	})
	if err != nil {
		log.Fatal(err)
	}
}

Error Handling

func fetchWithRetry(url string) error {
	fetcher := web.NewDefaultBinaryFetcher()

	_, err := fetcher.FetchBinary(context.Background(), &web.BinaryFetchInput{
		URL: url,
	})
	if err != nil {
		// Check if it's a fetch error with status code
		var fetchErr *web.FetchError
		if errors.As(err, &fetchErr) {
			fmt.Printf("HTTP %d: %s\n", fetchErr.StatusCode, fetchErr.Error())

			// Check if we should retry
			if fetchErr.IsRecoverable() {
				fmt.Println("Error is recoverable, will retry...")
				// Retry logic here
			}
		}
		return err
	}
	return nil
}

Implementing a Search Provider

// Implement the Searcher interface for your search backend
type MySearcher struct {
	apiKey string
}

func (s *MySearcher) Search(ctx context.Context, input *web.SearchInput) (*web.SearchOutput, error) {
	// Call your search API
	// ...

	return &web.SearchOutput{
		Items: []*web.SearchItem{
			{
				URL:         "https://example.com/result1",
				Title:       "First Result",
				Description: "Description of the first result",
			},
			{
				URL:         "https://example.com/result2",
				Title:       "Second Result",
				Description: "Description of the second result",
			},
		},
	}, nil
}

func searchExample() {
	searcher := &MySearcher{apiKey: "..."}

	results, err := searcher.Search(context.Background(), &web.SearchInput{
		Query: "golang web scraping",
		Limit: 10,
	})
	if err != nil {
		log.Fatal(err)
	}

	for _, item := range results.Items {
		fmt.Printf("%s: %s\n", item.Title, item.URL)
	}
}

API Reference

URL Functions

Function	Description	Inputs	Outputs
`NormalizeURL`	Normalizes URL with transformations	`value string`	`*url.URL, error`
`ResolveLink`	Resolves relative URL against base	`domain, value string`	`string, bool`
`AreSameHost`	Checks if URLs have same host	`url1, url2 *url.URL`	`bool`
`AreRelatedHosts`	Checks if URLs share domain	`url1, url2 *url.URL`	`bool`
`SortURLs`	Sorts URLs alphabetically	`urls []*url.URL`	none (in-place)
`IsMediaURL`	Checks if URL points to media file	`u *url.URL`	`bool`
`IsMediaExtension`	Checks if extension is a media type	`ext string`	`bool`

Text Functions

Function	Description	Inputs	Outputs
`NormalizeText`	Cleans and normalizes text	`text string`	`string`
`EndsWithPunctuation`	Checks if string ends with punctuation	`s string`	`bool`

Binary Fetcher Types

Type	Description
`BinaryFetcher`	Interface for fetching binary files from URLs
`DefaultBinaryFetcher`	Standard implementation with timeouts and security features
`BinaryFetchInput`	Configuration: URL, headers, output path, size limits, MIME verification
`BinaryFetchResult`	Result containing filename, size, content type, and data or path

Error Types

Type	Description
`FetchError`	HTTP fetch error with status code and recoverability check
`NewFetchError(code, err)`	Create a new FetchError
`(*FetchError).IsRecoverable()`	Returns true for 429, 500, 502, 503, 504 status codes

Search Types

Type	Description
`Searcher`	Interface for web search implementations
`SearchInput`	Search query and limit
`SearchOutput`	Container for search results
`SearchItem`	Individual result: URL, title, description, icon, image

URL Normalization Behavior

NormalizeURL applies these transformations:

Trims whitespace
Adds https:// prefix if missing
Converts http:// to https://
Removes query parameters
Removes URL fragments
Removes trailing / if path is just /

Examples:

"example.com" → "https://example.com"
"http://example.com" → "https://example.com"
"example.com/path?q=1#frag" → "https://example.com/path"
" example.com/ " → "https://example.com"

Text Normalization Behavior

NormalizeText applies these transformations:

Trims whitespace
Unescapes HTML entities (& → &)
Removes non-printable characters

Examples:

" text " → "text"
"Hello & goodbye" → "Hello & goodbye"
"<div>" → "<div>"
"text\x00\x01here" → "text here"

Supported Media Extensions

IsMediaURL and IsMediaExtension recognize these common file types:

Images: .jpg, .jpeg, .png, .gif, .svg, .webp, .bmp, .ico, .tiff
Documents: .pdf, .doc, .docx, .xls, .xlsx, .ppt, .pptx
Video: .mp4, .avi, .mov, .wmv, .flv, .mkv, .m4v
Audio: .mp3, .wav, .aac, .ogg, .flac, .m4a
Archives: .zip, .tar, .gz, .rar, .7z, .iso
Fonts: .ttf, .otf, .woff, .woff2, .eot
Executables: .exe, .dmg, .apk, .deb, .rpm, .msi, .bin, .pkg
Other: .css, .torrent

crawler - Web crawler that uses these utilities
fetch - HTTP page fetching
htmlparse - HTML parsing and link extraction
htmltomd - HTML to Markdown conversion

Documentation ¶

Overview ¶

Package web provides URL manipulation, text normalization, media detection, binary file fetching, and web search abstractions for web crawling and content processing.

URL Operations ¶

NormalizeURL: Parse and standardize URLs (add https://, remove query params)
ResolveLink: Resolve relative URLs against a base domain
AreSameHost: Compare if two URLs have identical hosts
AreRelatedHosts: Check if URLs share a common parent domain
SortURLs: Sort URLs alphabetically by their string representation

Text Processing ¶

NormalizeText: Clean web text (trim, unescape HTML, remove non-printable chars)
EndsWithPunctuation: Check if text ends with common punctuation marks

Media Detection ¶

IsMediaURL: Identify URLs pointing to media files
IsMediaExtension: Check if a file extension is a media type

Binary File Fetching ¶

BinaryFetcher: Interface for downloading binary files from URLs
DefaultBinaryFetcher: Standard implementation with size limits and MIME verification
BinaryFetchInput: Configuration for binary fetch requests
BinaryFetchResult: Result containing downloaded data or file path

Web Search ¶

Searcher: Interface for web search implementations
SearchInput: Search query parameters
SearchOutput: Search results container
SearchItem: Individual search result with URL, title, and metadata

Error Handling ¶

FetchError: Structured error type for HTTP fetch failures with status codes

This package is particularly useful when building web crawlers, content extractors, or any application that needs to process URLs and text from web pages.

Index ¶

func AreRelatedHosts(url1, url2 *url.URL) bool
func AreSameHost(url1, url2 *url.URL) bool
func EndsWithPunctuation(s string) bool
func IsMediaExtension(ext string) bool
func IsMediaURL(u *url.URL) bool
func NormalizeText(text string) string
func NormalizeURL(value string) (*url.URL, error)
func ResolveLink(domain, value string) (string, bool)
func SortURLs(urls []*url.URL)
type BinaryFetchInput
type BinaryFetchResult
type BinaryFetcher
type DefaultBinaryFetcher
- func NewDefaultBinaryFetcher() *DefaultBinaryFetcher
- func (f *DefaultBinaryFetcher) FetchBinary(ctx context.Context, input *BinaryFetchInput) (*BinaryFetchResult, error)
type FetchError
- func NewFetchError(statusCode int, err error) *FetchError
- func (e *FetchError) Error() string
- func (e *FetchError) IsRecoverable() bool
- func (e *FetchError) Unwrap() error
type SearchInput
type SearchItem
type SearchOutput
type Searcher

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

func AreRelatedHosts ¶

func AreRelatedHosts(url1, url2 *url.URL) bool

AreRelatedHosts checks if two URLs share the same registrable domain (effective TLD + 1). Returns false if either URL is nil or cannot have its registrable domain determined.

This function uses the Public Suffix List to correctly handle multi-part TLDs like "co.uk", "com.au", etc. For example, "example.co.uk" and "other.co.uk" are NOT related because they have different registrable domains.

This function is useful for determining if URLs belong to the same website family, even if they use different subdomains.

Example:

url1, _ := url.Parse("https://www.example.com")
url2, _ := url.Parse("https://api.example.com")
web.AreRelatedHosts(url1, url2) // true (both share "example.com")

url3, _ := url.Parse("https://example.org")
web.AreRelatedHosts(url1, url3) // false (different base domains)

url4, _ := url.Parse("https://foo.example.co.uk")
url5, _ := url.Parse("https://bar.example.co.uk")
web.AreRelatedHosts(url4, url5) // true (both share "example.co.uk")

url6, _ := url.Parse("https://foo.other.co.uk")
web.AreRelatedHosts(url4, url6) // false (different registrable domains)

Example ¶

Example demonstrates checking if URLs share a common domain.

url1, _ := url.Parse("https://www.example.com")
url2, _ := url.Parse("https://api.example.com")
url3, _ := url.Parse("https://other.com")

fmt.Println(AreRelatedHosts(url1, url2))
fmt.Println(AreRelatedHosts(url1, url3))

Output:
true
false

func AreSameHost ¶

func AreSameHost(url1, url2 *url.URL) bool

AreSameHost checks if two URLs have the same host value. Returns false if either URL is nil.

This function performs an exact host comparison, meaning subdomains are considered different hosts. For example, "www.example.com" and "api.example.com" are not the same host. Use AreRelatedHosts if you need to check for shared parent domains.

Example:

url1, _ := url.Parse("https://example.com/page1")
url2, _ := url.Parse("https://example.com/page2")
web.AreSameHost(url1, url2) // true

url3, _ := url.Parse("https://sub.example.com/page")
web.AreSameHost(url1, url3) // false

Example ¶

Example demonstrates comparing URL hosts.

url1, _ := url.Parse("https://example.com/page1")
url2, _ := url.Parse("https://example.com/page2")
url3, _ := url.Parse("https://sub.example.com/page")

fmt.Println(AreSameHost(url1, url2))
fmt.Println(AreSameHost(url1, url3))

Output:
true
false

func EndsWithPunctuation ¶

func EndsWithPunctuation(s string) bool

EndsWithPunctuation checks if a string ends with a common punctuation mark.

The following punctuation characters are recognized: . , : ; ? ! " '

This function correctly handles Unicode strings by checking the last rune rather than the last byte. Returns false for empty strings.

Example:

web.EndsWithPunctuation("Hello.")  // true
web.EndsWithPunctuation("Hello?")  // true
web.EndsWithPunctuation("Hello")   // false
web.EndsWithPunctuation("")        // false

Example ¶

Example demonstrates checking for punctuation at the end of strings.

fmt.Println(EndsWithPunctuation("Hello."))
fmt.Println(EndsWithPunctuation("Hello?"))
fmt.Println(EndsWithPunctuation("Hello"))
fmt.Println(EndsWithPunctuation(""))

Output:
true
true
false
false

func IsMediaExtension ¶

func IsMediaExtension(ext string) bool

IsMediaExtension checks if a file extension is considered a media file extension. The extension should include the leading dot (e.g., ".jpg", ".mp4"). The check is case-insensitive.

Example:

web.IsMediaExtension(".jpg")  // true
web.IsMediaExtension(".JPG")  // true
web.IsMediaExtension(".html") // false
web.IsMediaExtension("jpg")   // false (missing dot)

func IsMediaURL ¶

func IsMediaURL(u *url.URL) bool

IsMediaURL checks if a URL appears to point to a media file based on its file extension.

The function extracts the file extension from the URL's path and performs a case-insensitive lookup against the known media extensions. Returns true if the extension is recognized as a media file type.

This is useful for filtering out media files when crawling web pages or extracting links that point to HTML content.

Example:

url, _ := url.Parse("https://example.com/image.jpg")
web.IsMediaURL(url) // true

url, _ = url.Parse("https://example.com/page.html")
web.IsMediaURL(url) // false

url, _ = url.Parse("https://example.com/VIDEO.MP4")
web.IsMediaURL(url) // true (case-insensitive)

Example ¶

Example demonstrates detecting media files from URLs.

imageURL, _ := url.Parse("https://example.com/photo.jpg")
fmt.Println(IsMediaURL(imageURL))

videoURL, _ := url.Parse("https://example.com/video.mp4")
fmt.Println(IsMediaURL(videoURL))

pageURL, _ := url.Parse("https://example.com/page.html")
fmt.Println(IsMediaURL(pageURL))

Output:
true
true
false

func NormalizeText ¶

func NormalizeText(text string) string

NormalizeText applies transformations to clean up text extracted from web pages.

The following transformations are applied in order:

Trim leading and trailing whitespace
Unescape HTML entities (e.g., "&" becomes "&", "<" becomes "<")
Replace non-printable characters with spaces

Non-printable characters are any Unicode characters that are not printable according to unicode.IsPrint() and are not whitespace. These are replaced with spaces rather than removed to preserve word boundaries.

Returns the original text unchanged if it's empty after trimming.

Example:

text := web.NormalizeText("  Hello &amp; goodbye  ")
fmt.Println(text) // "Hello & goodbye"

text = web.NormalizeText("&lt;div&gt;content&lt;/div&gt;")
fmt.Println(text) // "<div>content</div>"

Example ¶

Example demonstrates text normalization for web content.

// Trim whitespace
fmt.Println(NormalizeText("  Hello  "))

// Unescape HTML entities
fmt.Println(NormalizeText("Hello &amp; goodbye"))

// Convert HTML tags (entities)
fmt.Println(NormalizeText("&lt;div&gt;"))

// Remove non-printable characters
fmt.Println(NormalizeText("Hello\x00World"))

Output:
Hello
Hello & goodbye
<div>
Hello World

func NormalizeURL ¶

func NormalizeURL(value string) (*url.URL, error)

NormalizeURL parses a URL string and returns a normalized URL.

The following transformations are applied:

Trim whitespace from the input
Add https:// prefix if the URL has no scheme
Convert http:// to https://
Remove query parameters and URL fragments
Remove trailing "/" if the path is only "/"

This function returns an error if the input is empty, has an invalid scheme (anything other than http/https), or cannot be parsed as a valid URL.

Example:

url, _ := web.NormalizeURL("example.com/path?q=1#frag")
fmt.Println(url.String()) // "https://example.com/path"

url, _ = web.NormalizeURL("http://example.com")
fmt.Println(url.String()) // "https://example.com"

Example ¶

Example demonstrates basic URL normalization.

// Normalize a URL with query parameters and fragment
url, _ := NormalizeURL("example.com/path?query=1#fragment")
fmt.Println(url.String())

// Convert http to https
url, _ = NormalizeURL("http://example.com")
fmt.Println(url.String())

// Add https prefix when missing
url, _ = NormalizeURL("example.com")
fmt.Println(url.String())

Output:
https://example.com/path
https://example.com
https://example.com

func ResolveLink ¶

func ResolveLink(domain, value string) (string, bool)

ResolveLink resolves a relative or absolute URL against a base domain and returns the normalized result.

For absolute URLs, this function validates the scheme (only http/https are accepted) and normalizes the URL. For relative URLs, it resolves them against the provided domain. URL fragments are always removed.

Returns the resolved URL string and true if successful, or an empty string and false if the URL is invalid (e.g., unsupported scheme, parse error).

The domain parameter can be specified with or without a scheme. If no scheme is provided, https:// is assumed.

Example:

// Resolve relative URL
resolved, ok := web.ResolveLink("example.com", "/about")
// resolved: "https://example.com/about", ok: true

// Validate absolute URL
resolved, ok = web.ResolveLink("example.com", "https://other.com/page")
// resolved: "https://other.com/page", ok: true

// Reject non-http schemes
resolved, ok = web.ResolveLink("example.com", "ftp://files.com")
// resolved: "", ok: false

Example ¶

Example demonstrates resolving relative URLs against a base domain.

baseDomain := "example.com"

// Resolve absolute path
resolved, ok := ResolveLink(baseDomain, "/about")
fmt.Printf("%s: %v\n", resolved, ok)

// Resolve relative path
resolved, ok = ResolveLink(baseDomain, "contact")
fmt.Printf("%s: %v\n", resolved, ok)

// Reject non-HTTP schemes
resolved, ok = ResolveLink(baseDomain, "mailto:test@example.com")
fmt.Printf("valid: %v\n", ok)

Output:
https://example.com/about: true
https://example.com/contact: true
valid: false

func SortURLs ¶

func SortURLs(urls []*url.URL)

SortURLs sorts a slice of URLs alphabetically by their string representation. The slice is sorted in place. Nil entries are sorted to the end of the slice.

Example:

urls := []*url.URL{
    mustParse("https://z.com"),
    mustParse("https://a.com"),
    mustParse("https://m.com"),
}
web.SortURLs(urls)
// urls is now ordered: a.com, m.com, z.com

Example ¶

Example demonstrates sorting URLs alphabetically.

urls := []*url.URL{
	mustParse("https://z.com/page"),
	mustParse("https://a.com/page"),
	mustParse("https://m.com/page"),
}

SortURLs(urls)

for _, u := range urls {
	fmt.Println(u.String())
}

Output:
https://a.com/page
https://m.com/page
https://z.com/page

Types ¶

type BinaryFetchInput ¶ added in v0.0.5

type BinaryFetchInput struct {
	// URL is the address to fetch the binary file from.
	URL string `json:"url"`

	// Headers contains additional HTTP headers to include in the request.
	Headers map[string]string `json:"headers,omitempty"`

	// OutputPath is the destination file path or directory. If it's a directory,
	// the filename is derived from the URL or Content-Disposition header.
	// If empty, the file content is returned in BinaryFetchResult.Data.
	OutputPath string `json:"output_path,omitempty"`

	// CreateDirs creates parent directories if they don't exist.
	CreateDirs bool `json:"create_dirs,omitempty"`

	// MaxSizeBytes limits the maximum file size to download. A value of 0
	// means no limit.
	MaxSizeBytes int64 `json:"max_size_bytes,omitempty"`

	// ExpectedType is the expected MIME type (e.g., "application/pdf", "image/jpeg").
	// Only the media type is compared; parameters like charset are ignored.
	ExpectedType string `json:"expected_type,omitempty"`

	// VerifyMimeType enables MIME type verification against ExpectedType.
	VerifyMimeType bool `json:"verify_mime_type,omitempty"`
}

BinaryFetchInput contains parameters for fetching binary files.

type BinaryFetchResult ¶ added in v0.0.5

type BinaryFetchResult struct {
	// Filename is the name of the downloaded file.
	Filename string

	// Size is the number of bytes downloaded.
	Size int64

	// ContentType is the MIME type reported by the server.
	ContentType string

	// DownloadPath is the file path where content was saved (only set if
	// OutputPath was specified in the input).
	DownloadPath string

	// Data contains the file content (only populated if OutputPath was not
	// specified in the input).
	Data []byte
}

BinaryFetchResult contains the result of a binary file fetch operation.

type BinaryFetcher ¶ added in v0.0.5

type BinaryFetcher interface {
	// FetchBinary downloads a binary file from the specified URL.
	// Returns the result containing either the file data or the path where
	// it was saved.
	FetchBinary(ctx context.Context, input *BinaryFetchInput) (*BinaryFetchResult, error)
}

BinaryFetcher defines the interface for fetching binary files from URLs.

Implementations should handle HTTP redirects, respect size limits, and sanitize filenames to prevent path traversal attacks.

type DefaultBinaryFetcher ¶ added in v0.0.5

type DefaultBinaryFetcher struct {
	// Client is the HTTP client used for requests. If nil, a default client
	// with a 30-second timeout is used.
	Client *http.Client
}

DefaultBinaryFetcher provides a standard implementation of BinaryFetcher with sensible defaults for production use.

func NewDefaultBinaryFetcher ¶ added in v0.0.5

func NewDefaultBinaryFetcher() *DefaultBinaryFetcher

NewDefaultBinaryFetcher creates a new binary fetcher with a default HTTP client configured with a 30-second timeout.

func (*DefaultBinaryFetcher) FetchBinary ¶ added in v0.0.5

func (f *DefaultBinaryFetcher) FetchBinary(ctx context.Context, input *BinaryFetchInput) (*BinaryFetchResult, error)

FetchBinary downloads a binary file from the specified URL.

The function performs filename sanitization to prevent path traversal attacks when saving to disk. Filenames from Content-Disposition headers or URLs are cleaned to remove path separators and parent directory references.

type FetchError ¶ added in v0.0.5

type FetchError struct {
	// StatusCode is the HTTP status code returned by the server.
	StatusCode int

	// Err is the underlying error describing the failure.
	Err error
}

FetchError represents an HTTP fetch failure with status code information.

This error type wraps the underlying error while preserving the HTTP status code, enabling callers to make decisions based on the type of failure. It implements the standard error interface and supports error unwrapping via errors.Unwrap.

Example:

resp, err := http.Get(url)
if err != nil {
    return nil, err
}
if resp.StatusCode >= 400 {
    return nil, web.NewFetchError(resp.StatusCode, fmt.Errorf("request failed"))
}

func NewFetchError ¶ added in v0.0.5

func NewFetchError(statusCode int, err error) *FetchError

NewFetchError creates a new FetchError with the given HTTP status code and underlying error.

func (*FetchError) Error ¶ added in v0.0.5

func (e *FetchError) Error() string

Error returns a string representation of the fetch error, including the status code and underlying error message.

func (*FetchError) IsRecoverable ¶ added in v0.0.5

func (e *FetchError) IsRecoverable() bool

IsRecoverable returns true if the error represents a temporary failure that might succeed on retry.

The following status codes are considered recoverable:

429 Too Many Requests: Rate limiting, retry after backoff
500 Internal Server Error: Transient server issue
502 Bad Gateway: Upstream server issue
503 Service Unavailable: Temporary overload or maintenance
504 Gateway Timeout: Upstream timeout

Client errors (4xx except 429) and permanent server errors are not considered recoverable.

func (*FetchError) Unwrap ¶ added in v0.0.5

func (e *FetchError) Unwrap() error

Unwrap returns the underlying error, enabling use with errors.Is and errors.As.

type SearchInput ¶ added in v0.0.5

type SearchInput struct {
	// Query is the search query string.
	Query string `json:"query"`

	// Limit is the maximum number of results to return.
	// A value of 0 uses the implementation's default limit.
	Limit int `json:"limit,omitempty"`
}

SearchInput contains parameters for a web search query.

type SearchItem ¶ added in v0.0.5

type SearchItem struct {
	// URL is the web address of the search result.
	URL string `json:"url"`

	// Title is the page title or headline.
	Title string `json:"title"`

	// Description is a summary or snippet from the page content.
	Description string `json:"description,omitempty"`

	// Icon is the URL of the site's favicon or icon.
	Icon string `json:"icon,omitempty"`

	// Image is the URL of a preview image, if available.
	Image string `json:"image,omitempty"`
}

SearchItem represents a single search result.

type SearchOutput ¶ added in v0.0.5

type SearchOutput struct {
	// Items contains the search results, ordered by relevance.
	Items []*SearchItem `json:"items"`
}

SearchOutput contains the results of a web search.

type Searcher ¶ added in v0.0.5

type Searcher interface {
	// Search performs a web search and returns matching results.
	// Returns an error if the search fails or the context is canceled.
	Search(ctx context.Context, input *SearchInput) (*SearchOutput, error)
}

Searcher defines the interface for web search implementations.

Implementations might include search engine APIs (Google, Bing, DuckDuckGo), site-specific search, or custom search indexes.

Example implementation:

type GoogleSearcher struct {
    APIKey string
    CX     string
}

func (s *GoogleSearcher) Search(ctx context.Context, input *web.SearchInput) (*web.SearchOutput, error) {
    // Call Google Custom Search API
    // ...
    return &web.SearchOutput{Items: items}, nil
}

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL

README ¶

web

Usage Examples

URL Normalization

Resolving Relative Links

Host Comparison

Sorting URLs

Text Normalization

Checking Punctuation

Media URL Detection

Web Crawler URL Processing

Link Extraction with Filtering

HTML Text Cleaning

Deduplicate URLs

Binary File Fetching

Error Handling

Implementing a Search Provider

API Reference

URL Functions

Text Functions

Binary Fetcher Types

Error Types

Search Types

URL Normalization Behavior

Text Normalization Behavior

Supported Media Extensions

Related Packages

Documentation ¶

Overview ¶

URL Operations ¶

Text Processing ¶

Media Detection ¶

Binary File Fetching ¶

Web Search ¶

Error Handling ¶

Index ¶

Examples ¶

Constants ¶

Variables ¶

Functions ¶

func AreRelatedHosts ¶

func AreSameHost ¶

func EndsWithPunctuation ¶

func IsMediaExtension ¶

func IsMediaURL ¶

func NormalizeText ¶

func NormalizeURL ¶

func ResolveLink ¶

func SortURLs ¶

Types ¶

type BinaryFetchInput ¶ added in v0.0.5

type BinaryFetchResult ¶ added in v0.0.5

type BinaryFetcher ¶ added in v0.0.5

type DefaultBinaryFetcher ¶ added in v0.0.5

func NewDefaultBinaryFetcher ¶ added in v0.0.5

func (*DefaultBinaryFetcher) FetchBinary ¶ added in v0.0.5

type FetchError ¶ added in v0.0.5

func NewFetchError ¶ added in v0.0.5

func (*FetchError) Error ¶ added in v0.0.5

func (*FetchError) IsRecoverable ¶ added in v0.0.5

func (*FetchError) Unwrap ¶ added in v0.0.5

type SearchInput ¶ added in v0.0.5

type SearchItem ¶ added in v0.0.5

type SearchOutput ¶ added in v0.0.5

type Searcher ¶ added in v0.0.5

Source Files ¶