site2rss

package module
v0.0.0-...-e415774 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Nov 21, 2023 License: MIT Imports: 15 Imported by: 6

README

Site2RSS golang library

Go library for scraping the site and creating RSS feeds.

Usage

Parse feed items from remote pages
package main

import (
    "net/http"

    "github.com/n0madic/site2rss"
)

func rssRequest(w http.ResponseWriter, r *http.Request) {
    rss, err := site2rss.NewFeed("https://www.sciencealert.com/the-latest", "Science Alert").
        GetLinks("div.titletext > a").
        SetParseOptions(&site2rss.FindOnPage{
            Title:       ".article-title",
            Author:      ".author-name-name",
            Date:        ".author-name-date",
            DateFormat:  "02 Jan 2006",
            Description: ".article-fulltext",
        }).
        GetItemsFromLinks(site2rss.ParseItem).
        GetRSS()
    if err != nil {
        w.WriteHeader(http.StatusInternalServerError)
        w.Write([]byte(err.Error()))
    } else {
        w.Header().Set("Content-Type", "application/xml")
        w.Write([]byte(rss))
    }
}

func main() {
    http.HandleFunc("/", rssRequest)
    http.ListenAndServe(":3000", nil)
}
Parse remote pages with user-defined function
package main

import (
    "net/http"
    "strings"
    "time"

    "github.com/n0madic/site2rss"
)

func rssRequest(w http.ResponseWriter, r *http.Request) {
    rss, err := site2rss.NewFeed("https://www.sciencealert.com/the-latest", "Science Alert").
        GetLinks("div.titletext > a").
        GetItemsFromLinks(func(doc *site2rss.Document, opts *site2rss.FindOnPage) *site2rss.Item {
            author := doc.Find(".author-name-name").First().Text()
            title := doc.Find(".article-title").First().Text()
            created, _ := time.Parse("02 Jan 2006", strings.TrimSpace(doc.Find(".author-name-date").First().Text()))
            desc, _ := doc.Find(".article-fulltext").Html()
            return &site2rss.Item{
                Title:       title,
                Author:      &site2rss.Author{Name: author},
                Link:        &site2rss.Link{Href: doc.Url.String()},
                Id:          doc.Url.String(),
                Description: desc,
                Created:     created,
            }
        }).GetRSS()
    if err != nil {
        w.WriteHeader(http.StatusInternalServerError)
        w.Write([]byte(err.Error()))
    } else {
        w.Header().Set("Content-Type", "application/xml")
        w.Write([]byte(rss))
    }
}

func main() {
    http.HandleFunc("/", rssRequest)
    http.ListenAndServe(":3000", nil)
}
Parse feed items from source page
package main

import (
    "net/http"

    "github.com/n0madic/site2rss"
)

func rssRequest(w http.ResponseWriter, r *http.Request) {
    rss, err := site2rss.NewFeed("https://www.sciencealert.com/the-latest", "Science Alert").
        GetLinks(".titletext > a").
        SetParseOptions(&site2rss.FindOnPage{
            Title:       ".titletext",
            Author:      ".category",
            Date:        ".time",
            Image:       ".thumb-article-image > a > img",
            Description: ".introtext-feature",
        }).
        GetItemsFromSourcePage(site2rss.ParsePage).
        GetAtom()
    if err != nil {
        w.WriteHeader(http.StatusInternalServerError)
        w.Write([]byte(err.Error()))
    } else {
        w.Header().Set("Content-Type", "application/xml")
        w.Write([]byte(rss))
    }
}

func main() {
    http.HandleFunc("/", rssRequest)
    http.ListenAndServe(":3000", nil)
}
Parse feed items from a query by source page
package main

import (
    "net/http"

    "github.com/n0madic/site2rss"
)

func rssRequest(w http.ResponseWriter, r *http.Request) {
    rss, err := site2rss.NewFeed("https://www.sciencealert.com/the-latest", "Science Alert").
        SetParseOptions(&site2rss.FindOnPage{
            Title:       ".titletext",
            Date:        ".time",
            Description: ".introtext-feature",
            URL:         ".titletext > a",
        }).
        GetItemsFromQuery(".article-item", site2rss.ParseQuery).
        GetAtom()
    if err != nil {
        w.WriteHeader(http.StatusInternalServerError)
        w.Write([]byte(err.Error()))
    } else {
        w.Header().Set("Content-Type", "application/xml")
        w.Write([]byte(rss))
    }
}

func main() {
    http.HandleFunc("/", rssRequest)
    http.ListenAndServe(":3000", nil)
}
Or with user-defined function:
package main

import (
    "net/http"

    "github.com/n0madic/site2rss"
)

func rssRequest(w http.ResponseWriter, r *http.Request) {
    rss, err := site2rss.NewFeed("https://www.sciencealert.com/the-latest", "Science Alert").
        GetItemsFromQuery(".article-item",
            func(doc *site2rss.Selection, opts *site2rss.FindOnPage) *site2rss.Item {
                url := "https://www.sciencealert.com" +
                    doc.Find(".titletext > a").First().AttrOr("href", "")
                desc, _ := doc.Find(".introtext-feature").Html()
                return &site2rss.Item{
                    Title:       doc.Find(".titletext").First().Text(),
                    Link:        &site2rss.Link{Href: url},
                    Id:          url,
                    Description: desc,
                    Created:     site2rss.HumanTimeParse(doc.Find(".time").First().Text()),
                }
            }).GetRSS()
    if err != nil {
        w.WriteHeader(http.StatusInternalServerError)
        w.Write([]byte(err.Error()))
    } else {
        w.Header().Set("Content-Type", "application/xml")
        w.Write([]byte(rss))
    }
}

func main() {
    http.HandleFunc("/", rssRequest)
    http.ListenAndServe(":3000", nil)
}

Filter items

You can filter feed items by titles, text in description, text blocks, or CSS selector

site2rss.FilterItems(site2rss.Filters{
    Description: []string{"spam"},
    Selector: []string{".comments"},
    Text: []string{"See also:"},
    Title: []string{"advertising"},
})

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func ConvertToUTF8

func ConvertToUTF8(str string, origEncoding string) string

ConvertToUTF8 string from any encoding

func HumanTimeParse

func HumanTimeParse(d string) time.Time

HumanTimeParse from string

func TimeParse

func TimeParse(layout, dateStr string) time.Time

TimeParse from string

Types

type Author

type Author = feeds.Author

Author proxy type

type Document

type Document = goquery.Document

Document proxy type

type Enclosure

type Enclosure = feeds.Enclosure

Enclosure proxy type

type Feed

type Feed = feeds.Feed

Feed proxy type

type Filters

type Filters struct {
	// Skip item with the following words in the description
	Description []string
	// Remove the following selectors from content
	Selector []string
	// Remove blocks of text that contain the following words
	Text []string
	// Skip items with the following words in the title
	Title []string
}

Filters for item cleaning

type FindOnPage

type FindOnPage struct {
	Author      string
	Date        string
	DateFormat  string
	Description string
	Image       string
	Title       string
	URL         string
}

FindOnPage settings for parse page to feed item

type Item

type Item = feeds.Item

Item proxy type

func ParseItem

func ParseItem(doc *Document, opts *FindOnPage) *Item

ParseItem is default function for parsing items from remote page

func ParseQuery

func ParseQuery(sel *Selection, opts *FindOnPage) *Item

ParseQuery is default function for parsing items from a query by single page

type Link = feeds.Link

Link proxy type

type ParseResult

type ParseResult struct {
	Authors      []string
	Dates        []string
	Descriptions []string
	Images       []string
	Titles       []string
}

ParseResult return results of parsing single page

func ParsePage

func ParsePage(doc *Document, opts *FindOnPage) *ParseResult

ParsePage is default function for parsing items from single page

type Selection

type Selection = goquery.Selection

Selection proxy type

type Site2RSS

type Site2RSS struct {
	Feed         *Feed
	Links        []string
	MaxFeedItems int

	SourceURL *url.URL
	// contains filtered or unexported fields
}

Site2RSS object

func NewFeed

func NewFeed(source string, title string) *Site2RSS

NewFeed return a new Site2RSS feed object

func (*Site2RSS) AbsoluteURL

func (s *Site2RSS) AbsoluteURL(rpath string) string

AbsoluteURL makes the relative URL absolute

func (*Site2RSS) FilterItems

func (s *Site2RSS) FilterItems(filters Filters) *Site2RSS

FilterItems for clean items

func (*Site2RSS) GetAtom

func (s *Site2RSS) GetAtom() (string, error)

GetAtom return feed xml

func (s *Site2RSS) GetItemsFromLinks(f itemCallback) *Site2RSS

GetItemsFromLinks extracts details from remote links using a user-defined function

func (*Site2RSS) GetItemsFromQuery

func (s *Site2RSS) GetItemsFromQuery(docPattern string, f queryCallback) *Site2RSS

GetItemsFromQuery extracts feed items from a query by source page

func (*Site2RSS) GetItemsFromSourcePage

func (s *Site2RSS) GetItemsFromSourcePage(f pageCallback) *Site2RSS

GetItemsFromSourcePage extracts feed items from source page

func (s *Site2RSS) GetLinks(linkPattern string) *Site2RSS

GetLinks get a list of links by pattern

func (*Site2RSS) GetRSS

func (s *Site2RSS) GetRSS() (string, error)

GetRSS return feed xml

func (*Site2RSS) MakeAllLinksAbsolute

func (s *Site2RSS) MakeAllLinksAbsolute(doc *Document)

MakeAllLinksAbsolute makes all links absolute in document

func (*Site2RSS) SetMaxFeedItems

func (s *Site2RSS) SetMaxFeedItems(max int) *Site2RSS

SetMaxFeedItems set max feed items

func (*Site2RSS) SetParseOptions

func (s *Site2RSS) SetParseOptions(opts *FindOnPage) *Site2RSS

SetParseOptions for parse page

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL