ce

package module

v0.0.0-...-b66592d Latest Latest Go to latest Published: Jun 10, 2021 License: Apache-2.0 Imports: 15 Imported by: 0

Details

Valid go.mod file

The Go module system was introduced in Go 1.11 and is the official dependency management solution for Go.
Redistributable license

Redistributable licenses place minimal restrictions on how software can be used, modified, and redistributed.
Tagged version

Modules with tagged versions give importers more predictable builds.
Stable version

When a project reaches major version v1 it is considered stable.
Learn more about best practices

Repository

github.com/crawlerclub/ce

Links

Open Source Insights

README ¶

Multilingual Web Page Content Extractor

Introduction

ce is a golang package for multilingual web page content extraction. It is used to extract the content of article type web pages, such as news, blog posts, etc.

Basic usage

package main

import (
	"encoding/json"
	"flag"
	"fmt"
	"strings"

	"github.com/crawlerclub/ce"
	"github.com/crawlerclub/dl"
)

var (
	url = flag.String("url",
		"http://china.huanqiu.com/article/2017-07/11034896.html",
		"news url")
	debug = flag.Bool("debug", false, "debug mode")
)

func main() {
	flag.Parse()
	res := dl.DownloadUrl(*url)
	if res.Error != nil {
		fmt.Println(res.Error)
		return
	}

	items := strings.Split(res.RemoteAddr, ":")
	ip := ""
	if len(items) > 0 {
		ip = items[0]
	}
	doc := ce.ParsePro(*url, res.Text, ip, *debug)
	j, _ := json.Marshal(doc)
	fmt.Println(string(j))
}

Fields

ce can extract the following fields from raw web htmls:

title: the title of article
text: the main content of article in plain text
html: the main content of article with basic html format, images included
publish_date: the publish time of article
language: the language of article
location: the country code
author: the author of artile
images: the images used in the article

Documentation ¶

Index ¶

Constants
Variables
func FilterControlChar(in string) string
func InfoFromMeta(meta []map[string]string) (string, string, []string)
func ParseTime(tz string, s string) time.Time
func RawMeta(raw string) []map[string]string
func TextFromHTML(rawhtml string) string
type Doc
- func Parse(rawurl, rawHtml string) *Doc
- func ParsePro(rawurl, rawHtml, ip string, debug bool) *Doc
type Meta
- func GetMeta(meta []map[string]string) *Meta

Constants ¶

View Source

const (
	BlocksWidth = 3
	Threshold   = 500 // in bytes
)

Variables ¶

View Source

var (
	ReIgnoreBlock = map[string]*regexp.Regexp{
		"doctype":  regexp.MustCompile(`(?ims)<!DOCTYPE.*?>`),
		"comment":  regexp.MustCompile(`(?ims)<!--.*?-->`),
		"script":   regexp.MustCompile(`(?ims)<script.*?>.*?</script>`),
		"noscript": regexp.MustCompile(`(?ims)<noscript.*?>.*?</noscript>`),
		"style":    regexp.MustCompile(`(?ims)<style.*?>.*?</style>`),
		"link":     regexp.MustCompile(`(?ims)<link.*?>`),
	}
	ReNewLineBlock = map[string]*regexp.Regexp{
		"<div>": regexp.MustCompile(`(?ims)<div.*?>`),
		"<p>":   regexp.MustCompile(`(?ims)<p.*?>`),
		"<br>":  regexp.MustCompile(`(?ims)<br.*?>`),
		"<hr>":  regexp.MustCompile(`(?ims)<hr.*?>`),
		"<li>":  regexp.MustCompile(`(?ims)<li.*?>`),
	}
	ReMultiNewLine = regexp.MustCompile(`(?m)\n+`)
	ReSpaces       = regexp.MustCompile(`(?m)\s+`)
	ReTag          = regexp.MustCompile(`(?ims)<.*?>`)
	ReImg          = regexp.MustCompile(`(?ims)<img.*?>`)
	//ReImgSrc       = regexp.MustCompile(`(?ims)<img.+?src=\s*?"(.+?)"|'(.+?)'.*?>`)
	//ReImgSrc = regexp.MustCompile(`(?ims).+?src=\s*?"(.+?)"|'(.+?)'`)
	ReImgSrc = regexp.MustCompile(`(?ims)(?:.+?src|data-original)=\s*?"(.+?)"|'(.+?)'`)
	ReTitle  = regexp.MustCompile(`(?ims)<title.*?>(.+?)</title>`)
	ReH      = regexp.MustCompile(`(?ims)<h\d+.*?>(.*?)</h\d+>`)
	ReHead   = regexp.MustCompile(`(?ims)<head.*?>(.*?)<\/head>`)

	MonthStr = `(?:(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]*)`
	ReDate   = regexp.MustCompile(`(?is)((?:` + MonthStr + `[\.,\-\s]*\d{1,2}(?:st|nd|rd|th)*[\.,\-\s]*(\d{4}))|` +
		`(?:\d{1,2}(?:st|nd|rd|th)*[\.,\-\s]*` + MonthStr + `[\.,\-\s]*(\d{4}))|` +
		MonthStr + `.\d{1,2}|` +
		`(?:(19|20)\d{2}[^0-9]\d{1,2}[^0-9]\d{1,2})|` +
		`(?:\d{1,2}[^0-9]\d{1,2}[^0-9](19|20)\d{2})|` +
		`(?:(\d{4}年){0,1}\d{1,2}月\d{1,2}日))`)

	ReTime = regexp.MustCompile(`(?is)((?:0?|[12])\d\s*:+\s*[0-5]\d(?:\s*:+\s*[0-5]\d)?(?:\s*[,:.]*\s*(?:am|pm))?|` +
		`(?:0?|[12])\d\s*[.\s]+\s*[0-5]\d(?:\s*[,:.]*\s*(?:am|pm))+)`)

	ReFavicon = regexp.MustCompile(`(?ims)<link rel="shortcut icon" href="(.+?)".*?/>`)

	//ReTitleNoNoisy = regexp.MustCompile(`(?ims)^[^|\-/•—_]+`)
	//ReTitleNoNoisy = regexp.MustCompile(`(?ims).*?——+.*|^[^|\-/•—_]+`)
	ReTitleNoNoisy = regexp.MustCompile(`(?ims)(.*?(——+|--+))?[^|\-/•—_]+`)

	IgnoreImgs = map[string]bool{
		"data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7": true,
		"data:image/gif;base64,R0lGODlhAQABAIAAAAUEBAAAACwAAAAAAQABAAACAkQBADs=":         true,
	}
)

View Source

var (
	ReMeta = regexp.MustCompile(`(?ims)<meta.*?>`)
	ReKV   = regexp.MustCompile(`(?ims)([^\s]+?)\s*?=\s*?"(.+?)"|'(.+?)'`)
)

Functions ¶

func FilterControlChar ¶

func FilterControlChar(in string) string

func InfoFromMeta ¶

func InfoFromMeta(meta []map[string]string) (string, string, []string)

func ParseTime ¶

func ParseTime(tz string, s string) time.Time

func RawMeta ¶

func RawMeta(raw string) []map[string]string

func TextFromHTML ¶

func TextFromHTML(rawhtml string) string

Types ¶

type Doc ¶

type Doc struct {
	Url             string                 `json:"url"`
	From            string                 `json:"from"`
	CanonicalUrl    string                 `json:"canonical_url"`
	Title           string                 `json:"title"`
	Text            string                 `json:"text"`
	Html            string                 `json:"html"`
	Language        string                 `json:"language"`
	Location        string                 `json:"location"`
	Favicon         string                 `json:"favicon"`
	Images          []string               `json:"images"`
	Tags            string                 `json:"tags"`
	Author          string                 `json:"author"`
	Published       string                 `json:"published"`
	PublishedParsed time.Time              `json:"published_parsed"`
	Debug           map[string]interface{} `json:"debug,omitempty"`
}

func Parse ¶

func Parse(rawurl, rawHtml string) *Doc

func ParsePro ¶

func ParsePro(rawurl, rawHtml, ip string, debug bool) *Doc

type Meta ¶

type Meta struct {
	Keywords    string `json:"keywords"`
	Tags        string `json:"tags"`
	Description string `json:"description"`
	Author      string `json:"author"`
}

func GetMeta ¶

func GetMeta(meta []map[string]string) *Meta

Source Files ¶

View all Source files

Directories ¶

Path	Synopsis
cmd
opengraph
twitter

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL