ce

package module
v0.0.0-...-b66592d Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jun 10, 2021 License: Apache-2.0 Imports: 15 Imported by: 0

README

Multilingual Web Page Content Extractor

Introduction

ce is a golang package for multilingual web page content extraction. It is used to extract the content of article type web pages, such as news, blog posts, etc.

Basic usage

package main

import (
	"encoding/json"
	"flag"
	"fmt"
	"strings"

	"github.com/crawlerclub/ce"
	"github.com/crawlerclub/dl"
)

var (
	url = flag.String("url",
		"http://china.huanqiu.com/article/2017-07/11034896.html",
		"news url")
	debug = flag.Bool("debug", false, "debug mode")
)

func main() {
	flag.Parse()
	res := dl.DownloadUrl(*url)
	if res.Error != nil {
		fmt.Println(res.Error)
		return
	}

	items := strings.Split(res.RemoteAddr, ":")
	ip := ""
	if len(items) > 0 {
		ip = items[0]
	}
	doc := ce.ParsePro(*url, res.Text, ip, *debug)
	j, _ := json.Marshal(doc)
	fmt.Println(string(j))
}

Fields

ce can extract the following fields from raw web htmls:

  • title: the title of article
  • text: the main content of article in plain text
  • html: the main content of article with basic html format, images included
  • publish_date: the publish time of article
  • language: the language of article
  • location: the country code
  • author: the author of artile
  • images: the images used in the article

Documentation

Index

Constants

View Source
const (
	BlocksWidth = 3
	Threshold   = 500 // in bytes
)

Variables

View Source
var (
	ReIgnoreBlock = map[string]*regexp.Regexp{
		"doctype":  regexp.MustCompile(`(?ims)<!DOCTYPE.*?>`),
		"comment":  regexp.MustCompile(`(?ims)<!--.*?-->`),
		"script":   regexp.MustCompile(`(?ims)<script.*?>.*?</script>`),
		"noscript": regexp.MustCompile(`(?ims)<noscript.*?>.*?</noscript>`),
		"style":    regexp.MustCompile(`(?ims)<style.*?>.*?</style>`),
		"link":     regexp.MustCompile(`(?ims)<link.*?>`),
	}
	ReNewLineBlock = map[string]*regexp.Regexp{
		"<div>": regexp.MustCompile(`(?ims)<div.*?>`),
		"<p>":   regexp.MustCompile(`(?ims)<p.*?>`),
		"<br>":  regexp.MustCompile(`(?ims)<br.*?>`),
		"<hr>":  regexp.MustCompile(`(?ims)<hr.*?>`),
		"<li>":  regexp.MustCompile(`(?ims)<li.*?>`),
	}
	ReMultiNewLine = regexp.MustCompile(`(?m)\n+`)
	ReSpaces       = regexp.MustCompile(`(?m)\s+`)
	ReTag          = regexp.MustCompile(`(?ims)<.*?>`)
	ReImg          = regexp.MustCompile(`(?ims)<img.*?>`)
	//ReImgSrc       = regexp.MustCompile(`(?ims)<img.+?src=\s*?"(.+?)"|'(.+?)'.*?>`)
	//ReImgSrc = regexp.MustCompile(`(?ims).+?src=\s*?"(.+?)"|'(.+?)'`)
	ReImgSrc = regexp.MustCompile(`(?ims)(?:.+?src|data-original)=\s*?"(.+?)"|'(.+?)'`)
	ReTitle  = regexp.MustCompile(`(?ims)<title.*?>(.+?)</title>`)
	ReH      = regexp.MustCompile(`(?ims)<h\d+.*?>(.*?)</h\d+>`)
	ReHead   = regexp.MustCompile(`(?ims)<head.*?>(.*?)<\/head>`)

	MonthStr = `(?:(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]*)`
	ReDate   = regexp.MustCompile(`(?is)((?:` + MonthStr + `[\.,\-\s]*\d{1,2}(?:st|nd|rd|th)*[\.,\-\s]*(\d{4}))|` +
		`(?:\d{1,2}(?:st|nd|rd|th)*[\.,\-\s]*` + MonthStr + `[\.,\-\s]*(\d{4}))|` +
		MonthStr + `.\d{1,2}|` +
		`(?:(19|20)\d{2}[^0-9]\d{1,2}[^0-9]\d{1,2})|` +
		`(?:\d{1,2}[^0-9]\d{1,2}[^0-9](19|20)\d{2})|` +
		`(?:(\d{4}年){0,1}\d{1,2}月\d{1,2}日))`)

	ReTime = regexp.MustCompile(`(?is)((?:0?|[12])\d\s*:+\s*[0-5]\d(?:\s*:+\s*[0-5]\d)?(?:\s*[,:.]*\s*(?:am|pm))?|` +
		`(?:0?|[12])\d\s*[.\s]+\s*[0-5]\d(?:\s*[,:.]*\s*(?:am|pm))+)`)

	ReFavicon = regexp.MustCompile(`(?ims)<link rel="shortcut icon" href="(.+?)".*?/>`)

	//ReTitleNoNoisy = regexp.MustCompile(`(?ims)^[^|\-/•—_]+`)
	//ReTitleNoNoisy = regexp.MustCompile(`(?ims).*?——+.*|^[^|\-/•—_]+`)
	ReTitleNoNoisy = regexp.MustCompile(`(?ims)(.*?(——+|--+))?[^|\-/•—_]+`)

	IgnoreImgs = map[string]bool{
		"": true,
		"":         true,
	}
)
View Source
var (
	ReMeta = regexp.MustCompile(`(?ims)<meta.*?>`)
	ReKV   = regexp.MustCompile(`(?ims)([^\s]+?)\s*?=\s*?"(.+?)"|'(.+?)'`)
)

Functions

func FilterControlChar

func FilterControlChar(in string) string

func InfoFromMeta

func InfoFromMeta(meta []map[string]string) (string, string, []string)

func ParseTime

func ParseTime(tz string, s string) time.Time

func RawMeta

func RawMeta(raw string) []map[string]string

func TextFromHTML

func TextFromHTML(rawhtml string) string

Types

type Doc

type Doc struct {
	Url             string                 `json:"url"`
	From            string                 `json:"from"`
	CanonicalUrl    string                 `json:"canonical_url"`
	Title           string                 `json:"title"`
	Text            string                 `json:"text"`
	Html            string                 `json:"html"`
	Language        string                 `json:"language"`
	Location        string                 `json:"location"`
	Favicon         string                 `json:"favicon"`
	Images          []string               `json:"images"`
	Tags            string                 `json:"tags"`
	Author          string                 `json:"author"`
	Published       string                 `json:"published"`
	PublishedParsed time.Time              `json:"published_parsed"`
	Debug           map[string]interface{} `json:"debug,omitempty"`
}

func Parse

func Parse(rawurl, rawHtml string) *Doc

func ParsePro

func ParsePro(rawurl, rawHtml, ip string, debug bool) *Doc

type Meta

type Meta struct {
	Keywords    string `json:"keywords"`
	Tags        string `json:"tags"`
	Description string `json:"description"`
	Author      string `json:"author"`
}

func GetMeta

func GetMeta(meta []map[string]string) *Meta

Directories

Path Synopsis

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL