gndoc

package module
v0.3.2 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Oct 13, 2022 License: MIT Imports: 9 Imported by: 2

README

GNdoc

GNdoc is a library for extracting the content of a large variety of files into UTF8-encoded text format.

Install

go get github.com/gnames/gndoc

Usage

import (
  "fmt"
  "path/filepath"
  "strings"

  "github.com/gnames/gndoc"
)

func Example() {
  gnd := gndoc.New(tikaURL)
  path := filepath.Join("testdata/file.pdf")
  txt, _, err := gnd.TextFromFile(path, false)
  if err != nil {
    log.Fatal(err)
  }
  hasText := strings.Contains(txt, "sabana de Bogotá")
  fmt.Printf("%v\n", hasText)

  path = filepath.Join("testdata/utf8.txt")
  txt, _, err = gnd.TextFromFile(path, true)
  if err != nil {
    log.Fatal(err)
  }
  hasText = strings.Contains(txt, "Holarctic genus")
  fmt.Printf("%v\n", hasText)

  url := "https://example.org"
  txt, _, err = gnd.TextFromURL(url)
  if err != nil {
    log.Fatal(err)
  }
  hasText = strings.Contains(txt, "Example")
  fmt.Printf("%v\n", hasText)
}

  // Output:
  // true
  // true
  // true

Documentation

Overview

Example
package main

import (
	"fmt"
	"log"
	"path/filepath"
	"strings"

	"github.com/gnames/gndoc"
)

var tikaURL = "https://tika.globalnames.org"

func main() {
	gnd := gndoc.New(tikaURL)
	path := filepath.Join("testdata/file.pdf")
	txt, _, err := gnd.TextFromFile(path, false)
	if err != nil {
		log.Fatal(err)
	}
	hasText := strings.Contains(txt, "sabana de Bogotá")
	fmt.Printf("%v\n", hasText)

	path = filepath.Join("testdata/utf8.txt")
	txt, _, err = gnd.TextFromFile(path, true)
	if err != nil {
		log.Fatal(err)
	}
	hasText = strings.Contains(txt, "Holarctic genus")
	fmt.Printf("%v\n", hasText)

	url := "https://example.org"
	txt, _, err = gnd.TextFromURL(url)
	if err != nil {
		log.Fatal(err)
	}
	hasText = strings.Contains(txt, "Example")
	fmt.Printf("%v\n", hasText)
}
Output:

true
true
true

Index

Examples

Constants

This section is empty.

Variables

This section is empty.

Functions

This section is empty.

Types

type GNdoc

type GNdoc interface {
	// TextFromFile takes a path to a file, boolean indicating if the file is a
	// plain UTF8-encoded text or not, and returns the converted
	// UTF8-encoded text, elapsed time in seconds or an error.
	TextFromFile(path string, plainInput bool) (string, float32, error)

	// TextFromURL takes a URL to a page, reads its content, and converts it into
	// a plain UTF8-encoded text. If it succeeds it returns the text, the time it
	// spend on conversion, and a nil.  If it does not succeed, it returns an
	// empty string and error.
	TextFromURL(url string) (string, float32, error)

	// GetText takes a io.Reader interface (for example opened file)
	// and returns back the UTF8-encoded textual content of the input.
	GetText(io.Reader) (string, error)

	// Text returns the UTF8-encoded textual content of a file, if it was
	// already received by running other methods.
	Text() string
}

GNdoc is the main interface of GNdoc library for converting a great variety of files into UTF8-encoded tests.

func New

func New(tikaURL string) GNdoc

Directories

Path Synopsis
io

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL