rebuild

package
v0.2.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Nov 25, 2021 License: MIT Imports: 23 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

View Source
var DataSourcesInf = map[int]DataSourceInf{
	1: {
		Title:          "Catalogue of Life",
		TitleShort:     "Catalogue of Life",
		UUID:           "d4df2968-4257-4ad9-ab81-bedbbfb25e2a",
		HomeURL:        "https://www.catalogueoflife.org/",
		DataURL:        "http://www.catalogueoflife.org/DCA_Export/archive.php",
		IsOutlinkReady: true,
		OutlinkURL:     "https://www.catalogueoflife.org/data/taxon/{}",
		OutlinkID: func(n NameInf) string {
			return n.RecordID
		},
	},
	2: {
		TitleShort:     "Wikispecies",
		UUID:           "68923690-0727-473c-b7c5-2ae9e601e3fd",
		HomeURL:        "https://species.wikimedia.org/wiki/Main_Page",
		IsOutlinkReady: true,
		DataURL: "http://dumps.wikimedia.org/specieswiki/latest/" +
			"specieswiki-latest-pages-articles.xml.bz2",
		OutlinkURL: "http://species.wikimedia.org/wiki/{}",
		OutlinkID: func(n NameInf) string {
			return strings.ReplaceAll(n.CanonicalFull, " ", "_")
		},
	},
	3: {
		Title:          "Integrated Taxonomic Information System",
		TitleShort:     "ITIS",
		UUID:           "5d066e84-e512-4a2f-875c-0a605d3d9f35",
		HomeURL:        "https://www.itis.gov/",
		DataURL:        "https://www.itis.gov/downloads/itisMySQLTables.tar.gz",
		IsOutlinkReady: true,
		OutlinkURL:     "https://www.itis.gov/servlet/SingleRpt/SingleRpt?search_topic=TSN&search_value={}#null",
		OutlinkID: func(n NameInf) string {
			return n.RecordID
		},
	},
	4: {
		Title:          "National Center for Biotechnology Information",
		TitleShort:     "NCBI",
		UUID:           "97d7633b-5f79-4307-a397-3c29402d9311",
		HomeURL:        "https://www.ncbi.nlm.nih.gov/",
		DataURL:        "ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz",
		IsOutlinkReady: true,
		OutlinkURL: "https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?" +
			"mode=Undef&name={}&lvl=0&srchmode=1&keep=1&unlock",
		OutlinkID: func(n NameInf) string {
			return url.PathEscape(n.Canonical)
		},
	},
	5: {
		Title:          "Index Fungorum: Species Fungorum",
		TitleShort:     "Index Fungorum",
		UUID:           "af06816a-0b28-4a09-8219-bd1d63289858",
		HomeURL:        "http://www.speciesfungorum.org",
		IsOutlinkReady: true,
		OutlinkURL:     "http://www.indexfungorum.org/Names/NamesRecord.asp?RecordID={}",
		OutlinkID: func(n NameInf) string {
			return n.RecordID
		},
	},
	8: {
		TitleShort: "IRMNG (old)",
		UUID:       "f8e586aa-876e-4b0a-ab89-da0b4a64c19a",
		HomeURL:    "https://irmng.org/",
	},
	9: {
		TitleShort:     "WoRMS",
		UUID:           "bf077d91-673a-4be4-8af9-76db45d07e98",
		IsOutlinkReady: true,
		HomeURL:        "https://marinespecies.org",
	},
	10: {
		TitleShort: "Freebase",
		UUID:       "bacd21f0-44e0-43e2-914c-70929916f257",
	},
	11: {
		Title:          "Global Biodiversity Information Facility Backbone Taxonomy",
		TitleShort:     "GBIF Backbone Taxonomy",
		UUID:           "eebb6f49-e1a1-4f42-b9d5-050844c893cd",
		IsOutlinkReady: true,
		HomeURL:        "https://www.gbif.org/dataset/d7dddbf4-2cf0-4f39-9b2a-bb099caae36c",
	},
	12: {
		TitleShort:     "EOL",
		UUID:           "dba5f880-a40d-479b-a1ad-a646835edde4",
		HomeURL:        "https://eol.org",
		DataURL:        "https://eol.org/data/provider_ids.csv.gz",
		IsOutlinkReady: true,
		OutlinkURL:     "https://eol.org/pages/{}",
		OutlinkID: func(n NameInf) string {
			return n.RecordID
		},
	},
	113: {
		Title:      "Zoological names",
		TitleShort: "Zoological names",
	},
	117: {
		Title:      "Birds of Tansania",
		TitleShort: "Birds of Tansania",
	},
	119: {
		Title:      "Tansania Plant Specimens",
		TitleShort: "Tansania Plant Specimens",
	},
	142: {
		Title:      "The Clements Checklist of Birds of the World",
		TitleShort: "The Clements Checklist of Birds",
	},
	147: {
		TitleShort: "VASCAN",
	},
	149: {
		Title:      "Ocean Biodiversity Information System",
		TitleShort: "OBIS",
	},
	155: {
		TitleShort:     "FishBase",
		UUID:           "bacd21f0-44e0-43e2-914c-70929916f257",
		IsOutlinkReady: true,
		HomeURL:        "https://www.fishbase.in/home.htm",
	},
	165: {
		TitleShort: "Tropicos",
		Description: "The Tropicos database links over 1.33M scientific names " +
			"with over 4.87M specimens and over 685K digital images. The data " +
			"includes over 150K references from over 52.6K publications offered " +
			"as a free service to the world’s scientific community.",
		IsOutlinkReady: true,
		OutlinkURL:     "https://tropicos.org/name/{}",
		OutlinkID: func(n NameInf) string {
			return n.RecordID
		},
	},
	167: {
		TitleShort:     "IPNI",
		UUID:           "6b3905ce-5025-49f3-9697-ddd5bdfb4ff0",
		HomeURL:        "https://www.ipni.org/",
		IsOutlinkReady: true,
		OutlinkURL:     "https://www.ipni.org/n/{}",
		OutlinkID: func(n NameInf) string {
			return n.RecordID
		},
	},
	168: {
		TitleShort:     "ION",
		UUID:           "1137dfa3-5b8c-487d-b497-dc0938605864",
		HomeURL:        "http://organismnames.com/",
		IsOutlinkReady: true,
		OutlinkURL:     "http://www.organismnames.com/details.htm?lsid={}",
		OutlinkID: func(n NameInf) string {
			return n.RecordID
		},
	},
	170: {
		TitleShort:     "Arctos",
		UUID:           "eea8315d-a244-4625-859a-226675622312",
		HomeURL:        "https://arctosdb.org/",
		IsOutlinkReady: true,
		OutlinkURL:     "https://arctos.database.museum/name/{}",
		OutlinkID: func(n NameInf) string {
			return url.QueryEscape(n.Canonical)
		},
	},
	172: {
		TitleShort:     "PaleoBioDB",
		UUID:           "fad9970e-c358-4e1b-8cc3-f9ad2582751f",
		HomeURL:        "https://paleobiodb.org/#/",
		IsOutlinkReady: true,
	},
	173: {
		TitleShort:     "The Reptile DataBase",
		UUID:           "c24e0905-4980-4e1d-aff2-ee0ef54ea1f8",
		HomeURL:        "http://reptile-database.org/",
		IsOutlinkReady: true,
	},
	174: {
		TitleShort:     "Mammal Species of the World",
		UUID:           "464dafec-1037-432d-8449-c0b309e0a030",
		HomeURL:        "https://www.departments.bucknell.edu/biology/resources/msw3/",
		DataURL:        "https://www.departments.bucknell.edu/biology/resources/msw3/export.asp",
		IsOutlinkReady: true,
		OutlinkURL:     "https://www.departments.bucknell.edu/biology/resources/msw3/browse.asp?s=y&id={}",
		OutlinkID: func(n NameInf) string {
			return n.LocalID
		},
	},
	175: {
		TitleShort:     "BirdLife International",
		UUID:           "b1d8de7a-ab96-455f-acd8-f3fff2d7d169",
		HomeURL:        "http://www.birdlife.org/",
		DataURL:        "http://datazone.birdlife.org/species/taxonomy",
		IsOutlinkReady: true,
		OutlinkURL:     "http://datazone.birdlife.org/species/results?thrlev1=&thrlev2=&kw={}",
		OutlinkID: func(n NameInf) string {
			return url.PathEscape(n.Canonical)
		},
	},
	179: {
		TitleShort:     "Open Tree of Life",
		UUID:           "e10865e2-cdd9-4f97-912f-08f3d5ef49f7",
		IsOutlinkReady: true,
		HomeURL:        "https://tree.opentreeoflife.org/",
		DataURL:        "https://files.opentreeoflife.org/ott/",
	},
	181: {
		TitleShort:     "IRMNG",
		UUID:           "417454fa-a0a1-4b9c-814d-edc0f4f25ad8",
		IsOutlinkReady: true,
		HomeURL:        "https://irmng.org/",
		DataURL:        "https://irmng.org/export/",
	},
	183: {
		TitleShort:     "Sherborn Index Animalium",
		UUID:           "05ad6ca2-fc37-47f4-983a-72e535420e28",
		IsOutlinkReady: true,
		HomeURL:        "https://www.sil.si.edu/DigitalCollections/indexanimalium/taxonomicnames/",
		DataURL: "https://www.sil.si.edu/DigitalCollections/indexanimalium/" +
			"Datasets/2006.01.06.TaxonomicData.csv",
	},
	184: {
		TitleShort:     "ASM Mammal Diversity DB",
		UUID:           "94270cdd-5424-4bb1-8324-46ccc5386dc7",
		HomeURL:        "https://mammaldiversity.org/",
		DataURL:        "https://mammaldiversity.org/",
		IsOutlinkReady: true,
		OutlinkURL:     "https://mammaldiversity.org/species-account/species-id={}",
		OutlinkID: func(n NameInf) string {
			return n.AcceptedRecordID
		},
	},
	185: {
		TitleShort:     "IOC World Bird List",
		UUID:           "6421ffec-38e3-40fb-a6d9-af27238a47a1",
		IsOutlinkReady: true,
		HomeURL:        "https://www.worldbirdnames.org/",
		DataURL:        "https://www.worldbirdnames.org/ioc-lists/master-list-2/",
	},
	186: {
		TitleShort:     "MCZbase",
		UUID:           "c79d055b-211b-40de-8e27-618011656265",
		IsOutlinkReady: true,
		HomeURL:        "https://mczbase.mcz.harvard.edu/",
		OutlinkURL:     "https://mczbase.mcz.harvard.edu/name/{}",
		OutlinkID: func(n NameInf) string {
			return url.PathEscape(n.Canonical)
		},
	},
	187: {
		TitleShort:     "Clements' Birds of the World",
		UUID:           "577c0b56-4a3c-4314-8724-14b304f601de",
		IsOutlinkReady: true,
		HomeURL:        "https://www.birds.cornell.edu/clementschecklist/",
		DataURL:        "https://www.birds.cornell.edu/clementschecklist/download/",
	},
	188: {
		TitleShort:     "American Ornithological Society",
		UUID:           "91d38806-8435-479f-a18d-705e5cb0767c",
		HomeURL:        "https://americanornithology.org/",
		IsOutlinkReady: true,
		DataURL:        "https://checklist.americanornithology.org/taxa.csv",
		OutlinkURL:     "https://checklist.americanornithology.org/taxa/{}",
		OutlinkID: func(n NameInf) string {
			return n.RecordID
		},
	},
	189: {
		TitleShort:     "Howard & Moore Birds of the World",
		UUID:           "85023fe5-bf2a-486b-bdae-3e61cefd41fd",
		HomeURL:        "https://www.howardandmoore.org/",
		IsOutlinkReady: true,
		DataURL:        "https://www.howardandmoore.org/howard-and-moore-database/",
	},
	194: {
		TitleShort:     "Plazi",
		UUID:           "68938dc9-b93d-43bc-9d51-5c2a632f136f",
		HomeURL:        "https://www.plazi.org/",
		IsOutlinkReady: true,
		DataURL:        "http://tb.plazi.org/GgServer/xml.rss.xml",
		OutlinkURL:     "http://tb.plazi.org/GgServer/html/{}",
		OutlinkID: func(n NameInf) string {
			return n.LocalID
		},
	},
	195: {
		TitleShort:     "AlgaeBase",
		UUID:           "a5869bfb-7cbf-40f2-88d3-962922dac43f",
		HomeURL:        "https://www.algaebase.org/",
		IsOutlinkReady: true,
		OutlinkURL:     "https://www.algaebase.org/search/species/detail/?species_id={}",
		OutlinkID: func(n NameInf) string {
			return n.RecordID
		},
	},
}

DataSourcesInf provides missing data for data_sources table.

Functions

func QuoteString

func QuoteString(s string) string

QuoteString makes a string value compatible with SQL synthax by wrapping it in quotes and escaping internal quotes.

Types

type Canonical

type Canonical struct {
	// UUID v5 generated for simple canonical form.
	ID string `gorm:"type:uuid;primary_key;auto_increment:false"`
	// Canonical name-string
	Name string `gorm:"type:varchar(255);index:canonical_name;not null"`
}

Canonical is a 'simple' canonical form.

type CanonicalData

type CanonicalData struct {
	ID          string
	Value       string
	FullID      string
	FullValue   string
	StemID      string
	StemValue   string
	Cardinality int
}

Canonical Data provides data about various canonical forms of a name-string.

type CanonicalFull

type CanonicalFull struct {
	// UUID v5 generated for 'full' canonical form (with infraspecific ranks
	// and hybrid signs for named hybrids).
	ID string `gorm:"type:uuid;primary_key;auto_increment:false"`
	// Canonical name-string
	Name string `gorm:"type:varchar(255);not null"`
}

CanonicalFull ia a full canonical form.

type CanonicalStem

type CanonicalStem struct {
	// UUID v5 for the stemmed derivative of a simple canonical form.
	ID string `gorm:"type:uuid;primary_key;auto_increment:false"`
	// Stemmed canonical name-string
	Name string `gorm:"type:varchar(255);not null"`
}

CanonicalStem is a stemmed derivative of a simple canonical form.

type DataSource

type DataSource struct {
	// Hard-coded ID that corresponds to historic IDs given by old versions
	// of resolver.
	ID int `gorm:"type:smallint;primary_key;auto_increment:false"`
	// UUID assigned to the resource during creation. UUID is not displayed to
	// users, but is important for data import from DwCA files.
	UUID string `gorm:"type:uuid;default:'00000000-0000-0000-0000-000000000000'"`
	// Long title tries to follow the name of dataset given by its creators.
	Title string `gorm:"type:varchar(255)"`
	// Shortened/Abbreviated title.
	TitleShort string `gorm:"type:varchar(50)"`
	// Some datasets have versions.
	Version string `gorm:"type:varchar(50)"`
	// Time when the dataset was created.
	// Follows a format of a 'YYYY-MM-DD' || 'YYYY-MM' || 'YYYY'.
	RevisionDate string
	// DOI of the dataset (if exists).
	DOI string `gorm:"type:varchar(50)"`
	// A reference that can be used to cite the dataset.
	Citation string
	// Authors of the dataset.
	Authors string
	// Description of the dataset. Might include unstructured metainformation
	// as well.
	Description string
	// Home URL for the dataset.
	WebsiteURL string `gorm:"type:varchar(255)"`
	// Original url used to download the dataset.
	DataURL string `gorm:"type:varchar(255)"`
	// A template for creation of an outlink for a dataset record. It contains
	// a placeholder '{}' for the record's OutlinkID.
	OutlinkURL string
	// IsOutlinkReady means that the data-source has enough metainformation,
	// URLs, harvests to be generally good to be pointed out as a 'mature'
	// data-source at gnames. Resources that are harvested too long time ago
	// or do not have WebsiteURL/OutlinkURLs would normally have this flag set
	// to false.
	IsOutlinkReady bool
	// Is true if a dataset undergoes a significant manual curation.
	IsCurated bool
	// Is true if a dataset undergoes a significant automatic curation by
	// scripts.
	IsAutoCurated bool
	// Number of records in a dataset.
	RecordCount int
	// Timestamp when the dataset was imported last time. The timeset usually
	// does not corresponds to when the dataset was created.
	UpdatedAt time.Time `gorm:"type:timestamp without time zone"`
}

DataSource describes metadata of a dataset.

type DataSourceInf

type DataSourceInf struct {
	Title          string
	TitleShort     string
	Description    string
	UUID           string
	HomeURL        string
	DataURL        string
	IsOutlinkReady bool
	OutlinkURL     string
	OutlinkID      func(n NameInf) string
}

DataSourceInf provides fields associated with a DataSource

type NameInf

type NameInf struct {
	RecordID         string
	AcceptedRecordID string
	LocalID          string
	GlobalID         string
	Canonical        string
	CanonicalFull    string
}

NameInf provides fields associated with a name-string in a particular data source.

type NameString

type NameString struct {
	// UUID v5 generated from the name-string using DNS:"globalnames.org" as
	// a seed.
	ID string `gorm:"type:uuid;primary_key;auto_increment:false"`
	// Name-string with authorships and annotations as it is given by a dataset.
	// Sometimes an authorship is concatenated with a name-string by our
	// import scripts.
	Name string `gorm:"type:varchar(255);not null"`
	// Year is the year when a name was published
	Year sql.NullInt16 `gorm:"type:int"`
	// Number of elements in a 'classic' Linnaen name: 0 - unknown, not available,
	// 1 - uninomial, 2 - binomial, 3 - trinomial etc.
	// Cardinality can be used to filter out surrogates and hybrid formulas --
	// they would have cardinality 0.
	Cardinality sql.NullInt32 `gorm:"type:int"`
	// UUID v5 generated for simple canonical form.
	CanonicalID sql.NullString `gorm:"type:uuid;index:canonical"`
	// UUID v5 generated for 'full' canonical form (with infraspecific ranks
	// and hybrid signs for named hybrids).
	CanonicalFullID sql.NullString `gorm:"type:uuid;index:canonical_full"`
	// UUID v5 for the stemmed derivative of a simple canonical form.
	CanonicalStemID sql.NullString `gorm:"type:uuid;index:canonical_stem"`
	// Virus indicates if a name-string seems to be virus-like.
	Virus bool `gorm:"type:bool"`
	// Bacteria is true if parser marks a name as from Bactrial Code.
	Bacteria bool `gorm:"type:bool;not null;default:false"`
	// Surrogate indicates if a name-string is a surrogate name.
	Surrogate bool `gorm:"type:bool"`
	// ParseQuality is numeric representation of the quality of parsing.
	// 0 - no parse, 1 - clear parse, 2 - some problems, 3 - big problems.
	ParseQuality int `gorm:"type:int;not null;default:0"`
}

NameString is a name-string extracted from a dataset.

type NameStringIndex

type NameStringIndex struct {
	// Dataset ID
	DataSourceID int `gorm:"primary_key;auto_increment:false"`
	// Unique ID for record. We do our best to get it from the record IDs, either
	// global or local, but if all fails, id is assigned by gnames in a format of
	// 'gn_{int}'.
	RecordID string `gorm:"type:varchar(255);primary_key;auto_increment:false"`
	// The UUID5 of a full name-string from the dataset.
	NameStringID string `gorm:"type:uuid;index:name_string_id;primary_key;auto_increment:false"`
	// The id to create an outlink.
	OutlinkID string `gorm:"type:varchar(255)"`
	// Global id from the dataset.
	GlobalID string `gorm:"type:varchar(255)"`
	// Local id from the dataset.
	LocalID string `gorm:"type:varchar(255)"`
	// Nomenclatural code ID. 0 - no info, 1 - ICZN, 2 - ICN, 3 - ICNP, 4 - ICTV.
	CodeID int `gorm:"type:smallint"`
	// The rank of the name.
	Rank string `gorm:"type:varchar(255)"`
	// RecordID of a currently accepted name-string for the taxon.
	AcceptedRecordID string `gorm:"type:varchar(255);index:accepted_record_id"`
	// Pipe-delimited string containing classification supplied with the resource.
	Classification string
	// RecordIDs of the classificatiaon elements (if given).
	ClassificationIDs string
	// Ranks of the classification elements.
	ClassificationRanks string
}

NameStringIndex is a name-strings relations to datasets.

type ParsedData added in v0.2.0

type ParsedData struct {
	ID              string
	CanonicalSimple string
	CanonicalFull   string
}

type PgDB

type PgDB struct {
	PgHost string
	PgUser string
	PgPass string
	PgDB   string
}

func (PgDB) Migrate

func (pdb PgDB) Migrate() error

Migrate creates all the tables and indices in the database.

func (PgDB) NewDb

func (pdb PgDB) NewDb() *sql.DB

NewDb creates a database handler from sandard sql package. We use it to speed up import of the data.

func (PgDB) NewDbGorm

func (pdb PgDB) NewDbGorm() *gorm.DB

NewDbGorm creates a database handler from GORM library. We use it to simplify migrations process.

func (PgDB) ResetDB

func (pdb PgDB) ResetDB() error

ResetDB deletes old database and its public schema and sets up a new schema with correct owner.

type Rebuild

type Rebuild struct {
	PgDB
	DumpDir         string
	ParserKeyValDir string
	JobsNum         int
	Batch           int
}

Rebuild provides configuration for database rebuilding process

func NewRebuild

func NewRebuild(pgDB PgDB, inputDir string, jobsNum int) Rebuild

NewRebuild creates new Rebuild structure for rebuilding process.

func (Rebuild) CreateWords added in v0.2.0

func (rb Rebuild) CreateWords()

func (Rebuild) RemoveOrphans added in v0.2.0

func (rb Rebuild) RemoveOrphans()

func (Rebuild) UploadDataSources

func (rb Rebuild) UploadDataSources() error

UploadDataSources populates data_sources table with data.

func (Rebuild) UploadNameString

func (rb Rebuild) UploadNameString() error

UploadNameString constructs data for name_strings, canonicals, canonical_fulls, canonical_stems tables and uploads these data to the database.

func (Rebuild) UploadNameStringIndices

func (rb Rebuild) UploadNameStringIndices()

UploadNameStringIndices constracts data for name_string_indices table and aploads them to the database.

func (Rebuild) VerificationView added in v0.2.0

func (rb Rebuild) VerificationView()

verificationView creates data for a materialized view.

type Word added in v0.2.0

type Word struct {
	// ID generated by combinding modified word and type converted to integer
	//together with a pipe, and generating UUID5 from it.
	//For example: "alb|2"
	ID string `gorm:"primary_key;type:uuid;auto_increment:false"`
	// Normalized is the word normalized by GNparser. This field is used
	// for sorting results.
	Normalized string `gorm:"type:varchar(255);primary_key;auto_increment:false"`
	// Modified is a heavy-normalized word. This field is used for matching.
	Modified string `gorm:"type:varchar(255);not null;index:words_modified"`
	// WordTypeID is the integer representation of parsed.WordType
	// from GNparser.
	TypeID int
}

Word is a word from a name-string.

type WordNameString added in v0.2.0

type WordNameString struct {
	// WordID is the identifier of a word.
	WordID string `gorm:"primary_key;type:uuid;auto_increment:false"`
	// NameStringID is UUID5 of a full name-string from the dataset.
	NameStringID string `gorm:"primary_key;type:uuid;auto_increment:false"`
	// CanonicalID is UUID5 of a simple canonical form of a name
	CanonicalID string `gorm:"type:uuid;not_null"`
}

WordNameString is the meaning of a word in a name-string.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL