simulate

package
v1.0.1 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: May 1, 2024 License: BSD-3-Clause Imports: 18 Imported by: 0

Documentation

Overview

Package simulate contains functions for simulation of genomic data, including generating random sequences, variants, reads, or regions.

Index

Constants

This section is empty.

Variables

View Source
var BLOSUM = [][]float64{{0.288590604, 0.03087248322, 0.03087248322, 0.02953020134, 0.02147651007, 0.0255033557, 0.04026845638, 0.07785234899, 0.01476510067, 0.04295302013, 0.05906040268, 0.04429530201, 0.01744966443, 0.02147651007, 0.02953020134, 0.08456375839, 0.04966442953, 0.005369127517, 0.01744966443, 0.06845637584, 0.0},
	{0.04457364341, 0.3449612403, 0.03875968992, 0.03100775194, 0.007751937984, 0.0484496124, 0.0523255814, 0.03294573643, 0.02325581395, 0.02325581395, 0.04651162791, 0.1201550388, 0.01550387597, 0.01744186047, 0.01937984496, 0.04457364341, 0.03488372093, 0.005813953488, 0.01744186047, 0.03100775194, 0.0},
	{0.05122494432, 0.04454342984, 0.3140311804, 0.08240534521, 0.008908685969, 0.03340757238, 0.04899777283, 0.06458797327, 0.03118040089, 0.02227171492, 0.03118040089, 0.05345211581, 0.01113585746, 0.01781737194, 0.02004454343, 0.06904231626, 0.04899777283, 0.004454342984, 0.01559020045, 0.02672605791, 0.0},
	{0.04104477612, 0.02985074627, 0.06902985075, 0.3973880597, 0.007462686567, 0.02985074627, 0.09141791045, 0.04664179104, 0.01865671642, 0.0223880597, 0.02798507463, 0.0447761194, 0.009328358209, 0.01492537313, 0.0223880597, 0.05223880597, 0.03544776119, 0.003731343284, 0.01119402985, 0.02425373134, 0.0},
	{0.06504065041, 0.0162601626, 0.0162601626, 0.0162601626, 0.4837398374, 0.01219512195, 0.0162601626, 0.0325203252, 0.008130081301, 0.04471544715, 0.06504065041, 0.02032520325, 0.0162601626, 0.02032520325, 0.0162601626, 0.0406504065, 0.03658536585, 0.00406504065, 0.01219512195, 0.05691056911, 0.0},
	{0.05588235294, 0.07352941176, 0.04411764706, 0.04705882353, 0.008823529412, 0.2147058824, 0.1029411765, 0.04117647059, 0.02941176471, 0.02647058824, 0.04705882353, 0.09117647059, 0.02058823529, 0.01470588235, 0.02352941176, 0.05588235294, 0.04117647059, 0.005882352941, 0.02058823529, 0.03529411765, 0.0},
	{0.05524861878, 0.04972375691, 0.04051565378, 0.09023941068, 0.007366482505, 0.06445672192, 0.2965009208, 0.0349907919, 0.02578268877, 0.02209944751, 0.03683241252, 0.07550644567, 0.01289134438, 0.01657458564, 0.02578268877, 0.05524861878, 0.03683241252, 0.005524861878, 0.01657458564, 0.03130755064, 0.0},
	{0.07827260459, 0.02294197031, 0.03913630229, 0.03373819163, 0.01079622132, 0.01889338731, 0.02564102564, 0.5101214575, 0.01349527665, 0.01889338731, 0.02834008097, 0.03373819163, 0.009446693657, 0.01619433198, 0.01889338731, 0.05128205128, 0.02968960864, 0.005398110661, 0.01079622132, 0.02429149798, 0.0},
	{0.04198473282, 0.04580152672, 0.0534351145, 0.03816793893, 0.007633587786, 0.03816793893, 0.0534351145, 0.03816793893, 0.3549618321, 0.02290076336, 0.03816793893, 0.04580152672, 0.01526717557, 0.03053435115, 0.01908396947, 0.04198473282, 0.02671755725, 0.007633587786, 0.0572519084, 0.02290076336, 0.0},
	{0.0471281296, 0.0176730486, 0.0147275405, 0.0176730486, 0.01620029455, 0.01325478645, 0.0176730486, 0.0206185567, 0.0088365243, 0.2709867452, 0.1678939617, 0.0235640648, 0.03681885125, 0.0441826215, 0.0147275405, 0.02503681885, 0.03976435935, 0.0058910162, 0.0206185567, 0.176730486, 0.0},
	{0.04453441296, 0.02429149798, 0.01417004049, 0.01518218623, 0.01619433198, 0.01619433198, 0.02024291498, 0.02125506073, 0.01012145749, 0.1153846154, 0.3755060729, 0.02530364372, 0.0495951417, 0.05465587045, 0.01417004049, 0.02429149798, 0.03340080972, 0.007085020243, 0.02226720648, 0.09615384615, 0.0},
	{0.05699481865, 0.1070811744, 0.0414507772, 0.0414507772, 0.008635578584, 0.05354058722, 0.07081174439, 0.04317789292, 0.0207253886, 0.02763385147, 0.04317789292, 0.2780656304, 0.01554404145, 0.01554404145, 0.02763385147, 0.05354058722, 0.03972366149, 0.00518134715, 0.01727115717, 0.03281519862, 0.0},
	{0.05220883534, 0.03212851406, 0.02008032129, 0.02008032129, 0.01606425703, 0.0281124498, 0.0281124498, 0.0281124498, 0.01606425703, 0.1004016064, 0.1967871486, 0.03614457831, 0.1606425703, 0.04819277108, 0.01606425703, 0.03614457831, 0.04016064257, 0.008032128514, 0.02409638554, 0.09236947791, 0.0},
	{0.03382663848, 0.01902748414, 0.01691331924, 0.01691331924, 0.01057082452, 0.01057082452, 0.01902748414, 0.02536997886, 0.01691331924, 0.06342494715, 0.1141649049, 0.01902748414, 0.02536997886, 0.3868921776, 0.01057082452, 0.02536997886, 0.02536997886, 0.01691331924, 0.088794926, 0.05496828753, 0.0},
	{0.05684754522, 0.02583979328, 0.02325581395, 0.03100775194, 0.01033591731, 0.02067183463, 0.03617571059, 0.03617571059, 0.01291989664, 0.02583979328, 0.03617571059, 0.04134366925, 0.01033591731, 0.01291989664, 0.4935400517, 0.04392764858, 0.03617571059, 0.002583979328, 0.01291989664, 0.03100775194, 0.0},
	{0.109947644, 0.04013961606, 0.05410122164, 0.04886561955, 0.01745200698, 0.03315881326, 0.05235602094, 0.06631762653, 0.01919720768, 0.02966841187, 0.04188481675, 0.05410122164, 0.01570680628, 0.02094240838, 0.02966841187, 0.219895288, 0.08202443281, 0.005235602094, 0.01745200698, 0.04188481675, 0.0},
	{0.07297830375, 0.03550295858, 0.04339250493, 0.03747534517, 0.01775147929, 0.02761341223, 0.03944773176, 0.04339250493, 0.01380670611, 0.05325443787, 0.0650887574, 0.04536489152, 0.01972386588, 0.02366863905, 0.02761341223, 0.09270216963, 0.2465483235, 0.005917159763, 0.01775147929, 0.07100591716, 0.0},
	{0.0303030303, 0.02272727273, 0.01515151515, 0.01515151515, 0.007575757576, 0.01515151515, 0.02272727273, 0.0303030303, 0.01515151515, 0.0303030303, 0.05303030303, 0.02272727273, 0.01515151515, 0.06060606061, 0.007575757576, 0.02272727273, 0.02272727273, 0.4924242424, 0.06818181818, 0.0303030303, 0.0},
	{0.04049844237, 0.02803738318, 0.02180685358, 0.01869158879, 0.009345794393, 0.02180685358, 0.02803738318, 0.02492211838, 0.04672897196, 0.04361370717, 0.06853582555, 0.03115264798, 0.01869158879, 0.1308411215, 0.01557632399, 0.03115264798, 0.02803738318, 0.02803738318, 0.3177570093, 0.04672897196, 0.0},
	{0.06995884774, 0.0219478738, 0.01646090535, 0.01783264746, 0.01920438957, 0.01646090535, 0.02331961591, 0.02469135802, 0.008230452675, 0.1646090535, 0.1303155007, 0.02606310014, 0.03155006859, 0.03566529492, 0.01646090535, 0.0329218107, 0.04938271605, 0.00548696845, 0.02057613169, 0.268861454, 0.0},
	{0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}}

BLOSUM matrix for amino acid switching probabilities normalized to 0-1.

View Source
var GC float64 = 0.42

GC is the expected GC-content

Functions

func BaseExtToBases

func BaseExtToBases(seq []BaseExt) []dna.Base

BaseExtToBases converts a slice of BaseExt to a slice of dna.Base.

func CheckExon

func CheckExon(gene genePred.GenePred, position int) (bool, int)

CheckExon takes in a genePred and a position and determines if the base at that position is in an exon. It returns a bool and the int that refers to the exon the base is in.

func CheckStart

func CheckStart(gene genePred.GenePred, codon CodonExt) bool

CheckStart determines if a base is the beginning of a start codon by comparing it to the start position of the genePred CDS.

func CheckStop

func CheckStop(gene genePred.GenePred, codon CodonExt) bool

CheckStop determines if a base is the beginning of a stop codon by comparing it to the end position of the genePred CDS.

func ChooseRandomBase

func ChooseRandomBase(GCcontent float64) dna.Base

ChooseRandomBase chooses one of the four bases according to the GC content provided.

func CodonExtToCodon

func CodonExtToCodon(cE CodonExt) dna.Codon

CodonExtToCodon converts a since CodonExt to a single dna.Codon.

func CodonExtsToCodons

func CodonExtsToCodons(cE []CodonExt) []dna.Codon

CodonExtsToCodons converts a slice of CodonExt to a slice of dna.Codon.

func ETree added in v1.0.1

func ETree(numNodes int, gammaAlpha float64, gammaBeta float64) *expandedTree.ETree

ETree produces a phylogenetic tree with a user-specified number of nodes and random gamma-distributed branch lengths, determined by a user-specific alpha and beta parameter.

func GoSimulateBed

func GoSimulateBed(searchSpace []bed.Bed, regionCount int, regionLength int) <-chan bed.Bed

GoSimulateBed takes a searchSpace (represented by a noGap.bed input file, here as a parsed struct) and generates a number of regions (regionCount) of a specified length (regionLength) and sends the simulated regions to an output chan.

func IlluminaPairedSam

func IlluminaPairedSam(refName string, ref []dna.Base, numPairs, readLen, avgFragmentSize int, avgFragmentStdDev float64, flatErrorRate float64, ancientErrorRate float64, flatBinomialAlias numbers.BinomialAlias, ancientBinomialAlias numbers.BinomialAlias, geometricParam float64, out *fileio.EasyWriter, bw *sam.BamWriter, bamOutput bool, deaminationDistributionSlice []int)

IlluminaPairedSam generates pairs of sam reads randomly distributed across the input DNA sequence. The inputs are the name of the input DNA sequence, the sequence itself, the number of read pairs to generate, the length of each read, the average fragment size, the standard deviation of fragment sizes, the error rate where a base in the read will not match the input DNA, a numbers.binomialAlias that is used to speed up calculations, and output file handles for sam, bam, and a bool denoting if bam (or sam) should be the output. Whichever handle (sam or bam) is not being used can be nil.

func MutateGene

func MutateGene(inputSeq []dna.Base, branchLength float64, geneFile string, deletions bool) []dna.Base

MutateGene takes a starting sequence, a branch length, and the gene structure of the starting sequence in the form of a genePred file, along with a flag for if deletions should be allowed along with substitutions. The function returns the evolved sequence.

func NonCoding added in v1.0.1

func NonCoding(root *expandedTree.ETree, substitutionMatrixFile string, unitBranchLength float64) *expandedTree.ETree

func OrderBaseExtBySeqPos

func OrderBaseExtBySeqPos(unordered []BaseExt) []dna.Base

OrderBaseExtBySeqPos orders a string of BaseExt by seq position.

func ParseSubstitutionMatrix added in v1.0.1

func ParseSubstitutionMatrix(filename string) [][]float64

ParseSubstitutionMatrix reads a substitution matrix from an input file and returns it as a [][]float64

func RandGene

func RandGene(name string, length int, GCcontent float64) []fasta.Fasta

RandGene takes a gene name, length (in bp) and expected GC content and makes a random gene with start and stop codons. Length must be a multiple of 3 and it returns the gene as a slice of fasta.Fasta with a single entry that is the gene.

func RandIntergenicSeq

func RandIntergenicSeq(GcContent float64, lenSeq int) []dna.Base

RandIntergenicSeq makes a randomly generated DNA sequence by drawing from a distribution with a specified GC content. Unlike RandGene, it does not have to be divisible by 3. The inputs are the expected GC content and the desired length of the output sequence.

func RemoveAncestors

func RemoveAncestors(filename string, tree *expandedTree.ETree, outputFilename string)

RemoveAncestors takes a an input fasta filename and a pointer to a tree, along with an output file. All sequences in the input file that match the name of a leaf node in the tree will be written to the output file.

func Simulate

func Simulate(randSeqFilename string, root *expandedTree.ETree, gene string, deletions bool)

Simulate takes a filename of a fasta file that will be the starting sequence at the root nodes, a pointer to a phylogenetic tree, a genePred filename related to the starting sequence, and if deletions should be allowed along with substitutions. The starting sequence will then be evolved according to the neutral tree provided and each node in the tree will be assigned a DNA sequence.

func SingleVcf

func SingleVcf(alpha float64, numAlleles int, boundAlpha float64, boundBeta float64, boundMultiplier float64, pos int) vcf.Vcf

SingleVcf returns a single simulated Vcf record for a user-specified selection parameter alpha and genomic position. There also needs to be parameters for the bounding function, where alpha, beta, and multiplier parameters of 0.001, 0.001, and 10000 are good for most applications.

func VcfToFile

func VcfToFile(alpha float64, numAlleles int, numSites int, outFile string, boundAlpha float64, boundBeta float64, boundMultiplier float64)

VcfToFile generates simulated VCF data. The inputs are alpha (the selection parameter), the number of sites, the output filename, along with parameters for the bounding function for sampling. Reasonable parameters choices for boundAlpha, boundBeta, and boundMultiplier are 0.001, 0.001, and 10000.

func WithIndels added in v1.0.1

func WithIndels(fastaFile string, branchLength float64, propIndel float64, lambda float64, gcContent float64, transitionBias float64, vcfOutFile string, qName string) []fasta.Fasta

WithIndels takes an input fastaFile, which must contain a single fasta entry, and simulates a mutated sequence. The output sequence is provided in a multiFa alignment, aligned to the initial sequence. branchLength (a float from 0 to 1) specifies the expected value of the proportion of sites in the input sequence that will be mutated. propIndel (a float from 0 to 1) specifies the expected value of the proportion of indels in the output sequence. lambda specifies the rate parameter for an exponential distribution, from which INDEL sizes will be sampled. gcContent specifies the expected value of GC content for inserted sequences. vcfOutFile specifies an optional (empty string disables this option) return that records all variants made during the simulated mutation process. transitionBias specifies the expected value of the ratio of transitions to transversions in the output sequence. qName sets the suffix for the output query fasta name.

Types

type BaseExt

type BaseExt struct {
	Base   dna.Base
	SeqPos int
}

BaseExt holds the original position of a base in the starting Seq and the base at that position.

func BasesToBaseExt

func BasesToBaseExt(seq []dna.Base) []BaseExt

BasesToBaseExt converts a slice of dna.Base to a slice of BaseExt.

func CodonExtToBaseExt

func CodonExtToBaseExt(allCodons []CodonExt) []BaseExt

CodonExtToBaseExt converts a slice of CodonExt to a slice of BaseExt.

type CodonExt

type CodonExt struct {
	Seq []BaseExt
}

CodonExt holds a slice of 3 bases and their original positions.

func CreateCodons

func CreateCodons(seq []BaseExt, gene genePred.GenePred, exon int) []CodonExt

CreateCodons takes a sequence and it's gene structure in genePred format, compensates for non-zero exon frame and returns all codons that correspond to a single exon.

func PickStop

func PickStop(codon CodonExt) CodonExt

PickStop randomly selects one of the three stop codons to be used in the MutateGene output.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL