avx2

package

v0.0.0-...-3878f85 Latest Latest Go to latest Published: Jul 23, 2017 License: MIT Imports: 1 Imported by: 0

Details

Valid go.mod file

The Go module system was introduced in Go 1.11 and is the official dependency management solution for Go.
Redistributable license

Redistributable licenses place minimal restrictions on how software can be used, modified, and redistributed.
Tagged version

Modules with tagged versions give importers more predictable builds.
Stable version

When a project reaches major version v1 it is considered stable.
Learn more about best practices

Repository

github.com/klauspost/intrinsics

Links

Open Source Insights

Documentation ¶

Overview ¶

THESE PACKAGES ARE FOR DEMONSTRATION PURPOSES ONLY!

THEY DO NOT NOT CONTAIN WORKING INTRINSICS!

See https://github.com/klauspost/intrinsics

Index ¶

func BlendEpi32(a x86.M128i, b x86.M128i, imm8 byte) (dst x86.M128i)
func BroadcastbEpi8(a x86.M128i) (dst x86.M128i)
func BroadcastdEpi32(a x86.M128i) (dst x86.M128i)
func BroadcastqEpi64(a x86.M128i) (dst x86.M128i)
func BroadcastsdPd(a x86.M128d) (dst x86.M128d)
func BroadcastssPs(a x86.M128) (dst x86.M128)
func BroadcastwEpi16(a x86.M128i) (dst x86.M128i)
func I32gatherEpi32(base_addr *int, vindex x86.M128i, scale int) (dst x86.M128i)
func I32gatherEpi64(base_addr *int, vindex x86.M128i, scale int) (dst x86.M128i)
func I64gatherEpi32(base_addr *int, vindex x86.M128i, scale int) (dst x86.M128i)
func I64gatherEpi64(base_addr *int, vindex x86.M128i, scale int) (dst x86.M128i)
func M256AbsEpi16(a x86.M256i) (dst x86.M256i)
func M256AbsEpi32(a x86.M256i) (dst x86.M256i)
func M256AbsEpi8(a x86.M256i) (dst x86.M256i)
func M256AddEpi16(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256AddEpi32(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256AddEpi64(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256AddEpi8(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256AddsEpi16(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256AddsEpi8(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256AddsEpu16(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256AddsEpu8(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256AlignrEpi8(a x86.M256i, b x86.M256i, count int) (dst x86.M256i)
func M256AndSi256(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256AndnotSi256(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256AvgEpu16(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256AvgEpu8(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256BlendEpi16(a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)
func M256BlendEpi32(a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)
func M256BlendvEpi8(a x86.M256i, b x86.M256i, mask x86.M256i) (dst x86.M256i)
func M256BroadcastbEpi8(a x86.M128i) (dst x86.M256i)
func M256BroadcastdEpi32(a x86.M128i) (dst x86.M256i)
func M256BroadcastqEpi64(a x86.M128i) (dst x86.M256i)
func M256BroadcastsdPd(a x86.M128d) (dst x86.M256d)
func M256Broadcastsi128Si256(a x86.M128i) (dst x86.M256i)
func M256BroadcastssPs(a x86.M128) (dst x86.M256)
func M256BroadcastwEpi16(a x86.M128i) (dst x86.M256i)
func M256BslliEpi128(a x86.M256i, imm8 byte) (dst x86.M256i)
func M256BsrliEpi128(a x86.M256i, imm8 byte) (dst x86.M256i)
func M256CmpeqEpi16(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256CmpeqEpi32(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256CmpeqEpi64(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256CmpeqEpi8(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256CmpgtEpi16(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256CmpgtEpi32(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256CmpgtEpi64(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256CmpgtEpi8(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256Cvtepi16Epi32(a x86.M128i) (dst x86.M256i)
func M256Cvtepi16Epi64(a x86.M128i) (dst x86.M256i)
func M256Cvtepi32Epi64(a x86.M128i) (dst x86.M256i)
func M256Cvtepi8Epi16(a x86.M128i) (dst x86.M256i)
func M256Cvtepi8Epi32(a x86.M128i) (dst x86.M256i)
func M256Cvtepi8Epi64(a x86.M128i) (dst x86.M256i)
func M256Cvtepu16Epi32(a x86.M128i) (dst x86.M256i)
func M256Cvtepu16Epi64(a x86.M128i) (dst x86.M256i)
func M256Cvtepu32Epi64(a x86.M128i) (dst x86.M256i)
func M256Cvtepu8Epi16(a x86.M128i) (dst x86.M256i)
func M256Cvtepu8Epi32(a x86.M128i) (dst x86.M256i)
func M256Cvtepu8Epi64(a x86.M128i) (dst x86.M256i)
func M256Extracti128Si256(a x86.M256i, imm8 byte) (dst x86.M128i)
func M256HaddEpi16(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256HaddEpi32(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256HaddsEpi16(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256HsubEpi16(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256HsubEpi32(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256HsubsEpi16(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256I32gatherEpi32(base_addr *int, vindex x86.M256i, scale int) (dst x86.M256i)
func M256I32gatherEpi64(base_addr *int, vindex x86.M128i, scale int) (dst x86.M256i)
func M256I64gatherEpi32(base_addr *int, vindex x86.M256i, scale int) (dst x86.M128i)
func M256I64gatherEpi64(base_addr *int, vindex x86.M256i, scale int) (dst x86.M256i)
func M256Inserti128Si256(a x86.M256i, b x86.M128i, imm8 byte) (dst x86.M256i)
func M256MaddEpi16(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaddubsEpi16(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaskI32gatherEpi32(src x86.M256i, base_addr *int, vindex x86.M256i, mask x86.M256i, scale int) (dst x86.M256i)
func M256MaskI32gatherEpi64(src x86.M256i, base_addr *int, vindex x86.M128i, mask x86.M256i, scale int) (dst x86.M256i)
func M256MaskI64gatherEpi32(src x86.M128i, base_addr *int, vindex x86.M256i, mask x86.M128i, scale int) (dst x86.M128i)
func M256MaskI64gatherEpi64(src x86.M256i, base_addr *int, vindex x86.M256i, mask x86.M256i, scale int) (dst x86.M256i)
func M256MaskloadEpi32(mem_addr *int, mask x86.M256i) (dst x86.M256i)
func M256MaskloadEpi64(mem_addr *int, mask x86.M256i) (dst x86.M256i)
func M256MaskstoreEpi32(mem_addr *int, mask x86.M256i, a x86.M256i)
func M256MaskstoreEpi64(mem_addr *int64, mask x86.M256i, a x86.M256i)
func M256MaxEpi16(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaxEpi32(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaxEpi8(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaxEpu16(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaxEpu32(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MaxEpu8(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MinEpi16(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MinEpi32(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MinEpi8(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MinEpu16(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MinEpu32(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MinEpu8(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MovemaskEpi8(a x86.M256i) int
func M256MpsadbwEpu8(a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)
func M256MulEpi32(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MulEpu32(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MulhiEpi16(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MulhiEpu16(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MulhrsEpi16(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MulloEpi16(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256MulloEpi32(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256OrSi256(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256PacksEpi16(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256PacksEpi32(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256PackusEpi16(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256PackusEpi32(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256Permute2x128Si256(a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)
func M256Permute4x64Epi64(a x86.M256i, imm8 byte) (dst x86.M256i)
func M256Permute4x64Pd(a x86.M256d, imm8 byte) (dst x86.M256d)
func M256Permutevar8x32Epi32(a x86.M256i, idx x86.M256i) (dst x86.M256i)
func M256Permutevar8x32Ps(a x86.M256, idx x86.M256i) (dst x86.M256)
func M256SadEpu8(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256ShuffleEpi32(a x86.M256i, imm8 byte) (dst x86.M256i)
func M256ShuffleEpi8(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256ShufflehiEpi16(a x86.M256i, imm8 byte) (dst x86.M256i)
func M256ShuffleloEpi16(a x86.M256i, imm8 byte) (dst x86.M256i)
func M256SignEpi16(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256SignEpi32(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256SignEpi8(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256SllEpi16(a x86.M256i, count x86.M128i) (dst x86.M256i)
func M256SllEpi32(a x86.M256i, count x86.M128i) (dst x86.M256i)
func M256SllEpi64(a x86.M256i, count x86.M128i) (dst x86.M256i)
func M256SlliEpi16(a x86.M256i, imm8 byte) (dst x86.M256i)
func M256SlliEpi32(a x86.M256i, imm8 byte) (dst x86.M256i)
func M256SlliEpi64(a x86.M256i, imm8 byte) (dst x86.M256i)
func M256SlliSi256(a x86.M256i, imm8 byte) (dst x86.M256i)
func M256SllvEpi32(a x86.M256i, count x86.M256i) (dst x86.M256i)
func M256SllvEpi64(a x86.M256i, count x86.M256i) (dst x86.M256i)
func M256SraEpi16(a x86.M256i, count x86.M128i) (dst x86.M256i)
func M256SraEpi32(a x86.M256i, count x86.M128i) (dst x86.M256i)
func M256SraiEpi16(a x86.M256i, imm8 byte) (dst x86.M256i)
func M256SraiEpi32(a x86.M256i, imm8 byte) (dst x86.M256i)
func M256SravEpi32(a x86.M256i, count x86.M256i) (dst x86.M256i)
func M256SrlEpi16(a x86.M256i, count x86.M128i) (dst x86.M256i)
func M256SrlEpi32(a x86.M256i, count x86.M128i) (dst x86.M256i)
func M256SrlEpi64(a x86.M256i, count x86.M128i) (dst x86.M256i)
func M256SrliEpi16(a x86.M256i, imm8 byte) (dst x86.M256i)
func M256SrliEpi32(a x86.M256i, imm8 byte) (dst x86.M256i)
func M256SrliEpi64(a x86.M256i, imm8 byte) (dst x86.M256i)
func M256SrliSi256(a x86.M256i, imm8 byte) (dst x86.M256i)
func M256SrlvEpi32(a x86.M256i, count x86.M256i) (dst x86.M256i)
func M256SrlvEpi64(a x86.M256i, count x86.M256i) (dst x86.M256i)
func M256StreamLoadSi256(mem_addr *x86.M256iConst) (dst x86.M256i)
func M256SubEpi16(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256SubEpi32(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256SubEpi64(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256SubEpi8(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256SubsEpi16(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256SubsEpi8(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256SubsEpu16(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256SubsEpu8(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256UnpackhiEpi16(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256UnpackhiEpi32(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256UnpackhiEpi64(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256UnpackhiEpi8(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256UnpackloEpi16(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256UnpackloEpi32(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256UnpackloEpi64(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256UnpackloEpi8(a x86.M256i, b x86.M256i) (dst x86.M256i)
func M256XorSi256(a x86.M256i, b x86.M256i) (dst x86.M256i)
func MaskI32gatherEpi32(src x86.M128i, base_addr *int, vindex x86.M128i, mask x86.M128i, scale int) (dst x86.M128i)
func MaskI32gatherEpi64(src x86.M128i, base_addr *int, vindex x86.M128i, mask x86.M128i, scale int) (dst x86.M128i)
func MaskI64gatherEpi32(src x86.M128i, base_addr *int, vindex x86.M128i, mask x86.M128i, scale int) (dst x86.M128i)
func MaskI64gatherEpi64(src x86.M128i, base_addr *int, vindex x86.M128i, mask x86.M128i, scale int) (dst x86.M128i)
func MaskloadEpi32(mem_addr *int, mask x86.M128i) (dst x86.M128i)
func MaskloadEpi64(mem_addr *int, mask x86.M128i) (dst x86.M128i)
func MaskstoreEpi32(mem_addr *int, mask x86.M128i, a x86.M128i)
func MaskstoreEpi64(mem_addr *int64, mask x86.M128i, a x86.M128i)
func SllvEpi32(a x86.M128i, count x86.M128i) (dst x86.M128i)
func SllvEpi64(a x86.M128i, count x86.M128i) (dst x86.M128i)
func SravEpi32(a x86.M128i, count x86.M128i) (dst x86.M128i)
func SrlvEpi32(a x86.M128i, count x86.M128i) (dst x86.M128i)
func SrlvEpi64(a x86.M128i, count x86.M128i) (dst x86.M128i)

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

func BlendEpi32 ¶

func BlendEpi32(a x86.M128i, b x86.M128i, imm8 byte) (dst x86.M128i)

BlendEpi32: Blend packed 32-bit integers from 'a' and 'b' using control mask 'imm8', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	IF imm8[j%8]
		dst[i+31:i] := b[i+31:i]
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPBLENDD'. Intrinsic: '_mm_blend_epi32'. Requires AVX2.

FIXME: Requires compiler support (has immediate)

func BroadcastbEpi8 ¶

func BroadcastbEpi8(a x86.M128i) (dst x86.M128i)

BroadcastbEpi8: Broadcast the low packed 8-bit integer from 'a' to all elements of 'dst'.

FOR j := 0 to 15
	i := j*8
	dst[i+7:i] := a[7:0]
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPBROADCASTB'. Intrinsic: '_mm_broadcastb_epi8'. Requires AVX2.

func BroadcastdEpi32 ¶

func BroadcastdEpi32(a x86.M128i) (dst x86.M128i)

BroadcastdEpi32: Broadcast the low packed 32-bit integer from 'a' to all elements of 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := a[31:0]
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPBROADCASTD'. Intrinsic: '_mm_broadcastd_epi32'. Requires AVX2.

func BroadcastqEpi64 ¶

func BroadcastqEpi64(a x86.M128i) (dst x86.M128i)

BroadcastqEpi64: Broadcast the low packed 64-bit integer from 'a' to all elements of 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := a[63:0]
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPBROADCASTQ'. Intrinsic: '_mm_broadcastq_epi64'. Requires AVX2.

func BroadcastsdPd ¶

func BroadcastsdPd(a x86.M128d) (dst x86.M128d)

BroadcastsdPd: Broadcast the low double-precision (64-bit) floating-point element from 'a' to all elements of 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := a[63:0]
ENDFOR
dst[MAX:128] := 0

Instruction: 'MOVDDUP'. Intrinsic: '_mm_broadcastsd_pd'. Requires AVX2.

func BroadcastssPs ¶

func BroadcastssPs(a x86.M128) (dst x86.M128)

BroadcastssPs: Broadcast the low single-precision (32-bit) floating-point element from 'a' to all elements of 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := a[31:0]
ENDFOR
dst[MAX:128] := 0

Instruction: 'VBROADCASTSS'. Intrinsic: '_mm_broadcastss_ps'. Requires AVX2.

func BroadcastwEpi16 ¶

func BroadcastwEpi16(a x86.M128i) (dst x86.M128i)

BroadcastwEpi16: Broadcast the low packed 16-bit integer from 'a' to all elements of 'dst'.

FOR j := 0 to 7
	i := j*16
	dst[i+15:i] := a[15:0]
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPBROADCASTW'. Intrinsic: '_mm_broadcastw_epi16'. Requires AVX2.

func I32gatherEpi32 ¶

func I32gatherEpi32(base_addr *int, vindex x86.M128i, scale int) (dst x86.M128i)

I32gatherEpi32: Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at 'base_addr' and offset by each 32-bit element in 'vindex' (each index is scaled by the factor in 'scale'). Gathered elements are merged into 'dst'. 'scale' should be 1, 2, 4 or 8.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale]
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPGATHERDD'. Intrinsic: '_mm_i32gather_epi32'. Requires AVX2.

FIXME: Will likely need to be reworked (has pointer parameter).

func I32gatherEpi64 ¶

func I32gatherEpi64(base_addr *int, vindex x86.M128i, scale int) (dst x86.M128i)

I32gatherEpi64: Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at 'base_addr' and offset by each 32-bit element in 'vindex' (each index is scaled by the factor in 'scale'). Gathered elements are merged into 'dst'. 'scale' should be 1, 2, 4 or 8.

FOR j := 0 to 1
	i := j*64
	m := j*32
	dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale]
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPGATHERDQ'. Intrinsic: '_mm_i32gather_epi64'. Requires AVX2.

FIXME: Will likely need to be reworked (has pointer parameter).

func I64gatherEpi32 ¶

func I64gatherEpi32(base_addr *int, vindex x86.M128i, scale int) (dst x86.M128i)

I64gatherEpi32: Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at 'base_addr' and offset by each 64-bit element in 'vindex' (each index is scaled by the factor in 'scale'). Gathered elements are merged into 'dst'. 'scale' should be 1, 2, 4 or 8.

FOR j := 0 to 1
	i := j*32
	m := j*64
	dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPGATHERQD'. Intrinsic: '_mm_i64gather_epi32'. Requires AVX2.

FIXME: Will likely need to be reworked (has pointer parameter).

func I64gatherEpi64 ¶

func I64gatherEpi64(base_addr *int, vindex x86.M128i, scale int) (dst x86.M128i)

I64gatherEpi64: Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at 'base_addr' and offset by each 64-bit element in 'vindex' (each index is scaled by the factor in 'scale'). Gathered elements are merged into 'dst'. 'scale' should be 1, 2, 4 or 8.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPGATHERQQ'. Intrinsic: '_mm_i64gather_epi64'. Requires AVX2.

FIXME: Will likely need to be reworked (has pointer parameter).

func M256AbsEpi16 ¶

func M256AbsEpi16(a x86.M256i) (dst x86.M256i)

M256AbsEpi16: Compute the absolute value of packed 16-bit integers in 'a', and store the unsigned results in 'dst'.

FOR j := 0 to 15
	i := j*16
	dst[i+15:i] := ABS(a[i+15:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPABSW'. Intrinsic: '_mm256_abs_epi16'. Requires AVX2.

func M256AbsEpi32 ¶

func M256AbsEpi32(a x86.M256i) (dst x86.M256i)

M256AbsEpi32: Compute the absolute value of packed 32-bit integers in 'a', and store the unsigned results in 'dst'.

FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := ABS(a[i+31:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPABSD'. Intrinsic: '_mm256_abs_epi32'. Requires AVX2.

func M256AbsEpi8 ¶

func M256AbsEpi8(a x86.M256i) (dst x86.M256i)

M256AbsEpi8: Compute the absolute value of packed 8-bit integers in 'a', and store the unsigned results in 'dst'.

FOR j := 0 to 31
	i := j*8
	dst[i+7:i] := ABS(a[i+7:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPABSB'. Intrinsic: '_mm256_abs_epi8'. Requires AVX2.

func M256AddEpi16 ¶

func M256AddEpi16(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256AddEpi16: Add packed 16-bit integers in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*16
	dst[i+15:i] := a[i+15:i] + b[i+15:i]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPADDW'. Intrinsic: '_mm256_add_epi16'. Requires AVX2.

func M256AddEpi32 ¶

func M256AddEpi32(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256AddEpi32: Add packed 32-bit integers in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := a[i+31:i] + b[i+31:i]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPADDD'. Intrinsic: '_mm256_add_epi32'. Requires AVX2.

func M256AddEpi64 ¶

func M256AddEpi64(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256AddEpi64: Add packed 64-bit integers in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := a[i+63:i] + b[i+63:i]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPADDQ'. Intrinsic: '_mm256_add_epi64'. Requires AVX2.

func M256AddEpi8 ¶

func M256AddEpi8(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256AddEpi8: Add packed 8-bit integers in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 31
	i := j*8
	dst[i+7:i] := a[i+7:i] + b[i+7:i]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPADDB'. Intrinsic: '_mm256_add_epi8'. Requires AVX2.

func M256AddsEpi16 ¶

func M256AddsEpi16(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256AddsEpi16: Add packed 16-bit integers in 'a' and 'b' using saturation, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*16
	dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] )
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPADDSW'. Intrinsic: '_mm256_adds_epi16'. Requires AVX2.

func M256AddsEpi8 ¶

func M256AddsEpi8(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256AddsEpi8: Add packed 8-bit integers in 'a' and 'b' using saturation, and store the results in 'dst'.

FOR j := 0 to 31
	i := j*8
	dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] )
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPADDSB'. Intrinsic: '_mm256_adds_epi8'. Requires AVX2.

func M256AddsEpu16 ¶

func M256AddsEpu16(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256AddsEpu16: Add packed unsigned 16-bit integers in 'a' and 'b' using saturation, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*16
	dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] )
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPADDUSW'. Intrinsic: '_mm256_adds_epu16'. Requires AVX2.

func M256AddsEpu8 ¶

func M256AddsEpu8(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256AddsEpu8: Add packed unsigned 8-bit integers in 'a' and 'b' using saturation, and store the results in 'dst'.

FOR j := 0 to 31
	i := j*8
	dst[i+7:i] := Saturate_To_UnsignedInt8( a[i+7:i] + b[i+7:i] )
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPADDUSB'. Intrinsic: '_mm256_adds_epu8'. Requires AVX2.

func M256AlignrEpi8 ¶

func M256AlignrEpi8(a x86.M256i, b x86.M256i, count int) (dst x86.M256i)

M256AlignrEpi8: Concatenate pairs of 16-byte blocks in 'a' and 'b' into a 32-byte temporary result, shift the result right by 'count' bytes, and store the low 16 bytes in 'dst'.

FOR j := 0 to 1
	i := j*128
	tmp[255:0] := ((a[i+127:i] << 128) OR b[i+127:i]) >> (count[7:0]*8)
	dst[i+127:i] := tmp[127:0]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPALIGNR'. Intrinsic: '_mm256_alignr_epi8'. Requires AVX2.

func M256AndSi256 ¶

func M256AndSi256(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256AndSi256: Compute the bitwise AND of 256 bits (representing integer data) in 'a' and 'b', and store the result in 'dst'.

dst[255:0] := (a[255:0] AND b[255:0])
dst[MAX:256] := 0

Instruction: 'VPAND'. Intrinsic: '_mm256_and_si256'. Requires AVX2.

func M256AndnotSi256 ¶

func M256AndnotSi256(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256AndnotSi256: Compute the bitwise AND NOT of 256 bits (representing integer data) in 'a' and 'b', and store the result in 'dst'.

dst[255:0] := ((NOT a[255:0]) AND b[255:0])
dst[MAX:256] := 0

Instruction: 'VPANDN'. Intrinsic: '_mm256_andnot_si256'. Requires AVX2.

func M256AvgEpu16 ¶

func M256AvgEpu16(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256AvgEpu16: Average packed unsigned 16-bit integers in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*16
	dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPAVGW'. Intrinsic: '_mm256_avg_epu16'. Requires AVX2.

func M256AvgEpu8 ¶

func M256AvgEpu8(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256AvgEpu8: Average packed unsigned 8-bit integers in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 31
	i := j*8
	dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPAVGB'. Intrinsic: '_mm256_avg_epu8'. Requires AVX2.

func M256BlendEpi16 ¶

func M256BlendEpi16(a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)

M256BlendEpi16: Blend packed 16-bit integers from 'a' and 'b' using control mask 'imm8', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*16
	IF imm8[j%8]
		dst[i+15:i] := b[i+15:i]
	ELSE
		dst[i+15:i] := a[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPBLENDW'. Intrinsic: '_mm256_blend_epi16'. Requires AVX2.

FIXME: Requires compiler support (has immediate)

func M256BlendEpi32 ¶

func M256BlendEpi32(a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)

M256BlendEpi32: Blend packed 32-bit integers from 'a' and 'b' using control mask 'imm8', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*32
	IF imm8[j%8]
		dst[i+31:i] := b[i+31:i]
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPBLENDD'. Intrinsic: '_mm256_blend_epi32'. Requires AVX2.

FIXME: Requires compiler support (has immediate)

func M256BlendvEpi8 ¶

func M256BlendvEpi8(a x86.M256i, b x86.M256i, mask x86.M256i) (dst x86.M256i)

M256BlendvEpi8: Blend packed 8-bit integers from 'a' and 'b' using 'mask', and store the results in 'dst'.

FOR j := 0 to 31
	i := j*8
	IF mask[i+7]
		dst[i+7:i] := b[i+7:i]
	ELSE
		dst[i+7:i] := a[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPBLENDVB'. Intrinsic: '_mm256_blendv_epi8'. Requires AVX2.

func M256BroadcastbEpi8 ¶

func M256BroadcastbEpi8(a x86.M128i) (dst x86.M256i)

M256BroadcastbEpi8: Broadcast the low packed 8-bit integer from 'a' to all elements of 'dst'.

FOR j := 0 to 31
	i := j*8
	dst[i+7:i] := a[7:0]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPBROADCASTB'. Intrinsic: '_mm256_broadcastb_epi8'. Requires AVX2.

func M256BroadcastdEpi32 ¶

func M256BroadcastdEpi32(a x86.M128i) (dst x86.M256i)

M256BroadcastdEpi32: Broadcast the low packed 32-bit integer from 'a' to all elements of 'dst'.

FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := a[31:0]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPBROADCASTD'. Intrinsic: '_mm256_broadcastd_epi32'. Requires AVX2.

func M256BroadcastqEpi64 ¶

func M256BroadcastqEpi64(a x86.M128i) (dst x86.M256i)

M256BroadcastqEpi64: Broadcast the low packed 64-bit integer from 'a' to all elements of 'dst'.

FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := a[63:0]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPBROADCASTQ'. Intrinsic: '_mm256_broadcastq_epi64'. Requires AVX2.

func M256BroadcastsdPd ¶

func M256BroadcastsdPd(a x86.M128d) (dst x86.M256d)

M256BroadcastsdPd: Broadcast the low double-precision (64-bit) floating-point element from 'a' to all elements of 'dst'.

FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := a[63:0]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VBROADCASTSD'. Intrinsic: '_mm256_broadcastsd_pd'. Requires AVX2.

func M256Broadcastsi128Si256 ¶

func M256Broadcastsi128Si256(a x86.M128i) (dst x86.M256i)

M256Broadcastsi128Si256: Broadcast 128 bits of integer data from 'a' to all 128-bit lanes in 'dst'.

dst[127:0] := a[127:0]
dst[255:128] := a[127:0]
dst[MAX:256] := 0

Instruction: 'VBROADCASTI128'. Intrinsic: '_mm256_broadcastsi128_si256'. Requires AVX2.

func M256BroadcastssPs ¶

func M256BroadcastssPs(a x86.M128) (dst x86.M256)

M256BroadcastssPs: Broadcast the low single-precision (32-bit) floating-point element from 'a' to all elements of 'dst'.

FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := a[31:0]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VBROADCASTSS'. Intrinsic: '_mm256_broadcastss_ps'. Requires AVX2.

func M256BroadcastwEpi16 ¶

func M256BroadcastwEpi16(a x86.M128i) (dst x86.M256i)

M256BroadcastwEpi16: Broadcast the low packed 16-bit integer from 'a' to all elements of 'dst'.

FOR j := 0 to 15
	i := j*16
	dst[i+15:i] := a[15:0]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPBROADCASTW'. Intrinsic: '_mm256_broadcastw_epi16'. Requires AVX2.

func M256BslliEpi128 ¶

func M256BslliEpi128(a x86.M256i, imm8 byte) (dst x86.M256i)

M256BslliEpi128: Shift 128-bit lanes in 'a' left by 'imm8' bytes while shifting in zeros, and store the results in 'dst'.

tmp := imm8[7:0]
IF tmp > 15
	tmp := 16
FI
dst[127:0] := a[127:0] << (tmp*8)
dst[255:128] := a[255:128] << (tmp*8)
dst[MAX:256] := 0

Instruction: 'VPSLLDQ'. Intrinsic: '_mm256_bslli_epi128'. Requires AVX2.

FIXME: Requires compiler support (has immediate)

func M256BsrliEpi128 ¶

func M256BsrliEpi128(a x86.M256i, imm8 byte) (dst x86.M256i)

M256BsrliEpi128: Shift 128-bit lanes in 'a' right by 'imm8' bytes while shifting in zeros, and store the results in 'dst'.

tmp := imm8[7:0]
IF tmp > 15
	tmp := 16
FI
dst[127:0] := a[127:0] >> (tmp*8)
dst[255:128] := a[255:128] >> (tmp*8)
dst[MAX:256] := 0

Instruction: 'VPSRLDQ'. Intrinsic: '_mm256_bsrli_epi128'. Requires AVX2.

FIXME: Requires compiler support (has immediate)

func M256CmpeqEpi16 ¶

func M256CmpeqEpi16(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256CmpeqEpi16: Compare packed 16-bit integers in 'a' and 'b' for equality, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*16
	dst[i+15:i] := ( a[i+15:i] == b[i+15:i] ) ? 0xFFFF : 0
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPCMPEQW'. Intrinsic: '_mm256_cmpeq_epi16'. Requires AVX2.

func M256CmpeqEpi32 ¶

func M256CmpeqEpi32(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256CmpeqEpi32: Compare packed 32-bit integers in 'a' and 'b' for equality, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := ( a[i+31:i] == b[i+31:i] ) ? 0xFFFFFFFF : 0
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPCMPEQD'. Intrinsic: '_mm256_cmpeq_epi32'. Requires AVX2.

func M256CmpeqEpi64 ¶

func M256CmpeqEpi64(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256CmpeqEpi64: Compare packed 64-bit integers in 'a' and 'b' for equality, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := ( a[i+63:i] == b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPCMPEQQ'. Intrinsic: '_mm256_cmpeq_epi64'. Requires AVX2.

func M256CmpeqEpi8 ¶

func M256CmpeqEpi8(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256CmpeqEpi8: Compare packed 8-bit integers in 'a' and 'b' for equality, and store the results in 'dst'.

FOR j := 0 to 31
	i := j*8
	dst[i+7:i] := ( a[i+7:i] == b[i+7:i] ) ? 0xFF : 0
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPCMPEQB'. Intrinsic: '_mm256_cmpeq_epi8'. Requires AVX2.

func M256CmpgtEpi16 ¶

func M256CmpgtEpi16(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256CmpgtEpi16: Compare packed 16-bit integers in 'a' and 'b' for greater-than, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*16
	dst[i+15:i] := ( a[i+15:i] > b[i+15:i] ) ? 0xFFFF : 0
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPCMPGTW'. Intrinsic: '_mm256_cmpgt_epi16'. Requires AVX2.

func M256CmpgtEpi32 ¶

func M256CmpgtEpi32(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256CmpgtEpi32: Compare packed 32-bit integers in 'a' and 'b' for greater-than, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := ( a[i+31:i] > b[i+31:i] ) ? 0xFFFFFFFF : 0
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPCMPGTD'. Intrinsic: '_mm256_cmpgt_epi32'. Requires AVX2.

func M256CmpgtEpi64 ¶

func M256CmpgtEpi64(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256CmpgtEpi64: Compare packed 64-bit integers in 'a' and 'b' for greater-than, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := ( a[i+63:i] > b[i+63:i] ) ? 0xFFFFFFFFFFFFFFFF : 0
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPCMPGTQ'. Intrinsic: '_mm256_cmpgt_epi64'. Requires AVX2.

func M256CmpgtEpi8 ¶

func M256CmpgtEpi8(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256CmpgtEpi8: Compare packed 8-bit integers in 'a' and 'b' for greater-than, and store the results in 'dst'.

FOR j := 0 to 31
	i := j*8
	dst[i+7:i] := ( a[i+7:i] > b[i+7:i] ) ? 0xFF : 0
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPCMPGTB'. Intrinsic: '_mm256_cmpgt_epi8'. Requires AVX2.

func M256Cvtepi16Epi32 ¶

func M256Cvtepi16Epi32(a x86.M128i) (dst x86.M256i)

M256Cvtepi16Epi32: Sign extend packed 16-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst'.

FOR j:= 0 to 7
	i := 32*j
	k := 16*j
	dst[i+31:i] := SignExtend(a[k+15:k])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVSXWD'. Intrinsic: '_mm256_cvtepi16_epi32'. Requires AVX2.

func M256Cvtepi16Epi64 ¶

func M256Cvtepi16Epi64(a x86.M128i) (dst x86.M256i)

M256Cvtepi16Epi64: Sign extend packed 16-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst'.

FOR j:= 0 to 3
	i := 64*j
	k := 16*j
	dst[i+63:i] := SignExtend(a[k+15:k])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVSXWQ'. Intrinsic: '_mm256_cvtepi16_epi64'. Requires AVX2.

func M256Cvtepi32Epi64 ¶

func M256Cvtepi32Epi64(a x86.M128i) (dst x86.M256i)

M256Cvtepi32Epi64: Sign extend packed 32-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst'.

FOR j:= 0 to 3
	i := 64*j
	k := 32*j
	dst[i+63:i] := SignExtend(a[k+31:k])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVSXDQ'. Intrinsic: '_mm256_cvtepi32_epi64'. Requires AVX2.

func M256Cvtepi8Epi16 ¶

func M256Cvtepi8Epi16(a x86.M128i) (dst x86.M256i)

M256Cvtepi8Epi16: Sign extend packed 8-bit integers in 'a' to packed 16-bit integers, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*8
	l := j*16
	dst[l+15:l] := SignExtend(a[i+7:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVSXBW'. Intrinsic: '_mm256_cvtepi8_epi16'. Requires AVX2.

func M256Cvtepi8Epi32 ¶

func M256Cvtepi8Epi32(a x86.M128i) (dst x86.M256i)

M256Cvtepi8Epi32: Sign extend packed 8-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst'.

FOR j := 0 to 7
	i := 32*j
	k := 8*j
	dst[i+31:i] := SignExtend(a[k+7:k])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVSXBD'. Intrinsic: '_mm256_cvtepi8_epi32'. Requires AVX2.

func M256Cvtepi8Epi64 ¶

func M256Cvtepi8Epi64(a x86.M128i) (dst x86.M256i)

M256Cvtepi8Epi64: Sign extend packed 8-bit integers in the low 8 bytes of 'a' to packed 64-bit integers, and store the results in 'dst'.

FOR j := 0 to 3
	i := 64*j
	k := 8*j
	dst[i+63:i] := SignExtend(a[k+7:k])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVSXBQ'. Intrinsic: '_mm256_cvtepi8_epi64'. Requires AVX2.

func M256Cvtepu16Epi32 ¶

func M256Cvtepu16Epi32(a x86.M128i) (dst x86.M256i)

M256Cvtepu16Epi32: Zero extend packed unsigned 16-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst'.

FOR j := 0 to 7
	i := 32*j
	k := 16*j
	dst[i+31:i] := ZeroExtend(a[k+15:k])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVZXWD'. Intrinsic: '_mm256_cvtepu16_epi32'. Requires AVX2.

func M256Cvtepu16Epi64 ¶

func M256Cvtepu16Epi64(a x86.M128i) (dst x86.M256i)

M256Cvtepu16Epi64: Zero extend packed unsigned 16-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst'.

FOR j:= 0 to 3
	i := 64*j
	k := 16*j
	dst[i+63:i] := ZeroExtend(a[k+15:k])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVZXWQ'. Intrinsic: '_mm256_cvtepu16_epi64'. Requires AVX2.

func M256Cvtepu32Epi64 ¶

func M256Cvtepu32Epi64(a x86.M128i) (dst x86.M256i)

M256Cvtepu32Epi64: Zero extend packed unsigned 32-bit integers in 'a' to packed 64-bit integers, and store the results in 'dst'.

FOR j:= 0 to 3
	i := 64*j
	k := 32*j
	dst[i+63:i] := ZeroExtend(a[k+31:k])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVZXDQ'. Intrinsic: '_mm256_cvtepu32_epi64'. Requires AVX2.

func M256Cvtepu8Epi16 ¶

func M256Cvtepu8Epi16(a x86.M128i) (dst x86.M256i)

M256Cvtepu8Epi16: Zero extend packed unsigned 8-bit integers in 'a' to packed 16-bit integers, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*8
	l := j*16
	dst[l+15:l] := ZeroExtend(a[i+7:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVZXBW'. Intrinsic: '_mm256_cvtepu8_epi16'. Requires AVX2.

func M256Cvtepu8Epi32 ¶

func M256Cvtepu8Epi32(a x86.M128i) (dst x86.M256i)

M256Cvtepu8Epi32: Zero extend packed unsigned 8-bit integers in 'a' to packed 32-bit integers, and store the results in 'dst'.

FOR j := 0 to 7
	i := 32*j
	k := 8*j
	dst[i+31:i] := ZeroExtend(a[k+7:k])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVZXBD'. Intrinsic: '_mm256_cvtepu8_epi32'. Requires AVX2.

func M256Cvtepu8Epi64 ¶

func M256Cvtepu8Epi64(a x86.M128i) (dst x86.M256i)

M256Cvtepu8Epi64: Zero extend packed unsigned 8-bit integers in the low 8 byte sof 'a' to packed 64-bit integers, and store the results in 'dst'.

FOR j := 0 to 3
	i := 64*j
	k := 8*j
	dst[i+63:i] := ZeroExtend(a[k+7:k])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVZXBQ'. Intrinsic: '_mm256_cvtepu8_epi64'. Requires AVX2.

func M256Extracti128Si256 ¶

func M256Extracti128Si256(a x86.M256i, imm8 byte) (dst x86.M128i)

M256Extracti128Si256: Extract 128 bits (composed of integer data) from 'a', selected with 'imm8', and store the result in 'dst'.

CASE imm8[7:0] of
0: dst[127:0] := a[127:0]
1: dst[127:0] := a[255:128]
ESAC
dst[MAX:128] := 0

Instruction: 'VEXTRACTI128'. Intrinsic: '_mm256_extracti128_si256'. Requires AVX2.

FIXME: Requires compiler support (has immediate)

func M256HaddEpi16 ¶

func M256HaddEpi16(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256HaddEpi16: Horizontally add adjacent pairs of 16-bit integers in 'a' and 'b', and pack the signed 16-bit results in 'dst'.

dst[15:0] := a[31:16] + a[15:0]
dst[31:16] := a[63:48] + a[47:32]
dst[47:32] := a[95:80] + a[79:64]
dst[63:48] := a[127:112] + a[111:96]
dst[79:64] := b[31:16] + b[15:0]
dst[95:80] := b[63:48] + b[47:32]
dst[111:96] := b[95:80] + b[79:64]
dst[127:112] := b[127:112] + b[111:96]
dst[143:128] := a[159:144] + a[143:128]
dst[159:144] := a[191:176] + a[175:160]
dst[175:160] := a[223:208] + a[207:192]
dst[191:176] := a[255:240] + a[239:224]
dst[207:192] := b[127:112] + b[143:128]
dst[223:208] := b[159:144] + b[175:160]
dst[239:224] := b[191:176] + b[207:192]
dst[255:240] := b[223:208] + b[239:224]
dst[MAX:256] := 0

Instruction: 'VPHADDW'. Intrinsic: '_mm256_hadd_epi16'. Requires AVX2.

func M256HaddEpi32 ¶

func M256HaddEpi32(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256HaddEpi32: Horizontally add adjacent pairs of 32-bit integers in 'a' and 'b', and pack the signed 32-bit results in 'dst'.

dst[31:0] := a[63:32] + a[31:0]
dst[63:32] := a[127:96] + a[95:64]
dst[95:64] := b[63:32] + b[31:0]
dst[127:96] := b[127:96] + b[95:64]
dst[159:128] := a[191:160] + a[159:128]
dst[191:160] := a[255:224] + a[223:192]
dst[223:192] := b[191:160] + b[159:128]
dst[255:224] := b[255:224] + b[223:192]
dst[MAX:256] := 0

Instruction: 'VPHADDD'. Intrinsic: '_mm256_hadd_epi32'. Requires AVX2.

func M256HaddsEpi16 ¶

func M256HaddsEpi16(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256HaddsEpi16: Horizontally add adjacent pairs of 16-bit integers in 'a' and 'b' using saturation, and pack the signed 16-bit results in 'dst'.

dst[15:0]= Saturate_To_Int16(a[31:16] + a[15:0])
dst[31:16] = Saturate_To_Int16(a[63:48] + a[47:32])
dst[47:32] = Saturate_To_Int16(a[95:80] + a[79:64])
dst[63:48] = Saturate_To_Int16(a[127:112] + a[111:96])
dst[79:64] = Saturate_To_Int16(b[31:16] + b[15:0])
dst[95:80] = Saturate_To_Int16(b[63:48] + b[47:32])
dst[111:96] = Saturate_To_Int16(b[95:80] + b[79:64])
dst[127:112] = Saturate_To_Int16(b[127:112] + b[111:96])
dst[143:128] = Saturate_To_Int16(a[159:144] + a[143:128])
dst[159:144] = Saturate_To_Int16(a[191:176] + a[175:160])
dst[175:160] = Saturate_To_Int16( a[223:208] + a[207:192])
dst[191:176] = Saturate_To_Int16(a[255:240] + a[239:224])
dst[207:192] = Saturate_To_Int16(b[127:112] + b[143:128])
dst[223:208] = Saturate_To_Int16(b[159:144] + b[175:160])
dst[239:224] = Saturate_To_Int16(b[191-160] + b[159-128])
dst[255:240] = Saturate_To_Int16(b[255:240] + b[239:224])
dst[MAX:256] := 0

Instruction: 'VPHADDSW'. Intrinsic: '_mm256_hadds_epi16'. Requires AVX2.

func M256HsubEpi16 ¶

func M256HsubEpi16(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256HsubEpi16: Horizontally subtract adjacent pairs of 16-bit integers in 'a' and 'b', and pack the signed 16-bit results in 'dst'.

dst[15:0] := a[15:0] - a[31:16]
dst[31:16] := a[47:32] - a[63:48]
dst[47:32] := a[79:64] - a[95:80]
dst[63:48] := a[111:96] - a[127:112]
dst[79:64] := b[15:0] - b[31:16]
dst[95:80] := b[47:32] - b[63:48]
dst[111:96] := b[79:64] - b[95:80]
dst[127:112] := b[111:96] - b[127:112]
dst[143:128] := a[143:128] - a[159:144]
dst[159:144] := a[175:160] - a[191:176]
dst[175:160] := a[207:192] - a[223:208]
dst[191:176] := a[239:224] - a[255:240]
dst[207:192] := b[143:128] - b[159:144]
dst[223:208] := b[175:160] - b[191:176]
dst[239:224] := b[207:192] - b[223:208]
dst[255:240] := b[239:224] - b[255:240]
dst[MAX:256] := 0

Instruction: 'VPHSUBW'. Intrinsic: '_mm256_hsub_epi16'. Requires AVX2.

func M256HsubEpi32 ¶

func M256HsubEpi32(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256HsubEpi32: Horizontally subtract adjacent pairs of 32-bit integers in 'a' and 'b', and pack the signed 32-bit results in 'dst'.

dst[31:0] := a[31:0] - a[63:32]
dst[63:32] := a[95:64] - a[127:96]
dst[95:64] := b[31:0] - b[63:32]
dst[127:96] := b[95:64] - b[127:96]
dst[159:128] := a[159:128] - a[191:160]
dst[191:160] := a[223:192] - a[255:224]
dst[223:192] := b[159:128] - b[191:160]
dst[255:224] := b[223:192] - b[255:224]
dst[MAX:256] := 0

Instruction: 'VPHSUBD'. Intrinsic: '_mm256_hsub_epi32'. Requires AVX2.

func M256HsubsEpi16 ¶

func M256HsubsEpi16(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256HsubsEpi16: Horizontally subtract adjacent pairs of 16-bit integers in 'a' and 'b' using saturation, and pack the signed 16-bit results in 'dst'.

dst[15:0]= Saturate_To_Int16(a[15:0] - a[31:16])
dst[31:16] = Saturate_To_Int16(a[47:32] - a[63:48])
dst[47:32] = Saturate_To_Int16(a[79:64] - a[95:80])
dst[63:48] = Saturate_To_Int16(a[111:96] - a[127:112])
dst[79:64] = Saturate_To_Int16(b[15:0] - b[31:16])
dst[95:80] = Saturate_To_Int16(b[47:32] - b[63:48])
dst[111:96] = Saturate_To_Int16(b[79:64] - b[95:80])
dst[127:112] = Saturate_To_Int16(b[111:96] - b[127:112])
dst[143:128]= Saturate_To_Int16(a[143:128] - a[159:144])
dst[159:144] = Saturate_To_Int16(a[175:160] - a[191:176])
dst[175:160] = Saturate_To_Int16(a[207:192] - a[223:208])
dst[191:176] = Saturate_To_Int16(a[239:224] - a[255:240])
dst[207:192] = Saturate_To_Int16(b[143:128] - b[159:144])
dst[223:208] = Saturate_To_Int16(b[175:160] - b[191:176])
dst[239:224] = Saturate_To_Int16(b[207:192] - b[223:208])
dst[255:240] = Saturate_To_Int16(b[239:224] - b[255:240])
dst[MAX:256] := 0

Instruction: 'VPHSUBSW'. Intrinsic: '_mm256_hsubs_epi16'. Requires AVX2.

func M256I32gatherEpi32 ¶

func M256I32gatherEpi32(base_addr *int, vindex x86.M256i, scale int) (dst x86.M256i)

M256I32gatherEpi32: Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at 'base_addr' and offset by each 32-bit element in 'vindex' (each index is scaled by the factor in 'scale'). Gathered elements are merged into 'dst'. 'scale' should be 1, 2, 4 or 8.

FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPGATHERDD'. Intrinsic: '_mm256_i32gather_epi32'. Requires AVX2.

FIXME: Will likely need to be reworked (has pointer parameter).

func M256I32gatherEpi64 ¶

func M256I32gatherEpi64(base_addr *int, vindex x86.M128i, scale int) (dst x86.M256i)

M256I32gatherEpi64: Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at 'base_addr' and offset by each 32-bit element in 'vindex' (each index is scaled by the factor in 'scale'). Gathered elements are merged into 'dst'. 'scale' should be 1, 2, 4 or 8.

FOR j := 0 to 3
	i := j*64
	m := j*32
	dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPGATHERDQ'. Intrinsic: '_mm256_i32gather_epi64'. Requires AVX2.

FIXME: Will likely need to be reworked (has pointer parameter).

func M256I64gatherEpi32 ¶

func M256I64gatherEpi32(base_addr *int, vindex x86.M256i, scale int) (dst x86.M128i)

M256I64gatherEpi32: Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at 'base_addr' and offset by each 64-bit element in 'vindex' (each index is scaled by the factor in 'scale'). Gathered elements are merged into 'dst'. 'scale' should be 1, 2, 4 or 8.

FOR j := 0 to 3
	i := j*32
	m := j*64
	dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPGATHERQD'. Intrinsic: '_mm256_i64gather_epi32'. Requires AVX2.

FIXME: Will likely need to be reworked (has pointer parameter).

func M256I64gatherEpi64 ¶

func M256I64gatherEpi64(base_addr *int, vindex x86.M256i, scale int) (dst x86.M256i)

M256I64gatherEpi64: Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at 'base_addr' and offset by each 64-bit element in 'vindex' (each index is scaled by the factor in 'scale'). Gathered elements are merged into 'dst'. 'scale' should be 1, 2, 4 or 8.

FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPGATHERQQ'. Intrinsic: '_mm256_i64gather_epi64'. Requires AVX2.

FIXME: Will likely need to be reworked (has pointer parameter).

func M256Inserti128Si256 ¶

func M256Inserti128Si256(a x86.M256i, b x86.M128i, imm8 byte) (dst x86.M256i)

M256Inserti128Si256: Copy 'a' to 'dst', then insert 128 bits (composed of integer data) from 'b' into 'dst' at the location specified by 'imm8'.

dst[255:0] := a[255:0]
CASE (imm8[1:0]) of
0: dst[127:0] := b[127:0]
1: dst[255:128] := b[127:0]
ESAC
dst[MAX:256] := 0

Instruction: 'VINSERTI128'. Intrinsic: '_mm256_inserti128_si256'. Requires AVX2.

FIXME: Requires compiler support (has immediate)

func M256MaddEpi16 ¶

func M256MaddEpi16(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaddEpi16: Multiply packed signed 16-bit integers in 'a' and 'b', producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in 'dst'.

FOR j := 0 to 7
	i := j*32
	st[i+31:i] := a[i+31:i+16]*b[i+31:i+16] + a[i+15:i]*b[i+15:i]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMADDWD'. Intrinsic: '_mm256_madd_epi16'. Requires AVX2.

func M256MaddubsEpi16 ¶

func M256MaddubsEpi16(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaddubsEpi16: Vertically multiply each unsigned 8-bit integer from 'a' with the corresponding signed 8-bit integer from 'b', producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in 'dst'.

FOR j := 0 to 15
	i := j*16
	dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMADDUBSW'. Intrinsic: '_mm256_maddubs_epi16'. Requires AVX2.

func M256MaskI32gatherEpi32 ¶

func M256MaskI32gatherEpi32(src x86.M256i, base_addr *int, vindex x86.M256i, mask x86.M256i, scale int) (dst x86.M256i)

M256MaskI32gatherEpi32: Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at 'base_addr' and offset by each 32-bit element in 'vindex' (each index is scaled by the factor in 'scale'). Gathered elements are merged into 'dst' using 'mask' (elements are copied from 'src' when the highest bit is not set in the corresponding element). 'scale' should be 1, 2, 4 or 8.

FOR j := 0 to 7
	i := j*32
	IF mask[i+31]
		dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale]
		mask[i+31] := 0
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
mask[MAX:256] := 0
dst[MAX:256] := 0

Instruction: 'VPGATHERDD'. Intrinsic: '_mm256_mask_i32gather_epi32'. Requires AVX2.

FIXME: Will likely need to be reworked (has pointer parameter).

func M256MaskI32gatherEpi64 ¶

func M256MaskI32gatherEpi64(src x86.M256i, base_addr *int, vindex x86.M128i, mask x86.M256i, scale int) (dst x86.M256i)

M256MaskI32gatherEpi64: Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at 'base_addr' and offset by each 32-bit element in 'vindex' (each index is scaled by the factor in 'scale'). Gathered elements are merged into 'dst' using 'mask' (elements are copied from 'src' when the highest bit is not set in the corresponding element). 'scale' should be 1, 2, 4 or 8.

FOR j := 0 to 3
	i := j*64
	m := j*32
	IF mask[i+63]
		dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale]
		mask[i+63] := 0
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
mask[MAX:256] := 0
dst[MAX:256] := 0

Instruction: 'VPGATHERDQ'. Intrinsic: '_mm256_mask_i32gather_epi64'. Requires AVX2.

FIXME: Will likely need to be reworked (has pointer parameter).

func M256MaskI64gatherEpi32 ¶

func M256MaskI64gatherEpi32(src x86.M128i, base_addr *int, vindex x86.M256i, mask x86.M128i, scale int) (dst x86.M128i)

M256MaskI64gatherEpi32: Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at 'base_addr' and offset by each 64-bit element in 'vindex' (each index is scaled by the factor in 'scale'). Gathered elements are merged into 'dst' using 'mask' (elements are copied from 'src' when the highest bit is not set in the corresponding element). 'scale' should be 1, 2, 4 or 8.

FOR j := 0 to 3
	i := j*32
	m := j*64
	IF mask[i+31]
		dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
		mask[i+31] := 0
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
mask[MAX:128] := 0
dst[MAX:128] := 0

Instruction: 'VPGATHERQD'. Intrinsic: '_mm256_mask_i64gather_epi32'. Requires AVX2.

FIXME: Will likely need to be reworked (has pointer parameter).

func M256MaskI64gatherEpi64 ¶

func M256MaskI64gatherEpi64(src x86.M256i, base_addr *int, vindex x86.M256i, mask x86.M256i, scale int) (dst x86.M256i)

M256MaskI64gatherEpi64: Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at 'base_addr' and offset by each 64-bit element in 'vindex' (each index is scaled by the factor in 'scale'). Gathered elements are merged into 'dst' using 'mask' (elements are copied from 'src' when the highest bit is not set in the corresponding element). 'scale' should be 1, 2, 4 or 8.

FOR j := 0 to 3
	i := j*64
	IF mask[i+63]
		dst[i+63:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
		mask[i+63] := 0
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
mask[MAX:256] := 0
dst[MAX:256] := 0

Instruction: 'VPGATHERQQ'. Intrinsic: '_mm256_mask_i64gather_epi64'. Requires AVX2.

FIXME: Will likely need to be reworked (has pointer parameter).

func M256MaskloadEpi32 ¶

func M256MaskloadEpi32(mem_addr *int, mask x86.M256i) (dst x86.M256i)

M256MaskloadEpi32: Load packed 32-bit integers from memory into 'dst' using 'mask' (elements are zeroed out when the highest bit is not set in the corresponding element).

FOR j := 0 to 7
	i := j*32
	IF mask[i+31]
		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMASKMOVD'. Intrinsic: '_mm256_maskload_epi32'. Requires AVX2.

FIXME: Will likely need to be reworked (has pointer parameter).

func M256MaskloadEpi64 ¶

func M256MaskloadEpi64(mem_addr *int, mask x86.M256i) (dst x86.M256i)

M256MaskloadEpi64: Load packed 64-bit integers from memory into 'dst' using 'mask' (elements are zeroed out when the highest bit is not set in the corresponding element).

FOR j := 0 to 3
	i := j*64
	IF mask[i+63]
		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMASKMOVQ'. Intrinsic: '_mm256_maskload_epi64'. Requires AVX2.

FIXME: Will likely need to be reworked (has pointer parameter).

func M256MaskstoreEpi32 ¶

func M256MaskstoreEpi32(mem_addr *int, mask x86.M256i, a x86.M256i)

M256MaskstoreEpi32: Store packed 32-bit integers from 'a' into memory using 'mask' (elements are not stored when the highest bit is not set in the corresponding element).

FOR j := 0 to 7
	i := j*32
	IF mask[i+31]
		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
	FI
ENDFOR

Instruction: 'VPMASKMOVD'. Intrinsic: '_mm256_maskstore_epi32'. Requires AVX2.

FIXME: Will likely need to be reworked (has pointer parameter).

func M256MaskstoreEpi64 ¶

func M256MaskstoreEpi64(mem_addr *int64, mask x86.M256i, a x86.M256i)

M256MaskstoreEpi64: Store packed 64-bit integers from 'a' into memory using 'mask' (elements are not stored when the highest bit is not set in the corresponding element).

FOR j := 0 to 3
	i := j*64
	IF mask[i+63]
		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
	FI
ENDFOR

Instruction: 'VPMASKMOVQ'. Intrinsic: '_mm256_maskstore_epi64'. Requires AVX2.

FIXME: Will likely need to be reworked (has pointer parameter).

func M256MaxEpi16 ¶

func M256MaxEpi16(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaxEpi16: Compare packed 16-bit integers in 'a' and 'b', and store packed maximum values in 'dst'.

FOR j := 0 to 15
	i := j*16
	IF a[i+15:i] > b[i+15:i]
		dst[i+15:i] := a[i+15:i]
	ELSE
		dst[i+15:i] := b[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMAXSW'. Intrinsic: '_mm256_max_epi16'. Requires AVX2.

func M256MaxEpi32 ¶

func M256MaxEpi32(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaxEpi32: Compare packed 32-bit integers in 'a' and 'b', and store packed maximum values in 'dst'.

FOR j := 0 to 7
	i := j*32
	IF a[i+31:i] > b[i+31:i]
		dst[i+31:i] := a[i+31:i]
	ELSE
		dst[i+31:i] := b[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMAXSD'. Intrinsic: '_mm256_max_epi32'. Requires AVX2.

func M256MaxEpi8 ¶

func M256MaxEpi8(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaxEpi8: Compare packed 8-bit integers in 'a' and 'b', and store packed maximum values in 'dst'.

FOR j := 0 to 31
	i := j*8
	IF a[i+7:i] > b[i+7:i]
		dst[i+7:i] := a[i+7:i]
	ELSE
		dst[i+7:i] := b[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMAXSB'. Intrinsic: '_mm256_max_epi8'. Requires AVX2.

func M256MaxEpu16 ¶

func M256MaxEpu16(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaxEpu16: Compare packed unsigned 16-bit integers in 'a' and 'b', and store packed maximum values in 'dst'.

FOR j := 0 to 15
	i := j*16
	IF a[i+15:i] > b[i+15:i]
		dst[i+15:i] := a[i+15:i]
	ELSE
		dst[i+15:i] := b[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMAXUW'. Intrinsic: '_mm256_max_epu16'. Requires AVX2.

func M256MaxEpu32 ¶

func M256MaxEpu32(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaxEpu32: Compare packed unsigned 32-bit integers in 'a' and 'b', and store packed maximum values in 'dst'.

FOR j := 0 to 7
	i := j*32
	IF a[i+31:i] > b[i+31:i]
		dst[i+31:i] := a[i+31:i]
	ELSE
		dst[i+31:i] := b[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMAXUD'. Intrinsic: '_mm256_max_epu32'. Requires AVX2.

func M256MaxEpu8 ¶

func M256MaxEpu8(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaxEpu8: Compare packed unsigned 8-bit integers in 'a' and 'b', and store packed maximum values in 'dst'.

FOR j := 0 to 31
	i := j*8
	IF a[i+7:i] > b[i+7:i]
		dst[i+7:i] := a[i+7:i]
	ELSE
		dst[i+7:i] := b[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMAXUB'. Intrinsic: '_mm256_max_epu8'. Requires AVX2.

func M256MinEpi16 ¶

func M256MinEpi16(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MinEpi16: Compare packed 16-bit integers in 'a' and 'b', and store packed minimum values in 'dst'.

FOR j := 0 to 15
	i := j*16
	IF a[i+15:i] < b[i+15:i]
		dst[i+15:i] := a[i+15:i]
	ELSE
		dst[i+15:i] := b[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMINSW'. Intrinsic: '_mm256_min_epi16'. Requires AVX2.

func M256MinEpi32 ¶

func M256MinEpi32(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MinEpi32: Compare packed 32-bit integers in 'a' and 'b', and store packed minimum values in 'dst'.

FOR j := 0 to 7
	i := j*32
	IF a[i+31:i] < b[i+31:i]
		dst[i+31:i] := a[i+31:i]
	ELSE
		dst[i+31:i] := b[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMINSD'. Intrinsic: '_mm256_min_epi32'. Requires AVX2.

func M256MinEpi8 ¶

func M256MinEpi8(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MinEpi8: Compare packed 8-bit integers in 'a' and 'b', and store packed minimum values in 'dst'.

FOR j := 0 to 31
	i := j*8
	IF a[i+7:i] < b[i+7:i]
		dst[i+7:i] := a[i+7:i]
	ELSE
		dst[i+7:i] := b[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMINSB'. Intrinsic: '_mm256_min_epi8'. Requires AVX2.

func M256MinEpu16 ¶

func M256MinEpu16(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MinEpu16: Compare packed unsigned 16-bit integers in 'a' and 'b', and store packed minimum values in 'dst'.

FOR j := 0 to 15
	i := j*16
	IF a[i+15:i] < b[i+15:i]
		dst[i+15:i] := a[i+15:i]
	ELSE
		dst[i+15:i] := b[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMINUW'. Intrinsic: '_mm256_min_epu16'. Requires AVX2.

func M256MinEpu32 ¶

func M256MinEpu32(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MinEpu32: Compare packed unsigned 32-bit integers in 'a' and 'b', and store packed minimum values in 'dst'.

FOR j := 0 to 7
	i := j*32
	IF a[i+31:i] < b[i+31:i]
		dst[i+31:i] := a[i+31:i]
	ELSE
		dst[i+31:i] := b[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMINUD'. Intrinsic: '_mm256_min_epu32'. Requires AVX2.

func M256MinEpu8 ¶

func M256MinEpu8(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MinEpu8: Compare packed unsigned 8-bit integers in 'a' and 'b', and store packed minimum values in 'dst'.

FOR j := 0 to 31
	i := j*8
	IF a[i+7:i] < b[i+7:i]
		dst[i+7:i] := a[i+7:i]
	ELSE
		dst[i+7:i] := b[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMINUB'. Intrinsic: '_mm256_min_epu8'. Requires AVX2.

func M256MovemaskEpi8 ¶

func M256MovemaskEpi8(a x86.M256i) int

M256MovemaskEpi8: Create mask from the most significant bit of each 8-bit element in 'a', and store the result in 'dst'.

FOR j := 0 to 31
	i := j*8
	dst[j] := a[i+7]
ENDFOR

Instruction: 'VPMOVMSKB'. Intrinsic: '_mm256_movemask_epi8'. Requires AVX2.

func M256MpsadbwEpu8 ¶

func M256MpsadbwEpu8(a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)

M256MpsadbwEpu8: Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in 'a' compared to those in 'b', and store the 16-bit results in 'dst'.

Eight SADs are performed for each 128-bit lane using one quadruplet from

'b' and eight quadruplets from 'a'. One quadruplet is selected from 'b' starting at on the offset specified in 'imm8'. Eight quadruplets are formed from sequential 8-bit integers selected from 'a' starting at the offset specified in 'imm8'.

MPSADBW(a[127:0], b[127:0], imm8[2:0]) {
	i := imm8[2]*32
	b_offset := imm8[1:0]*32
	FOR j := 0 to 7
		i := j*8
		k := a_offset+i
		l := b_offset
		tmp[i+15:i] := ABS(a[k+7:k] - b[l+7:l]) + ABS(a[k+15:k+8] - b[l+15:l+8]) + ABS(a[k+23:k+16] - b[l+23:l+16]) + ABS(a[k+31:k+24] - b[l+31:l+24])
	ENDFOR
	RETURN tmp[127:0]
}

dst[127:0] := MPSADBW(a[127:0], b[127:0], imm8[2:0])
dst[255:128] := MPSADBW(a[255:128], b[255:128], imm8[5:3])
dst[MAX:256] := 0

Instruction: 'VMPSADBW'. Intrinsic: '_mm256_mpsadbw_epu8'. Requires AVX2.

FIXME: Requires compiler support (has immediate)

func M256MulEpi32 ¶

func M256MulEpi32(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MulEpi32: Multiply the low 32-bit integers from each packed 64-bit element in 'a' and 'b', and store the signed 64-bit results in 'dst'.

FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := a[i+31:i] * b[i+31:i]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMULDQ'. Intrinsic: '_mm256_mul_epi32'. Requires AVX2.

func M256MulEpu32 ¶

func M256MulEpu32(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MulEpu32: Multiply the low unsigned 32-bit integers from each packed 64-bit element in 'a' and 'b', and store the unsigned 64-bit results in 'dst'.

FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := a[i+31:i] * b[i+31:i]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMULUDQ'. Intrinsic: '_mm256_mul_epu32'. Requires AVX2.

func M256MulhiEpi16 ¶

func M256MulhiEpi16(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MulhiEpi16: Multiply the packed 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in 'dst'.

FOR j := 0 to 15
	i := j*16
	tmp[31:0] := a[i+15:i] * b[i+15:i]
	dst[i+15:i] := tmp[31:16]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMULHW'. Intrinsic: '_mm256_mulhi_epi16'. Requires AVX2.

func M256MulhiEpu16 ¶

func M256MulhiEpu16(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MulhiEpu16: Multiply the packed unsigned 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in 'dst'.

FOR j := 0 to 15
	i := j*16
	tmp[31:0] := a[i+15:i] * b[i+15:i]
	dst[i+15:i] := tmp[31:16]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMULHUW'. Intrinsic: '_mm256_mulhi_epu16'. Requires AVX2.

func M256MulhrsEpi16 ¶

func M256MulhrsEpi16(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MulhrsEpi16: Multiply packed 16-bit integers in 'a' and 'b', producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to 'dst'.

FOR j := 0 to 15
	i := j*16
	tmp[31:0] := ((a[i+15:i] * b[i+15:i]) >> 14) + 1
	dst[i+15:i] := tmp[16:1]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMULHRSW'. Intrinsic: '_mm256_mulhrs_epi16'. Requires AVX2.

func M256MulloEpi16 ¶

func M256MulloEpi16(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MulloEpi16: Multiply the packed 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in 'dst'.

FOR j := 0 to 15
	i := j*16
	tmp[31:0] := a[i+15:i] * b[i+15:i]
	dst[i+15:i] := tmp[15:0]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMULLW'. Intrinsic: '_mm256_mullo_epi16'. Requires AVX2.

func M256MulloEpi32 ¶

func M256MulloEpi32(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MulloEpi32: Multiply the packed 32-bit integers in 'a' and 'b', producing intermediate 64-bit integers, and store the low 32 bits of the intermediate integers in 'dst'.

FOR j := 0 to 7
	i := j*32
	tmp[63:0] := a[i+31:i] * b[i+31:i]
	dst[i+31:i] := tmp[31:0]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMULLD'. Intrinsic: '_mm256_mullo_epi32'. Requires AVX2.

func M256OrSi256 ¶

func M256OrSi256(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256OrSi256: Compute the bitwise OR of 256 bits (representing integer data) in 'a' and 'b', and store the result in 'dst'.

dst[255:0] := (a[255:0] OR b[255:0])
dst[MAX:256] := 0

Instruction: 'VPOR'. Intrinsic: '_mm256_or_si256'. Requires AVX2.

func M256PacksEpi16 ¶

func M256PacksEpi16(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256PacksEpi16: Convert packed 16-bit integers from 'a' and 'b' to packed 8-bit integers using signed saturation, and store the results in 'dst'.

dst[7:0] := Saturate_Int16_To_Int8 (a[15:0])
dst[15:8] := Saturate_Int16_To_Int8 (a[31:16])
dst[23:16] := Saturate_Int16_To_Int8 (a[47:32])
dst[31:24] := Saturate_Int16_To_Int8 (a[63:48])
dst[39:32] := Saturate_Int16_To_Int8 (a[79:64])
dst[47:40] := Saturate_Int16_To_Int8 (a[95:80])
dst[55:48] := Saturate_Int16_To_Int8 (a[111:96])
dst[63:56] := Saturate_Int16_To_Int8 (a[127:112])
dst[71:64] := Saturate_Int16_To_Int8 (b[15:0])
dst[79:72] := Saturate_Int16_To_Int8 (b[31:16])
dst[87:80] := Saturate_Int16_To_Int8 (b[47:32])
dst[95:88] := Saturate_Int16_To_Int8 (b[63:48])
dst[103:96] := Saturate_Int16_To_Int8 (b[79:64])
dst[111:104] := Saturate_Int16_To_Int8 (b[95:80])
dst[119:112] := Saturate_Int16_To_Int8 (b[111:96])
dst[127:120] := Saturate_Int16_To_Int8 (b[127:112])
dst[135:128] := Saturate_Int16_To_Int8 (a[143:128])
dst[143:136] := Saturate_Int16_To_Int8 (a[159:144])
dst[151:144] := Saturate_Int16_To_Int8 (a[175:160])
dst[159:152] := Saturate_Int16_To_Int8 (a[191:176])
dst[167:160] := Saturate_Int16_To_Int8 (a[207:192])
dst[175:168] := Saturate_Int16_To_Int8 (a[223:208])
dst[183:176] := Saturate_Int16_To_Int8 (a[239:224])
dst[191:184] := Saturate_Int16_To_Int8 (a[255:240])
dst[199:192] := Saturate_Int16_To_Int8 (b[143:128])
dst[207:200] := Saturate_Int16_To_Int8 (b[159:144])
dst[215:208] := Saturate_Int16_To_Int8 (b[175:160])
dst[223:216] := Saturate_Int16_To_Int8 (b[191:176])
dst[231:224] := Saturate_Int16_To_Int8 (b[207:192])
dst[239:232] := Saturate_Int16_To_Int8 (b[223:208])
dst[247:240] := Saturate_Int16_To_Int8 (b[239:224])
dst[255:248] := Saturate_Int16_To_Int8 (b[255:240])
dst[MAX:256] := 0

Instruction: 'VPACKSSWB'. Intrinsic: '_mm256_packs_epi16'. Requires AVX2.

func M256PacksEpi32 ¶

func M256PacksEpi32(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256PacksEpi32: Convert packed 32-bit integers from 'a' and 'b' to packed 16-bit integers using signed saturation, and store the results in 'dst'.

dst[15:0] := Saturate_Int32_To_Int16 (a[31:0])
dst[31:16] := Saturate_Int32_To_Int16 (a[63:32])
dst[47:32] := Saturate_Int32_To_Int16 (a[95:64])
dst[63:48] := Saturate_Int32_To_Int16 (a[127:96])
dst[79:64] := Saturate_Int32_To_Int16 (b[31:0])
dst[95:80] := Saturate_Int32_To_Int16 (b[63:32])
dst[111:96] := Saturate_Int32_To_Int16 (b[95:64])
dst[127:112] := Saturate_Int32_To_Int16 (b[127:96])
dst[143:128] := Saturate_Int32_To_Int16 (a[159:128])
dst[159:144] := Saturate_Int32_To_Int16 (a[191:160])
dst[175:160] := Saturate_Int32_To_Int16 (a[223:192])
dst[191:176] := Saturate_Int32_To_Int16 (a[255:224])
dst[207:192] := Saturate_Int32_To_Int16 (b[159:128])
dst[223:208] := Saturate_Int32_To_Int16 (b[191:160])
dst[239:224] := Saturate_Int32_To_Int16 (b[223:192])
dst[255:240] := Saturate_Int32_To_Int16 (b[255:224])
dst[MAX:256] := 0

Instruction: 'VPACKSSDW'. Intrinsic: '_mm256_packs_epi32'. Requires AVX2.

func M256PackusEpi16 ¶

func M256PackusEpi16(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256PackusEpi16: Convert packed 16-bit integers from 'a' and 'b' to packed 8-bit integers using unsigned saturation, and store the results in 'dst'.

dst[7:0] := Saturate_Int16_To_UnsignedInt8 (a[15:0])
dst[15:8] := Saturate_Int16_To_UnsignedInt8 (a[31:16])
dst[23:16] := Saturate_Int16_To_UnsignedInt8 (a[47:32])
dst[31:24] := Saturate_Int16_To_UnsignedInt8 (a[63:48])
dst[39:32] := Saturate_Int16_To_UnsignedInt8 (a[79:64])
dst[47:40] := Saturate_Int16_To_UnsignedInt8 (a[95:80])
dst[55:48] := Saturate_Int16_To_UnsignedInt8 (a[111:96])
dst[63:56] := Saturate_Int16_To_UnsignedInt8 (a[127:112])
dst[71:64] := Saturate_Int16_To_UnsignedInt8 (b[15:0])
dst[79:72] := Saturate_Int16_To_UnsignedInt8 (b[31:16])
dst[87:80] := Saturate_Int16_To_UnsignedInt8 (b[47:32])
dst[95:88] := Saturate_Int16_To_UnsignedInt8 (b[63:48])
dst[103:96] := Saturate_Int16_To_UnsignedInt8 (b[79:64])
dst[111:104] := Saturate_Int16_To_UnsignedInt8 (b[95:80])
dst[119:112] := Saturate_Int16_To_UnsignedInt8 (b[111:96])
dst[127:120] := Saturate_Int16_To_UnsignedInt8 (b[127:112])
dst[135:128] := Saturate_Int16_To_UnsignedInt8 (a[143:128])
dst[143:136] := Saturate_Int16_To_UnsignedInt8 (a[159:144])
dst[151:144] := Saturate_Int16_To_UnsignedInt8 (a[175:160])
dst[159:152] := Saturate_Int16_To_UnsignedInt8 (a[191:176])
dst[167:160] := Saturate_Int16_To_UnsignedInt8 (a[207:192])
dst[175:168] := Saturate_Int16_To_UnsignedInt8 (a[223:208])
dst[183:176] := Saturate_Int16_To_UnsignedInt8 (a[239:224])
dst[191:184] := Saturate_Int16_To_UnsignedInt8 (a[255:240])
dst[199:192] := Saturate_Int16_To_UnsignedInt8 (b[143:128])
dst[207:200] := Saturate_Int16_To_UnsignedInt8 (b[159:144])
dst[215:208] := Saturate_Int16_To_UnsignedInt8 (b[175:160])
dst[223:216] := Saturate_Int16_To_UnsignedInt8 (b[191:176])
dst[231:224] := Saturate_Int16_To_UnsignedInt8 (b[207:192])
dst[239:232] := Saturate_Int16_To_UnsignedInt8 (b[223:208])
dst[247:240] := Saturate_Int16_To_UnsignedInt8 (b[239:224])
dst[255:248] := Saturate_Int16_To_UnsignedInt8 (b[255:240])
dst[MAX:256] := 0

Instruction: 'VPACKUSWB'. Intrinsic: '_mm256_packus_epi16'. Requires AVX2.

func M256PackusEpi32 ¶

func M256PackusEpi32(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256PackusEpi32: Convert packed 32-bit integers from 'a' and 'b' to packed 16-bit integers using unsigned saturation, and store the results in 'dst'.

dst[15:0] := Saturate_Int32_To_UnsignedInt16 (a[31:0])
dst[31:16] := Saturate_Int32_To_UnsignedInt16 (a[63:32])
dst[47:32] := Saturate_Int32_To_UnsignedInt16 (a[95:64])
dst[63:48] := Saturate_Int32_To_UnsignedInt16 (a[127:96])
dst[79:64] := Saturate_Int32_To_UnsignedInt16 (b[31:0])
dst[95:80] := Saturate_Int32_To_UnsignedInt16 (b[63:32])
dst[111:96] := Saturate_Int32_To_UnsignedInt16 (b[95:64])
dst[127:112] := Saturate_Int32_To_UnsignedInt16 (b[127:96])
dst[143:128] := Saturate_Int32_To_UnsignedInt16 (a[159:128])
dst[159:144] := Saturate_Int32_To_UnsignedInt16 (a[191:160])
dst[175:160] := Saturate_Int32_To_UnsignedInt16 (a[223:192])
dst[191:176] := Saturate_Int32_To_UnsignedInt16 (a[255:224])
dst[207:192] := Saturate_Int32_To_UnsignedInt16 (b[159:128])
dst[223:208] := Saturate_Int32_To_UnsignedInt16 (b[191:160])
dst[239:224] := Saturate_Int32_To_UnsignedInt16 (b[223:192])
dst[255:240] := Saturate_Int32_To_UnsignedInt16 (b[255:224])
dst[MAX:256] := 0

Instruction: 'VPACKUSDW'. Intrinsic: '_mm256_packus_epi32'. Requires AVX2.

func M256Permute2x128Si256 ¶

func M256Permute2x128Si256(a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)

M256Permute2x128Si256: Shuffle 128-bits (composed of integer data) selected by 'imm8' from 'a' and 'b', and store the results in 'dst'.

SELECT4(src1, src2, control){
	CASE(control[1:0])
	0:	tmp[127:0] := src1[127:0]
	1:	tmp[127:0] := src1[255:128]
	2:	tmp[127:0] := src2[127:0]
	3:	tmp[127:0] := src2[255:128]
	ESAC
	IF control[3]
		tmp[127:0] := 0
	FI
	RETURN tmp[127:0]
}

dst[127:0] := SELECT4(a[255:0], b[255:0], imm8[3:0])
dst[255:128] := SELECT4(a[255:0], b[255:0], imm8[7:4])
dst[MAX:256] := 0

Instruction: 'VPERM2I128'. Intrinsic: '_mm256_permute2x128_si256'. Requires AVX2.

FIXME: Requires compiler support (has immediate)

func M256Permute4x64Epi64 ¶

func M256Permute4x64Epi64(a x86.M256i, imm8 byte) (dst x86.M256i)

M256Permute4x64Epi64: Shuffle 64-bit integers in 'a' across lanes using the control in 'imm8', and store the results in 'dst'.

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[63:0] := src[63:0]
	1:	tmp[63:0] := src[127:64]
	2:	tmp[63:0] := src[191:128]
	3:	tmp[63:0] := src[255:192]
	ESAC
	RETURN tmp[63:0]
}

dst[63:0] := SELECT4(a[255:0], imm8[1:0])
dst[127:64] := SELECT4(a[255:0], imm8[3:2])
dst[191:128] := SELECT4(a[255:0], imm8[5:4])
dst[255:192] := SELECT4(a[255:0], imm8[7:6])
dst[MAX:256] := 0

Instruction: 'VPERMQ'. Intrinsic: '_mm256_permute4x64_epi64'. Requires AVX2.

FIXME: Requires compiler support (has immediate)

func M256Permute4x64Pd ¶

func M256Permute4x64Pd(a x86.M256d, imm8 byte) (dst x86.M256d)

M256Permute4x64Pd: Shuffle double-precision (64-bit) floating-point elements in 'a' across lanes using the control in 'imm8', and store the results in 'dst'.

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[63:0] := src[63:0]
	1:	tmp[63:0] := src[127:64]
	2:	tmp[63:0] := src[191:128]
	3:	tmp[63:0] := src[255:192]
	ESAC
	RETURN tmp[63:0]
}

dst[63:0] := SELECT4(a[255:0], imm8[1:0])
dst[127:64] := SELECT4(a[255:0], imm8[3:2])
dst[191:128] := SELECT4(a[255:0], imm8[5:4])
dst[255:192] := SELECT4(a[255:0], imm8[7:6])
dst[MAX:256] := 0

Instruction: 'VPERMPD'. Intrinsic: '_mm256_permute4x64_pd'. Requires AVX2.

FIXME: Requires compiler support (has immediate)

func M256Permutevar8x32Epi32 ¶

func M256Permutevar8x32Epi32(a x86.M256i, idx x86.M256i) (dst x86.M256i)

M256Permutevar8x32Epi32: Shuffle 32-bit integers in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*32
	id := idx[i+2:i]*32
	dst[i+31:i] := a[id+31:id]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMD'. Intrinsic: '_mm256_permutevar8x32_epi32'. Requires AVX2.

func M256Permutevar8x32Ps ¶

func M256Permutevar8x32Ps(a x86.M256, idx x86.M256i) (dst x86.M256)

M256Permutevar8x32Ps: Shuffle single-precision (32-bit) floating-point elements in 'a' across lanes using the corresponding index in 'idx'.

FOR j := 0 to 7
	i := j*32
	id := idx[i+2:i]*32
	dst[i+31:i] := a[id+31:id]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMPS'. Intrinsic: '_mm256_permutevar8x32_ps'. Requires AVX2.

func M256SadEpu8 ¶

func M256SadEpu8(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256SadEpu8: Compute the absolute differences of packed unsigned 8-bit integers in 'a' and 'b', then horizontally sum each consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of 64-bit elements in 'dst'.

FOR j := 0 to 31
	i := j*8
	tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i])
ENDFOR
FOR j := 0 to 4
	i := j*64
	dst[i+15:i] := tmp[i+7:i] + tmp[i+15:i+8] + tmp[i+23:i+16] + tmp[i+31:i+24] + tmp[i+39:i+32] + tmp[i+47:i+40] + tmp[i+55:i+48] + tmp[i+63:i+56]
	dst[i+63:i+16] := 0
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSADBW'. Intrinsic: '_mm256_sad_epu8'. Requires AVX2.

func M256ShuffleEpi32 ¶

func M256ShuffleEpi32(a x86.M256i, imm8 byte) (dst x86.M256i)

M256ShuffleEpi32: Shuffle 32-bit integers in 'a' within 128-bit lanes using the control in 'imm8', and store the results in 'dst'.

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

dst[31:0] := SELECT4(a[127:0], imm8[1:0])
dst[63:32] := SELECT4(a[127:0], imm8[3:2])
dst[95:64] := SELECT4(a[127:0], imm8[5:4])
dst[127:96] := SELECT4(a[127:0], imm8[7:6])
dst[159:128] := SELECT4(a[255:128], imm8[1:0])
dst[191:160] := SELECT4(a[255:128], imm8[3:2])
dst[223:192] := SELECT4(a[255:128], imm8[5:4])
dst[255:224] := SELECT4(a[255:128], imm8[7:6])
dst[MAX:256] := 0

Instruction: 'VPSHUFD'. Intrinsic: '_mm256_shuffle_epi32'. Requires AVX2.

FIXME: Requires compiler support (has immediate)

func M256ShuffleEpi8 ¶

func M256ShuffleEpi8(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256ShuffleEpi8: Shuffle 8-bit integers in 'a' within 128-bit lanes according to shuffle control mask in the corresponding 8-bit element of 'b', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*8
	IF b[i+7] == 1
		dst[i+7:i] := 0
	ELSE
		index[3:0] := b[i+3:i]
		dst[i+7:i] := a[index*8+7:index*8]
	FI
	IF b[128+i+7] == 1
		dst[128+i+7:i] := 0
	ELSE
		index[3:0] := b[128+i+3:128+i]
		dst[128+i+7:i] := a[128+index*8+7:128+index*8]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSHUFB'. Intrinsic: '_mm256_shuffle_epi8'. Requires AVX2.

func M256ShufflehiEpi16 ¶

func M256ShufflehiEpi16(a x86.M256i, imm8 byte) (dst x86.M256i)

M256ShufflehiEpi16: Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of 'a' using the control in 'imm8'. Store the results in the high 64 bits of 128-bit lanes of 'dst', with the low 64 bits of 128-bit lanes being copied from from 'a' to 'dst'.

dst[63:0] := a[63:0]
dst[79:64] := (a >> (imm8[1:0] * 16))[79:64]
dst[95:80] := (a >> (imm8[3:2] * 16))[79:64]
dst[111:96] := (a >> (imm8[5:4] * 16))[79:64]
dst[127:112] := (a >> (imm8[7:6] * 16))[79:64]
dst[191:128] := a[191:128]
dst[207:192] := (a >> (imm8[1:0] * 16))[207:192]
dst[223:208] := (a >> (imm8[3:2] * 16))[207:192]
dst[239:224] := (a >> (imm8[5:4] * 16))[207:192]
dst[255:240] := (a >> (imm8[7:6] * 16))[207:192]
dst[MAX:256] := 0

Instruction: 'VPSHUFHW'. Intrinsic: '_mm256_shufflehi_epi16'. Requires AVX2.

FIXME: Requires compiler support (has immediate)

func M256ShuffleloEpi16 ¶

func M256ShuffleloEpi16(a x86.M256i, imm8 byte) (dst x86.M256i)

M256ShuffleloEpi16: Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of 'a' using the control in 'imm8'. Store the results in the low 64 bits of 128-bit lanes of 'dst', with the high 64 bits of 128-bit lanes being copied from from 'a' to 'dst'.

dst[15:0] := (a >> (imm8[1:0] * 16))[15:0]
dst[31:16] := (a >> (imm8[3:2] * 16))[15:0]
dst[47:32] := (a >> (imm8[5:4] * 16))[15:0]
dst[63:48] := (a >> (imm8[7:6] * 16))[15:0]
dst[127:64] := a[127:64]
dst[143:128] := (a >> (imm8[1:0] * 16))[143:128]
dst[159:144] := (a >> (imm8[3:2] * 16))[143:128]
dst[175:160] := (a >> (imm8[5:4] * 16))[143:128]
dst[191:176] := (a >> (imm8[7:6] * 16))[143:128]
dst[255:192] := a[255:192]
dst[MAX:256] := 0

Instruction: 'VPSHUFLW'. Intrinsic: '_mm256_shufflelo_epi16'. Requires AVX2.

FIXME: Requires compiler support (has immediate)

func M256SignEpi16 ¶

func M256SignEpi16(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256SignEpi16: Negate packed 16-bit integers in 'a' when the corresponding signed 16-bit integer in 'b' is negative, and store the results in 'dst'. Element in 'dst' are zeroed out when the corresponding element in 'b' is zero.

FOR j := 0 to 15
	i := j*16
	IF b[i+15:i] < 0
		dst[i+15:i] := NEG(a[i+15:i])
	ELSE IF b[i+15:i] = 0
		dst[i+15:i] := 0
	ELSE
		dst[i+15:i] := a[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSIGNW'. Intrinsic: '_mm256_sign_epi16'. Requires AVX2.

func M256SignEpi32 ¶

func M256SignEpi32(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256SignEpi32: Negate packed 32-bit integers in 'a' when the corresponding signed 32-bit integer in 'b' is negative, and store the results in 'dst'. Element in 'dst' are zeroed out when the corresponding element in 'b' is zero.

FOR j := 0 to 7
	i := j*32
	IF b[i+31:i] < 0
		dst[i+31:i] := NEG(a[i+31:i])
	ELSE IF b[i+31:i] = 0
		dst[i+31:i] := 0
	ELSE
		dst[i+31:i] := a[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSIGND'. Intrinsic: '_mm256_sign_epi32'. Requires AVX2.

func M256SignEpi8 ¶

func M256SignEpi8(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256SignEpi8: Negate packed 8-bit integers in 'a' when the corresponding signed 8-bit integer in 'b' is negative, and store the results in 'dst'. Element in 'dst' are zeroed out when the corresponding element in 'b' is zero.

FOR j := 0 to 31
	i := j*8
	IF b[i+7:i] < 0
		dst[i+7:i] := NEG(a[i+7:i])
	ELSE IF b[i+7:i] = 0
		dst[i+7:i] := 0
	ELSE
		dst[i+7:i] := a[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSIGNB'. Intrinsic: '_mm256_sign_epi8'. Requires AVX2.

func M256SllEpi16 ¶

func M256SllEpi16(a x86.M256i, count x86.M128i) (dst x86.M256i)

M256SllEpi16: Shift packed 16-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*16
	IF count[63:0] > 15
		dst[i+15:i] := 0
	ELSE
		dst[i+15:i] := ZeroExtend(a[i+15:i] << count[63:0])
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSLLW'. Intrinsic: '_mm256_sll_epi16'. Requires AVX2.

func M256SllEpi32 ¶

func M256SllEpi32(a x86.M256i, count x86.M128i) (dst x86.M256i)

M256SllEpi32: Shift packed 32-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*32
	IF count[63:0] > 31
		dst[i+31:i] := 0
	ELSE
		dst[i+31:i] := ZeroExtend(a[i+31:i] << count[63:0])
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSLLD'. Intrinsic: '_mm256_sll_epi32'. Requires AVX2.

func M256SllEpi64 ¶

func M256SllEpi64(a x86.M256i, count x86.M128i) (dst x86.M256i)

M256SllEpi64: Shift packed 64-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	IF count[63:0] > 63
		dst[i+63:i] := 0
	ELSE
		dst[i+63:i] := ZeroExtend(a[i+63:i] << count[63:0])
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSLLQ'. Intrinsic: '_mm256_sll_epi64'. Requires AVX2.

func M256SlliEpi16 ¶

func M256SlliEpi16(a x86.M256i, imm8 byte) (dst x86.M256i)

M256SlliEpi16: Shift packed 16-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*16
	IF imm8[7:0] > 15
		dst[i+15:i] := 0
	ELSE
		dst[i+15:i] := ZeroExtend(a[i+15:i] << imm8[7:0])
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSLLW'. Intrinsic: '_mm256_slli_epi16'. Requires AVX2.

FIXME: Requires compiler support (has immediate)

func M256SlliEpi32 ¶

func M256SlliEpi32(a x86.M256i, imm8 byte) (dst x86.M256i)

M256SlliEpi32: Shift packed 32-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*32
	IF imm8[7:0] > 31
		dst[i+31:i] := 0
	ELSE
		dst[i+31:i] := ZeroExtend(a[i+31:i] << imm8[7:0])
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSLLD'. Intrinsic: '_mm256_slli_epi32'. Requires AVX2.

FIXME: Requires compiler support (has immediate)

func M256SlliEpi64 ¶

func M256SlliEpi64(a x86.M256i, imm8 byte) (dst x86.M256i)

M256SlliEpi64: Shift packed 64-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	IF imm8[7:0] > 63
		dst[i+63:i] := 0
	ELSE
		dst[i+63:i] := ZeroExtend(a[i+63:i] << imm8[7:0])
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSLLQ'. Intrinsic: '_mm256_slli_epi64'. Requires AVX2.

FIXME: Requires compiler support (has immediate)

func M256SlliSi256 ¶

func M256SlliSi256(a x86.M256i, imm8 byte) (dst x86.M256i)

M256SlliSi256: Shift 128-bit lanes in 'a' left by 'imm8' bytes while shifting in zeros, and store the results in 'dst'.

tmp := imm8[7:0]
IF tmp > 15
	tmp := 16
FI
dst[127:0] := a[127:0] << (tmp*8)
dst[255:128] := a[255:128] << (tmp*8)
dst[MAX:256] := 0

Instruction: 'VPSLLDQ'. Intrinsic: '_mm256_slli_si256'. Requires AVX2.

FIXME: Requires compiler support (has immediate)

func M256SllvEpi32 ¶

func M256SllvEpi32(a x86.M256i, count x86.M256i) (dst x86.M256i)

M256SllvEpi32: Shift packed 32-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := ZeroExtend(a[i+31:i] << count[i+31:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSLLVD'. Intrinsic: '_mm256_sllv_epi32'. Requires AVX2.

func M256SllvEpi64 ¶

func M256SllvEpi64(a x86.M256i, count x86.M256i) (dst x86.M256i)

M256SllvEpi64: Shift packed 64-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := ZeroExtend(a[i+63:i] << count[i+63:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSLLVQ'. Intrinsic: '_mm256_sllv_epi64'. Requires AVX2.

func M256SraEpi16 ¶

func M256SraEpi16(a x86.M256i, count x86.M128i) (dst x86.M256i)

M256SraEpi16: Shift packed 16-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*16
	IF count[63:0] > 15
		dst[i+15:i] := SignBit
	ELSE
		dst[i+15:i] := SignExtend(a[i+15:i] >> count[63:0])
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRAW'. Intrinsic: '_mm256_sra_epi16'. Requires AVX2.

func M256SraEpi32 ¶

func M256SraEpi32(a x86.M256i, count x86.M128i) (dst x86.M256i)

M256SraEpi32: Shift packed 32-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*32
	IF count[63:0] > 31
		dst[i+31:i] := SignBit
	ELSE
		dst[i+31:i] := SignExtend(a[i+31:i] >> count[63:0])
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRAD'. Intrinsic: '_mm256_sra_epi32'. Requires AVX2.

func M256SraiEpi16 ¶

func M256SraiEpi16(a x86.M256i, imm8 byte) (dst x86.M256i)

M256SraiEpi16: Shift packed 16-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*16
	IF imm8[7:0] > 15
		dst[i+15:i] := SignBit
	ELSE
		dst[i+15:i] := SignExtend(a[i+15:i] >> imm8[7:0])
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRAW'. Intrinsic: '_mm256_srai_epi16'. Requires AVX2.

FIXME: Requires compiler support (has immediate)

func M256SraiEpi32 ¶

func M256SraiEpi32(a x86.M256i, imm8 byte) (dst x86.M256i)

M256SraiEpi32: Shift packed 32-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*32
	IF imm8[7:0] > 31
		dst[i+31:i] := SignBit
	ELSE
		dst[i+31:i] := SignExtend(a[i+31:i] >> imm8[7:0])
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRAD'. Intrinsic: '_mm256_srai_epi32'. Requires AVX2.

FIXME: Requires compiler support (has immediate)

func M256SravEpi32 ¶

func M256SravEpi32(a x86.M256i, count x86.M256i) (dst x86.M256i)

M256SravEpi32: Shift packed 32-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := SignExtend(a[i+31:i] >> count[i+31:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRAVD'. Intrinsic: '_mm256_srav_epi32'. Requires AVX2.

func M256SrlEpi16 ¶

func M256SrlEpi16(a x86.M256i, count x86.M128i) (dst x86.M256i)

M256SrlEpi16: Shift packed 16-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*16
	IF count[63:0] > 15
		dst[i+15:i] := 0
	ELSE
		dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[63:0])
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRLW'. Intrinsic: '_mm256_srl_epi16'. Requires AVX2.

func M256SrlEpi32 ¶

func M256SrlEpi32(a x86.M256i, count x86.M128i) (dst x86.M256i)

M256SrlEpi32: Shift packed 32-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*32
	IF count[63:0] > 31
		dst[i+31:i] := 0
	ELSE
		dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[63:0])
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRLD'. Intrinsic: '_mm256_srl_epi32'. Requires AVX2.

func M256SrlEpi64 ¶

func M256SrlEpi64(a x86.M256i, count x86.M128i) (dst x86.M256i)

M256SrlEpi64: Shift packed 64-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	IF count[63:0] > 63
		dst[i+63:i] := 0
	ELSE
		dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[63:0])
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRLQ'. Intrinsic: '_mm256_srl_epi64'. Requires AVX2.

func M256SrliEpi16 ¶

func M256SrliEpi16(a x86.M256i, imm8 byte) (dst x86.M256i)

M256SrliEpi16: Shift packed 16-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*16
	IF imm8[7:0] > 15
		dst[i+15:i] := 0
	ELSE
		dst[i+15:i] := ZeroExtend(a[i+15:i] >> imm8[7:0])
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRLW'. Intrinsic: '_mm256_srli_epi16'. Requires AVX2.

FIXME: Requires compiler support (has immediate)

func M256SrliEpi32 ¶

func M256SrliEpi32(a x86.M256i, imm8 byte) (dst x86.M256i)

M256SrliEpi32: Shift packed 32-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*32
	IF imm8[7:0] > 31
		dst[i+31:i] := 0
	ELSE
		dst[i+31:i] := ZeroExtend(a[i+31:i] >> imm8[7:0])
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRLD'. Intrinsic: '_mm256_srli_epi32'. Requires AVX2.

FIXME: Requires compiler support (has immediate)

func M256SrliEpi64 ¶

func M256SrliEpi64(a x86.M256i, imm8 byte) (dst x86.M256i)

M256SrliEpi64: Shift packed 64-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	IF imm8[7:0] > 63
		dst[i+63:i] := 0
	ELSE
		dst[i+63:i] := ZeroExtend(a[i+63:i] >> imm8[7:0])
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRLQ'. Intrinsic: '_mm256_srli_epi64'. Requires AVX2.

FIXME: Requires compiler support (has immediate)

func M256SrliSi256 ¶

func M256SrliSi256(a x86.M256i, imm8 byte) (dst x86.M256i)

M256SrliSi256: Shift 128-bit lanes in 'a' right by 'imm8' bytes while shifting in zeros, and store the results in 'dst'.

tmp := imm8[7:0]
IF tmp > 15
	tmp := 16
FI
dst[127:0] := a[127:0] >> (tmp*8)
dst[255:128] := a[255:128] >> (tmp*8)
dst[MAX:256] := 0

Instruction: 'VPSRLDQ'. Intrinsic: '_mm256_srli_si256'. Requires AVX2.

FIXME: Requires compiler support (has immediate)

func M256SrlvEpi32 ¶

func M256SrlvEpi32(a x86.M256i, count x86.M256i) (dst x86.M256i)

M256SrlvEpi32: Shift packed 32-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[i+31:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRLVD'. Intrinsic: '_mm256_srlv_epi32'. Requires AVX2.

func M256SrlvEpi64 ¶

func M256SrlvEpi64(a x86.M256i, count x86.M256i) (dst x86.M256i)

M256SrlvEpi64: Shift packed 64-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[i+63:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRLVQ'. Intrinsic: '_mm256_srlv_epi64'. Requires AVX2.

func M256StreamLoadSi256 ¶

func M256StreamLoadSi256(mem_addr *x86.M256iConst) (dst x86.M256i)

M256StreamLoadSi256: Load 256-bits of integer data from memory into 'dst' using a non-temporal memory hint.

'mem_addr' must be aligned on a 32-byte boundary or a general-protection

exception may be generated.

dst[255:0] := MEM[mem_addr+255:mem_addr]
dst[MAX:256] := 0

Instruction: 'VMOVNTDQA'. Intrinsic: '_mm256_stream_load_si256'. Requires AVX2.

FIXME: Will likely need to be reworked (has pointer parameter).

func M256SubEpi16 ¶

func M256SubEpi16(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256SubEpi16: Subtract packed 16-bit integers in 'b' from packed 16-bit integers in 'a', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*16
	dst[i+15:i] := a[i+15:i] - b[i+15:i]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSUBW'. Intrinsic: '_mm256_sub_epi16'. Requires AVX2.

func M256SubEpi32 ¶

func M256SubEpi32(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256SubEpi32: Subtract packed 32-bit integers in 'b' from packed 32-bit integers in 'a', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*32
	dst[i+31:i] := a[i+31:i] - b[i+31:i]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSUBD'. Intrinsic: '_mm256_sub_epi32'. Requires AVX2.

func M256SubEpi64 ¶

func M256SubEpi64(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256SubEpi64: Subtract packed 64-bit integers in 'b' from packed 64-bit integers in 'a', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*64
	dst[i+63:i] := a[i+63:i] - b[i+63:i]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSUBQ'. Intrinsic: '_mm256_sub_epi64'. Requires AVX2.

func M256SubEpi8 ¶

func M256SubEpi8(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256SubEpi8: Subtract packed 8-bit integers in 'b' from packed 8-bit integers in 'a', and store the results in 'dst'.

FOR j := 0 to 31
	i := j*8
	dst[i+7:i] := a[i+7:i] - b[i+7:i]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSUBB'. Intrinsic: '_mm256_sub_epi8'. Requires AVX2.

func M256SubsEpi16 ¶

func M256SubsEpi16(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256SubsEpi16: Subtract packed 16-bit integers in 'b' from packed 16-bit integers in 'a' using saturation, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*16
	dst[i+15:i] := Saturate_To_Int16(a[i+15:i] - b[i+15:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSUBSW'. Intrinsic: '_mm256_subs_epi16'. Requires AVX2.

func M256SubsEpi8 ¶

func M256SubsEpi8(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256SubsEpi8: Subtract packed 8-bit integers in 'b' from packed 8-bit integers in 'a' using saturation, and store the results in 'dst'.

FOR j := 0 to 31
	i := j*8
	dst[i+7:i] := Saturate_To_Int8(a[i+7:i] - b[i+7:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSUBSB'. Intrinsic: '_mm256_subs_epi8'. Requires AVX2.

func M256SubsEpu16 ¶

func M256SubsEpu16(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256SubsEpu16: Subtract packed unsigned 16-bit integers in 'b' from packed unsigned 16-bit integers in 'a' using saturation, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*16
	dst[i+15:i] := Saturate_To_UnsignedInt16(a[i+15:i] - b[i+15:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSUBUSW'. Intrinsic: '_mm256_subs_epu16'. Requires AVX2.

func M256SubsEpu8 ¶

func M256SubsEpu8(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256SubsEpu8: Subtract packed unsigned 8-bit integers in 'b' from packed unsigned 8-bit integers in 'a' using saturation, and store the results in 'dst'.

FOR j := 0 to 31
	i := j*8
	dst[i+7:i] := Saturate_To_UnsignedInt8(a[i+7:i] - b[i+7:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSUBUSB'. Intrinsic: '_mm256_subs_epu8'. Requires AVX2.

func M256UnpackhiEpi16 ¶

func M256UnpackhiEpi16(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256UnpackhiEpi16: Unpack and interleave 16-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst'.

INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]){
	dst[15:0] := src1[79:64]
	dst[31:16] := src2[79:64]
	dst[47:32] := src1[95:80]
	dst[63:48] := src2[95:80]
	dst[79:64] := src1[111:96]
	dst[95:80] := src2[111:96]
	dst[111:96] := src1[127:112]
	dst[127:112] := src2[127:112]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128])
dst[MAX:256] := 0

Instruction: 'VPUNPCKHWD'. Intrinsic: '_mm256_unpackhi_epi16'. Requires AVX2.

func M256UnpackhiEpi32 ¶

func M256UnpackhiEpi32(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256UnpackhiEpi32: Unpack and interleave 32-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst'.

INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[95:64]
	dst[63:32] := src2[95:64]
	dst[95:64] := src1[127:96]
	dst[127:96] := src2[127:96]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_HIGH_DWORDS(a[255:128], b[255:128])
dst[MAX:256] := 0

Instruction: 'VPUNPCKHDQ'. Intrinsic: '_mm256_unpackhi_epi32'. Requires AVX2.

func M256UnpackhiEpi64 ¶

func M256UnpackhiEpi64(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256UnpackhiEpi64: Unpack and interleave 64-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst'.

INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[127:64]
	dst[127:64] := src2[127:64]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_HIGH_QWORDS(a[255:128], b[255:128])
dst[MAX:256] := 0

Instruction: 'VPUNPCKHQDQ'. Intrinsic: '_mm256_unpackhi_epi64'. Requires AVX2.

func M256UnpackhiEpi8 ¶

func M256UnpackhiEpi8(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256UnpackhiEpi8: Unpack and interleave 8-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst'.

INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]){
	dst[7:0] := src1[71:64]
	dst[15:8] := src2[71:64]
	dst[23:16] := src1[79:72]
	dst[31:24] := src2[79:72]
	dst[39:32] := src1[87:80]
	dst[47:40] := src2[87:80]
	dst[55:48] := src1[95:88]
	dst[63:56] := src2[95:88]
	dst[71:64] := src1[103:96]
	dst[79:72] := src2[103:96]
	dst[87:80] := src1[111:104]
	dst[95:88] := src2[111:104]
	dst[103:96] := src1[119:112]
	dst[111:104] := src2[119:112]
	dst[119:112] := src1[127:120]
	dst[127:120] := src2[127:120]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128])
dst[MAX:256] := 0

Instruction: 'VPUNPCKHBW'. Intrinsic: '_mm256_unpackhi_epi8'. Requires AVX2.

func M256UnpackloEpi16 ¶

func M256UnpackloEpi16(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256UnpackloEpi16: Unpack and interleave 16-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst'.

INTERLEAVE_WORDS(src1[127:0], src2[127:0]){
	dst[15:0] := src1[15:0]
	dst[31:16] := src2[15:0]
	dst[47:32] := src1[31:16]
	dst[63:48] := src2[31:16]
	dst[79:64] := src1[47:32]
	dst[95:80] := src2[47:32]
	dst[111:96] := src1[63:48]
	dst[127:112] := src2[63:48]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128])
dst[MAX:256] := 0

Instruction: 'VPUNPCKLWD'. Intrinsic: '_mm256_unpacklo_epi16'. Requires AVX2.

func M256UnpackloEpi32 ¶

func M256UnpackloEpi32(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256UnpackloEpi32: Unpack and interleave 32-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst'.

INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[31:0]
	dst[63:32] := src2[31:0]
	dst[95:64] := src1[63:32]
	dst[127:96] := src2[63:32]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_DWORDS(a[255:128], b[255:128])
dst[MAX:256] := 0

Instruction: 'VPUNPCKLDQ'. Intrinsic: '_mm256_unpacklo_epi32'. Requires AVX2.

func M256UnpackloEpi64 ¶

func M256UnpackloEpi64(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256UnpackloEpi64: Unpack and interleave 64-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst'.

INTERLEAVE_QWORDS(src1[127:0], src2[127:0]){
	dst[63:0] := src1[63:0]
	dst[127:64] := src2[63:0]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_QWORDS(a[255:128], b[255:128])
dst[MAX:256] := 0

Instruction: 'VPUNPCKLQDQ'. Intrinsic: '_mm256_unpacklo_epi64'. Requires AVX2.

func M256UnpackloEpi8 ¶

func M256UnpackloEpi8(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256UnpackloEpi8: Unpack and interleave 8-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst'.

INTERLEAVE_BYTES(src1[127:0], src2[127:0]){
	dst[7:0] := src1[7:0]
	dst[15:8] := src2[7:0]
	dst[23:16] := src1[15:8]
	dst[31:24] := src2[15:8]
	dst[39:32] := src1[23:16]
	dst[47:40] := src2[23:16]
	dst[55:48] := src1[31:24]
	dst[63:56] := src2[31:24]
	dst[71:64] := src1[39:32]
	dst[79:72] := src2[39:32]
	dst[87:80] := src1[47:40]
	dst[95:88] := src2[47:40]
	dst[103:96] := src1[55:48]
	dst[111:104] := src2[55:48]
	dst[119:112] := src1[63:56]
	dst[127:120] := src2[63:56]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128])
dst[MAX:256] := 0

Instruction: 'VPUNPCKLBW'. Intrinsic: '_mm256_unpacklo_epi8'. Requires AVX2.

func M256XorSi256 ¶

func M256XorSi256(a x86.M256i, b x86.M256i) (dst x86.M256i)

M256XorSi256: Compute the bitwise XOR of 256 bits (representing integer data) in 'a' and 'b', and store the result in 'dst'.

dst[255:0] := (a[255:0] XOR b[255:0])
dst[MAX:256] := 0

Instruction: 'VPXOR'. Intrinsic: '_mm256_xor_si256'. Requires AVX2.

func MaskI32gatherEpi32 ¶

func MaskI32gatherEpi32(src x86.M128i, base_addr *int, vindex x86.M128i, mask x86.M128i, scale int) (dst x86.M128i)

MaskI32gatherEpi32: Gather 32-bit integers from memory using 32-bit indices. 32-bit elements are loaded from addresses starting at 'base_addr' and offset by each 32-bit element in 'vindex' (each index is scaled by the factor in 'scale'). Gathered elements are merged into 'dst' using 'mask' (elements are copied from 'src' when the highest bit is not set in the corresponding element). 'scale' should be 1, 2, 4 or 8.

FOR j := 0 to 3
	i := j*32
	IF mask[i+31]
		dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+31:i])*scale]
		mask[i+31] := 0
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
mask[MAX:128] := 0
dst[MAX:128] := 0

Instruction: 'VPGATHERDD'. Intrinsic: '_mm_mask_i32gather_epi32'. Requires AVX2.

FIXME: Will likely need to be reworked (has pointer parameter).

func MaskI32gatherEpi64 ¶

func MaskI32gatherEpi64(src x86.M128i, base_addr *int, vindex x86.M128i, mask x86.M128i, scale int) (dst x86.M128i)

MaskI32gatherEpi64: Gather 64-bit integers from memory using 32-bit indices. 64-bit elements are loaded from addresses starting at 'base_addr' and offset by each 32-bit element in 'vindex' (each index is scaled by the factor in 'scale'). Gathered elements are merged into 'dst' using 'mask' (elements are copied from 'src' when the highest bit is not set in the corresponding element). 'scale' should be 1, 2, 4 or 8.

FOR j := 0 to 1
	i := j*64
	m := j*32
	IF mask[i+63]
		dst[i+63:i] := MEM[base_addr + SignExtend(vindex[m+31:m])*scale]
		mask[i+63] := 0
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
mask[MAX:128] := 0
dst[MAX:128] := 0

Instruction: 'VPGATHERDQ'. Intrinsic: '_mm_mask_i32gather_epi64'. Requires AVX2.

FIXME: Will likely need to be reworked (has pointer parameter).

func MaskI64gatherEpi32 ¶

func MaskI64gatherEpi32(src x86.M128i, base_addr *int, vindex x86.M128i, mask x86.M128i, scale int) (dst x86.M128i)

MaskI64gatherEpi32: Gather 32-bit integers from memory using 64-bit indices. 32-bit elements are loaded from addresses starting at 'base_addr' and offset by each 64-bit element in 'vindex' (each index is scaled by the factor in 'scale'). Gathered elements are merged into 'dst' using 'mask' (elements are copied from 'src' when the highest bit is not set in the corresponding element). 'scale' should be 1, 2, 4 or 8.

FOR j := 0 to 1
	i := j*32
	m := j*64
	IF mask[i+31]
		dst[i+31:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
		mask[i+31] := 0
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
mask[MAX:64] := 0
dst[MAX:64] := 0

Instruction: 'VPGATHERQD'. Intrinsic: '_mm_mask_i64gather_epi32'. Requires AVX2.

FIXME: Will likely need to be reworked (has pointer parameter).

func MaskI64gatherEpi64 ¶

func MaskI64gatherEpi64(src x86.M128i, base_addr *int, vindex x86.M128i, mask x86.M128i, scale int) (dst x86.M128i)

MaskI64gatherEpi64: Gather 64-bit integers from memory using 64-bit indices. 64-bit elements are loaded from addresses starting at 'base_addr' and offset by each 64-bit element in 'vindex' (each index is scaled by the factor in 'scale'). Gathered elements are merged into 'dst' using 'mask' (elements are copied from 'src' when the highest bit is not set in the corresponding element). 'scale' should be 1, 2, 4 or 8.

FOR j := 0 to 1
	i := j*64
	IF mask[i+63]
		dst[i+63:i] := MEM[base_addr + SignExtend(vindex[i+63:i])*scale]
		mask[i+63] := 0
	ELSE
		dst[i+63:i] := src[i+63:i]
	FI
ENDFOR
mask[MAX:128] := 0
dst[MAX:128] := 0

Instruction: 'VPGATHERQQ'. Intrinsic: '_mm_mask_i64gather_epi64'. Requires AVX2.

FIXME: Will likely need to be reworked (has pointer parameter).

func MaskloadEpi32 ¶

func MaskloadEpi32(mem_addr *int, mask x86.M128i) (dst x86.M128i)

MaskloadEpi32: Load packed 32-bit integers from memory into 'dst' using 'mask' (elements are zeroed out when the highest bit is not set in the corresponding element).

FOR j := 0 to 3
	i := j*32
	IF mask[i+31]
		dst[i+31:i] := MEM[mem_addr+i+31:mem_addr+i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMASKMOVD'. Intrinsic: '_mm_maskload_epi32'. Requires AVX2.

FIXME: Will likely need to be reworked (has pointer parameter).

func MaskloadEpi64 ¶

func MaskloadEpi64(mem_addr *int, mask x86.M128i) (dst x86.M128i)

MaskloadEpi64: Load packed 64-bit integers from memory into 'dst' using 'mask' (elements are zeroed out when the highest bit is not set in the corresponding element).

FOR j := 0 to 1
	i := j*64
	IF mask[i+63]
		dst[i+63:i] := MEM[mem_addr+i+63:mem_addr+i]
	ELSE
		dst[i+63:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMASKMOVQ'. Intrinsic: '_mm_maskload_epi64'. Requires AVX2.

FIXME: Will likely need to be reworked (has pointer parameter).

func MaskstoreEpi32 ¶

func MaskstoreEpi32(mem_addr *int, mask x86.M128i, a x86.M128i)

MaskstoreEpi32: Store packed 32-bit integers from 'a' into memory using 'mask' (elements are not stored when the highest bit is not set in the corresponding element).

FOR j := 0 to 3
	i := j*32
	IF mask[i+31]
		MEM[mem_addr+i+31:mem_addr+i] := a[i+31:i]
	FI
ENDFOR

Instruction: 'VPMASKMOVD'. Intrinsic: '_mm_maskstore_epi32'. Requires AVX2.

FIXME: Will likely need to be reworked (has pointer parameter).

func MaskstoreEpi64 ¶

func MaskstoreEpi64(mem_addr *int64, mask x86.M128i, a x86.M128i)

MaskstoreEpi64: Store packed 64-bit integers from 'a' into memory using 'mask' (elements are not stored when the highest bit is not set in the corresponding element).

FOR j := 0 to 1
	i := j*64
	IF mask[i+63]
		MEM[mem_addr+i+63:mem_addr+i] := a[i+63:i]
	FI
ENDFOR

Instruction: 'VPMASKMOVQ'. Intrinsic: '_mm_maskstore_epi64'. Requires AVX2.

FIXME: Will likely need to be reworked (has pointer parameter).

func SllvEpi32 ¶

func SllvEpi32(a x86.M128i, count x86.M128i) (dst x86.M128i)

SllvEpi32: Shift packed 32-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ZeroExtend(a[i+31:i] << count[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSLLVD'. Intrinsic: '_mm_sllv_epi32'. Requires AVX2.

func SllvEpi64 ¶

func SllvEpi64(a x86.M128i, count x86.M128i) (dst x86.M128i)

SllvEpi64: Shift packed 64-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ZeroExtend(a[i+63:i] << count[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSLLVQ'. Intrinsic: '_mm_sllv_epi64'. Requires AVX2.

func SravEpi32 ¶

func SravEpi32(a x86.M128i, count x86.M128i) (dst x86.M128i)

SravEpi32: Shift packed 32-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := SignExtend(a[i+31:i] >> count[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRAVD'. Intrinsic: '_mm_srav_epi32'. Requires AVX2.

func SrlvEpi32 ¶

func SrlvEpi32(a x86.M128i, count x86.M128i) (dst x86.M128i)

SrlvEpi32: Shift packed 32-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ZeroExtend(a[i+31:i] >> count[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRLVD'. Intrinsic: '_mm_srlv_epi32'. Requires AVX2.

func SrlvEpi64 ¶

func SrlvEpi64(a x86.M128i, count x86.M128i) (dst x86.M128i)

SrlvEpi64: Shift packed 64-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ZeroExtend(a[i+63:i] >> count[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRLVQ'. Intrinsic: '_mm_srlv_epi64'. Requires AVX2.

Types ¶

This section is empty.

Source Files ¶

View all Source files

avx2.go

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL