avx512bw

package
v0.0.0-...-3878f85 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jul 23, 2017 License: MIT Imports: 1 Imported by: 0

Documentation

Overview

THESE PACKAGES ARE FOR DEMONSTRATION PURPOSES ONLY!

THEY DO NOT NOT CONTAIN WORKING INTRINSICS!

See https://github.com/klauspost/intrinsics

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func CmpEpi16Mask

func CmpEpi16Mask(a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask8)

CmpEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 7
	i := j*16
	k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm_cmp_epi16_mask'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func CmpEpi8Mask

func CmpEpi8Mask(a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask16)

CmpEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 15
	i := j*8
	k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm_cmp_epi8_mask'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func CmpEpu16Mask

func CmpEpu16Mask(a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask8)

CmpEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 7
	i := j*16
	k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm_cmp_epu16_mask'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func CmpEpu8Mask

func CmpEpu8Mask(a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask16)

CmpEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 15
	i := j*8
	k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm_cmp_epu8_mask'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func CmpeqEpi16Mask

func CmpeqEpi16Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpeqEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*16
	k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm_cmpeq_epi16_mask'. Requires AVX512BW.

func CmpeqEpi8Mask

func CmpeqEpi8Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask16)

CmpeqEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*8
	k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm_cmpeq_epi8_mask'. Requires AVX512BW.

func CmpeqEpu16Mask

func CmpeqEpu16Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpeqEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*16
	k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm_cmpeq_epu16_mask'. Requires AVX512BW.

func CmpeqEpu8Mask

func CmpeqEpu8Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask16)

CmpeqEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*8
	k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm_cmpeq_epu8_mask'. Requires AVX512BW.

func CmpgeEpi16Mask

func CmpgeEpi16Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpgeEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*16
	k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm_cmpge_epi16_mask'. Requires AVX512BW.

func CmpgeEpi8Mask

func CmpgeEpi8Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask16)

CmpgeEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*8
	k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm_cmpge_epi8_mask'. Requires AVX512BW.

func CmpgeEpu16Mask

func CmpgeEpu16Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpgeEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*16
	k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm_cmpge_epu16_mask'. Requires AVX512BW.

func CmpgeEpu8Mask

func CmpgeEpu8Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask16)

CmpgeEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*8
	k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm_cmpge_epu8_mask'. Requires AVX512BW.

func CmpgtEpi16Mask

func CmpgtEpi16Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpgtEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*16
	k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm_cmpgt_epi16_mask'. Requires AVX512BW.

func CmpgtEpi8Mask

func CmpgtEpi8Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask16)

CmpgtEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*8
	k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm_cmpgt_epi8_mask'. Requires AVX512BW.

func CmpgtEpu16Mask

func CmpgtEpu16Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpgtEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*16
	k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm_cmpgt_epu16_mask'. Requires AVX512BW.

func CmpgtEpu8Mask

func CmpgtEpu8Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask16)

CmpgtEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*8
	k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm_cmpgt_epu8_mask'. Requires AVX512BW.

func CmpleEpi16Mask

func CmpleEpi16Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpleEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*16
	k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm_cmple_epi16_mask'. Requires AVX512BW.

func CmpleEpi8Mask

func CmpleEpi8Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask16)

CmpleEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*8
	k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm_cmple_epi8_mask'. Requires AVX512BW.

func CmpleEpu16Mask

func CmpleEpu16Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpleEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*16
	k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm_cmple_epu16_mask'. Requires AVX512BW.

func CmpleEpu8Mask

func CmpleEpu8Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask16)

CmpleEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*8
	k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm_cmple_epu8_mask'. Requires AVX512BW.

func CmpltEpi16Mask

func CmpltEpi16Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpltEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*16
	k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm_cmplt_epi16_mask'. Requires AVX512BW.

func CmpltEpi8Mask

func CmpltEpi8Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask16)

CmpltEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*8
	k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm_cmplt_epi8_mask'. Requires AVX512BW.

func CmpltEpu16Mask

func CmpltEpu16Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpltEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*16
	k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm_cmplt_epu16_mask'. Requires AVX512BW.

func CmpltEpu8Mask

func CmpltEpu8Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask16)

CmpltEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*8
	k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm_cmplt_epu8_mask'. Requires AVX512BW.

func CmpneqEpi16Mask

func CmpneqEpi16Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpneqEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*16
	k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm_cmpneq_epi16_mask'. Requires AVX512BW.

func CmpneqEpi8Mask

func CmpneqEpi8Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask16)

CmpneqEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*8
	k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm_cmpneq_epi8_mask'. Requires AVX512BW.

func CmpneqEpu16Mask

func CmpneqEpu16Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

CmpneqEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.

FOR j := 0 to 7
	i := j*16
	k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm_cmpneq_epu16_mask'. Requires AVX512BW.

func CmpneqEpu8Mask

func CmpneqEpu8Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask16)

CmpneqEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*8
	k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm_cmpneq_epu8_mask'. Requires AVX512BW.

func Cvtepi16Epi8

func Cvtepi16Epi8(a x86.M128i) (dst x86.M128i)

Cvtepi16Epi8: Convert packed 16-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 7
	i := 16*j
	l := 8*j
	dst[l+7:l] := Truncate_Int16_To_Int8(a[i+15:i])
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVWB'. Intrinsic: '_mm_cvtepi16_epi8'. Requires AVX512BW.

func Cvtsepi16Epi8

func Cvtsepi16Epi8(a x86.M128i) (dst x86.M128i)

Cvtsepi16Epi8: Convert packed 16-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst'.

FOR j := 0 to 7
	i := 16*j
	l := 8*j
	dst[l+7:l] := Saturate_Int16_To_Int8(a[i+15:i])
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVSWB'. Intrinsic: '_mm_cvtsepi16_epi8'. Requires AVX512BW.

func Cvtusepi16Epi8

func Cvtusepi16Epi8(a x86.M128i) (dst x86.M128i)

Cvtusepi16Epi8: Convert packed unsigned 16-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst'.

FOR j := 0 to 7
	i := 16*j
	l := 8*j
	dst[l+7:l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i])
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVUSWB'. Intrinsic: '_mm_cvtusepi16_epi8'. Requires AVX512BW.

func DbsadEpu8

func DbsadEpu8(a x86.M128i, b x86.M128i, imm8 byte) (dst x86.M128i)

DbsadEpu8: Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in 'a' compared to those in 'b', and store the 16-bit results in 'dst'.

Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The

first two SADs use the lower 8-bit quadruplet of the lane from 'a', and the last two SADs use the uppper 8-bit quadruplet of the lane from 'a'. Quadruplets from 'b' are selected according to the control in 'imm8', and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.

tmp[31:0] := select(b[127:0], imm8[1:0])
tmp[63:32] := select(b[127:0], imm8[3:2])
tmp[95:64] := select(b[127:0], imm8[5:4])
tmp[127:96] := select(b[127:0], imm8[7:6])

FOR j := 0 to 1
	i := j*64
	dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8])
				 + ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])

	dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16])
				 + ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])

	dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24])
				 + ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])

	dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32])
				 + ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
ENDFOR

dst[MAX:128] := 0

Instruction: 'VDBPSADBW'. Intrinsic: '_mm_dbsad_epu8'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M256CmpEpi16Mask

func M256CmpEpi16Mask(a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask16)

M256CmpEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 15
	i := j*16
	k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm256_cmp_epi16_mask'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M256CmpEpi8Mask

func M256CmpEpi8Mask(a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask32)

M256CmpEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 31
	i := j*8
	k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm256_cmp_epi8_mask'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M256CmpEpu16Mask

func M256CmpEpu16Mask(a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask16)

M256CmpEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 15
	i := j*16
	k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm256_cmp_epu16_mask'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M256CmpEpu8Mask

func M256CmpEpu8Mask(a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask32)

M256CmpEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 31
	i := j*8
	k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm256_cmp_epu8_mask'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M256CmpeqEpi16Mask

func M256CmpeqEpi16Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256CmpeqEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*16
	k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm256_cmpeq_epi16_mask'. Requires AVX512BW.

func M256CmpeqEpi8Mask

func M256CmpeqEpi8Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256CmpeqEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.

FOR j := 0 to 31
	i := j*8
	k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm256_cmpeq_epi8_mask'. Requires AVX512BW.

func M256CmpeqEpu16Mask

func M256CmpeqEpu16Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256CmpeqEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*16
	k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm256_cmpeq_epu16_mask'. Requires AVX512BW.

func M256CmpeqEpu8Mask

func M256CmpeqEpu8Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256CmpeqEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.

FOR j := 0 to 31
	i := j*8
	k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm256_cmpeq_epu8_mask'. Requires AVX512BW.

func M256CmpgeEpi16Mask

func M256CmpgeEpi16Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256CmpgeEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*16
	k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm256_cmpge_epi16_mask'. Requires AVX512BW.

func M256CmpgeEpi8Mask

func M256CmpgeEpi8Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256CmpgeEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 31
	i := j*8
	k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm256_cmpge_epi8_mask'. Requires AVX512BW.

func M256CmpgeEpu16Mask

func M256CmpgeEpu16Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256CmpgeEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*16
	k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm256_cmpge_epu16_mask'. Requires AVX512BW.

func M256CmpgeEpu8Mask

func M256CmpgeEpu8Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256CmpgeEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 31
	i := j*8
	k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm256_cmpge_epu8_mask'. Requires AVX512BW.

func M256CmpgtEpi16Mask

func M256CmpgtEpi16Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256CmpgtEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*16
	k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm256_cmpgt_epi16_mask'. Requires AVX512BW.

func M256CmpgtEpi8Mask

func M256CmpgtEpi8Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256CmpgtEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.

FOR j := 0 to 31
	i := j*8
	k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm256_cmpgt_epi8_mask'. Requires AVX512BW.

func M256CmpgtEpu16Mask

func M256CmpgtEpu16Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256CmpgtEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*16
	k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm256_cmpgt_epu16_mask'. Requires AVX512BW.

func M256CmpgtEpu8Mask

func M256CmpgtEpu8Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256CmpgtEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.

FOR j := 0 to 31
	i := j*8
	k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm256_cmpgt_epu8_mask'. Requires AVX512BW.

func M256CmpleEpi16Mask

func M256CmpleEpi16Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256CmpleEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*16
	k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm256_cmple_epi16_mask'. Requires AVX512BW.

func M256CmpleEpi8Mask

func M256CmpleEpi8Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256CmpleEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 31
	i := j*8
	k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm256_cmple_epi8_mask'. Requires AVX512BW.

func M256CmpleEpu16Mask

func M256CmpleEpu16Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256CmpleEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*16
	k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm256_cmple_epu16_mask'. Requires AVX512BW.

func M256CmpleEpu8Mask

func M256CmpleEpu8Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256CmpleEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 31
	i := j*8
	k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm256_cmple_epu8_mask'. Requires AVX512BW.

func M256CmpltEpi16Mask

func M256CmpltEpi16Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256CmpltEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*16
	k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm256_cmplt_epi16_mask'. Requires AVX512BW.

func M256CmpltEpi8Mask

func M256CmpltEpi8Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256CmpltEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.

FOR j := 0 to 31
	i := j*8
	k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm256_cmplt_epi8_mask'. Requires AVX512BW.

func M256CmpltEpu16Mask

func M256CmpltEpu16Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256CmpltEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*16
	k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm256_cmplt_epu16_mask'. Requires AVX512BW.

func M256CmpltEpu8Mask

func M256CmpltEpu8Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256CmpltEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.

FOR j := 0 to 31
	i := j*8
	k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm256_cmplt_epu8_mask'. Requires AVX512BW.

func M256CmpneqEpi16Mask

func M256CmpneqEpi16Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256CmpneqEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*16
	k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm256_cmpneq_epi16_mask'. Requires AVX512BW.

func M256CmpneqEpi8Mask

func M256CmpneqEpi8Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256CmpneqEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.

FOR j := 0 to 31
	i := j*8
	k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm256_cmpneq_epi8_mask'. Requires AVX512BW.

func M256CmpneqEpu16Mask

func M256CmpneqEpu16Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256CmpneqEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.

FOR j := 0 to 15
	i := j*16
	k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm256_cmpneq_epu16_mask'. Requires AVX512BW.

func M256CmpneqEpu8Mask

func M256CmpneqEpu8Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256CmpneqEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.

FOR j := 0 to 31
	i := j*8
	k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm256_cmpneq_epu8_mask'. Requires AVX512BW.

func M256Cvtepi16Epi8

func M256Cvtepi16Epi8(a x86.M256i) (dst x86.M128i)

M256Cvtepi16Epi8: Convert packed 16-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 15
	i := 16*j
	l := 8*j
	dst[l+7:l] := Truncate_Int16_To_Int8(a[i+15:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVWB'. Intrinsic: '_mm256_cvtepi16_epi8'. Requires AVX512BW.

func M256Cvtsepi16Epi8

func M256Cvtsepi16Epi8(a x86.M256i) (dst x86.M128i)

M256Cvtsepi16Epi8: Convert packed 16-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst'.

FOR j := 0 to 15
	i := 16*j
	l := 8*j
	dst[l+7:l] := Saturate_Int16_To_Int8(a[i+15:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSWB'. Intrinsic: '_mm256_cvtsepi16_epi8'. Requires AVX512BW.

func M256Cvtusepi16Epi8

func M256Cvtusepi16Epi8(a x86.M256i) (dst x86.M128i)

M256Cvtusepi16Epi8: Convert packed unsigned 16-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst'.

FOR j := 0 to 15
	i := 16*j
	l := 8*j
	dst[l+7:l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVUSWB'. Intrinsic: '_mm256_cvtusepi16_epi8'. Requires AVX512BW.

func M256DbsadEpu8

func M256DbsadEpu8(a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)

M256DbsadEpu8: Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in 'a' compared to those in 'b', and store the 16-bit results in 'dst'.

Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The

first two SADs use the lower 8-bit quadruplet of the lane from 'a', and the last two SADs use the uppper 8-bit quadruplet of the lane from 'a'. Quadruplets from 'b' are selected from within 128-bit lanes according to the control in 'imm8', and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.

FOR j := 0 to 1
	i := j*128
	tmp[i+31:i] := select(b[i+127:i], imm8[1:0])
	tmp[i+63:i+32] := select(b[i+127:i], imm8[3:2])
	tmp[i+95:i+64] := select(b[i+127:i], imm8[5:4])
	tmp[i+127:i+96] := select(b[i+127:i], imm8[7:6])
ENDFOR

FOR j := 0 to 3
	i := j*64
	dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8])
				 + ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])

	dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16])
				 + ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])

	dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24])
				 + ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])

	dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32])
				 + ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VDBPSADBW'. Intrinsic: '_mm256_dbsad_epu8'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M256Mask2Permutex2varEpi16

func M256Mask2Permutex2varEpi16(a x86.M256i, idx x86.M256i, k x86.Mmask16, b x86.M256i) (dst x86.M256i)

M256Mask2Permutex2varEpi16: Shuffle 16-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'idx' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		off := 16*idx[i+3:i]
		dst[i+15:i] := idx[i+4] ? b[off+15:off] : a[off+15:off]
	ELSE
		dst[i+15:i] := idx[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMI2W'. Intrinsic: '_mm256_mask2_permutex2var_epi16'. Requires AVX512BW.

func M256MaskAbsEpi16

func M256MaskAbsEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i) (dst x86.M256i)

M256MaskAbsEpi16: Compute the absolute value of packed 16-bit integers in 'a', and store the unsigned results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := ABS(a[i+15:i])
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPABSW'. Intrinsic: '_mm256_mask_abs_epi16'. Requires AVX512BW.

func M256MaskAbsEpi8

func M256MaskAbsEpi8(src x86.M256i, k x86.Mmask32, a x86.M256i) (dst x86.M256i)

M256MaskAbsEpi8: Compute the absolute value of packed 8-bit integers in 'a', and store the unsigned results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := ABS(a[i+7:i])
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPABSB'. Intrinsic: '_mm256_mask_abs_epi8'. Requires AVX512BW.

func M256MaskAddEpi16

func M256MaskAddEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskAddEpi16: Add packed 16-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := a[i+15:i] + b[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPADDW'. Intrinsic: '_mm256_mask_add_epi16'. Requires AVX512BW.

func M256MaskAddEpi8

func M256MaskAddEpi8(src x86.M256i, k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskAddEpi8: Add packed 8-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := a[i+7:i] + b[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPADDB'. Intrinsic: '_mm256_mask_add_epi8'. Requires AVX512BW.

func M256MaskAddsEpi16

func M256MaskAddsEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskAddsEpi16: Add packed 16-bit integers in 'a' and 'b' using saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] )
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPADDSW'. Intrinsic: '_mm256_mask_adds_epi16'. Requires AVX512BW.

func M256MaskAddsEpi8

func M256MaskAddsEpi8(src x86.M256i, k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskAddsEpi8: Add packed 8-bit integers in 'a' and 'b' using saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] )
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPADDSB'. Intrinsic: '_mm256_mask_adds_epi8'. Requires AVX512BW.

func M256MaskAddsEpu16

func M256MaskAddsEpu16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskAddsEpu16: Add packed unsigned 16-bit integers in 'a' and 'b' using saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] )
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPADDUSW'. Intrinsic: '_mm256_mask_adds_epu16'. Requires AVX512BW.

func M256MaskAddsEpu8

func M256MaskAddsEpu8(src x86.M256i, k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskAddsEpu8: Add packed unsigned 8-bit integers in 'a' and 'b' using saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := Saturate_To_UnsignedInt8( a[i+7:i] + b[i+7:i] )
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPADDUSB'. Intrinsic: '_mm256_mask_adds_epu8'. Requires AVX512BW.

func M256MaskAlignrEpi8

func M256MaskAlignrEpi8(src x86.M256i, k x86.Mmask32, a x86.M256i, b x86.M256i, count int) (dst x86.M256i)

M256MaskAlignrEpi8: Concatenate pairs of 16-byte blocks in 'a' and 'b' into a 32-byte temporary result, shift the result right by 'count' bytes, and store the low 16 bytes in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*128
	tmp[255:0] := ((a[i+127:i] << 128) OR b[i+127:i]) >> (count[7:0]*8)
	tmp_dst[i+127:i] := tmp[127:0]
ENDFOR

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPALIGNR'. Intrinsic: '_mm256_mask_alignr_epi8'. Requires AVX512BW.

func M256MaskAvgEpu16

func M256MaskAvgEpu16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskAvgEpu16: Average packed unsigned 16-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPAVGW'. Intrinsic: '_mm256_mask_avg_epu16'. Requires AVX512BW.

func M256MaskAvgEpu8

func M256MaskAvgEpu8(src x86.M256i, k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskAvgEpu8: Average packed unsigned 8-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPAVGB'. Intrinsic: '_mm256_mask_avg_epu8'. Requires AVX512BW.

func M256MaskBlendEpi16

func M256MaskBlendEpi16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskBlendEpi16: Blend packed 16-bit integers from 'a' and 'b' using control mask 'k', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := b[i+15:i]
	ELSE
		dst[i+15:i] := a[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPBLENDMW'. Intrinsic: '_mm256_mask_blend_epi16'. Requires AVX512BW.

func M256MaskBlendEpi8

func M256MaskBlendEpi8(k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskBlendEpi8: Blend packed 8-bit integers from 'a' and 'b' using control mask 'k', and store the results in 'dst'.

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := b[i+7:i]
	ELSE
		dst[i+7:i] := a[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPBLENDMB'. Intrinsic: '_mm256_mask_blend_epi8'. Requires AVX512BW.

func M256MaskBroadcastbEpi8

func M256MaskBroadcastbEpi8(src x86.M256i, k x86.Mmask32, a x86.M128i) (dst x86.M256i)

M256MaskBroadcastbEpi8: Broadcast the low packed 8-bit integer from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := a[7:0]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPBROADCASTB'. Intrinsic: '_mm256_mask_broadcastb_epi8'. Requires AVX512BW.

func M256MaskBroadcastwEpi16

func M256MaskBroadcastwEpi16(src x86.M256i, k x86.Mmask16, a x86.M128i) (dst x86.M256i)

M256MaskBroadcastwEpi16: Broadcast the low packed 16-bit integer from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := a[15:0]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPBROADCASTW'. Intrinsic: '_mm256_mask_broadcastw_epi16'. Requires AVX512BW.

func M256MaskCmpEpi16Mask

func M256MaskCmpEpi16Mask(k1 x86.Mmask16, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask16)

M256MaskCmpEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 15
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm256_mask_cmp_epi16_mask'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M256MaskCmpEpi8Mask

func M256MaskCmpEpi8Mask(k1 x86.Mmask32, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask32)

M256MaskCmpEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 31
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm256_mask_cmp_epi8_mask'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M256MaskCmpEpu16Mask

func M256MaskCmpEpu16Mask(k1 x86.Mmask16, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask16)

M256MaskCmpEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 15
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm256_mask_cmp_epu16_mask'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M256MaskCmpEpu8Mask

func M256MaskCmpEpu8Mask(k1 x86.Mmask32, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.Mmask32)

M256MaskCmpEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 31
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm256_mask_cmp_epu8_mask'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M256MaskCmpeqEpi16Mask

func M256MaskCmpeqEpi16Mask(k1 x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256MaskCmpeqEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm256_mask_cmpeq_epi16_mask'. Requires AVX512BW.

func M256MaskCmpeqEpi8Mask

func M256MaskCmpeqEpi8Mask(k1 x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256MaskCmpeqEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm256_mask_cmpeq_epi8_mask'. Requires AVX512BW.

func M256MaskCmpeqEpu16Mask

func M256MaskCmpeqEpu16Mask(k1 x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256MaskCmpeqEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm256_mask_cmpeq_epu16_mask'. Requires AVX512BW.

func M256MaskCmpeqEpu8Mask

func M256MaskCmpeqEpu8Mask(k1 x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256MaskCmpeqEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm256_mask_cmpeq_epu8_mask'. Requires AVX512BW.

func M256MaskCmpgeEpi16Mask

func M256MaskCmpgeEpi16Mask(k1 x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256MaskCmpgeEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm256_mask_cmpge_epi16_mask'. Requires AVX512BW.

func M256MaskCmpgeEpi8Mask

func M256MaskCmpgeEpi8Mask(k1 x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256MaskCmpgeEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm256_mask_cmpge_epi8_mask'. Requires AVX512BW.

func M256MaskCmpgeEpu16Mask

func M256MaskCmpgeEpu16Mask(k1 x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256MaskCmpgeEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm256_mask_cmpge_epu16_mask'. Requires AVX512BW.

func M256MaskCmpgeEpu8Mask

func M256MaskCmpgeEpu8Mask(k1 x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256MaskCmpgeEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm256_mask_cmpge_epu8_mask'. Requires AVX512BW.

func M256MaskCmpgtEpi16Mask

func M256MaskCmpgtEpi16Mask(k1 x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256MaskCmpgtEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm256_mask_cmpgt_epi16_mask'. Requires AVX512BW.

func M256MaskCmpgtEpi8Mask

func M256MaskCmpgtEpi8Mask(k1 x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256MaskCmpgtEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm256_mask_cmpgt_epi8_mask'. Requires AVX512BW.

func M256MaskCmpgtEpu16Mask

func M256MaskCmpgtEpu16Mask(k1 x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256MaskCmpgtEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm256_mask_cmpgt_epu16_mask'. Requires AVX512BW.

func M256MaskCmpgtEpu8Mask

func M256MaskCmpgtEpu8Mask(k1 x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256MaskCmpgtEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm256_mask_cmpgt_epu8_mask'. Requires AVX512BW.

func M256MaskCmpleEpi16Mask

func M256MaskCmpleEpi16Mask(k1 x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256MaskCmpleEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm256_mask_cmple_epi16_mask'. Requires AVX512BW.

func M256MaskCmpleEpi8Mask

func M256MaskCmpleEpi8Mask(k1 x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256MaskCmpleEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm256_mask_cmple_epi8_mask'. Requires AVX512BW.

func M256MaskCmpleEpu16Mask

func M256MaskCmpleEpu16Mask(k1 x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256MaskCmpleEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0
	ELSE
			k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm256_mask_cmple_epu16_mask'. Requires AVX512BW.

func M256MaskCmpleEpu8Mask

func M256MaskCmpleEpu8Mask(k1 x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256MaskCmpleEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm256_mask_cmple_epu8_mask'. Requires AVX512BW.

func M256MaskCmpltEpi16Mask

func M256MaskCmpltEpi16Mask(k1 x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256MaskCmpltEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm256_mask_cmplt_epi16_mask'. Requires AVX512BW.

func M256MaskCmpltEpi8Mask

func M256MaskCmpltEpi8Mask(k1 x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256MaskCmpltEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm256_mask_cmplt_epi8_mask'. Requires AVX512BW.

func M256MaskCmpltEpu16Mask

func M256MaskCmpltEpu16Mask(k1 x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256MaskCmpltEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm256_mask_cmplt_epu16_mask'. Requires AVX512BW.

func M256MaskCmpltEpu8Mask

func M256MaskCmpltEpu8Mask(k1 x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256MaskCmpltEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm256_mask_cmplt_epu8_mask'. Requires AVX512BW.

func M256MaskCmpneqEpi16Mask

func M256MaskCmpneqEpi16Mask(k1 x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256MaskCmpneqEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm256_mask_cmpneq_epi16_mask'. Requires AVX512BW.

func M256MaskCmpneqEpi8Mask

func M256MaskCmpneqEpi8Mask(k1 x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256MaskCmpneqEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm256_mask_cmpneq_epi8_mask'. Requires AVX512BW.

func M256MaskCmpneqEpu16Mask

func M256MaskCmpneqEpu16Mask(k1 x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256MaskCmpneqEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm256_mask_cmpneq_epu16_mask'. Requires AVX512BW.

func M256MaskCmpneqEpu8Mask

func M256MaskCmpneqEpu8Mask(k1 x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256MaskCmpneqEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm256_mask_cmpneq_epu8_mask'. Requires AVX512BW.

func M256MaskCvtepi16Epi8

func M256MaskCvtepi16Epi8(src x86.M128i, k x86.Mmask16, a x86.M256i) (dst x86.M128i)

M256MaskCvtepi16Epi8: Convert packed 16-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 16*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Truncate_Int16_To_Int8(a[i+15:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVWB'. Intrinsic: '_mm256_mask_cvtepi16_epi8'. Requires AVX512BW.

func M256MaskCvtepi8Epi16

func M256MaskCvtepi8Epi16(src x86.M256i, k x86.Mmask16, a x86.M128i) (dst x86.M256i)

M256MaskCvtepi8Epi16: Sign extend packed 8-bit integers in 'a' to packed 16-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	l := j*16
	IF k[j]
		dst[l+15:l] := SignExtend(a[i+7:i])
	ELSE
		dst[l+15:l] := src[l+15:l]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVSXBW'. Intrinsic: '_mm256_mask_cvtepi8_epi16'. Requires AVX512BW.

func M256MaskCvtepu8Epi16

func M256MaskCvtepu8Epi16(src x86.M256i, k x86.Mmask16, a x86.M128i) (dst x86.M256i)

M256MaskCvtepu8Epi16: Zero extend packed unsigned 8-bit integers in 'a' to packed 16-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	l := j*16
	IF k[j]
		dst[l+15:l] := ZeroExtend(a[i+7:i])
	ELSE
		dst[l+15:l] := src[l+15:l]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVZXBW'. Intrinsic: '_mm256_mask_cvtepu8_epi16'. Requires AVX512BW.

func M256MaskCvtsepi16Epi8

func M256MaskCvtsepi16Epi8(src x86.M128i, k x86.Mmask16, a x86.M256i) (dst x86.M128i)

M256MaskCvtsepi16Epi8: Convert packed 16-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 16*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_Int16_To_Int8(a[i+15:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSWB'. Intrinsic: '_mm256_mask_cvtsepi16_epi8'. Requires AVX512BW.

func M256MaskCvtusepi16Epi8

func M256MaskCvtusepi16Epi8(src x86.M128i, k x86.Mmask16, a x86.M256i) (dst x86.M128i)

M256MaskCvtusepi16Epi8: Convert packed unsigned 16-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 16*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVUSWB'. Intrinsic: '_mm256_mask_cvtusepi16_epi8'. Requires AVX512BW.

func M256MaskDbsadEpu8

func M256MaskDbsadEpu8(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskDbsadEpu8: Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in 'a' compared to those in 'b', and store the 16-bit results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The

first two SADs use the lower 8-bit quadruplet of the lane from 'a', and the last two SADs use the uppper 8-bit quadruplet of the lane from 'a'. Quadruplets from 'b' are selected from within 128-bit lanes according to the control in 'imm8', and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.

FOR j := 0 to 1
	i := j*128
	tmp[i+31:i] := select(b[i+127:i], imm8[1:0])
	tmp[i+63:i+32] := select(b[i+127:i], imm8[3:2])
	tmp[i+95:i+64] := select(b[i+127:i], imm8[5:4])
	tmp[i+127:i+96] := select(b[i+127:i], imm8[7:6])
ENDFOR

FOR j := 0 to 3
	i := j*64
	tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8])
				 + ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])

	tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16])
				 + ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])

	tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24])
				 + ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])

	tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32])
				 + ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
ENDFOR

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VDBPSADBW'. Intrinsic: '_mm256_mask_dbsad_epu8'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M256MaskMaddEpi16

func M256MaskMaddEpi16(src x86.M256i, k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMaddEpi16: Multiply packed 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i+16]*b[i+31:i+16] + a[i+15:i]*b[i+15:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMADDWD'. Intrinsic: '_mm256_mask_madd_epi16'. Requires AVX512BW.

func M256MaskMaddubsEpi16

func M256MaskMaddubsEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMaddubsEpi16: Multiply packed unsigned 8-bit integers in 'a' by packed signed 8-bit integers in 'b', producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMADDUBSW'. Intrinsic: '_mm256_mask_maddubs_epi16'. Requires AVX512BW.

func M256MaskMaxEpi16

func M256MaskMaxEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMaxEpi16: Compare packed 16-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		IF a[i+15:i] > b[i+15:i]
			dst[i+15:i] := a[i+15:i]
		ELSE
			dst[i+15:i] := b[i+15:i]
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMAXSW'. Intrinsic: '_mm256_mask_max_epi16'. Requires AVX512BW.

func M256MaskMaxEpi8

func M256MaskMaxEpi8(src x86.M256i, k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMaxEpi8: Compare packed 8-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		IF a[i+7:i] > b[i+7:i]
			dst[i+7:i] := a[i+7:i]
		ELSE
			dst[i+7:i] := b[i+7:i]
		FI
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMAXSB'. Intrinsic: '_mm256_mask_max_epi8'. Requires AVX512BW.

func M256MaskMaxEpu16

func M256MaskMaxEpu16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMaxEpu16: Compare packed unsigned 16-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		IF a[i+15:i] > b[i+15:i]
			dst[i+15:i] := a[i+15:i]
		ELSE
			dst[i+15:i] := b[i+15:i]
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMAXUW'. Intrinsic: '_mm256_mask_max_epu16'. Requires AVX512BW.

func M256MaskMaxEpu8

func M256MaskMaxEpu8(src x86.M256i, k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMaxEpu8: Compare packed unsigned 8-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		IF a[i+7:i] > b[i+7:i]
			dst[i+7:i] := a[i+7:i]
		ELSE
			dst[i+7:i] := b[i+7:i]
		FI
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMAXUB'. Intrinsic: '_mm256_mask_max_epu8'. Requires AVX512BW.

func M256MaskMinEpi16

func M256MaskMinEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMinEpi16: Compare packed 16-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		IF a[i+15:i] < b[i+15:i]
			dst[i+15:i] := a[i+15:i]
		ELSE
			dst[i+15:i] := b[i+15:i]
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMINSW'. Intrinsic: '_mm256_mask_min_epi16'. Requires AVX512BW.

func M256MaskMinEpi8

func M256MaskMinEpi8(src x86.M256i, k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMinEpi8: Compare packed 8-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		IF a[i+7:i] < b[i+7:i]
			dst[i+7:i] := a[i+7:i]
		ELSE
			dst[i+7:i] := b[i+7:i]
		FI
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMINSB'. Intrinsic: '_mm256_mask_min_epi8'. Requires AVX512BW.

func M256MaskMinEpu16

func M256MaskMinEpu16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMinEpu16: Compare packed unsigned 16-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		IF a[i+15:i] < b[i+15:i]
			dst[i+15:i] := a[i+15:i]
		ELSE
			dst[i+15:i] := b[i+15:i]
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMINUW'. Intrinsic: '_mm256_mask_min_epu16'. Requires AVX512BW.

func M256MaskMinEpu8

func M256MaskMinEpu8(src x86.M256i, k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMinEpu8: Compare packed unsigned 8-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		IF a[i+7:i] < b[i+7:i]
			dst[i+7:i] := a[i+7:i]
		ELSE
			dst[i+7:i] := b[i+7:i]
		FI
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMINUB'. Intrinsic: '_mm256_mask_min_epu8'. Requires AVX512BW.

func M256MaskMovEpi16

func M256MaskMovEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i) (dst x86.M256i)

M256MaskMovEpi16: Move packed 16-bit integers from 'a' into 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := a[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMOVDQU16'. Intrinsic: '_mm256_mask_mov_epi16'. Requires AVX512BW.

func M256MaskMovEpi8

func M256MaskMovEpi8(src x86.M256i, k x86.Mmask32, a x86.M256i) (dst x86.M256i)

M256MaskMovEpi8: Move packed 8-bit integers from 'a' into 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := a[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMOVDQU8'. Intrinsic: '_mm256_mask_mov_epi8'. Requires AVX512BW.

func M256MaskMulhiEpi16

func M256MaskMulhiEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMulhiEpi16: Multiply the packed 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		tmp[31:0] := a[i+15:i] * b[i+15:i]
		dst[i+15:i] := tmp[31:16]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMULHW'. Intrinsic: '_mm256_mask_mulhi_epi16'. Requires AVX512BW.

func M256MaskMulhiEpu16

func M256MaskMulhiEpu16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMulhiEpu16: Multiply the packed unsigned 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		tmp[31:0] := a[i+15:i] * b[i+15:i]
		dst[i+15:i] := tmp[31:16]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMULHUW'. Intrinsic: '_mm256_mask_mulhi_epu16'. Requires AVX512BW.

func M256MaskMulhrsEpi16

func M256MaskMulhrsEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMulhrsEpi16: Multiply packed 16-bit integers in 'a' and 'b', producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		tmp[31:0] := ((a[i+15:i] * b[i+15:i]) >> 14) + 1
		dst[i+15:i] := tmp[16:1]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMULHRSW'. Intrinsic: '_mm256_mask_mulhrs_epi16'. Requires AVX512BW.

func M256MaskMulloEpi16

func M256MaskMulloEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskMulloEpi16: Multiply the packed 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		tmp[31:0] := a[i+15:i] * b[i+15:i]
		dst[i+15:i] := tmp[15:0]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMULLW'. Intrinsic: '_mm256_mask_mullo_epi16'. Requires AVX512BW.

func M256MaskPacksEpi16

func M256MaskPacksEpi16(src x86.M256i, k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskPacksEpi16: Convert packed 16-bit integers from 'a' and 'b' to packed 8-bit integers using signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp_dst[7:0] := Saturate_Int16_To_Int8 (a[15:0])
tmp_dst[15:8] := Saturate_Int16_To_Int8 (a[31:16])
tmp_dst[23:16] := Saturate_Int16_To_Int8 (a[47:32])
tmp_dst[31:24] := Saturate_Int16_To_Int8 (a[63:48])
tmp_dst[39:32] := Saturate_Int16_To_Int8 (a[79:64])
tmp_dst[47:40] := Saturate_Int16_To_Int8 (a[95:80])
tmp_dst[55:48] := Saturate_Int16_To_Int8 (a[111:96])
tmp_dst[63:56] := Saturate_Int16_To_Int8 (a[127:112])
tmp_dst[71:64] := Saturate_Int16_To_Int8 (b[15:0])
tmp_dst[79:72] := Saturate_Int16_To_Int8 (b[31:16])
tmp_dst[87:80] := Saturate_Int16_To_Int8 (b[47:32])
tmp_dst[95:88] := Saturate_Int16_To_Int8 (b[63:48])
tmp_dst[103:96] := Saturate_Int16_To_Int8 (b[79:64])
tmp_dst[111:104] := Saturate_Int16_To_Int8 (b[95:80])
tmp_dst[119:112] := Saturate_Int16_To_Int8 (b[111:96])
tmp_dst[127:120] := Saturate_Int16_To_Int8 (b[127:112])
tmp_dst[135:128] := Saturate_Int16_To_Int8 (a[143:128])
tmp_dst[143:136] := Saturate_Int16_To_Int8 (a[159:144])
tmp_dst[151:144] := Saturate_Int16_To_Int8 (a[175:160])
tmp_dst[159:152] := Saturate_Int16_To_Int8 (a[191:176])
tmp_dst[167:160] := Saturate_Int16_To_Int8 (a[207:192])
tmp_dst[175:168] := Saturate_Int16_To_Int8 (a[223:208])
tmp_dst[183:176] := Saturate_Int16_To_Int8 (a[239:224])
tmp_dst[191:184] := Saturate_Int16_To_Int8 (a[255:240])
tmp_dst[199:192] := Saturate_Int16_To_Int8 (b[143:128])
tmp_dst[207:200] := Saturate_Int16_To_Int8 (b[159:144])
tmp_dst[215:208] := Saturate_Int16_To_Int8 (b[175:160])
tmp_dst[223:216] := Saturate_Int16_To_Int8 (b[191:176])
tmp_dst[231:224] := Saturate_Int16_To_Int8 (b[207:192])
tmp_dst[239:232] := Saturate_Int16_To_Int8 (b[223:208])
tmp_dst[247:240] := Saturate_Int16_To_Int8 (b[239:224])
tmp_dst[255:248] := Saturate_Int16_To_Int8 (b[255:240])

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPACKSSWB'. Intrinsic: '_mm256_mask_packs_epi16'. Requires AVX512BW.

func M256MaskPacksEpi32

func M256MaskPacksEpi32(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskPacksEpi32: Convert packed 32-bit integers from 'a' and 'b' to packed 16-bit integers using signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp_dst[15:0] := Saturate_Int32_To_Int16 (a[31:0])
tmp_dst[31:16] := Saturate_Int32_To_Int16 (a[63:32])
tmp_dst[47:32] := Saturate_Int32_To_Int16 (a[95:64])
tmp_dst[63:48] := Saturate_Int32_To_Int16 (a[127:96])
tmp_dst[79:64] := Saturate_Int32_To_Int16 (b[31:0])
tmp_dst[95:80] := Saturate_Int32_To_Int16 (b[63:32])
tmp_dst[111:96] := Saturate_Int32_To_Int16 (b[95:64])
tmp_dst[127:112] := Saturate_Int32_To_Int16 (b[127:96])
tmp_dst[143:128] := Saturate_Int32_To_Int16 (a[159:128])
tmp_dst[159:144] := Saturate_Int32_To_Int16 (a[191:160])
tmp_dst[175:160] := Saturate_Int32_To_Int16 (a[223:192])
tmp_dst[191:176] := Saturate_Int32_To_Int16 (a[255:224])
tmp_dst[207:192] := Saturate_Int32_To_Int16 (b[159:128])
tmp_dst[223:208] := Saturate_Int32_To_Int16 (b[191:160])
tmp_dst[239:224] := Saturate_Int32_To_Int16 (b[223:192])
tmp_dst[255:240] := Saturate_Int32_To_Int16 (b[255:224])

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPACKSSDW'. Intrinsic: '_mm256_mask_packs_epi32'. Requires AVX512BW.

func M256MaskPackusEpi16

func M256MaskPackusEpi16(src x86.M256i, k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskPackusEpi16: Convert packed 16-bit integers from 'a' and 'b' to packed 8-bit integers using unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp_dst[7:0] := Saturate_Int16_To_UnsignedInt8 (a[15:0])
tmp_dst[15:8] := Saturate_Int16_To_UnsignedInt8 (a[31:16])
tmp_dst[23:16] := Saturate_Int16_To_UnsignedInt8 (a[47:32])
tmp_dst[31:24] := Saturate_Int16_To_UnsignedInt8 (a[63:48])
tmp_dst[39:32] := Saturate_Int16_To_UnsignedInt8 (a[79:64])
tmp_dst[47:40] := Saturate_Int16_To_UnsignedInt8 (a[95:80])
tmp_dst[55:48] := Saturate_Int16_To_UnsignedInt8 (a[111:96])
tmp_dst[63:56] := Saturate_Int16_To_UnsignedInt8 (a[127:112])
tmp_dst[71:64] := Saturate_Int16_To_UnsignedInt8 (b[15:0])
tmp_dst[79:72] := Saturate_Int16_To_UnsignedInt8 (b[31:16])
tmp_dst[87:80] := Saturate_Int16_To_UnsignedInt8 (b[47:32])
tmp_dst[95:88] := Saturate_Int16_To_UnsignedInt8 (b[63:48])
tmp_dst[103:96] := Saturate_Int16_To_UnsignedInt8 (b[79:64])
tmp_dst[111:104] := Saturate_Int16_To_UnsignedInt8 (b[95:80])
tmp_dst[119:112] := Saturate_Int16_To_UnsignedInt8 (b[111:96])
tmp_dst[127:120] := Saturate_Int16_To_UnsignedInt8 (b[127:112])
tmp_dst[135:128] := Saturate_Int16_To_UnsignedInt8 (a[143:128])
tmp_dst[143:136] := Saturate_Int16_To_UnsignedInt8 (a[159:144])
tmp_dst[151:144] := Saturate_Int16_To_UnsignedInt8 (a[175:160])
tmp_dst[159:152] := Saturate_Int16_To_UnsignedInt8 (a[191:176])
tmp_dst[167:160] := Saturate_Int16_To_UnsignedInt8 (a[207:192])
tmp_dst[175:168] := Saturate_Int16_To_UnsignedInt8 (a[223:208])
tmp_dst[183:176] := Saturate_Int16_To_UnsignedInt8 (a[239:224])
tmp_dst[191:184] := Saturate_Int16_To_UnsignedInt8 (a[255:240])
tmp_dst[199:192] := Saturate_Int16_To_UnsignedInt8 (b[143:128])
tmp_dst[207:200] := Saturate_Int16_To_UnsignedInt8 (b[159:144])
tmp_dst[215:208] := Saturate_Int16_To_UnsignedInt8 (b[175:160])
tmp_dst[223:216] := Saturate_Int16_To_UnsignedInt8 (b[191:176])
tmp_dst[231:224] := Saturate_Int16_To_UnsignedInt8 (b[207:192])
tmp_dst[239:232] := Saturate_Int16_To_UnsignedInt8 (b[223:208])
tmp_dst[247:240] := Saturate_Int16_To_UnsignedInt8 (b[239:224])
tmp_dst[255:248] := Saturate_Int16_To_UnsignedInt8 (b[255:240])

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPACKUSWB'. Intrinsic: '_mm256_mask_packus_epi16'. Requires AVX512BW.

func M256MaskPackusEpi32

func M256MaskPackusEpi32(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskPackusEpi32: Convert packed 32-bit integers from 'a' and 'b' to packed 16-bit integers using unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp_dst[15:0] := Saturate_Int32_To_UnsignedInt16 (a[31:0])
tmp_dst[31:16] := Saturate_Int32_To_UnsignedInt16 (a[63:32])
tmp_dst[47:32] := Saturate_Int32_To_UnsignedInt16 (a[95:64])
tmp_dst[63:48] := Saturate_Int32_To_UnsignedInt16 (a[127:96])
tmp_dst[79:64] := Saturate_Int32_To_UnsignedInt16 (b[31:0])
tmp_dst[95:80] := Saturate_Int32_To_UnsignedInt16 (b[63:32])
tmp_dst[111:96] := Saturate_Int32_To_UnsignedInt16 (b[95:64])
tmp_dst[127:112] := Saturate_Int32_To_UnsignedInt16 (b[127:96])
tmp_dst[143:128] := Saturate_Int32_To_UnsignedInt16 (a[159:128])
tmp_dst[159:144] := Saturate_Int32_To_UnsignedInt16 (a[191:160])
tmp_dst[175:160] := Saturate_Int32_To_UnsignedInt16 (a[223:192])
tmp_dst[191:176] := Saturate_Int32_To_UnsignedInt16 (a[255:224])
tmp_dst[207:192] := Saturate_Int32_To_UnsignedInt16 (b[159:128])
tmp_dst[223:208] := Saturate_Int32_To_UnsignedInt16 (b[191:160])
tmp_dst[239:224] := Saturate_Int32_To_UnsignedInt16 (b[223:192])
tmp_dst[255:240] := Saturate_Int32_To_UnsignedInt16 (b[255:224])

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPACKUSDW'. Intrinsic: '_mm256_mask_packus_epi32'. Requires AVX512BW.

func M256MaskPermutex2varEpi16

func M256MaskPermutex2varEpi16(a x86.M256i, k x86.Mmask16, idx x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskPermutex2varEpi16: Shuffle 16-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		off := 16*idx[i+3:i]
		dst[i+15:i] := idx[i+4] ? b[off+15:off] : a[off+15:off]
	ELSE
		dst[i+15:i] := a[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMT2W'. Intrinsic: '_mm256_mask_permutex2var_epi16'. Requires AVX512BW.

func M256MaskPermutexvarEpi16

func M256MaskPermutexvarEpi16(src x86.M256i, k x86.Mmask16, idx x86.M256i, a x86.M256i) (dst x86.M256i)

M256MaskPermutexvarEpi16: Shuffle 16-bit integers in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	id := idx[i+3:i]*16
	IF k[j]
		dst[i+15:i] := a[id+15:id]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMW'. Intrinsic: '_mm256_mask_permutexvar_epi16'. Requires AVX512BW.

func M256MaskSet1Epi16

func M256MaskSet1Epi16(src x86.M256i, k x86.Mmask16, a int16) (dst x86.M256i)

M256MaskSet1Epi16: Broadcast the low packed 16-bit integer from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := a[15:0]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPBROADCASTW'. Intrinsic: '_mm256_mask_set1_epi16'. Requires AVX512BW.

func M256MaskSet1Epi8

func M256MaskSet1Epi8(src x86.M256i, k x86.Mmask32, a byte) (dst x86.M256i)

M256MaskSet1Epi8: Broadcast 8-bit integer 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := a[7:0]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPBROADCASTB'. Intrinsic: '_mm256_mask_set1_epi8'. Requires AVX512BW.

func M256MaskShuffleEpi8

func M256MaskShuffleEpi8(src x86.M256i, k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskShuffleEpi8: Shuffle packed 8-bit integers in 'a' according to shuffle control mask in the corresponding 8-bit element of 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		IF b[i+7] == 1
			dst[i+7:i] := 0
		ELSE
			index[3:0] := b[i+3:i]
			dst[i+7:i] := a[index*8+7:index*8]
		FI
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSHUFB'. Intrinsic: '_mm256_mask_shuffle_epi8'. Requires AVX512BW.

func M256MaskShufflehiEpi16

func M256MaskShufflehiEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskShufflehiEpi16: Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of 'a' using the control in 'imm8'. Store the results in the high 64 bits of 128-bit lanes of 'dst', with the low 64 bits of 128-bit lanes being copied from from 'a' to 'dst', using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp_dst[63:0] := a[63:0]
tmp_dst[79:64] := (a >> (imm8[1:0] * 16))[79:64]
tmp_dst[95:80] := (a >> (imm8[3:2] * 16))[79:64]
tmp_dst[111:96] := (a >> (imm8[5:4] * 16))[79:64]
tmp_dst[127:112] := (a >> (imm8[7:6] * 16))[79:64]
tmp_dst[191:128] := a[191:128]
tmp_dst[207:192] := (a >> (imm8[1:0] * 16))[207:192]
tmp_dst[223:208] := (a >> (imm8[3:2] * 16))[207:192]
tmp_dst[239:224] := (a >> (imm8[5:4] * 16))[207:192]
tmp_dst[255:240] := (a >> (imm8[7:6] * 16))[207:192]

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSHUFHW'. Intrinsic: '_mm256_mask_shufflehi_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M256MaskShuffleloEpi16

func M256MaskShuffleloEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskShuffleloEpi16: Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of 'a' using the control in 'imm8'. Store the results in the low 64 bits of 128-bit lanes of 'dst', with the high 64 bits of 128-bit lanes being copied from from 'a' to 'dst', using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp_dst[15:0] := (a >> (imm8[1:0] * 16))[15:0]
tmp_dst[31:16] := (a >> (imm8[3:2] * 16))[15:0]
tmp_dst[47:32] := (a >> (imm8[5:4] * 16))[15:0]
tmp_dst[63:48] := (a >> (imm8[7:6] * 16))[15:0]
tmp_dst[127:64] := a[127:64]
tmp_dst[143:128] := (a >> (imm8[1:0] * 16))[143:128]
tmp_dst[159:144] := (a >> (imm8[3:2] * 16))[143:128]
tmp_dst[175:160] := (a >> (imm8[5:4] * 16))[143:128]
tmp_dst[191:176] := (a >> (imm8[7:6] * 16))[143:128]
tmp_dst[255:192] := a[255:192]

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSHUFLW'. Intrinsic: '_mm256_mask_shufflelo_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M256MaskSllEpi16

func M256MaskSllEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, count x86.M128i) (dst x86.M256i)

M256MaskSllEpi16: Shift packed 16-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		IF count[63:0] > 15
			dst[i+15:i] := 0
		ELSE
			dst[i+15:i] := ZeroExtend(a[i+15:i] << count[63:0])
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSLLW'. Intrinsic: '_mm256_mask_sll_epi16'. Requires AVX512BW.

func M256MaskSlliEpi16

func M256MaskSlliEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskSlliEpi16: Shift packed 16-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		IF imm8[7:0] > 15
			dst[i+15:i] := 0
		ELSE
			dst[i+15:i] := ZeroExtend(a[i+15:i] << imm8[7:0])
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSLLW'. Intrinsic: '_mm256_mask_slli_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M256MaskSllvEpi16

func M256MaskSllvEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, count x86.M256i) (dst x86.M256i)

M256MaskSllvEpi16: Shift packed 16-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := ZeroExtend(a[i+15:i] << count[i+15:i])
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSLLVW'. Intrinsic: '_mm256_mask_sllv_epi16'. Requires AVX512BW.

func M256MaskSraEpi16

func M256MaskSraEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, count x86.M128i) (dst x86.M256i)

M256MaskSraEpi16: Shift packed 16-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		IF count[63:0] > 15
			dst[i+15:i] := SignBit
		ELSE
			dst[i+15:i] := SignExtend(a[i+15:i] >> count[63:0])
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRAW'. Intrinsic: '_mm256_mask_sra_epi16'. Requires AVX512BW.

func M256MaskSraiEpi16

func M256MaskSraiEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskSraiEpi16: Shift packed 16-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		IF imm8[7:0] > 15
			dst[i+15:i] := SignBit
		ELSE
			dst[i+15:i] := SignExtend(a[i+15:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRAW'. Intrinsic: '_mm256_mask_srai_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M256MaskSravEpi16

func M256MaskSravEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, count x86.M256i) (dst x86.M256i)

M256MaskSravEpi16: Shift packed 16-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := SignExtend(a[i+15:i] >> count[i+15:i])
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRAVW'. Intrinsic: '_mm256_mask_srav_epi16'. Requires AVX512BW.

func M256MaskSrlEpi16

func M256MaskSrlEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, count x86.M128i) (dst x86.M256i)

M256MaskSrlEpi16: Shift packed 16-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		IF count[63:0] > 15
			dst[i+15:i] := 0
		ELSE
			dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[63:0])
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRLW'. Intrinsic: '_mm256_mask_srl_epi16'. Requires AVX512BW.

func M256MaskSrliEpi16

func M256MaskSrliEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskSrliEpi16: Shift packed 16-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		IF imm8[7:0] > 15
			dst[i+15:i] := 0
		ELSE
			dst[i+15:i] := ZeroExtend(a[i+15:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRLW'. Intrinsic: '_mm256_mask_srli_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M256MaskSrlvEpi16

func M256MaskSrlvEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, count x86.M256i) (dst x86.M256i)

M256MaskSrlvEpi16: Shift packed 16-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[i+63:i])
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRLVW'. Intrinsic: '_mm256_mask_srlv_epi16'. Requires AVX512BW.

func M256MaskSubEpi16

func M256MaskSubEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskSubEpi16: Subtract packed 16-bit integers in 'b' from packed 16-bit integers in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := a[i+15:i] - b[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSUBW'. Intrinsic: '_mm256_mask_sub_epi16'. Requires AVX512BW.

func M256MaskSubEpi8

func M256MaskSubEpi8(src x86.M256i, k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskSubEpi8: Subtract packed 8-bit integers in 'b' from packed 8-bit integers in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := a[i+7:i] - b[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSUBB'. Intrinsic: '_mm256_mask_sub_epi8'. Requires AVX512BW.

func M256MaskSubsEpi16

func M256MaskSubsEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskSubsEpi16: Subtract packed 16-bit integers in 'b' from packed 16-bit integers in 'a' using saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_Int16(a[i+15:i] - b[i+15:i])
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSUBSW'. Intrinsic: '_mm256_mask_subs_epi16'. Requires AVX512BW.

func M256MaskSubsEpi8

func M256MaskSubsEpi8(src x86.M256i, k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskSubsEpi8: Subtract packed 8-bit integers in 'b' from packed 8-bit integers in 'a' using saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := Saturate_To_Int8(a[i+7:i] - b[i+7:i])
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSUBSB'. Intrinsic: '_mm256_mask_subs_epi8'. Requires AVX512BW.

func M256MaskSubsEpu16

func M256MaskSubsEpu16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskSubsEpu16: Subtract packed unsigned 16-bit integers in 'b' from packed unsigned 16-bit integers in 'a' using saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_UnsignedInt16(a[i+15:i] - b[i+15:i])
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSUBUSW'. Intrinsic: '_mm256_mask_subs_epu16'. Requires AVX512BW.

func M256MaskSubsEpu8

func M256MaskSubsEpu8(src x86.M256i, k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskSubsEpu8: Subtract packed unsigned 8-bit integers in 'b' from packed unsigned 8-bit integers in 'a' using saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := Saturate_To_UnsignedInt8(a[i+7:i] - b[i+7:i])
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSUBUSB'. Intrinsic: '_mm256_mask_subs_epu8'. Requires AVX512BW.

func M256MaskTestEpi16Mask

func M256MaskTestEpi16Mask(k1 x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256MaskTestEpi16Mask: Compute the bitwise AND of packed 16-bit integers in 'a' and 'b', producing intermediate 16-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is non-zero.

FOR j := 0 to 15
	i := j*16
	IF k1[j]
		k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPTESTMW'. Intrinsic: '_mm256_mask_test_epi16_mask'. Requires AVX512BW.

func M256MaskTestEpi8Mask

func M256MaskTestEpi8Mask(k1 x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256MaskTestEpi8Mask: Compute the bitwise AND of packed 8-bit integers in 'a' and 'b', producing intermediate 8-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is non-zero.

FOR j := 0 to 31
	i := j*8
	IF k1[j]
		k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPTESTMB'. Intrinsic: '_mm256_mask_test_epi8_mask'. Requires AVX512BW.

func M256MaskTestnEpi16Mask

func M256MaskTestnEpi16Mask(k1 x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256MaskTestnEpi16Mask: Compute the bitwise NAND of packed 16-bit integers in 'a' and 'b', producing intermediate 16-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is zero.

FOR j := 0 to 15
	i := j*16
	IF k1[j]
		k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPTESTNMW'. Intrinsic: '_mm256_mask_testn_epi16_mask'. Requires AVX512BW.

func M256MaskTestnEpi8Mask

func M256MaskTestnEpi8Mask(k1 x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256MaskTestnEpi8Mask: Compute the bitwise NAND of packed 8-bit integers in 'a' and 'b', producing intermediate 8-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is zero.

FOR j := 0 to 31
	i := j*8
	IF k1[j]
		k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPTESTNMB'. Intrinsic: '_mm256_mask_testn_epi8_mask'. Requires AVX512BW.

func M256MaskUnpackhiEpi16

func M256MaskUnpackhiEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskUnpackhiEpi16: Unpack and interleave 16-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]){
	dst[15:0] := src1[79:64]
	dst[31:16] := src2[79:64]
	dst[47:32] := src1[95:80]
	dst[63:48] := src2[95:80]
	dst[79:64] := src1[111:96]
	dst[95:80] := src2[111:96]
	dst[111:96] := src1[127:112]
	dst[127:112] := src2[127:112]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128])

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPUNPCKHWD'. Intrinsic: '_mm256_mask_unpackhi_epi16'. Requires AVX512BW.

func M256MaskUnpackhiEpi8

func M256MaskUnpackhiEpi8(src x86.M256i, k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskUnpackhiEpi8: Unpack and interleave 8-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]){
	dst[7:0] := src1[71:64]
	dst[15:8] := src2[71:64]
	dst[23:16] := src1[79:72]
	dst[31:24] := src2[79:72]
	dst[39:32] := src1[87:80]
	dst[47:40] := src2[87:80]
	dst[55:48] := src1[95:88]
	dst[63:56] := src2[95:88]
	dst[71:64] := src1[103:96]
	dst[79:72] := src2[103:96]
	dst[87:80] := src1[111:104]
	dst[95:88] := src2[111:104]
	dst[103:96] := src1[119:112]
	dst[111:104] := src2[119:112]
	dst[119:112] := src1[127:120]
	dst[127:120] := src2[127:120]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128])

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPUNPCKHBW'. Intrinsic: '_mm256_mask_unpackhi_epi8'. Requires AVX512BW.

func M256MaskUnpackloEpi16

func M256MaskUnpackloEpi16(src x86.M256i, k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskUnpackloEpi16: Unpack and interleave 16-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_WORDS(src1[127:0], src2[127:0]){
	dst[15:0] := src1[15:0]
	dst[31:16] := src2[15:0]
	dst[47:32] := src1[31:16]
	dst[63:48] := src2[31:16]
	dst[79:64] := src1[47:32]
	dst[95:80] := src2[47:32]
	dst[111:96] := src1[63:48]
	dst[127:112] := src2[63:48]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128])

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPUNPCKLWD'. Intrinsic: '_mm256_mask_unpacklo_epi16'. Requires AVX512BW.

func M256MaskUnpackloEpi8

func M256MaskUnpackloEpi8(src x86.M256i, k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskUnpackloEpi8: Unpack and interleave 8-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_BYTES(src1[127:0], src2[127:0]){
	dst[7:0] := src1[7:0]
	dst[15:8] := src2[7:0]
	dst[23:16] := src1[15:8]
	dst[31:24] := src2[15:8]
	dst[39:32] := src1[23:16]
	dst[47:40] := src2[23:16]
	dst[55:48] := src1[31:24]
	dst[63:56] := src2[31:24]
	dst[71:64] := src1[39:32]
	dst[79:72] := src2[39:32]
	dst[87:80] := src1[47:40]
	dst[95:88] := src2[47:40]
	dst[103:96] := src1[55:48]
	dst[111:104] := src2[55:48]
	dst[119:112] := src1[63:56]
	dst[127:120] := src2[63:56]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128])

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPUNPCKLBW'. Intrinsic: '_mm256_mask_unpacklo_epi8'. Requires AVX512BW.

func M256MaskzAbsEpi16

func M256MaskzAbsEpi16(k x86.Mmask16, a x86.M256i) (dst x86.M256i)

M256MaskzAbsEpi16: Compute the absolute value of packed 16-bit integers in 'a', and store the unsigned results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := ABS(a[i+15:i])
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPABSW'. Intrinsic: '_mm256_maskz_abs_epi16'. Requires AVX512BW.

func M256MaskzAbsEpi8

func M256MaskzAbsEpi8(k x86.Mmask32, a x86.M256i) (dst x86.M256i)

M256MaskzAbsEpi8: Compute the absolute value of packed 8-bit integers in 'a', and store the unsigned results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := ABS(a[i+7:i])
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPABSB'. Intrinsic: '_mm256_maskz_abs_epi8'. Requires AVX512BW.

func M256MaskzAddEpi16

func M256MaskzAddEpi16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzAddEpi16: Add packed 16-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := a[i+15:i] + b[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPADDW'. Intrinsic: '_mm256_maskz_add_epi16'. Requires AVX512BW.

func M256MaskzAddEpi8

func M256MaskzAddEpi8(k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzAddEpi8: Add packed 8-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := a[i+7:i] + b[i+7:i]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPADDB'. Intrinsic: '_mm256_maskz_add_epi8'. Requires AVX512BW.

func M256MaskzAddsEpi16

func M256MaskzAddsEpi16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzAddsEpi16: Add packed 16-bit integers in 'a' and 'b' using saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] )
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPADDSW'. Intrinsic: '_mm256_maskz_adds_epi16'. Requires AVX512BW.

func M256MaskzAddsEpi8

func M256MaskzAddsEpi8(k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzAddsEpi8: Add packed 8-bit integers in 'a' and 'b' using saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] )
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPADDSB'. Intrinsic: '_mm256_maskz_adds_epi8'. Requires AVX512BW.

func M256MaskzAddsEpu16

func M256MaskzAddsEpu16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzAddsEpu16: Add packed unsigned 16-bit integers in 'a' and 'b' using saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] )
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPADDUSW'. Intrinsic: '_mm256_maskz_adds_epu16'. Requires AVX512BW.

func M256MaskzAddsEpu8

func M256MaskzAddsEpu8(k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzAddsEpu8: Add packed unsigned 8-bit integers in 'a' and 'b' using saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := Saturate_To_UnsignedInt8( a[i+7:i] + b[i+7:i] )
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPADDUSB'. Intrinsic: '_mm256_maskz_adds_epu8'. Requires AVX512BW.

func M256MaskzAlignrEpi8

func M256MaskzAlignrEpi8(k x86.Mmask32, a x86.M256i, b x86.M256i, count int) (dst x86.M256i)

M256MaskzAlignrEpi8: Concatenate pairs of 16-byte blocks in 'a' and 'b' into a 32-byte temporary result, shift the result right by 'count' bytes, and store the low 16 bytes in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 1
	i := j*128
	tmp[255:0] := ((a[i+127:i] << 128) OR b[i+127:i]) >> (count[7:0]*8)
	tmp_dst[i+127:i] := tmp[127:0]
ENDFOR

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPALIGNR'. Intrinsic: '_mm256_maskz_alignr_epi8'. Requires AVX512BW.

func M256MaskzAvgEpu16

func M256MaskzAvgEpu16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzAvgEpu16: Average packed unsigned 16-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPAVGW'. Intrinsic: '_mm256_maskz_avg_epu16'. Requires AVX512BW.

func M256MaskzAvgEpu8

func M256MaskzAvgEpu8(k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzAvgEpu8: Average packed unsigned 8-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPAVGB'. Intrinsic: '_mm256_maskz_avg_epu8'. Requires AVX512BW.

func M256MaskzBroadcastbEpi8

func M256MaskzBroadcastbEpi8(k x86.Mmask32, a x86.M128i) (dst x86.M256i)

M256MaskzBroadcastbEpi8: Broadcast the low packed 8-bit integer from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := a[7:0]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPBROADCASTB'. Intrinsic: '_mm256_maskz_broadcastb_epi8'. Requires AVX512BW.

func M256MaskzBroadcastwEpi16

func M256MaskzBroadcastwEpi16(k x86.Mmask16, a x86.M128i) (dst x86.M256i)

M256MaskzBroadcastwEpi16: Broadcast the low packed 16-bit integer from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := a[15:0]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPBROADCASTW'. Intrinsic: '_mm256_maskz_broadcastw_epi16'. Requires AVX512BW.

func M256MaskzCvtepi16Epi8

func M256MaskzCvtepi16Epi8(k x86.Mmask16, a x86.M256i) (dst x86.M128i)

M256MaskzCvtepi16Epi8: Convert packed 16-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 16*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Truncate_Int16_To_Int8(a[i+15:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVWB'. Intrinsic: '_mm256_maskz_cvtepi16_epi8'. Requires AVX512BW.

func M256MaskzCvtepi8Epi16

func M256MaskzCvtepi8Epi16(k x86.Mmask16, a x86.M128i) (dst x86.M256i)

M256MaskzCvtepi8Epi16: Sign extend packed 8-bit integers in 'a' to packed 16-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	l := j*16
	IF k[j]
		dst[l+15:l] := SignExtend(a[i+7:i])
	ELSE
		dst[l+15:l] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVSXBW'. Intrinsic: '_mm256_maskz_cvtepi8_epi16'. Requires AVX512BW.

func M256MaskzCvtepu8Epi16

func M256MaskzCvtepu8Epi16(k x86.Mmask16, a x86.M128i) (dst x86.M256i)

M256MaskzCvtepu8Epi16: Zero extend packed unsigned 8-bit integers in 'a' to packed 16-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	l := j*16
	IF k[j]
		dst[l+15:l] := ZeroExtend(a[i+7:i])
	ELSE
		dst[l+15:l] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVZXBW'. Intrinsic: '_mm256_maskz_cvtepu8_epi16'. Requires AVX512BW.

func M256MaskzCvtsepi16Epi8

func M256MaskzCvtsepi16Epi8(k x86.Mmask16, a x86.M256i) (dst x86.M128i)

M256MaskzCvtsepi16Epi8: Convert packed 16-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 16*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_Int16_To_Int8(a[i+15:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSWB'. Intrinsic: '_mm256_maskz_cvtsepi16_epi8'. Requires AVX512BW.

func M256MaskzCvtusepi16Epi8

func M256MaskzCvtusepi16Epi8(k x86.Mmask16, a x86.M256i) (dst x86.M128i)

M256MaskzCvtusepi16Epi8: Convert packed unsigned 16-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := 16*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVUSWB'. Intrinsic: '_mm256_maskz_cvtusepi16_epi8'. Requires AVX512BW.

func M256MaskzDbsadEpu8

func M256MaskzDbsadEpu8(k x86.Mmask16, a x86.M256i, b x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskzDbsadEpu8: Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in 'a' compared to those in 'b', and store the 16-bit results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The

first two SADs use the lower 8-bit quadruplet of the lane from 'a', and the last two SADs use the uppper 8-bit quadruplet of the lane from 'a'. Quadruplets from 'b' are selected from within 128-bit lanes according to the control in 'imm8', and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.

FOR j := 0 to 1
	i := j*128
	tmp[i+31:i] := select(b[i+127:i], imm8[1:0])
	tmp[i+63:i+32] := select(b[i+127:i], imm8[3:2])
	tmp[i+95:i+64] := select(b[i+127:i], imm8[5:4])
	tmp[i+127:i+96] := select(b[i+127:i], imm8[7:6])
ENDFOR

FOR j := 0 to 3
	i := j*64
	tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8])
				 + ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])

	tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16])
				 + ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])

	tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24])
				 + ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])

	tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32])
				 + ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
ENDFOR

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VDBPSADBW'. Intrinsic: '_mm256_maskz_dbsad_epu8'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M256MaskzMaddEpi16

func M256MaskzMaddEpi16(k x86.Mmask8, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMaddEpi16: Multiply packed 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i+16]*b[i+31:i+16] + a[i+15:i]*b[i+15:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMADDWD'. Intrinsic: '_mm256_maskz_madd_epi16'. Requires AVX512BW.

func M256MaskzMaddubsEpi16

func M256MaskzMaddubsEpi16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMaddubsEpi16: Multiply packed unsigned 8-bit integers in 'a' by packed signed 8-bit integers in 'b', producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMADDUBSW'. Intrinsic: '_mm256_maskz_maddubs_epi16'. Requires AVX512BW.

func M256MaskzMaxEpi16

func M256MaskzMaxEpi16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMaxEpi16: Compare packed 16-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		IF a[i+15:i] > b[i+15:i]
			dst[i+15:i] := a[i+15:i]
		ELSE
			dst[i+15:i] := b[i+15:i]
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMAXSW'. Intrinsic: '_mm256_maskz_max_epi16'. Requires AVX512BW.

func M256MaskzMaxEpi8

func M256MaskzMaxEpi8(k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMaxEpi8: Compare packed 8-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		IF a[i+7:i] > b[i+7:i]
			dst[i+7:i] := a[i+7:i]
		ELSE
			dst[i+7:i] := b[i+7:i]
		FI
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMAXSB'. Intrinsic: '_mm256_maskz_max_epi8'. Requires AVX512BW.

func M256MaskzMaxEpu16

func M256MaskzMaxEpu16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMaxEpu16: Compare packed unsigned 16-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		IF a[i+15:i] > b[i+15:i]
			dst[i+15:i] := a[i+15:i]
		ELSE
			dst[i+15:i] := b[i+15:i]
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMAXUW'. Intrinsic: '_mm256_maskz_max_epu16'. Requires AVX512BW.

func M256MaskzMaxEpu8

func M256MaskzMaxEpu8(k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMaxEpu8: Compare packed unsigned 8-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		IF a[i+7:i] > b[i+7:i]
			dst[i+7:i] := a[i+7:i]
		ELSE
			dst[i+7:i] := b[i+7:i]
		FI
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMAXUB'. Intrinsic: '_mm256_maskz_max_epu8'. Requires AVX512BW.

func M256MaskzMinEpi16

func M256MaskzMinEpi16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMinEpi16: Compare packed 16-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		IF a[i+15:i] < b[i+15:i]
			dst[i+15:i] := a[i+15:i]
		ELSE
			dst[i+15:i] := b[i+15:i]
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMINSW'. Intrinsic: '_mm256_maskz_min_epi16'. Requires AVX512BW.

func M256MaskzMinEpi8

func M256MaskzMinEpi8(k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMinEpi8: Compare packed 8-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		IF a[i+7:i] < b[i+7:i]
			dst[i+7:i] := a[i+7:i]
		ELSE
			dst[i+7:i] := b[i+7:i]
		FI
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMINSB'. Intrinsic: '_mm256_maskz_min_epi8'. Requires AVX512BW.

func M256MaskzMinEpu16

func M256MaskzMinEpu16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMinEpu16: Compare packed unsigned 16-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		IF a[i+15:i] < b[i+15:i]
			dst[i+15:i] := a[i+15:i]
		ELSE
			dst[i+15:i] := b[i+15:i]
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMINUW'. Intrinsic: '_mm256_maskz_min_epu16'. Requires AVX512BW.

func M256MaskzMinEpu8

func M256MaskzMinEpu8(k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMinEpu8: Compare packed unsigned 8-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		IF a[i+7:i] < b[i+7:i]
			dst[i+7:i] := a[i+7:i]
		ELSE
			dst[i+7:i] := b[i+7:i]
		FI
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMINUB'. Intrinsic: '_mm256_maskz_min_epu8'. Requires AVX512BW.

func M256MaskzMovEpi16

func M256MaskzMovEpi16(k x86.Mmask16, a x86.M256i) (dst x86.M256i)

M256MaskzMovEpi16: Move packed 16-bit integers from 'a' into 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := a[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMOVDQU16'. Intrinsic: '_mm256_maskz_mov_epi16'. Requires AVX512BW.

func M256MaskzMovEpi8

func M256MaskzMovEpi8(k x86.Mmask32, a x86.M256i) (dst x86.M256i)

M256MaskzMovEpi8: Move packed 8-bit integers from 'a' into 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := a[i+7:i]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VMOVDQU8'. Intrinsic: '_mm256_maskz_mov_epi8'. Requires AVX512BW.

func M256MaskzMulhiEpi16

func M256MaskzMulhiEpi16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMulhiEpi16: Multiply the packed 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		tmp[31:0] := a[i+15:i] * b[i+15:i]
		dst[i+15:i] := tmp[31:16]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMULHW'. Intrinsic: '_mm256_maskz_mulhi_epi16'. Requires AVX512BW.

func M256MaskzMulhiEpu16

func M256MaskzMulhiEpu16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMulhiEpu16: Multiply the packed unsigned 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		tmp[31:0] := a[i+15:i] * b[i+15:i]
		dst[i+15:i] := tmp[31:16]
	ELSE
		dst[i+15:i] := o
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMULHUW'. Intrinsic: '_mm256_maskz_mulhi_epu16'. Requires AVX512BW.

func M256MaskzMulhrsEpi16

func M256MaskzMulhrsEpi16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMulhrsEpi16: Multiply packed 16-bit integers in 'a' and 'b', producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		tmp[31:0] := ((a[i+15:i] * b[i+15:i]) >> 14) + 1
		dst[i+15:i] := tmp[16:1]
	ELSE
		dst[i+15:i] := 9
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMULHRSW'. Intrinsic: '_mm256_maskz_mulhrs_epi16'. Requires AVX512BW.

func M256MaskzMulloEpi16

func M256MaskzMulloEpi16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzMulloEpi16: Multiply the packed 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		tmp[31:0] := a[i+15:i] * b[i+15:i]
		dst[i+15:i] := tmp[15:0]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMULLW'. Intrinsic: '_mm256_maskz_mullo_epi16'. Requires AVX512BW.

func M256MaskzPacksEpi16

func M256MaskzPacksEpi16(k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzPacksEpi16: Convert packed 16-bit integers from 'a' and 'b' to packed 8-bit integers using signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp_dst[7:0] := Saturate_Int16_To_Int8 (a[15:0])
tmp_dst[15:8] := Saturate_Int16_To_Int8 (a[31:16])
tmp_dst[23:16] := Saturate_Int16_To_Int8 (a[47:32])
tmp_dst[31:24] := Saturate_Int16_To_Int8 (a[63:48])
tmp_dst[39:32] := Saturate_Int16_To_Int8 (a[79:64])
tmp_dst[47:40] := Saturate_Int16_To_Int8 (a[95:80])
tmp_dst[55:48] := Saturate_Int16_To_Int8 (a[111:96])
tmp_dst[63:56] := Saturate_Int16_To_Int8 (a[127:112])
tmp_dst[71:64] := Saturate_Int16_To_Int8 (b[15:0])
tmp_dst[79:72] := Saturate_Int16_To_Int8 (b[31:16])
tmp_dst[87:80] := Saturate_Int16_To_Int8 (b[47:32])
tmp_dst[95:88] := Saturate_Int16_To_Int8 (b[63:48])
tmp_dst[103:96] := Saturate_Int16_To_Int8 (b[79:64])
tmp_dst[111:104] := Saturate_Int16_To_Int8 (b[95:80])
tmp_dst[119:112] := Saturate_Int16_To_Int8 (b[111:96])
tmp_dst[127:120] := Saturate_Int16_To_Int8 (b[127:112])
tmp_dst[135:128] := Saturate_Int16_To_Int8 (a[143:128])
tmp_dst[143:136] := Saturate_Int16_To_Int8 (a[159:144])
tmp_dst[151:144] := Saturate_Int16_To_Int8 (a[175:160])
tmp_dst[159:152] := Saturate_Int16_To_Int8 (a[191:176])
tmp_dst[167:160] := Saturate_Int16_To_Int8 (a[207:192])
tmp_dst[175:168] := Saturate_Int16_To_Int8 (a[223:208])
tmp_dst[183:176] := Saturate_Int16_To_Int8 (a[239:224])
tmp_dst[191:184] := Saturate_Int16_To_Int8 (a[255:240])
tmp_dst[199:192] := Saturate_Int16_To_Int8 (b[143:128])
tmp_dst[207:200] := Saturate_Int16_To_Int8 (b[159:144])
tmp_dst[215:208] := Saturate_Int16_To_Int8 (b[175:160])
tmp_dst[223:216] := Saturate_Int16_To_Int8 (b[191:176])
tmp_dst[231:224] := Saturate_Int16_To_Int8 (b[207:192])
tmp_dst[239:232] := Saturate_Int16_To_Int8 (b[223:208])
tmp_dst[247:240] := Saturate_Int16_To_Int8 (b[239:224])
tmp_dst[255:248] := Saturate_Int16_To_Int8 (b[255:240])

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPACKSSWB'. Intrinsic: '_mm256_maskz_packs_epi16'. Requires AVX512BW.

func M256MaskzPacksEpi32

func M256MaskzPacksEpi32(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzPacksEpi32: Convert packed 32-bit integers from 'a' and 'b' to packed 16-bit integers using signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp_dst[15:0] := Saturate_Int32_To_Int16 (a[31:0])
tmp_dst[31:16] := Saturate_Int32_To_Int16 (a[63:32])
tmp_dst[47:32] := Saturate_Int32_To_Int16 (a[95:64])
tmp_dst[63:48] := Saturate_Int32_To_Int16 (a[127:96])
tmp_dst[79:64] := Saturate_Int32_To_Int16 (b[31:0])
tmp_dst[95:80] := Saturate_Int32_To_Int16 (b[63:32])
tmp_dst[111:96] := Saturate_Int32_To_Int16 (b[95:64])
tmp_dst[127:112] := Saturate_Int32_To_Int16 (b[127:96])
tmp_dst[143:128] := Saturate_Int32_To_Int16 (a[159:128])
tmp_dst[159:144] := Saturate_Int32_To_Int16 (a[191:160])
tmp_dst[175:160] := Saturate_Int32_To_Int16 (a[223:192])
tmp_dst[191:176] := Saturate_Int32_To_Int16 (a[255:224])
tmp_dst[207:192] := Saturate_Int32_To_Int16 (b[159:128])
tmp_dst[223:208] := Saturate_Int32_To_Int16 (b[191:160])
tmp_dst[239:224] := Saturate_Int32_To_Int16 (b[223:192])
tmp_dst[255:240] := Saturate_Int32_To_Int16 (b[255:224])

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPACKSSDW'. Intrinsic: '_mm256_maskz_packs_epi32'. Requires AVX512BW.

func M256MaskzPackusEpi16

func M256MaskzPackusEpi16(k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzPackusEpi16: Convert packed 16-bit integers from 'a' and 'b' to packed 8-bit integers using unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp_dst[7:0] := Saturate_Int16_To_UnsignedInt8 (a[15:0])
tmp_dst[15:8] := Saturate_Int16_To_UnsignedInt8 (a[31:16])
tmp_dst[23:16] := Saturate_Int16_To_UnsignedInt8 (a[47:32])
tmp_dst[31:24] := Saturate_Int16_To_UnsignedInt8 (a[63:48])
tmp_dst[39:32] := Saturate_Int16_To_UnsignedInt8 (a[79:64])
tmp_dst[47:40] := Saturate_Int16_To_UnsignedInt8 (a[95:80])
tmp_dst[55:48] := Saturate_Int16_To_UnsignedInt8 (a[111:96])
tmp_dst[63:56] := Saturate_Int16_To_UnsignedInt8 (a[127:112])
tmp_dst[71:64] := Saturate_Int16_To_UnsignedInt8 (b[15:0])
tmp_dst[79:72] := Saturate_Int16_To_UnsignedInt8 (b[31:16])
tmp_dst[87:80] := Saturate_Int16_To_UnsignedInt8 (b[47:32])
tmp_dst[95:88] := Saturate_Int16_To_UnsignedInt8 (b[63:48])
tmp_dst[103:96] := Saturate_Int16_To_UnsignedInt8 (b[79:64])
tmp_dst[111:104] := Saturate_Int16_To_UnsignedInt8 (b[95:80])
tmp_dst[119:112] := Saturate_Int16_To_UnsignedInt8 (b[111:96])
tmp_dst[127:120] := Saturate_Int16_To_UnsignedInt8 (b[127:112])
tmp_dst[135:128] := Saturate_Int16_To_UnsignedInt8 (a[143:128])
tmp_dst[143:136] := Saturate_Int16_To_UnsignedInt8 (a[159:144])
tmp_dst[151:144] := Saturate_Int16_To_UnsignedInt8 (a[175:160])
tmp_dst[159:152] := Saturate_Int16_To_UnsignedInt8 (a[191:176])
tmp_dst[167:160] := Saturate_Int16_To_UnsignedInt8 (a[207:192])
tmp_dst[175:168] := Saturate_Int16_To_UnsignedInt8 (a[223:208])
tmp_dst[183:176] := Saturate_Int16_To_UnsignedInt8 (a[239:224])
tmp_dst[191:184] := Saturate_Int16_To_UnsignedInt8 (a[255:240])
tmp_dst[199:192] := Saturate_Int16_To_UnsignedInt8 (b[143:128])
tmp_dst[207:200] := Saturate_Int16_To_UnsignedInt8 (b[159:144])
tmp_dst[215:208] := Saturate_Int16_To_UnsignedInt8 (b[175:160])
tmp_dst[223:216] := Saturate_Int16_To_UnsignedInt8 (b[191:176])
tmp_dst[231:224] := Saturate_Int16_To_UnsignedInt8 (b[207:192])
tmp_dst[239:232] := Saturate_Int16_To_UnsignedInt8 (b[223:208])
tmp_dst[247:240] := Saturate_Int16_To_UnsignedInt8 (b[239:224])
tmp_dst[255:248] := Saturate_Int16_To_UnsignedInt8 (b[255:240])

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPACKUSWB'. Intrinsic: '_mm256_maskz_packus_epi16'. Requires AVX512BW.

func M256MaskzPackusEpi32

func M256MaskzPackusEpi32(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzPackusEpi32: Convert packed 32-bit integers from 'a' and 'b' to packed 16-bit integers using unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp_dst[15:0] := Saturate_Int32_To_UnsignedInt16 (a[31:0])
tmp_dst[31:16] := Saturate_Int32_To_UnsignedInt16 (a[63:32])
tmp_dst[47:32] := Saturate_Int32_To_UnsignedInt16 (a[95:64])
tmp_dst[63:48] := Saturate_Int32_To_UnsignedInt16 (a[127:96])
tmp_dst[79:64] := Saturate_Int32_To_UnsignedInt16 (b[31:0])
tmp_dst[95:80] := Saturate_Int32_To_UnsignedInt16 (b[63:32])
tmp_dst[111:96] := Saturate_Int32_To_UnsignedInt16 (b[95:64])
tmp_dst[127:112] := Saturate_Int32_To_UnsignedInt16 (b[127:96])
tmp_dst[143:128] := Saturate_Int32_To_UnsignedInt16 (a[159:128])
tmp_dst[159:144] := Saturate_Int32_To_UnsignedInt16 (a[191:160])
tmp_dst[175:160] := Saturate_Int32_To_UnsignedInt16 (a[223:192])
tmp_dst[191:176] := Saturate_Int32_To_UnsignedInt16 (a[255:224])
tmp_dst[207:192] := Saturate_Int32_To_UnsignedInt16 (b[159:128])
tmp_dst[223:208] := Saturate_Int32_To_UnsignedInt16 (b[191:160])
tmp_dst[239:224] := Saturate_Int32_To_UnsignedInt16 (b[223:192])
tmp_dst[255:240] := Saturate_Int32_To_UnsignedInt16 (b[255:224])

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPACKUSDW'. Intrinsic: '_mm256_maskz_packus_epi32'. Requires AVX512BW.

func M256MaskzPermutex2varEpi16

func M256MaskzPermutex2varEpi16(k x86.Mmask16, a x86.M256i, idx x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzPermutex2varEpi16: Shuffle 16-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		off := 16*idx[i+3:i]
		dst[i+15:i] := idx[i+4] ? b[off+15:off] : a[off+15:off]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMI2W, VPERMT2W'. Intrinsic: '_mm256_maskz_permutex2var_epi16'. Requires AVX512BW.

func M256MaskzPermutexvarEpi16

func M256MaskzPermutexvarEpi16(k x86.Mmask16, idx x86.M256i, a x86.M256i) (dst x86.M256i)

M256MaskzPermutexvarEpi16: Shuffle 16-bit integers in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	id := idx[i+3:i]*16
	IF k[j]
		dst[i+15:i] := a[id+15:id]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMW'. Intrinsic: '_mm256_maskz_permutexvar_epi16'. Requires AVX512BW.

func M256MaskzSet1Epi16

func M256MaskzSet1Epi16(k x86.Mmask16, a int16) (dst x86.M256i)

M256MaskzSet1Epi16: Broadcast 16-bit integer 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := a[15:0]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPBROADCASTW'. Intrinsic: '_mm256_maskz_set1_epi16'. Requires AVX512BW.

func M256MaskzSet1Epi8

func M256MaskzSet1Epi8(k x86.Mmask32, a byte) (dst x86.M256i)

M256MaskzSet1Epi8: Broadcast 8-bit integer 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := a[7:0]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPBROADCASTB'. Intrinsic: '_mm256_maskz_set1_epi8'. Requires AVX512BW.

func M256MaskzShuffleEpi8

func M256MaskzShuffleEpi8(k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzShuffleEpi8: Shuffle packed 8-bit integers in 'a' according to shuffle control mask in the corresponding 8-bit element of 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		IF b[i+7] == 1
			dst[i+7:i] := 0
		ELSE
			index[3:0] := b[i+3:i]
			dst[i+7:i] := a[index*8+7:index*8]
		FI
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSHUFB'. Intrinsic: '_mm256_maskz_shuffle_epi8'. Requires AVX512BW.

func M256MaskzShufflehiEpi16

func M256MaskzShufflehiEpi16(k x86.Mmask16, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskzShufflehiEpi16: Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of 'a' using the control in 'imm8'. Store the results in the high 64 bits of 128-bit lanes of 'dst', with the low 64 bits of 128-bit lanes being copied from from 'a' to 'dst', using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp_dst[63:0] := a[63:0]
tmp_dst[79:64] := (a >> (imm8[1:0] * 16))[79:64]
tmp_dst[95:80] := (a >> (imm8[3:2] * 16))[79:64]
tmp_dst[111:96] := (a >> (imm8[5:4] * 16))[79:64]
tmp_dst[127:112] := (a >> (imm8[7:6] * 16))[79:64]
tmp_dst[191:128] := a[191:128]
tmp_dst[207:192] := (a >> (imm8[1:0] * 16))[207:192]
tmp_dst[223:208] := (a >> (imm8[3:2] * 16))[207:192]
tmp_dst[239:224] := (a >> (imm8[5:4] * 16))[207:192]
tmp_dst[255:240] := (a >> (imm8[7:6] * 16))[207:192]

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSHUFHW'. Intrinsic: '_mm256_maskz_shufflehi_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M256MaskzShuffleloEpi16

func M256MaskzShuffleloEpi16(k x86.Mmask16, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskzShuffleloEpi16: Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of 'a' using the control in 'imm8'. Store the results in the low 64 bits of 128-bit lanes of 'dst', with the high 64 bits of 128-bit lanes being copied from from 'a' to 'dst', using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp_dst[15:0] := (a >> (imm8[1:0] * 16))[15:0]
tmp_dst[31:16] := (a >> (imm8[3:2] * 16))[15:0]
tmp_dst[47:32] := (a >> (imm8[5:4] * 16))[15:0]
tmp_dst[63:48] := (a >> (imm8[7:6] * 16))[15:0]
tmp_dst[127:64] := a[127:64]
tmp_dst[143:128] := (a >> (imm8[1:0] * 16))[143:128]
tmp_dst[159:144] := (a >> (imm8[3:2] * 16))[143:128]
tmp_dst[175:160] := (a >> (imm8[5:4] * 16))[143:128]
tmp_dst[191:176] := (a >> (imm8[7:6] * 16))[143:128]
tmp_dst[255:192] := a[255:192]

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSHUFLW'. Intrinsic: '_mm256_maskz_shufflelo_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M256MaskzSllEpi16

func M256MaskzSllEpi16(k x86.Mmask16, a x86.M256i, count x86.M128i) (dst x86.M256i)

M256MaskzSllEpi16: Shift packed 16-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		IF count[63:0] > 15
			dst[i+15:i] := 0
		ELSE
			dst[i+15:i] := ZeroExtend(a[i+15:i] << count[63:0])
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSLLW'. Intrinsic: '_mm256_maskz_sll_epi16'. Requires AVX512BW.

func M256MaskzSlliEpi16

func M256MaskzSlliEpi16(k x86.Mmask16, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskzSlliEpi16: Shift packed 16-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		IF imm8[7:0] > 15
			dst[i+15:i] := 0
		ELSE
			dst[i+15:i] := ZeroExtend(a[i+15:i] << imm8[7:0])
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSLLW'. Intrinsic: '_mm256_maskz_slli_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M256MaskzSllvEpi16

func M256MaskzSllvEpi16(k x86.Mmask16, a x86.M256i, count x86.M256i) (dst x86.M256i)

M256MaskzSllvEpi16: Shift packed 16-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := ZeroExtend(a[i+15:i] << count[i+15:i])
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSLLVW'. Intrinsic: '_mm256_maskz_sllv_epi16'. Requires AVX512BW.

func M256MaskzSraEpi16

func M256MaskzSraEpi16(k x86.Mmask16, a x86.M256i, count x86.M128i) (dst x86.M256i)

M256MaskzSraEpi16: Shift packed 16-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		IF count[63:0] > 15
			dst[i+15:i] := SignBit
		ELSE
			dst[i+15:i] := SignExtend(a[i+15:i] >> count[63:0])
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRAW'. Intrinsic: '_mm256_maskz_sra_epi16'. Requires AVX512BW.

func M256MaskzSraiEpi16

func M256MaskzSraiEpi16(k x86.Mmask16, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskzSraiEpi16: Shift packed 16-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		IF imm8[7:0] > 15
			dst[i+15:i] := SignBit
		ELSE
			dst[i+15:i] := SignExtend(a[i+15:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRAW'. Intrinsic: '_mm256_maskz_srai_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M256MaskzSravEpi16

func M256MaskzSravEpi16(k x86.Mmask16, a x86.M256i, count x86.M256i) (dst x86.M256i)

M256MaskzSravEpi16: Shift packed 16-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := SignExtend(a[i+15:i] >> count[i+15:i])
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRAVW'. Intrinsic: '_mm256_maskz_srav_epi16'. Requires AVX512BW.

func M256MaskzSrlEpi16

func M256MaskzSrlEpi16(k x86.Mmask16, a x86.M256i, count x86.M128i) (dst x86.M256i)

M256MaskzSrlEpi16: Shift packed 16-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		IF count[63:0] > 15
			dst[i+15:i] := 0
		ELSE
			dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[63:0])
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRLW'. Intrinsic: '_mm256_maskz_srl_epi16'. Requires AVX512BW.

func M256MaskzSrliEpi16

func M256MaskzSrliEpi16(k x86.Mmask16, a x86.M256i, imm8 byte) (dst x86.M256i)

M256MaskzSrliEpi16: Shift packed 16-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		IF imm8[7:0] > 15
			dst[i+15:i] := 0
		ELSE
			dst[i+15:i] := ZeroExtend(a[i+15:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRLW'. Intrinsic: '_mm256_maskz_srli_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M256MaskzSrlvEpi16

func M256MaskzSrlvEpi16(k x86.Mmask16, a x86.M256i, count x86.M256i) (dst x86.M256i)

M256MaskzSrlvEpi16: Shift packed 16-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[i+15:i])
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRLVW'. Intrinsic: '_mm256_maskz_srlv_epi16'. Requires AVX512BW.

func M256MaskzSubEpi16

func M256MaskzSubEpi16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzSubEpi16: Subtract packed 16-bit integers in 'b' from packed 16-bit integers in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := a[i+15:i] - b[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSUBW'. Intrinsic: '_mm256_maskz_sub_epi16'. Requires AVX512BW.

func M256MaskzSubEpi8

func M256MaskzSubEpi8(k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzSubEpi8: Subtract packed 8-bit integers in 'b' from packed 8-bit integers in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := a[i+7:i] - b[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSUBB'. Intrinsic: '_mm256_maskz_sub_epi8'. Requires AVX512BW.

func M256MaskzSubsEpi16

func M256MaskzSubsEpi16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzSubsEpi16: Subtract packed 16-bit integers in 'b' from packed 16-bit integers in 'a' using saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_Int16(a[i+15:i] - b[i+15:i])
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSUBSW'. Intrinsic: '_mm256_maskz_subs_epi16'. Requires AVX512BW.

func M256MaskzSubsEpi8

func M256MaskzSubsEpi8(k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzSubsEpi8: Subtract packed 8-bit integers in 'b' from packed 8-bit integers in 'a' using saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := Saturate_To_Int8(a[i+7:i] - b[i+7:i])
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSUBSB'. Intrinsic: '_mm256_maskz_subs_epi8'. Requires AVX512BW.

func M256MaskzSubsEpu16

func M256MaskzSubsEpu16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzSubsEpu16: Subtract packed unsigned 16-bit integers in 'b' from packed unsigned 16-bit integers in 'a' using saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_UnsignedInt16(a[i+15:i] - b[i+15:i])
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSUBUSW'. Intrinsic: '_mm256_maskz_subs_epu16'. Requires AVX512BW.

func M256MaskzSubsEpu8

func M256MaskzSubsEpu8(k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzSubsEpu8: Subtract packed unsigned 8-bit integers in 'b' from packed unsigned 8-bit integers in 'a' using saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := Saturate_To_UnsignedInt8(a[i+7:i] - b[i+7:i])
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSUBUSB'. Intrinsic: '_mm256_maskz_subs_epu8'. Requires AVX512BW.

func M256MaskzUnpackhiEpi16

func M256MaskzUnpackhiEpi16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzUnpackhiEpi16: Unpack and interleave 16-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]){
	dst[15:0] := src1[79:64]
	dst[31:16] := src2[79:64]
	dst[47:32] := src1[95:80]
	dst[63:48] := src2[95:80]
	dst[79:64] := src1[111:96]
	dst[95:80] := src2[111:96]
	dst[111:96] := src1[127:112]
	dst[127:112] := src2[127:112]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128])

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPUNPCKHWD'. Intrinsic: '_mm256_maskz_unpackhi_epi16'. Requires AVX512BW.

func M256MaskzUnpackhiEpi8

func M256MaskzUnpackhiEpi8(k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzUnpackhiEpi8: Unpack and interleave 8-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]){
	dst[7:0] := src1[71:64]
	dst[15:8] := src2[71:64]
	dst[23:16] := src1[79:72]
	dst[31:24] := src2[79:72]
	dst[39:32] := src1[87:80]
	dst[47:40] := src2[87:80]
	dst[55:48] := src1[95:88]
	dst[63:56] := src2[95:88]
	dst[71:64] := src1[103:96]
	dst[79:72] := src2[103:96]
	dst[87:80] := src1[111:104]
	dst[95:88] := src2[111:104]
	dst[103:96] := src1[119:112]
	dst[111:104] := src2[119:112]
	dst[119:112] := src1[127:120]
	dst[127:120] := src2[127:120]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128])

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPUNPCKHBW'. Intrinsic: '_mm256_maskz_unpackhi_epi8'. Requires AVX512BW.

func M256MaskzUnpackloEpi16

func M256MaskzUnpackloEpi16(k x86.Mmask16, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzUnpackloEpi16: Unpack and interleave 16-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_WORDS(src1[127:0], src2[127:0]){
	dst[15:0] := src1[15:0]
	dst[31:16] := src2[15:0]
	dst[47:32] := src1[31:16]
	dst[63:48] := src2[31:16]
	dst[79:64] := src1[47:32]
	dst[95:80] := src2[47:32]
	dst[111:96] := src1[63:48]
	dst[127:112] := src2[63:48]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128])

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPUNPCKLWD'. Intrinsic: '_mm256_maskz_unpacklo_epi16'. Requires AVX512BW.

func M256MaskzUnpackloEpi8

func M256MaskzUnpackloEpi8(k x86.Mmask32, a x86.M256i, b x86.M256i) (dst x86.M256i)

M256MaskzUnpackloEpi8: Unpack and interleave 8-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_BYTES(src1[127:0], src2[127:0]){
	dst[7:0] := src1[7:0]
	dst[15:8] := src2[7:0]
	dst[23:16] := src1[15:8]
	dst[31:24] := src2[15:8]
	dst[39:32] := src1[23:16]
	dst[47:40] := src2[23:16]
	dst[55:48] := src1[31:24]
	dst[63:56] := src2[31:24]
	dst[71:64] := src1[39:32]
	dst[79:72] := src2[39:32]
	dst[87:80] := src1[47:40]
	dst[95:88] := src2[47:40]
	dst[103:96] := src1[55:48]
	dst[111:104] := src2[55:48]
	dst[119:112] := src1[63:56]
	dst[127:120] := src2[63:56]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128])

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPUNPCKLBW'. Intrinsic: '_mm256_maskz_unpacklo_epi8'. Requires AVX512BW.

func M256Movepi16Mask

func M256Movepi16Mask(a x86.M256i) (dst x86.Mmask16)

M256Movepi16Mask: Set each bit of mask register 'k' based on the most significant bit of the corresponding packed 16-bit integer in 'a'.

FOR j := 0 to 15
	i := j*16
	IF a[i+15]
		k[j] := 1
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPMOVW2M'. Intrinsic: '_mm256_movepi16_mask'. Requires AVX512BW.

func M256Movepi8Mask

func M256Movepi8Mask(a x86.M256i) (dst x86.Mmask32)

M256Movepi8Mask: Set each bit of mask register 'k' based on the most significant bit of the corresponding packed 8-bit integer in 'a'.

FOR j := 0 to 31
	i := j*8
	IF a[i+7]
		k[j] := 1
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPMOVB2M'. Intrinsic: '_mm256_movepi8_mask'. Requires AVX512BW.

func M256MovmEpi16

func M256MovmEpi16(k x86.Mmask16) (dst x86.M256i)

M256MovmEpi16: Set each packed 16-bit integer in 'dst' to all ones or all zeros based on the value of the corresponding bit in 'k'.

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := 0xFFFF
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVM2W'. Intrinsic: '_mm256_movm_epi16'. Requires AVX512BW.

func M256MovmEpi8

func M256MovmEpi8(k x86.Mmask32) (dst x86.M256i)

M256MovmEpi8: Set each packed 8-bit integer in 'dst' to all ones or all zeros based on the value of the corresponding bit in 'k'.

FOR j := 0 to 31
	i := j*8
	IF k[j]
		dst[i+7:i] := 0xFF
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVM2B'. Intrinsic: '_mm256_movm_epi8'. Requires AVX512BW.

func M256Permutex2varEpi16

func M256Permutex2varEpi16(a x86.M256i, idx x86.M256i, b x86.M256i) (dst x86.M256i)

M256Permutex2varEpi16: Shuffle 16-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*16
	off := 16*idx[i+3:i]
	dst[i+15:i] := idx[i+4] ? b[off+15:off] : a[off+15:off]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMI2W, VPERMT2W'. Intrinsic: '_mm256_permutex2var_epi16'. Requires AVX512BW.

func M256PermutexvarEpi16

func M256PermutexvarEpi16(idx x86.M256i, a x86.M256i) (dst x86.M256i)

M256PermutexvarEpi16: Shuffle 16-bit integers in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*16
	id := idx[i+3:i]*16
	dst[i+15:i] := a[id+15:id]
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPERMW'. Intrinsic: '_mm256_permutexvar_epi16'. Requires AVX512BW.

func M256SllvEpi16

func M256SllvEpi16(a x86.M256i, count x86.M256i) (dst x86.M256i)

M256SllvEpi16: Shift packed 16-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := ZeroExtend(a[i+15:i] << count[i+15:i])
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSLLVW'. Intrinsic: '_mm256_sllv_epi16'. Requires AVX512BW.

func M256SravEpi16

func M256SravEpi16(a x86.M256i, count x86.M256i) (dst x86.M256i)

M256SravEpi16: Shift packed 16-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*16
	dst[i+15:i] := SignExtend(a[i+15:i] >> count[i+15:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRAVW'. Intrinsic: '_mm256_srav_epi16'. Requires AVX512BW.

func M256SrlvEpi16

func M256SrlvEpi16(a x86.M256i, count x86.M256i) (dst x86.M256i)

M256SrlvEpi16: Shift packed 16-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 15
	i := j*16
	IF k[j]
		dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[i+15:i])
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPSRLVW'. Intrinsic: '_mm256_srlv_epi16'. Requires AVX512BW.

func M256TestEpi16Mask

func M256TestEpi16Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256TestEpi16Mask: Compute the bitwise AND of packed 16-bit integers in 'a' and 'b', producing intermediate 16-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is non-zero.

FOR j := 0 to 15
	i := j*16
	k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPTESTMW'. Intrinsic: '_mm256_test_epi16_mask'. Requires AVX512BW.

func M256TestEpi8Mask

func M256TestEpi8Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256TestEpi8Mask: Compute the bitwise AND of packed 8-bit integers in 'a' and 'b', producing intermediate 8-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is non-zero.

FOR j := 0 to 31
	i := j*8
	k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPTESTMB'. Intrinsic: '_mm256_test_epi8_mask'. Requires AVX512BW.

func M256TestnEpi16Mask

func M256TestnEpi16Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask16)

M256TestnEpi16Mask: Compute the bitwise NAND of packed 16-bit integers in 'a' and 'b', producing intermediate 16-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is zero.

FOR j := 0 to 15
	i := j*16
	k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPTESTNMW'. Intrinsic: '_mm256_testn_epi16_mask'. Requires AVX512BW.

func M256TestnEpi8Mask

func M256TestnEpi8Mask(a x86.M256i, b x86.M256i) (dst x86.Mmask32)

M256TestnEpi8Mask: Compute the bitwise NAND of packed 8-bit integers in 'a' and 'b', producing intermediate 8-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is zero.

FOR j := 0 to 31
	i := j*8
	k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPTESTNMB'. Intrinsic: '_mm256_testn_epi8_mask'. Requires AVX512BW.

func M512AbsEpi16

func M512AbsEpi16(a x86.M512i) (dst x86.M512i)

M512AbsEpi16: Compute the absolute value of packed 16-bit integers in 'a', and store the unsigned results in 'dst'.

FOR j := 0 to 31
	i := j*16
	dst[i+15:i] := ABS(a[i+15:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPABSW'. Intrinsic: '_mm512_abs_epi16'. Requires AVX512BW.

func M512AbsEpi8

func M512AbsEpi8(a x86.M512i) (dst x86.M512i)

M512AbsEpi8: Compute the absolute value of packed 8-bit integers in 'a', and store the unsigned results in 'dst'.

FOR j := 0 to 63
	i := j*8
	dst[i+7:i] := ABS(a[i+7:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPABSB'. Intrinsic: '_mm512_abs_epi8'. Requires AVX512BW.

func M512AddEpi16

func M512AddEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512AddEpi16: Add packed 16-bit integers in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 31
	i := j*16
	dst[i+15:i] := a[i+15:i] + b[i+15:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADDW'. Intrinsic: '_mm512_add_epi16'. Requires AVX512BW.

func M512AddEpi8

func M512AddEpi8(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512AddEpi8: Add packed 8-bit integers in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 63
	i := j*8
	dst[i+7:i] := a[i+7:i] + b[i+7:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADDB'. Intrinsic: '_mm512_add_epi8'. Requires AVX512BW.

func M512AddsEpi16

func M512AddsEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512AddsEpi16: Add packed 16-bit integers in 'a' and 'b' using saturation, and store the results in 'dst'.

FOR j := 0 to 31
	i := j*16
	dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] )
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADDSW'. Intrinsic: '_mm512_adds_epi16'. Requires AVX512BW.

func M512AddsEpi8

func M512AddsEpi8(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512AddsEpi8: Add packed 8-bit integers in 'a' and 'b' using saturation, and store the results in 'dst'.

FOR j := 0 to 63
	i := j*8
	dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] )
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADDSB'. Intrinsic: '_mm512_adds_epi8'. Requires AVX512BW.

func M512AddsEpu16

func M512AddsEpu16(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512AddsEpu16: Add packed unsigned 16-bit integers in 'a' and 'b' using saturation, and store the results in 'dst'.

FOR j := 0 to 31
	i := j*16
	dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] )
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADDUSW'. Intrinsic: '_mm512_adds_epu16'. Requires AVX512BW.

func M512AddsEpu8

func M512AddsEpu8(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512AddsEpu8: Add packed unsigned 8-bit integers in 'a' and 'b' using saturation, and store the results in 'dst'.

FOR j := 0 to 63
	i := j*8
	dst[i+7:i] := Saturate_To_UnsignedInt8( a[i+7:i] + b[i+7:i] )
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADDUSB'. Intrinsic: '_mm512_adds_epu8'. Requires AVX512BW.

func M512AlignrEpi8

func M512AlignrEpi8(a x86.M512i, b x86.M512i, count int) (dst x86.M512i)

M512AlignrEpi8: Concatenate pairs of 16-byte blocks in 'a' and 'b' into a 32-byte temporary result, shift the result right by 'count' bytes, and store the low 16 bytes in 'dst'.

FOR j := 0 to 3
	i := j*128
	tmp[255:0] := ((a[i+127:i] << 128) OR b[i+127:i]) >> (count[7:0]*8)
	dst[i+127:i] := tmp[127:0]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPALIGNR'. Intrinsic: '_mm512_alignr_epi8'. Requires AVX512BW.

func M512AvgEpu16

func M512AvgEpu16(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512AvgEpu16: Average packed unsigned 16-bit integers in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 31
	i := j*16
	dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPAVGW'. Intrinsic: '_mm512_avg_epu16'. Requires AVX512BW.

func M512AvgEpu8

func M512AvgEpu8(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512AvgEpu8: Average packed unsigned 8-bit integers in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 63
	i := j*8
	dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPAVGB'. Intrinsic: '_mm512_avg_epu8'. Requires AVX512BW.

func M512BroadcastbEpi8

func M512BroadcastbEpi8(a x86.M128i) (dst x86.M512i)

M512BroadcastbEpi8: Broadcast the low packed 8-bit integer from 'a' to all elements of 'dst'.

FOR j := 0 to 63
	i := j*8
	dst[i+7:i] := a[7:0]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPBROADCASTB'. Intrinsic: '_mm512_broadcastb_epi8'. Requires AVX512BW.

func M512BroadcastwEpi16

func M512BroadcastwEpi16(a x86.M128i) (dst x86.M512i)

M512BroadcastwEpi16: Broadcast the low packed 16-bit integer from 'a' to all elements of 'dst'.

FOR j := 0 to 31
	i := j*16
	dst[i+15:i] := a[15:0]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPBROADCASTW'. Intrinsic: '_mm512_broadcastw_epi16'. Requires AVX512BW.

func M512BslliEpi128

func M512BslliEpi128(a x86.M512i, imm8 byte) (dst x86.M512i)

M512BslliEpi128: Shift 128-bit lanes in 'a' left by 'imm8' bytes while shifting in zeros, and store the results in 'dst'.

tmp := imm8[7:0]
IF tmp > 15
	tmp := 16
FI
dst[127:0] := a[127:0] << (tmp*8)
dst[255:128] := a[255:128] << (tmp*8)
dst[383:256] := a[383:256] << (tmp*8)
dst[511:384] := a[511:384] << (tmp*8)
dst[MAX:512] := 0

Instruction: 'VPSLLDQ'. Intrinsic: '_mm512_bslli_epi128'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512BsrliEpi128

func M512BsrliEpi128(a x86.M512i, imm8 byte) (dst x86.M512i)

M512BsrliEpi128: Shift 128-bit lanes in 'a' right by 'imm8' bytes while shifting in zeros, and store the results in 'dst'.

tmp := imm8[7:0]
IF tmp > 15
	tmp := 16
FI
dst[127:0] := a[127:0] >> (tmp*8)
dst[255:128] := a[255:128] >> (tmp*8)
dst[383:256] := a[383:256] >> (tmp*8)
dst[511:384] := a[511:384] >> (tmp*8)
dst[MAX:512] := 0

Instruction: 'VPSRLDQ'. Intrinsic: '_mm512_bsrli_epi128'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512CmpEpi16Mask

func M512CmpEpi16Mask(a x86.M512i, b x86.M512i, imm8 byte) (dst x86.Mmask32)

M512CmpEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 31
	i := j*16
	k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm512_cmp_epi16_mask'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512CmpEpi8Mask

func M512CmpEpi8Mask(a x86.M512i, b x86.M512i, imm8 byte) (dst x86.Mmask64)

M512CmpEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 63
	i := j*8
	k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm512_cmp_epi8_mask'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512CmpEpu16Mask

func M512CmpEpu16Mask(a x86.M512i, b x86.M512i, imm8 byte) (dst x86.Mmask32)

M512CmpEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 31
	i := j*16
	k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm512_cmp_epu16_mask'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512CmpEpu8Mask

func M512CmpEpu8Mask(a x86.M512i, b x86.M512i, imm8 byte) (dst x86.Mmask64)

M512CmpEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k'.

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 63
	i := j*8
	k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm512_cmp_epu8_mask'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512CmpeqEpi16Mask

func M512CmpeqEpi16Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512CmpeqEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.

FOR j := 0 to 31
	i := j*16
	k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm512_cmpeq_epi16_mask'. Requires AVX512BW.

func M512CmpeqEpi8Mask

func M512CmpeqEpi8Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512CmpeqEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.

FOR j := 0 to 63
	i := j*8
	k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm512_cmpeq_epi8_mask'. Requires AVX512BW.

func M512CmpeqEpu16Mask

func M512CmpeqEpu16Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512CmpeqEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.

FOR j := 0 to 31
	i := j*16
	k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm512_cmpeq_epu16_mask'. Requires AVX512BW.

func M512CmpeqEpu8Mask

func M512CmpeqEpu8Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512CmpeqEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k'.

FOR j := 0 to 63
	i := j*8
	k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm512_cmpeq_epu8_mask'. Requires AVX512BW.

func M512CmpgeEpi16Mask

func M512CmpgeEpi16Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512CmpgeEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 31
	i := j*16
	k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm512_cmpge_epi16_mask'. Requires AVX512BW.

func M512CmpgeEpi8Mask

func M512CmpgeEpi8Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512CmpgeEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 63
	i := j*8
	k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm512_cmpge_epi8_mask'. Requires AVX512BW.

func M512CmpgeEpu16Mask

func M512CmpgeEpu16Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512CmpgeEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 31
	i := j*16
	k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm512_cmpge_epu16_mask'. Requires AVX512BW.

func M512CmpgeEpu8Mask

func M512CmpgeEpu8Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512CmpgeEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 63
	i := j*8
	k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm512_cmpge_epu8_mask'. Requires AVX512BW.

func M512CmpgtEpi16Mask

func M512CmpgtEpi16Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512CmpgtEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.

FOR j := 0 to 31
	i := j*16
	k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm512_cmpgt_epi16_mask'. Requires AVX512BW.

func M512CmpgtEpi8Mask

func M512CmpgtEpi8Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512CmpgtEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.

FOR j := 0 to 63
	i := j*8
	k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm512_cmpgt_epi8_mask'. Requires AVX512BW.

func M512CmpgtEpu16Mask

func M512CmpgtEpu16Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512CmpgtEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.

FOR j := 0 to 31
	i := j*16
	k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm512_cmpgt_epu16_mask'. Requires AVX512BW.

func M512CmpgtEpu8Mask

func M512CmpgtEpu8Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512CmpgtEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k'.

FOR j := 0 to 63
	i := j*8
	k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm512_cmpgt_epu8_mask'. Requires AVX512BW.

func M512CmpleEpi16Mask

func M512CmpleEpi16Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512CmpleEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 31
	i := j*16
	k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm512_cmple_epi16_mask'. Requires AVX512BW.

func M512CmpleEpi8Mask

func M512CmpleEpi8Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512CmpleEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 63
	i := j*8
	k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm512_cmple_epi8_mask'. Requires AVX512BW.

func M512CmpleEpu16Mask

func M512CmpleEpu16Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512CmpleEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 31
	i := j*16
	k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm512_cmple_epu16_mask'. Requires AVX512BW.

func M512CmpleEpu8Mask

func M512CmpleEpu8Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512CmpleEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k'.

FOR j := 0 to 63
	i := j*8
	k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm512_cmple_epu8_mask'. Requires AVX512BW.

func M512CmpltEpi16Mask

func M512CmpltEpi16Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512CmpltEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.

FOR j := 0 to 31
	i := j*16
	k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm512_cmplt_epi16_mask'. Requires AVX512BW.

func M512CmpltEpi8Mask

func M512CmpltEpi8Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512CmpltEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.

FOR j := 0 to 63
	i := j*8
	k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm512_cmplt_epi8_mask'. Requires AVX512BW.

func M512CmpltEpu16Mask

func M512CmpltEpu16Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512CmpltEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.

FOR j := 0 to 31
	i := j*16
	k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm512_cmplt_epu16_mask'. Requires AVX512BW.

func M512CmpltEpu8Mask

func M512CmpltEpu8Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512CmpltEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k'.

FOR j := 0 to 63
	i := j*8
	k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm512_cmplt_epu8_mask'. Requires AVX512BW.

func M512CmpneqEpi16Mask

func M512CmpneqEpi16Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512CmpneqEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.

FOR j := 0 to 31
	i := j*16
	k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm512_cmpneq_epi16_mask'. Requires AVX512BW.

func M512CmpneqEpi8Mask

func M512CmpneqEpi8Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512CmpneqEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.

FOR j := 0 to 63
	i := j*8
	k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm512_cmpneq_epi8_mask'. Requires AVX512BW.

func M512CmpneqEpu16Mask

func M512CmpneqEpu16Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512CmpneqEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.

FOR j := 0 to 31
	i := j*16
	k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm512_cmpneq_epu16_mask'. Requires AVX512BW.

func M512CmpneqEpu8Mask

func M512CmpneqEpu8Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512CmpneqEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k'.

FOR j := 0 to 63
	i := j*8
	k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm512_cmpneq_epu8_mask'. Requires AVX512BW.

func M512Cvtepi16Epi8

func M512Cvtepi16Epi8(a x86.M512i) (dst x86.M256i)

M512Cvtepi16Epi8: Convert packed 16-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 31
	i := 16*j
	l := 8*j
	dst[l+7:l] := Truncate_Int16_To_Int8(a[i+15:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVWB'. Intrinsic: '_mm512_cvtepi16_epi8'. Requires AVX512BW.

func M512Cvtepi8Epi16

func M512Cvtepi8Epi16(a x86.M256i) (dst x86.M512i)

M512Cvtepi8Epi16: Sign extend packed 8-bit integers in 'a' to packed 16-bit integers, and store the results in 'dst'.

FOR j := 0 to 31
	i := j*8
	l := j*16
	dst[l+15:l] := SignExtend(a[i+7:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVSXBW'. Intrinsic: '_mm512_cvtepi8_epi16'. Requires AVX512BW.

func M512Cvtepu8Epi16

func M512Cvtepu8Epi16(a x86.M256i) (dst x86.M512i)

M512Cvtepu8Epi16: Zero extend packed unsigned 8-bit integers in 'a' to packed 16-bit integers, and store the results in 'dst'.

FOR j := 0 to 31
	i := j*8
	l := j*16
	dst[l+15:l] := ZeroExtend(a[i+7:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVZXBW'. Intrinsic: '_mm512_cvtepu8_epi16'. Requires AVX512BW.

func M512Cvtsepi16Epi8

func M512Cvtsepi16Epi8(a x86.M512i) (dst x86.M256i)

M512Cvtsepi16Epi8: Convert packed 16-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst'.

FOR j := 0 to 31
	i := 16*j
	l := 8*j
	dst[l+7:l] := Saturate_Int16_To_Int8(a[i+15:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVSWB'. Intrinsic: '_mm512_cvtsepi16_epi8'. Requires AVX512BW.

func M512Cvtusepi16Epi8

func M512Cvtusepi16Epi8(a x86.M512i) (dst x86.M256i)

M512Cvtusepi16Epi8: Convert packed unsigned 16-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst'.

FOR j := 0 to 31
	i := 16*j
	l := 8*j
	dst[l+7:l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i])
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVUSWB'. Intrinsic: '_mm512_cvtusepi16_epi8'. Requires AVX512BW.

func M512DbsadEpu8

func M512DbsadEpu8(a x86.M512i, b x86.M512i, imm8 byte) (dst x86.M512i)

M512DbsadEpu8: Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in 'a' compared to those in 'b', and store the 16-bit results in 'dst'.

Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The

first two SADs use the lower 8-bit quadruplet of the lane from 'a', and the last two SADs use the uppper 8-bit quadruplet of the lane from 'a'. Quadruplets from 'b' are selected from within 128-bit lanes according to the control in 'imm8', and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.

FOR j := 0 to 3
	i := j*128
	tmp[i+31:i] := select(b[i+127:i], imm8[1:0])
	tmp[i+63:i+32] := select(b[i+127:i], imm8[3:2])
	tmp[i+95:i+64] := select(b[i+127:i], imm8[5:4])
	tmp[i+127:i+96] := select(b[i+127:i], imm8[7:6])
ENDFOR

FOR j := 0 to 7
	i := j*64
	dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8])
				 + ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])

	dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16])
				 + ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])

	dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24])
				 + ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])

	dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32])
				 + ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VDBPSADBW'. Intrinsic: '_mm512_dbsad_epu8'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512Kunpackd

func M512Kunpackd(a x86.Mmask64, b x86.Mmask64) (dst x86.Mmask64)

M512Kunpackd: Unpack and interleave 32 bits from masks 'a' and 'b', and store the 64-bit result in 'k'.

k[31:0] := a[31:0]
k[63:32] := b[31:0]
k[MAX:64] := 0

Instruction: 'KUNPCKDQ'. Intrinsic: '_mm512_kunpackd'. Requires AVX512BW.

func M512Kunpackw

func M512Kunpackw(a x86.Mmask32, b x86.Mmask32) (dst x86.Mmask32)

M512Kunpackw: Unpack and interleave 16 bits from masks 'a' and 'b', and store the 32-bit result in 'k'.

k[15:0] := a[15:0]
k[31:16] := b[15:0]
k[MAX:32] := 0

Instruction: 'KUNPCKWD'. Intrinsic: '_mm512_kunpackw'. Requires AVX512BW.

func M512MaddEpi16

func M512MaddEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaddEpi16: Multiply packed 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in 'dst'.

FOR j := 0 to 15
	i := j*32
	st[i+31:i] := a[i+31:i+16]*b[i+31:i+16] + a[i+15:i]*b[i+15:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMADDWD'. Intrinsic: '_mm512_madd_epi16'. Requires AVX512BW.

func M512MaddubsEpi16

func M512MaddubsEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaddubsEpi16: Vertically multiply each unsigned 8-bit integer from 'a' with the corresponding signed 8-bit integer from 'b', producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in 'dst'.

FOR j := 0 to 31
	i := j*16
	dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMADDUBSW'. Intrinsic: '_mm512_maddubs_epi16'. Requires AVX512BW.

func M512Mask2Permutex2varEpi16

func M512Mask2Permutex2varEpi16(a x86.M512i, idx x86.M512i, k x86.Mmask32, b x86.M512i) (dst x86.M512i)

M512Mask2Permutex2varEpi16: Shuffle 16-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'idx' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		off := 16*idx[i+4:i]
		dst[i+15:i] := idx[i+5] ? b[off+15:off] : a[off+15:off]
	ELSE
		dst[i+15:i] := idx[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMI2W'. Intrinsic: '_mm512_mask2_permutex2var_epi16'. Requires AVX512BW.

func M512MaskAbsEpi16

func M512MaskAbsEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i) (dst x86.M512i)

M512MaskAbsEpi16: Compute the absolute value of packed 16-bit integers in 'a', and store the unsigned results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := ABS(a[i+15:i])
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPABSW'. Intrinsic: '_mm512_mask_abs_epi16'. Requires AVX512BW.

func M512MaskAbsEpi8

func M512MaskAbsEpi8(src x86.M512i, k x86.Mmask64, a x86.M512i) (dst x86.M512i)

M512MaskAbsEpi8: Compute the absolute value of packed 8-bit integers in 'a', and store the unsigned results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := ABS(a[i+7:i])
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPABSB'. Intrinsic: '_mm512_mask_abs_epi8'. Requires AVX512BW.

func M512MaskAddEpi16

func M512MaskAddEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskAddEpi16: Add packed 16-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := a[i+15:i] + b[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADDW'. Intrinsic: '_mm512_mask_add_epi16'. Requires AVX512BW.

func M512MaskAddEpi8

func M512MaskAddEpi8(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskAddEpi8: Add packed 8-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := a[i+7:i] + b[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADDB'. Intrinsic: '_mm512_mask_add_epi8'. Requires AVX512BW.

func M512MaskAddsEpi16

func M512MaskAddsEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskAddsEpi16: Add packed 16-bit integers in 'a' and 'b' using saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] )
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADDSW'. Intrinsic: '_mm512_mask_adds_epi16'. Requires AVX512BW.

func M512MaskAddsEpi8

func M512MaskAddsEpi8(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskAddsEpi8: Add packed 8-bit integers in 'a' and 'b' using saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] )
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADDSB'. Intrinsic: '_mm512_mask_adds_epi8'. Requires AVX512BW.

func M512MaskAddsEpu16

func M512MaskAddsEpu16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskAddsEpu16: Add packed unsigned 16-bit integers in 'a' and 'b' using saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] )
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADDUSW'. Intrinsic: '_mm512_mask_adds_epu16'. Requires AVX512BW.

func M512MaskAddsEpu8

func M512MaskAddsEpu8(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskAddsEpu8: Add packed unsigned 8-bit integers in 'a' and 'b' using saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := Saturate_To_UnsignedInt8( a[i+7:i] + b[i+7:i] )
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADDUSB'. Intrinsic: '_mm512_mask_adds_epu8'. Requires AVX512BW.

func M512MaskAlignrEpi8

func M512MaskAlignrEpi8(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i, count int) (dst x86.M512i)

M512MaskAlignrEpi8: Concatenate pairs of 16-byte blocks in 'a' and 'b' into a 32-byte temporary result, shift the result right by 'count' bytes, and store the low 16 bytes in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*128
	tmp[255:0] := ((a[i+127:i] << 128) OR b[i+127:i]) >> (count[7:0]*8)
	tmp_dst[i+127:i] := tmp[127:0]
ENDFOR

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPALIGNR'. Intrinsic: '_mm512_mask_alignr_epi8'. Requires AVX512BW.

func M512MaskAvgEpu16

func M512MaskAvgEpu16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskAvgEpu16: Average packed unsigned 16-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPAVGW'. Intrinsic: '_mm512_mask_avg_epu16'. Requires AVX512BW.

func M512MaskAvgEpu8

func M512MaskAvgEpu8(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskAvgEpu8: Average packed unsigned 8-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPAVGB'. Intrinsic: '_mm512_mask_avg_epu8'. Requires AVX512BW.

func M512MaskBlendEpi16

func M512MaskBlendEpi16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskBlendEpi16: Blend packed 16-bit integers from 'a' and 'b' using control mask 'k', and store the results in 'dst'.

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := b[i+15:i]
	ELSE
		dst[i+15:i] := a[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPBLENDMW'. Intrinsic: '_mm512_mask_blend_epi16'. Requires AVX512BW.

func M512MaskBlendEpi8

func M512MaskBlendEpi8(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskBlendEpi8: Blend packed 8-bit integers from 'a' and 'b' using control mask 'k', and store the results in 'dst'.

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := b[i+7:i]
	ELSE
		dst[i+7:i] := a[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPBLENDMB'. Intrinsic: '_mm512_mask_blend_epi8'. Requires AVX512BW.

func M512MaskBroadcastbEpi8

func M512MaskBroadcastbEpi8(src x86.M512i, k x86.Mmask64, a x86.M128i) (dst x86.M512i)

M512MaskBroadcastbEpi8: Broadcast the low packed 8-bit integer from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := a[7:0]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPBROADCASTB'. Intrinsic: '_mm512_mask_broadcastb_epi8'. Requires AVX512BW.

func M512MaskBroadcastwEpi16

func M512MaskBroadcastwEpi16(src x86.M512i, k x86.Mmask32, a x86.M128i) (dst x86.M512i)

M512MaskBroadcastwEpi16: Broadcast the low packed 16-bit integer from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := a[15:0]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPBROADCASTW'. Intrinsic: '_mm512_mask_broadcastw_epi16'. Requires AVX512BW.

func M512MaskCmpEpi16Mask

func M512MaskCmpEpi16Mask(k1 x86.Mmask32, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.Mmask32)

M512MaskCmpEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 31
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm512_mask_cmp_epi16_mask'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512MaskCmpEpi8Mask

func M512MaskCmpEpi8Mask(k1 x86.Mmask64, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.Mmask64)

M512MaskCmpEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 63
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm512_mask_cmp_epi8_mask'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512MaskCmpEpu16Mask

func M512MaskCmpEpu16Mask(k1 x86.Mmask32, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.Mmask32)

M512MaskCmpEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 31
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm512_mask_cmp_epu16_mask'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512MaskCmpEpu8Mask

func M512MaskCmpEpu8Mask(k1 x86.Mmask64, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.Mmask64)

M512MaskCmpEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 63
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm512_mask_cmp_epu8_mask'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512MaskCmpeqEpi16Mask

func M512MaskCmpeqEpi16Mask(k1 x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512MaskCmpeqEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm512_mask_cmpeq_epi16_mask'. Requires AVX512BW.

func M512MaskCmpeqEpi8Mask

func M512MaskCmpeqEpi8Mask(k1 x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512MaskCmpeqEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm512_mask_cmpeq_epi8_mask'. Requires AVX512BW.

func M512MaskCmpeqEpu16Mask

func M512MaskCmpeqEpu16Mask(k1 x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512MaskCmpeqEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm512_mask_cmpeq_epu16_mask'. Requires AVX512BW.

func M512MaskCmpeqEpu8Mask

func M512MaskCmpeqEpu8Mask(k1 x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512MaskCmpeqEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm512_mask_cmpeq_epu8_mask'. Requires AVX512BW.

func M512MaskCmpgeEpi16Mask

func M512MaskCmpgeEpi16Mask(k1 x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512MaskCmpgeEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm512_mask_cmpge_epi16_mask'. Requires AVX512BW.

func M512MaskCmpgeEpi8Mask

func M512MaskCmpgeEpi8Mask(k1 x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512MaskCmpgeEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm512_mask_cmpge_epi8_mask'. Requires AVX512BW.

func M512MaskCmpgeEpu16Mask

func M512MaskCmpgeEpu16Mask(k1 x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512MaskCmpgeEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm512_mask_cmpge_epu16_mask'. Requires AVX512BW.

func M512MaskCmpgeEpu8Mask

func M512MaskCmpgeEpu8Mask(k1 x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512MaskCmpgeEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm512_mask_cmpge_epu8_mask'. Requires AVX512BW.

func M512MaskCmpgtEpi16Mask

func M512MaskCmpgtEpi16Mask(k1 x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512MaskCmpgtEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm512_mask_cmpgt_epi16_mask'. Requires AVX512BW.

func M512MaskCmpgtEpi8Mask

func M512MaskCmpgtEpi8Mask(k1 x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512MaskCmpgtEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm512_mask_cmpgt_epi8_mask'. Requires AVX512BW.

func M512MaskCmpgtEpu16Mask

func M512MaskCmpgtEpu16Mask(k1 x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512MaskCmpgtEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] > b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm512_mask_cmpgt_epu16_mask'. Requires AVX512BW.

func M512MaskCmpgtEpu8Mask

func M512MaskCmpgtEpu8Mask(k1 x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512MaskCmpgtEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm512_mask_cmpgt_epu8_mask'. Requires AVX512BW.

func M512MaskCmpleEpi16Mask

func M512MaskCmpleEpi16Mask(k1 x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512MaskCmpleEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm512_mask_cmple_epi16_mask'. Requires AVX512BW.

func M512MaskCmpleEpi8Mask

func M512MaskCmpleEpi8Mask(k1 x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512MaskCmpleEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm512_mask_cmple_epi8_mask'. Requires AVX512BW.

func M512MaskCmpleEpu16Mask

func M512MaskCmpleEpu16Mask(k1 x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512MaskCmpleEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm512_mask_cmple_epu16_mask'. Requires AVX512BW.

func M512MaskCmpleEpu8Mask

func M512MaskCmpleEpu8Mask(k1 x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512MaskCmpleEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm512_mask_cmple_epu8_mask'. Requires AVX512BW.

func M512MaskCmpltEpi16Mask

func M512MaskCmpltEpi16Mask(k1 x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512MaskCmpltEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm512_mask_cmplt_epi16_mask'. Requires AVX512BW.

func M512MaskCmpltEpi8Mask

func M512MaskCmpltEpi8Mask(k1 x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512MaskCmpltEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm512_mask_cmplt_epi8_mask'. Requires AVX512BW.

func M512MaskCmpltEpu16Mask

func M512MaskCmpltEpu16Mask(k1 x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512MaskCmpltEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm512_mask_cmplt_epu16_mask'. Requires AVX512BW.

func M512MaskCmpltEpu8Mask

func M512MaskCmpltEpu8Mask(k1 x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512MaskCmpltEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm512_mask_cmplt_epu8_mask'. Requires AVX512BW.

func M512MaskCmpneqEpi16Mask

func M512MaskCmpneqEpi16Mask(k1 x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512MaskCmpneqEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm512_mask_cmpneq_epi16_mask'. Requires AVX512BW.

func M512MaskCmpneqEpi8Mask

func M512MaskCmpneqEpi8Mask(k1 x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512MaskCmpneqEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm512_mask_cmpneq_epi8_mask'. Requires AVX512BW.

func M512MaskCmpneqEpu16Mask

func M512MaskCmpneqEpu16Mask(k1 x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512MaskCmpneqEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm512_mask_cmpneq_epu16_mask'. Requires AVX512BW.

func M512MaskCmpneqEpu8Mask

func M512MaskCmpneqEpu8Mask(k1 x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512MaskCmpneqEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:64] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm512_mask_cmpneq_epu8_mask'. Requires AVX512BW.

func M512MaskCvtepi16Epi8

func M512MaskCvtepi16Epi8(src x86.M256i, k x86.Mmask32, a x86.M512i) (dst x86.M256i)

M512MaskCvtepi16Epi8: Convert packed 16-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := 16*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Truncate_Int16_To_Int8(a[i+15:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVWB'. Intrinsic: '_mm512_mask_cvtepi16_epi8'. Requires AVX512BW.

func M512MaskCvtepi8Epi16

func M512MaskCvtepi8Epi16(src x86.M512i, k x86.Mmask32, a x86.M256i) (dst x86.M512i)

M512MaskCvtepi8Epi16: Sign extend packed 8-bit integers in 'a' to packed 16-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	l := j*16
	IF k[j]
		dst[l+15:l] := SignExtend(a[i+7:i])
	ELSE
		dst[l+15:l] := src[l+15:l]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVSXBW'. Intrinsic: '_mm512_mask_cvtepi8_epi16'. Requires AVX512BW.

func M512MaskCvtepu8Epi16

func M512MaskCvtepu8Epi16(src x86.M512i, k x86.Mmask32, a x86.M256i) (dst x86.M512i)

M512MaskCvtepu8Epi16: Zero extend packed unsigned 8-bit integers in 'a' to packed 16-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	l := j*16
	IF k[j]
		dst[l+15:l] := ZeroExtend(a[i+7:i])
	ELSE
		dst[l+15:l] := src[l+15:l]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVZXBW'. Intrinsic: '_mm512_mask_cvtepu8_epi16'. Requires AVX512BW.

func M512MaskCvtsepi16Epi8

func M512MaskCvtsepi16Epi8(src x86.M256i, k x86.Mmask32, a x86.M512i) (dst x86.M256i)

M512MaskCvtsepi16Epi8: Convert packed 16-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := 16*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_Int16_To_Int8(a[i+15:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVSWB'. Intrinsic: '_mm512_mask_cvtsepi16_epi8'. Requires AVX512BW.

func M512MaskCvtusepi16Epi8

func M512MaskCvtusepi16Epi8(src x86.M256i, k x86.Mmask32, a x86.M512i) (dst x86.M256i)

M512MaskCvtusepi16Epi8: Convert packed unsigned 16-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := 16*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVUSWB'. Intrinsic: '_mm512_mask_cvtusepi16_epi8'. Requires AVX512BW.

func M512MaskDbsadEpu8

func M512MaskDbsadEpu8(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskDbsadEpu8: Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in 'a' compared to those in 'b', and store the 16-bit results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The

first two SADs use the lower 8-bit quadruplet of the lane from 'a', and the last two SADs use the uppper 8-bit quadruplet of the lane from 'a'. Quadruplets from 'b' are selected from within 128-bit lanes according to the control in 'imm8', and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.

FOR j := 0 to 3
	i := j*128
	tmp[i+31:i] := select(b[i+127:i], imm8[1:0])
	tmp[i+63:i+32] := select(b[i+127:i], imm8[3:2])
	tmp[i+95:i+64] := select(b[i+127:i], imm8[5:4])
	tmp[i+127:i+96] := select(b[i+127:i], imm8[7:6])
ENDFOR

FOR j := 0 to 7
	i := j*64
	tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8])
				 + ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])

	tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16])
				 + ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])

	tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24])
				 + ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])

	tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32])
				 + ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
ENDFOR

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VDBPSADBW'. Intrinsic: '_mm512_mask_dbsad_epu8'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512MaskMaddEpi16

func M512MaskMaddEpi16(src x86.M512i, k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskMaddEpi16: Multiply packed 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i+16]*b[i+31:i+16] + a[i+15:i]*b[i+15:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMADDWD'. Intrinsic: '_mm512_mask_madd_epi16'. Requires AVX512BW.

func M512MaskMaddubsEpi16

func M512MaskMaddubsEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskMaddubsEpi16: Multiply packed unsigned 8-bit integers in 'a' by packed signed 8-bit integers in 'b', producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMADDUBSW'. Intrinsic: '_mm512_mask_maddubs_epi16'. Requires AVX512BW.

func M512MaskMaxEpi16

func M512MaskMaxEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskMaxEpi16: Compare packed 16-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		IF a[i+15:i] > b[i+15:i]
			dst[i+15:i] := a[i+15:i]
		ELSE
			dst[i+15:i] := b[i+15:i]
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMAXSW'. Intrinsic: '_mm512_mask_max_epi16'. Requires AVX512BW.

func M512MaskMaxEpi8

func M512MaskMaxEpi8(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskMaxEpi8: Compare packed 8-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		IF a[i+7:i] > b[i+7:i]
			dst[i+7:i] := a[i+7:i]
		ELSE
			dst[i+7:i] := b[i+7:i]
		FI
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMAXSB'. Intrinsic: '_mm512_mask_max_epi8'. Requires AVX512BW.

func M512MaskMaxEpu16

func M512MaskMaxEpu16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskMaxEpu16: Compare packed unsigned 16-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		IF a[i+15:i] > b[i+15:i]
			dst[i+15:i] := a[i+15:i]
		ELSE
			dst[i+15:i] := b[i+15:i]
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMAXUW'. Intrinsic: '_mm512_mask_max_epu16'. Requires AVX512BW.

func M512MaskMaxEpu8

func M512MaskMaxEpu8(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskMaxEpu8: Compare packed unsigned 8-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		IF a[i+7:i] > b[i+7:i]
			dst[i+7:i] := a[i+7:i]
		ELSE
			dst[i+7:i] := b[i+7:i]
		FI
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMAXUB'. Intrinsic: '_mm512_mask_max_epu8'. Requires AVX512BW.

func M512MaskMinEpi16

func M512MaskMinEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskMinEpi16: Compare packed 16-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		IF a[i+15:i] < b[i+15:i]
			dst[i+15:i] := a[i+15:i]
		ELSE
			dst[i+15:i] := b[i+15:i]
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMINSW'. Intrinsic: '_mm512_mask_min_epi16'. Requires AVX512BW.

func M512MaskMinEpi8

func M512MaskMinEpi8(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskMinEpi8: Compare packed 8-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		IF a[i+7:i] < b[i+7:i]
			dst[i+7:i] := a[i+7:i]
		ELSE
			dst[i+7:i] := b[i+7:i]
		FI
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMINSB'. Intrinsic: '_mm512_mask_min_epi8'. Requires AVX512BW.

func M512MaskMinEpu16

func M512MaskMinEpu16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskMinEpu16: Compare packed unsigned 16-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		IF a[i+15:i] < b[i+15:i]
			dst[i+15:i] := a[i+15:i]
		ELSE
			dst[i+15:i] := b[i+15:i]
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMINUW'. Intrinsic: '_mm512_mask_min_epu16'. Requires AVX512BW.

func M512MaskMinEpu8

func M512MaskMinEpu8(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskMinEpu8: Compare packed unsigned 8-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		IF a[i+7:i] < b[i+7:i]
			dst[i+7:i] := a[i+7:i]
		ELSE
			dst[i+7:i] := b[i+7:i]
		FI
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMINUB'. Intrinsic: '_mm512_mask_min_epu8'. Requires AVX512BW.

func M512MaskMovEpi16

func M512MaskMovEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i) (dst x86.M512i)

M512MaskMovEpi16: Move packed 16-bit integers from 'a' into 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := a[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMOVDQU16'. Intrinsic: '_mm512_mask_mov_epi16'. Requires AVX512BW.

func M512MaskMovEpi8

func M512MaskMovEpi8(src x86.M512i, k x86.Mmask64, a x86.M512i) (dst x86.M512i)

M512MaskMovEpi8: Move packed 8-bit integers from 'a' into 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := a[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMOVDQU8'. Intrinsic: '_mm512_mask_mov_epi8'. Requires AVX512BW.

func M512MaskMulhiEpi16

func M512MaskMulhiEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskMulhiEpi16: Multiply the packed 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		tmp[31:0] := a[i+15:i] * b[i+15:i]
		dst[i+15:i] := tmp[31:16]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMULHW'. Intrinsic: '_mm512_mask_mulhi_epi16'. Requires AVX512BW.

func M512MaskMulhiEpu16

func M512MaskMulhiEpu16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskMulhiEpu16: Multiply the packed unsigned 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		tmp[31:0] := a[i+15:i] * b[i+15:i]
		dst[i+15:i] := tmp[31:16]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMULHUW'. Intrinsic: '_mm512_mask_mulhi_epu16'. Requires AVX512BW.

func M512MaskMulhrsEpi16

func M512MaskMulhrsEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskMulhrsEpi16: Multiply packed 16-bit integers in 'a' and 'b', producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		tmp[31:0] := ((a[i+15:i] * b[i+15:i]) >> 14) + 1
		dst[i+15:i] := tmp[16:1]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMULHRSW'. Intrinsic: '_mm512_mask_mulhrs_epi16'. Requires AVX512BW.

func M512MaskMulloEpi16

func M512MaskMulloEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskMulloEpi16: Multiply the packed 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		tmp[31:0] := a[i+15:i] * b[i+15:i]
		dst[i+15:i] := tmp[15:0]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMULLW'. Intrinsic: '_mm512_mask_mullo_epi16'. Requires AVX512BW.

func M512MaskPacksEpi16

func M512MaskPacksEpi16(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskPacksEpi16: Convert packed 16-bit integers from 'a' and 'b' to packed 8-bit integers using signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp_dst[7:0] := Saturate_Int16_To_Int8 (a[15:0])
tmp_dst[15:8] := Saturate_Int16_To_Int8 (a[31:16])
tmp_dst[23:16] := Saturate_Int16_To_Int8 (a[47:32])
tmp_dst[31:24] := Saturate_Int16_To_Int8 (a[63:48])
tmp_dst[39:32] := Saturate_Int16_To_Int8 (a[79:64])
tmp_dst[47:40] := Saturate_Int16_To_Int8 (a[95:80])
tmp_dst[55:48] := Saturate_Int16_To_Int8 (a[111:96])
tmp_dst[63:56] := Saturate_Int16_To_Int8 (a[127:112])
tmp_dst[71:64] := Saturate_Int16_To_Int8 (b[15:0])
tmp_dst[79:72] := Saturate_Int16_To_Int8 (b[31:16])
tmp_dst[87:80] := Saturate_Int16_To_Int8 (b[47:32])
tmp_dst[95:88] := Saturate_Int16_To_Int8 (b[63:48])
tmp_dst[103:96] := Saturate_Int16_To_Int8 (b[79:64])
tmp_dst[111:104] := Saturate_Int16_To_Int8 (b[95:80])
tmp_dst[119:112] := Saturate_Int16_To_Int8 (b[111:96])
tmp_dst[127:120] := Saturate_Int16_To_Int8 (b[127:112])
tmp_dst[135:128] := Saturate_Int16_To_Int8 (a[143:128])
tmp_dst[143:136] := Saturate_Int16_To_Int8 (a[159:144])
tmp_dst[151:144] := Saturate_Int16_To_Int8 (a[175:160])
tmp_dst[159:152] := Saturate_Int16_To_Int8 (a[191:176])
tmp_dst[167:160] := Saturate_Int16_To_Int8 (a[207:192])
tmp_dst[175:168] := Saturate_Int16_To_Int8 (a[223:208])
tmp_dst[183:176] := Saturate_Int16_To_Int8 (a[239:224])
tmp_dst[191:184] := Saturate_Int16_To_Int8 (a[255:240])
tmp_dst[199:192] := Saturate_Int16_To_Int8 (b[143:128])
tmp_dst[207:200] := Saturate_Int16_To_Int8 (b[159:144])
tmp_dst[215:208] := Saturate_Int16_To_Int8 (b[175:160])
tmp_dst[223:216] := Saturate_Int16_To_Int8 (b[191:176])
tmp_dst[231:224] := Saturate_Int16_To_Int8 (b[207:192])
tmp_dst[239:232] := Saturate_Int16_To_Int8 (b[223:208])
tmp_dst[247:240] := Saturate_Int16_To_Int8 (b[239:224])
tmp_dst[255:248] := Saturate_Int16_To_Int8 (b[255:240])
tmp_dst[263:256] := Saturate_Int16_To_Int8 (a[271:256])
tmp_dst[271:264] := Saturate_Int16_To_Int8 (a[287:272])
tmp_dst[279:272] := Saturate_Int16_To_Int8 (a[303:288])
tmp_dst[287:280] := Saturate_Int16_To_Int8 (a[319:304])
tmp_dst[295:288] := Saturate_Int16_To_Int8 (a[335:320])
tmp_dst[303:296] := Saturate_Int16_To_Int8 (a[351:336])
tmp_dst[311:304] := Saturate_Int16_To_Int8 (a[367:352])
tmp_dst[319:312] := Saturate_Int16_To_Int8 (a[383:368])
tmp_dst[327:320] := Saturate_Int16_To_Int8 (b[271:256])
tmp_dst[335:328] := Saturate_Int16_To_Int8 (b[287:272])
tmp_dst[343:336] := Saturate_Int16_To_Int8 (b[303:288])
tmp_dst[351:344] := Saturate_Int16_To_Int8 (b[319:304])
tmp_dst[359:352] := Saturate_Int16_To_Int8 (b[335:320])
tmp_dst[367:360] := Saturate_Int16_To_Int8 (b[351:336])
tmp_dst[375:368] := Saturate_Int16_To_Int8 (b[367:352])
tmp_dst[383:376] := Saturate_Int16_To_Int8 (b[383:368])
tmp_dst[391:384] := Saturate_Int16_To_Int8 (a[399:384])
tmp_dst[399:392] := Saturate_Int16_To_Int8 (a[415:400])
tmp_dst[407:400] := Saturate_Int16_To_Int8 (a[431:416])
tmp_dst[415:408] := Saturate_Int16_To_Int8 (a[447:432])
tmp_dst[423:416] := Saturate_Int16_To_Int8 (a[463:448])
tmp_dst[431:424] := Saturate_Int16_To_Int8 (a[479:464])
tmp_dst[439:432] := Saturate_Int16_To_Int8 (a[495:480])
tmp_dst[447:440] := Saturate_Int16_To_Int8 (a[511:496])
tmp_dst[455:448] := Saturate_Int16_To_Int8 (b[399:384])
tmp_dst[463:456] := Saturate_Int16_To_Int8 (b[415:400])
tmp_dst[471:464] := Saturate_Int16_To_Int8 (b[431:416])
tmp_dst[479:472] := Saturate_Int16_To_Int8 (b[447:432])
tmp_dst[487:480] := Saturate_Int16_To_Int8 (b[463:448])
tmp_dst[495:488] := Saturate_Int16_To_Int8 (b[479:464])
tmp_dst[503:496] := Saturate_Int16_To_Int8 (b[495:480])
tmp_dst[511:504] := Saturate_Int16_To_Int8 (b[511:496])

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPACKSSWB'. Intrinsic: '_mm512_mask_packs_epi16'. Requires AVX512BW.

func M512MaskPacksEpi32

func M512MaskPacksEpi32(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskPacksEpi32: Convert packed 32-bit integers from 'a' and 'b' to packed 16-bit integers using signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp_dst[15:0] := Saturate_Int32_To_Int16 (a[31:0])
tmp_dst[31:16] := Saturate_Int32_To_Int16 (a[63:32])
tmp_dst[47:32] := Saturate_Int32_To_Int16 (a[95:64])
tmp_dst[63:48] := Saturate_Int32_To_Int16 (a[127:96])
tmp_dst[79:64] := Saturate_Int32_To_Int16 (b[31:0])
tmp_dst[95:80] := Saturate_Int32_To_Int16 (b[63:32])
tmp_dst[111:96] := Saturate_Int32_To_Int16 (b[95:64])
tmp_dst[127:112] := Saturate_Int32_To_Int16 (b[127:96])
tmp_dst[143:128] := Saturate_Int32_To_Int16 (a[159:128])
tmp_dst[159:144] := Saturate_Int32_To_Int16 (a[191:160])
tmp_dst[175:160] := Saturate_Int32_To_Int16 (a[223:192])
tmp_dst[191:176] := Saturate_Int32_To_Int16 (a[255:224])
tmp_dst[207:192] := Saturate_Int32_To_Int16 (b[159:128])
tmp_dst[223:208] := Saturate_Int32_To_Int16 (b[191:160])
tmp_dst[239:224] := Saturate_Int32_To_Int16 (b[223:192])
tmp_dst[255:240] := Saturate_Int32_To_Int16 (b[255:224])
tmp_dst[271:256] := Saturate_Int32_To_Int16 (a[287:256])
tmp_dst[287:272] := Saturate_Int32_To_Int16 (a[319:288])
tmp_dst[303:288] := Saturate_Int32_To_Int16 (a[351:320])
tmp_dst[319:304] := Saturate_Int32_To_Int16 (a[383:352])
tmp_dst[335:320] := Saturate_Int32_To_Int16 (b[287:256])
tmp_dst[351:336] := Saturate_Int32_To_Int16 (b[319:288])
tmp_dst[367:352] := Saturate_Int32_To_Int16 (b[351:320])
tmp_dst[383:368] := Saturate_Int32_To_Int16 (b[383:352])
tmp_dst[399:384] := Saturate_Int32_To_Int16 (a[415:384])
tmp_dst[415:400] := Saturate_Int32_To_Int16 (a[447:416])
tmp_dst[431:416] := Saturate_Int32_To_Int16 (a[479:448])
tmp_dst[447:432] := Saturate_Int32_To_Int16 (a[511:480])
tmp_dst[463:448] := Saturate_Int32_To_Int16 (b[415:384])
tmp_dst[479:464] := Saturate_Int32_To_Int16 (b[447:416])
tmp_dst[495:480] := Saturate_Int32_To_Int16 (b[479:448])
tmp_dst[511:496] := Saturate_Int32_To_Int16 (b[511:480])

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPACKSSDW'. Intrinsic: '_mm512_mask_packs_epi32'. Requires AVX512BW.

func M512MaskPackusEpi16

func M512MaskPackusEpi16(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskPackusEpi16: Convert packed 16-bit integers from 'a' and 'b' to packed 8-bit integers using unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp_dst[7:0] := Saturate_Int16_To_UnsignedInt8 (a[15:0])
tmp_dst[15:8] := Saturate_Int16_To_UnsignedInt8 (a[31:16])
tmp_dst[23:16] := Saturate_Int16_To_UnsignedInt8 (a[47:32])
tmp_dst[31:24] := Saturate_Int16_To_UnsignedInt8 (a[63:48])
tmp_dst[39:32] := Saturate_Int16_To_UnsignedInt8 (a[79:64])
tmp_dst[47:40] := Saturate_Int16_To_UnsignedInt8 (a[95:80])
tmp_dst[55:48] := Saturate_Int16_To_UnsignedInt8 (a[111:96])
tmp_dst[63:56] := Saturate_Int16_To_UnsignedInt8 (a[127:112])
tmp_dst[71:64] := Saturate_Int16_To_UnsignedInt8 (b[15:0])
tmp_dst[79:72] := Saturate_Int16_To_UnsignedInt8 (b[31:16])
tmp_dst[87:80] := Saturate_Int16_To_UnsignedInt8 (b[47:32])
tmp_dst[95:88] := Saturate_Int16_To_UnsignedInt8 (b[63:48])
tmp_dst[103:96] := Saturate_Int16_To_UnsignedInt8 (b[79:64])
tmp_dst[111:104] := Saturate_Int16_To_UnsignedInt8 (b[95:80])
tmp_dst[119:112] := Saturate_Int16_To_UnsignedInt8 (b[111:96])
tmp_dst[127:120] := Saturate_Int16_To_UnsignedInt8 (b[127:112])
tmp_dst[135:128] := Saturate_Int16_To_UnsignedInt8 (a[143:128])
tmp_dst[143:136] := Saturate_Int16_To_UnsignedInt8 (a[159:144])
tmp_dst[151:144] := Saturate_Int16_To_UnsignedInt8 (a[175:160])
tmp_dst[159:152] := Saturate_Int16_To_UnsignedInt8 (a[191:176])
tmp_dst[167:160] := Saturate_Int16_To_UnsignedInt8 (a[207:192])
tmp_dst[175:168] := Saturate_Int16_To_UnsignedInt8 (a[223:208])
tmp_dst[183:176] := Saturate_Int16_To_UnsignedInt8 (a[239:224])
tmp_dst[191:184] := Saturate_Int16_To_UnsignedInt8 (a[255:240])
tmp_dst[199:192] := Saturate_Int16_To_UnsignedInt8 (b[143:128])
tmp_dst[207:200] := Saturate_Int16_To_UnsignedInt8 (b[159:144])
tmp_dst[215:208] := Saturate_Int16_To_UnsignedInt8 (b[175:160])
tmp_dst[223:216] := Saturate_Int16_To_UnsignedInt8 (b[191:176])
tmp_dst[231:224] := Saturate_Int16_To_UnsignedInt8 (b[207:192])
tmp_dst[239:232] := Saturate_Int16_To_UnsignedInt8 (b[223:208])
tmp_dst[247:240] := Saturate_Int16_To_UnsignedInt8 (b[239:224])
tmp_dst[255:248] := Saturate_Int16_To_UnsignedInt8 (b[255:240])
tmp_dst[263:256] := Saturate_Int16_To_UnsignedInt8 (a[271:256])
tmp_dst[271:264] := Saturate_Int16_To_UnsignedInt8 (a[287:272])
tmp_dst[279:272] := Saturate_Int16_To_UnsignedInt8 (a[303:288])
tmp_dst[287:280] := Saturate_Int16_To_UnsignedInt8 (a[319:304])
tmp_dst[295:288] := Saturate_Int16_To_UnsignedInt8 (a[335:320])
tmp_dst[303:296] := Saturate_Int16_To_UnsignedInt8 (a[351:336])
tmp_dst[311:304] := Saturate_Int16_To_UnsignedInt8 (a[367:352])
tmp_dst[319:312] := Saturate_Int16_To_UnsignedInt8 (a[383:368])
tmp_dst[327:320] := Saturate_Int16_To_UnsignedInt8 (b[271:256])
tmp_dst[335:328] := Saturate_Int16_To_UnsignedInt8 (b[287:272])
tmp_dst[343:336] := Saturate_Int16_To_UnsignedInt8 (b[303:288])
tmp_dst[351:344] := Saturate_Int16_To_UnsignedInt8 (b[319:304])
tmp_dst[359:352] := Saturate_Int16_To_UnsignedInt8 (b[335:320])
tmp_dst[367:360] := Saturate_Int16_To_UnsignedInt8 (b[351:336])
tmp_dst[375:368] := Saturate_Int16_To_UnsignedInt8 (b[367:352])
tmp_dst[383:376] := Saturate_Int16_To_UnsignedInt8 (b[383:368])
tmp_dst[391:384] := Saturate_Int16_To_UnsignedInt8 (a[399:384])
tmp_dst[399:392] := Saturate_Int16_To_UnsignedInt8 (a[415:400])
tmp_dst[407:400] := Saturate_Int16_To_UnsignedInt8 (a[431:416])
tmp_dst[415:408] := Saturate_Int16_To_UnsignedInt8 (a[447:432])
tmp_dst[423:416] := Saturate_Int16_To_UnsignedInt8 (a[463:448])
tmp_dst[431:424] := Saturate_Int16_To_UnsignedInt8 (a[479:464])
tmp_dst[439:432] := Saturate_Int16_To_UnsignedInt8 (a[495:480])
tmp_dst[447:440] := Saturate_Int16_To_UnsignedInt8 (a[511:496])
tmp_dst[455:448] := Saturate_Int16_To_UnsignedInt8 (b[399:384])
tmp_dst[463:456] := Saturate_Int16_To_UnsignedInt8 (b[415:400])
tmp_dst[471:464] := Saturate_Int16_To_UnsignedInt8 (b[431:416])
tmp_dst[479:472] := Saturate_Int16_To_UnsignedInt8 (b[447:432])
tmp_dst[487:480] := Saturate_Int16_To_UnsignedInt8 (b[463:448])
tmp_dst[495:488] := Saturate_Int16_To_UnsignedInt8 (b[479:464])
tmp_dst[503:496] := Saturate_Int16_To_UnsignedInt8 (b[495:480])
tmp_dst[511:504] := Saturate_Int16_To_UnsignedInt8 (b[511:496])

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPACKUSWB'. Intrinsic: '_mm512_mask_packus_epi16'. Requires AVX512BW.

func M512MaskPackusEpi32

func M512MaskPackusEpi32(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskPackusEpi32: Convert packed 32-bit integers from 'a' and 'b' to packed 16-bit integers using unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp_dst[15:0] := Saturate_Int32_To_UnsignedInt16 (a[31:0])
tmp_dst[31:16] := Saturate_Int32_To_UnsignedInt16 (a[63:32])
tmp_dst[47:32] := Saturate_Int32_To_UnsignedInt16 (a[95:64])
tmp_dst[63:48] := Saturate_Int32_To_UnsignedInt16 (a[127:96])
tmp_dst[79:64] := Saturate_Int32_To_UnsignedInt16 (b[31:0])
tmp_dst[95:80] := Saturate_Int32_To_UnsignedInt16 (b[63:32])
tmp_dst[111:96] := Saturate_Int32_To_UnsignedInt16 (b[95:64])
tmp_dst[127:112] := Saturate_Int32_To_UnsignedInt16 (b[127:96])
tmp_dst[143:128] := Saturate_Int32_To_UnsignedInt16 (a[159:128])
tmp_dst[159:144] := Saturate_Int32_To_UnsignedInt16 (a[191:160])
tmp_dst[175:160] := Saturate_Int32_To_UnsignedInt16 (a[223:192])
tmp_dst[191:176] := Saturate_Int32_To_UnsignedInt16 (a[255:224])
tmp_dst[207:192] := Saturate_Int32_To_UnsignedInt16 (b[159:128])
tmp_dst[223:208] := Saturate_Int32_To_UnsignedInt16 (b[191:160])
tmp_dst[239:224] := Saturate_Int32_To_UnsignedInt16 (b[223:192])
tmp_dst[255:240] := Saturate_Int32_To_UnsignedInt16 (b[255:224])
tmp_dst[271:256] := Saturate_Int32_To_UnsignedInt16 (a[287:256])
tmp_dst[287:272] := Saturate_Int32_To_UnsignedInt16 (a[319:288])
tmp_dst[303:288] := Saturate_Int32_To_UnsignedInt16 (a[351:320])
tmp_dst[319:304] := Saturate_Int32_To_UnsignedInt16 (a[383:352])
tmp_dst[335:320] := Saturate_Int32_To_UnsignedInt16 (b[287:256])
tmp_dst[351:336] := Saturate_Int32_To_UnsignedInt16 (b[319:288])
tmp_dst[367:352] := Saturate_Int32_To_UnsignedInt16 (b[351:320])
tmp_dst[383:368] := Saturate_Int32_To_UnsignedInt16 (b[383:352])
tmp_dst[399:384] := Saturate_Int32_To_UnsignedInt16 (a[415:384])
tmp_dst[415:400] := Saturate_Int32_To_UnsignedInt16 (a[447:416])
tmp_dst[431:416] := Saturate_Int32_To_UnsignedInt16 (a[479:448])
tmp_dst[447:432] := Saturate_Int32_To_UnsignedInt16 (a[511:480])
tmp_dst[463:448] := Saturate_Int32_To_UnsignedInt16 (b[415:384])
tmp_dst[479:464] := Saturate_Int32_To_UnsignedInt16 (b[447:416])
tmp_dst[495:480] := Saturate_Int32_To_UnsignedInt16 (b[479:448])
tmp_dst[511:496] := Saturate_Int32_To_UnsignedInt16 (b[511:480])

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPACKUSDW'. Intrinsic: '_mm512_mask_packus_epi32'. Requires AVX512BW.

func M512MaskPermutex2varEpi16

func M512MaskPermutex2varEpi16(a x86.M512i, k x86.Mmask32, idx x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskPermutex2varEpi16: Shuffle 16-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		off := 16*idx[i+4:i]
		dst[i+15:i] := idx[i+5] ? b[off+15:off] : a[off+15:off]
	ELSE
		dst[i+15:i] := a[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMT2W'. Intrinsic: '_mm512_mask_permutex2var_epi16'. Requires AVX512BW.

func M512MaskPermutexvarEpi16

func M512MaskPermutexvarEpi16(src x86.M512i, k x86.Mmask32, idx x86.M512i, a x86.M512i) (dst x86.M512i)

M512MaskPermutexvarEpi16: Shuffle 16-bit integers in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	id := idx[i+4:i]*16
	IF k[j]
		dst[i+15:i] := a[id+15:id]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMW'. Intrinsic: '_mm512_mask_permutexvar_epi16'. Requires AVX512BW.

func M512MaskSet1Epi16

func M512MaskSet1Epi16(src x86.M512i, k x86.Mmask32, a int16) (dst x86.M512i)

M512MaskSet1Epi16: Broadcast 16-bit integer 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := a[15:0]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPBROADCASTW'. Intrinsic: '_mm512_mask_set1_epi16'. Requires AVX512BW.

func M512MaskSet1Epi8

func M512MaskSet1Epi8(src x86.M512i, k x86.Mmask64, a byte) (dst x86.M512i)

M512MaskSet1Epi8: Broadcast 8-bit integer 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := a[7:0]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPBROADCASTB'. Intrinsic: '_mm512_mask_set1_epi8'. Requires AVX512BW.

func M512MaskShuffleEpi8

func M512MaskShuffleEpi8(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskShuffleEpi8: Shuffle 8-bit integers in 'a' within 128-bit lanes using the control in the corresponding 8-bit element of 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		IF b[i+7] == 1
			dst[i+7:i] := 0
		ELSE
			index[3:0] := b[i+3:i]
			dst[i+7:i] := a[index*8+7:index*8]
		FI
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSHUFB'. Intrinsic: '_mm512_mask_shuffle_epi8'. Requires AVX512BW.

func M512MaskShufflehiEpi16

func M512MaskShufflehiEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskShufflehiEpi16: Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of 'a' using the control in 'imm8'. Store the results in the high 64 bits of 128-bit lanes of 'dst', with the low 64 bits of 128-bit lanes being copied from from 'a' to 'dst', using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp_dst[63:0] := a[63:0]
tmp_dst[79:64] := (a >> (imm8[1:0] * 16))[79:64]
tmp_dst[95:80] := (a >> (imm8[3:2] * 16))[79:64]
tmp_dst[111:96] := (a >> (imm8[5:4] * 16))[79:64]
tmp_dst[127:112] := (a >> (imm8[7:6] * 16))[79:64]
tmp_dst[191:128] := a[191:128]
tmp_dst[207:192] := (a >> (imm8[1:0] * 16))[207:192]
tmp_dst[223:208] := (a >> (imm8[3:2] * 16))[207:192]
tmp_dst[239:224] := (a >> (imm8[5:4] * 16))[207:192]
tmp_dst[255:240] := (a >> (imm8[7:6] * 16))[207:192]
tmp_dst[319:256] := a[319:256]
tmp_dst[335:320] := (a >> (imm8[1:0] * 16))[335:320]
tmp_dst[351:336] := (a >> (imm8[3:2] * 16))[335:320]
tmp_dst[367:352] := (a >> (imm8[5:4] * 16))[335:320]
tmp_dst[383:368] := (a >> (imm8[7:6] * 16))[335:320]
tmp_dst[447:384] := a[447:384]
tmp_dst[463:448] := (a >> (imm8[1:0] * 16))[463:448]
tmp_dst[479:464] := (a >> (imm8[3:2] * 16))[463:448]
tmp_dst[495:480] := (a >> (imm8[5:4] * 16))[463:448]
tmp_dst[511:496] := (a >> (imm8[7:6] * 16))[463:448]

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSHUFHW'. Intrinsic: '_mm512_mask_shufflehi_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512MaskShuffleloEpi16

func M512MaskShuffleloEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskShuffleloEpi16: Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of 'a' using the control in 'imm8'. Store the results in the low 64 bits of 128-bit lanes of 'dst', with the high 64 bits of 128-bit lanes being copied from from 'a' to 'dst', using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp_dst[15:0] := (a >> (imm8[1:0] * 16))[15:0]
tmp_dst[31:16] := (a >> (imm8[3:2] * 16))[15:0]
tmp_dst[47:32] := (a >> (imm8[5:4] * 16))[15:0]
tmp_dst[63:48] := (a >> (imm8[7:6] * 16))[15:0]
tmp_dst[127:64] := a[127:64]
tmp_dst[143:128] := (a >> (imm8[1:0] * 16))[143:128]
tmp_dst[159:144] := (a >> (imm8[3:2] * 16))[143:128]
tmp_dst[175:160] := (a >> (imm8[5:4] * 16))[143:128]
tmp_dst[191:176] := (a >> (imm8[7:6] * 16))[143:128]
tmp_dst[255:192] := a[255:192]
tmp_dst[271:256] := (a >> (imm8[1:0] * 16))[271:256]
tmp_dst[287:272] := (a >> (imm8[3:2] * 16))[271:256]
tmp_dst[303:288] := (a >> (imm8[5:4] * 16))[271:256]
tmp_dst[319:304] := (a >> (imm8[7:6] * 16))[271:256]
tmp_dst[383:320] := a[383:320]
tmp_dst[399:384] := (a >> (imm8[1:0] * 16))[399:384]
tmp_dst[415:400] := (a >> (imm8[3:2] * 16))[399:384]
tmp_dst[431:416] := (a >> (imm8[5:4] * 16))[399:384]
tmp_dst[447:432] := (a >> (imm8[7:6] * 16))[399:384]
tmp_dst[511:448] := a[511:448]

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSHUFLW'. Intrinsic: '_mm512_mask_shufflelo_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512MaskSllEpi16

func M512MaskSllEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, count x86.M128i) (dst x86.M512i)

M512MaskSllEpi16: Shift packed 16-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		IF count[63:0] > 15
			dst[i+15:i] := 0
		ELSE
			dst[i+15:i] := ZeroExtend(a[i+15:i] << count[63:0])
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSLLW'. Intrinsic: '_mm512_mask_sll_epi16'. Requires AVX512BW.

func M512MaskSlliEpi16

func M512MaskSlliEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskSlliEpi16: Shift packed 16-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		IF imm8[7:0] > 15
			dst[i+15:i] := 0
		ELSE
			dst[i+15:i] := ZeroExtend(a[i+15:i] << imm8[7:0])
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSLLW'. Intrinsic: '_mm512_mask_slli_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512MaskSllvEpi16

func M512MaskSllvEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, count x86.M512i) (dst x86.M512i)

M512MaskSllvEpi16: Shift packed 16-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := ZeroExtend(a[i+15:i] << count[i+15:i])
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSLLVW'. Intrinsic: '_mm512_mask_sllv_epi16'. Requires AVX512BW.

func M512MaskSraEpi16

func M512MaskSraEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, count x86.M128i) (dst x86.M512i)

M512MaskSraEpi16: Shift packed 16-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		IF count[63:0] > 15
			dst[i+15:i] := SignBit
		ELSE
			dst[i+15:i] := SignExtend(a[i+15:i] >> count[63:0])
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRAW'. Intrinsic: '_mm512_mask_sra_epi16'. Requires AVX512BW.

func M512MaskSraiEpi16

func M512MaskSraiEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskSraiEpi16: Shift packed 16-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		IF imm8[7:0] > 15
			dst[i+15:i] := SignBit
		ELSE
			dst[i+15:i] := SignExtend(a[i+15:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRAW'. Intrinsic: '_mm512_mask_srai_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512MaskSravEpi16

func M512MaskSravEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, count x86.M512i) (dst x86.M512i)

M512MaskSravEpi16: Shift packed 16-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := SignExtend(a[i+15:i] >> count[i+15:i])
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRAVW'. Intrinsic: '_mm512_mask_srav_epi16'. Requires AVX512BW.

func M512MaskSrlEpi16

func M512MaskSrlEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, count x86.M128i) (dst x86.M512i)

M512MaskSrlEpi16: Shift packed 16-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		IF count[63:0] > 15
			dst[i+15:i] := 0
		ELSE
			dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[63:0])
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRLW'. Intrinsic: '_mm512_mask_srl_epi16'. Requires AVX512BW.

func M512MaskSrliEpi16

func M512MaskSrliEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskSrliEpi16: Shift packed 16-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		IF imm8[7:0] > 15
			dst[i+15:i] := 0
		ELSE
			dst[i+15:i] := ZeroExtend(a[i+15:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRLW'. Intrinsic: '_mm512_mask_srli_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512MaskSrlvEpi16

func M512MaskSrlvEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, count x86.M512i) (dst x86.M512i)

M512MaskSrlvEpi16: Shift packed 16-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[i+15:i])
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRLVW'. Intrinsic: '_mm512_mask_srlv_epi16'. Requires AVX512BW.

func M512MaskSubEpi16

func M512MaskSubEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskSubEpi16: Subtract packed 16-bit integers in 'b' from packed 16-bit integers in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := a[i+15:i] - b[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBW'. Intrinsic: '_mm512_mask_sub_epi16'. Requires AVX512BW.

func M512MaskSubEpi8

func M512MaskSubEpi8(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskSubEpi8: Subtract packed 8-bit integers in 'b' from packed 8-bit integers in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := a[i+7:i] - b[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBB'. Intrinsic: '_mm512_mask_sub_epi8'. Requires AVX512BW.

func M512MaskSubsEpi16

func M512MaskSubsEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskSubsEpi16: Subtract packed 16-bit integers in 'b' from packed 16-bit integers in 'a' using saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_Int16(a[i+15:i] - b[i+15:i])
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBSW'. Intrinsic: '_mm512_mask_subs_epi16'. Requires AVX512BW.

func M512MaskSubsEpi8

func M512MaskSubsEpi8(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskSubsEpi8: Subtract packed 8-bit integers in 'b' from packed 8-bit integers in 'a' using saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := Saturate_To_Int8(a[i+7:i] - b[i+7:i])
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBSB'. Intrinsic: '_mm512_mask_subs_epi8'. Requires AVX512BW.

func M512MaskSubsEpu16

func M512MaskSubsEpu16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskSubsEpu16: Subtract packed unsigned 16-bit integers in 'b' from packed unsigned 16-bit integers in 'a' using saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_UnsignedInt16(a[i+15:i] - b[i+15:i])
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBUSW'. Intrinsic: '_mm512_mask_subs_epu16'. Requires AVX512BW.

func M512MaskSubsEpu8

func M512MaskSubsEpu8(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskSubsEpu8: Subtract packed unsigned 8-bit integers in 'b' from packed unsigned 8-bit integers in 'a' using saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := Saturate_To_UnsignedInt8(a[i+7:i] - b[i+7:i])
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBUSB'. Intrinsic: '_mm512_mask_subs_epu8'. Requires AVX512BW.

func M512MaskTestEpi16Mask

func M512MaskTestEpi16Mask(k1 x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512MaskTestEpi16Mask: Compute the bitwise AND of packed 16-bit integers in 'a' and 'b', producing intermediate 16-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is non-zero.

FOR j := 0 to 31
	i := j*16
	IF k1[j]
		k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPTESTMW'. Intrinsic: '_mm512_mask_test_epi16_mask'. Requires AVX512BW.

func M512MaskTestEpi8Mask

func M512MaskTestEpi8Mask(k1 x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512MaskTestEpi8Mask: Compute the bitwise AND of packed 8-bit integers in 'a' and 'b', producing intermediate 8-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is non-zero.

FOR j := 0 to 63
	i := j*8
	IF k1[j]
		k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:64] := 0

Instruction: 'VPTESTMB'. Intrinsic: '_mm512_mask_test_epi8_mask'. Requires AVX512BW.

func M512MaskTestnEpi16Mask

func M512MaskTestnEpi16Mask(k1 x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512MaskTestnEpi16Mask: Compute the bitwise NAND of packed 16-bit integers in 'a' and 'b', producing intermediate 16-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is zero.

FOR j := 0 to 31
	i := j*16
	IF k1[j]
		k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPTESTNMW'. Intrinsic: '_mm512_mask_testn_epi16_mask'. Requires AVX512BW.

func M512MaskTestnEpi8Mask

func M512MaskTestnEpi8Mask(k1 x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512MaskTestnEpi8Mask: Compute the bitwise NAND of packed 8-bit integers in 'a' and 'b', producing intermediate 8-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is zero.

FOR j := 0 to 63
	i := j*8
	IF k1[j]
		k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:64] := 0

Instruction: 'VPTESTNMB'. Intrinsic: '_mm512_mask_testn_epi8_mask'. Requires AVX512BW.

func M512MaskUnpackhiEpi16

func M512MaskUnpackhiEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskUnpackhiEpi16: Unpack and interleave 16-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]){
	dst[15:0] := src1[79:64]
	dst[31:16] := src2[79:64]
	dst[47:32] := src1[95:80]
	dst[63:48] := src2[95:80]
	dst[79:64] := src1[111:96]
	dst[95:80] := src2[111:96]
	dst[111:96] := src1[127:112]
	dst[127:112] := src2[127:112]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_HIGH_WORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_HIGH_WORDS(a[511:384], b[511:384])

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPUNPCKHWD'. Intrinsic: '_mm512_mask_unpackhi_epi16'. Requires AVX512BW.

func M512MaskUnpackhiEpi8

func M512MaskUnpackhiEpi8(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskUnpackhiEpi8: Unpack and interleave 8-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]){
	dst[7:0] := src1[71:64]
	dst[15:8] := src2[71:64]
	dst[23:16] := src1[79:72]
	dst[31:24] := src2[79:72]
	dst[39:32] := src1[87:80]
	dst[47:40] := src2[87:80]
	dst[55:48] := src1[95:88]
	dst[63:56] := src2[95:88]
	dst[71:64] := src1[103:96]
	dst[79:72] := src2[103:96]
	dst[87:80] := src1[111:104]
	dst[95:88] := src2[111:104]
	dst[103:96] := src1[119:112]
	dst[111:104] := src2[119:112]
	dst[119:112] := src1[127:120]
	dst[127:120] := src2[127:120]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_HIGH_BYTES(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_HIGH_BYTES(a[511:384], b[511:384])

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPUNPCKHBW'. Intrinsic: '_mm512_mask_unpackhi_epi8'. Requires AVX512BW.

func M512MaskUnpackloEpi16

func M512MaskUnpackloEpi16(src x86.M512i, k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskUnpackloEpi16: Unpack and interleave 16-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_WORDS(src1[127:0], src2[127:0]){
	dst[15:0] := src1[15:0]
	dst[31:16] := src2[15:0]
	dst[47:32] := src1[31:16]
	dst[63:48] := src2[31:16]
	dst[79:64] := src1[47:32]
	dst[95:80] := src2[47:32]
	dst[111:96] := src1[63:48]
	dst[127:112] := src2[63:48]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_WORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_WORDS(a[511:384], b[511:384])

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPUNPCKLWD'. Intrinsic: '_mm512_mask_unpacklo_epi16'. Requires AVX512BW.

func M512MaskUnpackloEpi8

func M512MaskUnpackloEpi8(src x86.M512i, k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskUnpackloEpi8: Unpack and interleave 8-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_BYTES(src1[127:0], src2[127:0]){
	dst[7:0] := src1[7:0]
	dst[15:8] := src2[7:0]
	dst[23:16] := src1[15:8]
	dst[31:24] := src2[15:8]
	dst[39:32] := src1[23:16]
	dst[47:40] := src2[23:16]
	dst[55:48] := src1[31:24]
	dst[63:56] := src2[31:24]
	dst[71:64] := src1[39:32]
	dst[79:72] := src2[39:32]
	dst[87:80] := src1[47:40]
	dst[95:88] := src2[47:40]
	dst[103:96] := src1[55:48]
	dst[111:104] := src2[55:48]
	dst[119:112] := src1[63:56]
	dst[127:120] := src2[63:56]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_BYTES(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_BYTES(a[511:384], b[511:384])

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPUNPCKLBW'. Intrinsic: '_mm512_mask_unpacklo_epi8'. Requires AVX512BW.

func M512MaskzAbsEpi16

func M512MaskzAbsEpi16(k x86.Mmask32, a x86.M512i) (dst x86.M512i)

M512MaskzAbsEpi16: Compute the absolute value of packed 16-bit integers in 'a', and store the unsigned results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := ABS(a[i+15:i])
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPABSW'. Intrinsic: '_mm512_maskz_abs_epi16'. Requires AVX512BW.

func M512MaskzAbsEpi8

func M512MaskzAbsEpi8(k x86.Mmask64, a x86.M512i) (dst x86.M512i)

M512MaskzAbsEpi8: Compute the absolute value of packed 8-bit integers in 'a', and store the unsigned results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := ABS(a[i+7:i])
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPABSB'. Intrinsic: '_mm512_maskz_abs_epi8'. Requires AVX512BW.

func M512MaskzAddEpi16

func M512MaskzAddEpi16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzAddEpi16: Add packed 16-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := a[i+15:i] + b[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADDW'. Intrinsic: '_mm512_maskz_add_epi16'. Requires AVX512BW.

func M512MaskzAddEpi8

func M512MaskzAddEpi8(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzAddEpi8: Add packed 8-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := a[i+7:i] + b[i+7:i]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADDB'. Intrinsic: '_mm512_maskz_add_epi8'. Requires AVX512BW.

func M512MaskzAddsEpi16

func M512MaskzAddsEpi16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzAddsEpi16: Add packed 16-bit integers in 'a' and 'b' using saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] )
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADDSW'. Intrinsic: '_mm512_maskz_adds_epi16'. Requires AVX512BW.

func M512MaskzAddsEpi8

func M512MaskzAddsEpi8(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzAddsEpi8: Add packed 8-bit integers in 'a' and 'b' using saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] )
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADDSB'. Intrinsic: '_mm512_maskz_adds_epi8'. Requires AVX512BW.

func M512MaskzAddsEpu16

func M512MaskzAddsEpu16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzAddsEpu16: Add packed unsigned 16-bit integers in 'a' and 'b' using saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] )
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADDUSW'. Intrinsic: '_mm512_maskz_adds_epu16'. Requires AVX512BW.

func M512MaskzAddsEpu8

func M512MaskzAddsEpu8(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzAddsEpu8: Add packed unsigned 8-bit integers in 'a' and 'b' using saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := Saturate_To_UnsignedInt8( a[i+7:i] + b[i+7:i] )
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPADDUSB'. Intrinsic: '_mm512_maskz_adds_epu8'. Requires AVX512BW.

func M512MaskzAlignrEpi8

func M512MaskzAlignrEpi8(k x86.Mmask64, a x86.M512i, b x86.M512i, count int) (dst x86.M512i)

M512MaskzAlignrEpi8: Concatenate pairs of 16-byte blocks in 'a' and 'b' into a 32-byte temporary result, shift the result right by 'count' bytes, and store the low 16 bytes in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*128
	tmp[255:0] := ((a[i+127:i] << 128) OR b[i+127:i]) >> (count[7:0]*8)
	tmp_dst[i+127:i] := tmp[127:0]
ENDFOR

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPALIGNR'. Intrinsic: '_mm512_maskz_alignr_epi8'. Requires AVX512BW.

func M512MaskzAvgEpu16

func M512MaskzAvgEpu16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzAvgEpu16: Average packed unsigned 16-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPAVGW'. Intrinsic: '_mm512_maskz_avg_epu16'. Requires AVX512BW.

func M512MaskzAvgEpu8

func M512MaskzAvgEpu8(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzAvgEpu8: Average packed unsigned 8-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPAVGB'. Intrinsic: '_mm512_maskz_avg_epu8'. Requires AVX512BW.

func M512MaskzBroadcastbEpi8

func M512MaskzBroadcastbEpi8(k x86.Mmask64, a x86.M128i) (dst x86.M512i)

M512MaskzBroadcastbEpi8: Broadcast the low packed 8-bit integer from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := a[7:0]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPBROADCASTB'. Intrinsic: '_mm512_maskz_broadcastb_epi8'. Requires AVX512BW.

func M512MaskzBroadcastwEpi16

func M512MaskzBroadcastwEpi16(k x86.Mmask32, a x86.M128i) (dst x86.M512i)

M512MaskzBroadcastwEpi16: Broadcast the low packed 16-bit integer from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := a[15:0]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPBROADCASTW'. Intrinsic: '_mm512_maskz_broadcastw_epi16'. Requires AVX512BW.

func M512MaskzCvtepi16Epi8

func M512MaskzCvtepi16Epi8(k x86.Mmask32, a x86.M512i) (dst x86.M256i)

M512MaskzCvtepi16Epi8: Convert packed 16-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := 16*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Truncate_Int16_To_Int8(a[i+15:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVWB'. Intrinsic: '_mm512_maskz_cvtepi16_epi8'. Requires AVX512BW.

func M512MaskzCvtepi8Epi16

func M512MaskzCvtepi8Epi16(k x86.Mmask32, a x86.M256i) (dst x86.M512i)

M512MaskzCvtepi8Epi16: Sign extend packed 8-bit integers in 'a' to packed 16-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	l := j*16
	IF k[j]
		dst[l+15:l] := SignExtend(a[i+7:i])
	ELSE
		dst[l+15:l] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVSXBW'. Intrinsic: '_mm512_maskz_cvtepi8_epi16'. Requires AVX512BW.

func M512MaskzCvtepu8Epi16

func M512MaskzCvtepu8Epi16(k x86.Mmask32, a x86.M256i) (dst x86.M512i)

M512MaskzCvtepu8Epi16: Zero extend packed unsigned 8-bit integers in 'a' to packed 16-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*8
	l := j*16
	IF k[j]
		dst[l+15:l] := ZeroExtend(a[i+7:i])
	ELSE
		dst[l+15:l] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVZXBW'. Intrinsic: '_mm512_maskz_cvtepu8_epi16'. Requires AVX512BW.

func M512MaskzCvtsepi16Epi8

func M512MaskzCvtsepi16Epi8(k x86.Mmask32, a x86.M512i) (dst x86.M256i)

M512MaskzCvtsepi16Epi8: Convert packed 16-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := 16*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_Int16_To_Int8(a[i+15:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVSWB'. Intrinsic: '_mm512_maskz_cvtsepi16_epi8'. Requires AVX512BW.

func M512MaskzCvtusepi16Epi8

func M512MaskzCvtusepi16Epi8(k x86.Mmask32, a x86.M512i) (dst x86.M256i)

M512MaskzCvtusepi16Epi8: Convert packed unsigned 16-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := 16*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:256] := 0

Instruction: 'VPMOVUSWB'. Intrinsic: '_mm512_maskz_cvtusepi16_epi8'. Requires AVX512BW.

func M512MaskzDbsadEpu8

func M512MaskzDbsadEpu8(k x86.Mmask32, a x86.M512i, b x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskzDbsadEpu8: Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in 'a' compared to those in 'b', and store the 16-bit results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The

first two SADs use the lower 8-bit quadruplet of the lane from 'a', and the last two SADs use the uppper 8-bit quadruplet of the lane from 'a'. Quadruplets from 'b' are selected from within 128-bit lanes according to the control in 'imm8', and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.

FOR j := 0 to 3
	i := j*128
	tmp[i+31:i] := select(b[i+127:i], imm8[1:0])
	tmp[i+63:i+32] := select(b[i+127:i], imm8[3:2])
	tmp[i+95:i+64] := select(b[i+127:i], imm8[5:4])
	tmp[i+127:i+96] := select(b[i+127:i], imm8[7:6])
ENDFOR

FOR j := 0 to 7
	i := j*64
	tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8])
				 + ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])

	tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16])
				 + ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])

	tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24])
				 + ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])

	tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32])
				 + ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
ENDFOR

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VDBPSADBW'. Intrinsic: '_mm512_maskz_dbsad_epu8'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512MaskzMaddEpi16

func M512MaskzMaddEpi16(k x86.Mmask16, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMaddEpi16: Multiply packed 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i+16]*b[i+31:i+16] + a[i+15:i]*b[i+15:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMADDWD'. Intrinsic: '_mm512_maskz_madd_epi16'. Requires AVX512BW.

func M512MaskzMaddubsEpi16

func M512MaskzMaddubsEpi16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMaddubsEpi16: Multiply packed unsigned 8-bit integers in 'a' by packed signed 8-bit integers in 'b', producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMADDUBSW'. Intrinsic: '_mm512_maskz_maddubs_epi16'. Requires AVX512BW.

func M512MaskzMaxEpi16

func M512MaskzMaxEpi16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMaxEpi16: Compare packed 16-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		IF a[i+15:i] > b[i+15:i]
			dst[i+15:i] := a[i+15:i]
		ELSE
			dst[i+15:i] := b[i+15:i]
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMAXSW'. Intrinsic: '_mm512_maskz_max_epi16'. Requires AVX512BW.

func M512MaskzMaxEpi8

func M512MaskzMaxEpi8(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMaxEpi8: Compare packed 8-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		IF a[i+7:i] > b[i+7:i]
			dst[i+7:i] := a[i+7:i]
		ELSE
			dst[i+7:i] := b[i+7:i]
		FI
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMAXSB'. Intrinsic: '_mm512_maskz_max_epi8'. Requires AVX512BW.

func M512MaskzMaxEpu16

func M512MaskzMaxEpu16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMaxEpu16: Compare packed unsigned 16-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		IF a[i+15:i] > b[i+15:i]
			dst[i+15:i] := a[i+15:i]
		ELSE
			dst[i+15:i] := b[i+15:i]
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMAXUW'. Intrinsic: '_mm512_maskz_max_epu16'. Requires AVX512BW.

func M512MaskzMaxEpu8

func M512MaskzMaxEpu8(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMaxEpu8: Compare packed unsigned 8-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		IF a[i+7:i] > b[i+7:i]
			dst[i+7:i] := a[i+7:i]
		ELSE
			dst[i+7:i] := b[i+7:i]
		FI
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMAXUB'. Intrinsic: '_mm512_maskz_max_epu8'. Requires AVX512BW.

func M512MaskzMinEpi16

func M512MaskzMinEpi16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMinEpi16: Compare packed 16-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		IF a[i+15:i] < b[i+15:i]
			dst[i+15:i] := a[i+15:i]
		ELSE
			dst[i+15:i] := b[i+15:i]
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMINSW'. Intrinsic: '_mm512_maskz_min_epi16'. Requires AVX512BW.

func M512MaskzMinEpi8

func M512MaskzMinEpi8(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMinEpi8: Compare packed 8-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		IF a[i+7:i] < b[i+7:i]
			dst[i+7:i] := a[i+7:i]
		ELSE
			dst[i+7:i] := b[i+7:i]
		FI
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMINSB'. Intrinsic: '_mm512_maskz_min_epi8'. Requires AVX512BW.

func M512MaskzMinEpu16

func M512MaskzMinEpu16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMinEpu16: Compare packed unsigned 16-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		IF a[i+15:i] < b[i+15:i]
			dst[i+15:i] := a[i+15:i]
		ELSE
			dst[i+15:i] := b[i+15:i]
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMINUW'. Intrinsic: '_mm512_maskz_min_epu16'. Requires AVX512BW.

func M512MaskzMinEpu8

func M512MaskzMinEpu8(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMinEpu8: Compare packed unsigned 8-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		IF a[i+7:i] < b[i+7:i]
			dst[i+7:i] := a[i+7:i]
		ELSE
			dst[i+7:i] := b[i+7:i]
		FI
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMINUB'. Intrinsic: '_mm512_maskz_min_epu8'. Requires AVX512BW.

func M512MaskzMovEpi16

func M512MaskzMovEpi16(k x86.Mmask32, a x86.M512i) (dst x86.M512i)

M512MaskzMovEpi16: Move packed 16-bit integers from 'a' into 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := a[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMOVDQU16'. Intrinsic: '_mm512_maskz_mov_epi16'. Requires AVX512BW.

func M512MaskzMovEpi8

func M512MaskzMovEpi8(k x86.Mmask64, a x86.M512i) (dst x86.M512i)

M512MaskzMovEpi8: Move packed 8-bit integers from 'a' into 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := a[i+7:i]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VMOVDQU8'. Intrinsic: '_mm512_maskz_mov_epi8'. Requires AVX512BW.

func M512MaskzMulhiEpi16

func M512MaskzMulhiEpi16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMulhiEpi16: Multiply the packed 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		tmp[31:0] := a[i+15:i] * b[i+15:i]
		dst[i+15:i] := tmp[31:16]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMULHW'. Intrinsic: '_mm512_maskz_mulhi_epi16'. Requires AVX512BW.

func M512MaskzMulhiEpu16

func M512MaskzMulhiEpu16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMulhiEpu16: Multiply the packed unsigned 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		tmp[31:0] := a[i+15:i] * b[i+15:i]
		dst[i+15:i] := tmp[31:16]
	ELSE
		dst[i+15:i] := o
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMULHUW'. Intrinsic: '_mm512_maskz_mulhi_epu16'. Requires AVX512BW.

func M512MaskzMulhrsEpi16

func M512MaskzMulhrsEpi16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMulhrsEpi16: Multiply packed 16-bit integers in 'a' and 'b', producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		tmp[31:0] := ((a[i+15:i] * b[i+15:i]) >> 14) + 1
		dst[i+15:i] := tmp[16:1]
	ELSE
		dst[i+15:i] := 9
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMULHRSW'. Intrinsic: '_mm512_maskz_mulhrs_epi16'. Requires AVX512BW.

func M512MaskzMulloEpi16

func M512MaskzMulloEpi16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzMulloEpi16: Multiply the packed 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		tmp[31:0] := a[i+15:i] * b[i+15:i]
		dst[i+15:i] := tmp[15:0]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMULLW'. Intrinsic: '_mm512_maskz_mullo_epi16'. Requires AVX512BW.

func M512MaskzPacksEpi16

func M512MaskzPacksEpi16(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzPacksEpi16: Convert packed 16-bit integers from 'a' and 'b' to packed 8-bit integers using signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp_dst[7:0] := Saturate_Int16_To_Int8 (a[15:0])
tmp_dst[15:8] := Saturate_Int16_To_Int8 (a[31:16])
tmp_dst[23:16] := Saturate_Int16_To_Int8 (a[47:32])
tmp_dst[31:24] := Saturate_Int16_To_Int8 (a[63:48])
tmp_dst[39:32] := Saturate_Int16_To_Int8 (a[79:64])
tmp_dst[47:40] := Saturate_Int16_To_Int8 (a[95:80])
tmp_dst[55:48] := Saturate_Int16_To_Int8 (a[111:96])
tmp_dst[63:56] := Saturate_Int16_To_Int8 (a[127:112])
tmp_dst[71:64] := Saturate_Int16_To_Int8 (b[15:0])
tmp_dst[79:72] := Saturate_Int16_To_Int8 (b[31:16])
tmp_dst[87:80] := Saturate_Int16_To_Int8 (b[47:32])
tmp_dst[95:88] := Saturate_Int16_To_Int8 (b[63:48])
tmp_dst[103:96] := Saturate_Int16_To_Int8 (b[79:64])
tmp_dst[111:104] := Saturate_Int16_To_Int8 (b[95:80])
tmp_dst[119:112] := Saturate_Int16_To_Int8 (b[111:96])
tmp_dst[127:120] := Saturate_Int16_To_Int8 (b[127:112])
tmp_dst[135:128] := Saturate_Int16_To_Int8 (a[143:128])
tmp_dst[143:136] := Saturate_Int16_To_Int8 (a[159:144])
tmp_dst[151:144] := Saturate_Int16_To_Int8 (a[175:160])
tmp_dst[159:152] := Saturate_Int16_To_Int8 (a[191:176])
tmp_dst[167:160] := Saturate_Int16_To_Int8 (a[207:192])
tmp_dst[175:168] := Saturate_Int16_To_Int8 (a[223:208])
tmp_dst[183:176] := Saturate_Int16_To_Int8 (a[239:224])
tmp_dst[191:184] := Saturate_Int16_To_Int8 (a[255:240])
tmp_dst[199:192] := Saturate_Int16_To_Int8 (b[143:128])
tmp_dst[207:200] := Saturate_Int16_To_Int8 (b[159:144])
tmp_dst[215:208] := Saturate_Int16_To_Int8 (b[175:160])
tmp_dst[223:216] := Saturate_Int16_To_Int8 (b[191:176])
tmp_dst[231:224] := Saturate_Int16_To_Int8 (b[207:192])
tmp_dst[239:232] := Saturate_Int16_To_Int8 (b[223:208])
tmp_dst[247:240] := Saturate_Int16_To_Int8 (b[239:224])
tmp_dst[255:248] := Saturate_Int16_To_Int8 (b[255:240])
tmp_dst[263:256] := Saturate_Int16_To_Int8 (a[271:256])
tmp_dst[271:264] := Saturate_Int16_To_Int8 (a[287:272])
tmp_dst[279:272] := Saturate_Int16_To_Int8 (a[303:288])
tmp_dst[287:280] := Saturate_Int16_To_Int8 (a[319:304])
tmp_dst[295:288] := Saturate_Int16_To_Int8 (a[335:320])
tmp_dst[303:296] := Saturate_Int16_To_Int8 (a[351:336])
tmp_dst[311:304] := Saturate_Int16_To_Int8 (a[367:352])
tmp_dst[319:312] := Saturate_Int16_To_Int8 (a[383:368])
tmp_dst[327:320] := Saturate_Int16_To_Int8 (b[271:256])
tmp_dst[335:328] := Saturate_Int16_To_Int8 (b[287:272])
tmp_dst[343:336] := Saturate_Int16_To_Int8 (b[303:288])
tmp_dst[351:344] := Saturate_Int16_To_Int8 (b[319:304])
tmp_dst[359:352] := Saturate_Int16_To_Int8 (b[335:320])
tmp_dst[367:360] := Saturate_Int16_To_Int8 (b[351:336])
tmp_dst[375:368] := Saturate_Int16_To_Int8 (b[367:352])
tmp_dst[383:376] := Saturate_Int16_To_Int8 (b[383:368])
tmp_dst[391:384] := Saturate_Int16_To_Int8 (a[399:384])
tmp_dst[399:392] := Saturate_Int16_To_Int8 (a[415:400])
tmp_dst[407:400] := Saturate_Int16_To_Int8 (a[431:416])
tmp_dst[415:408] := Saturate_Int16_To_Int8 (a[447:432])
tmp_dst[423:416] := Saturate_Int16_To_Int8 (a[463:448])
tmp_dst[431:424] := Saturate_Int16_To_Int8 (a[479:464])
tmp_dst[439:432] := Saturate_Int16_To_Int8 (a[495:480])
tmp_dst[447:440] := Saturate_Int16_To_Int8 (a[511:496])
tmp_dst[455:448] := Saturate_Int16_To_Int8 (b[399:384])
tmp_dst[463:456] := Saturate_Int16_To_Int8 (b[415:400])
tmp_dst[471:464] := Saturate_Int16_To_Int8 (b[431:416])
tmp_dst[479:472] := Saturate_Int16_To_Int8 (b[447:432])
tmp_dst[487:480] := Saturate_Int16_To_Int8 (b[463:448])
tmp_dst[495:488] := Saturate_Int16_To_Int8 (b[479:464])
tmp_dst[503:496] := Saturate_Int16_To_Int8 (b[495:480])
tmp_dst[511:504] := Saturate_Int16_To_Int8 (b[511:496])

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPACKSSWB'. Intrinsic: '_mm512_maskz_packs_epi16'. Requires AVX512BW.

func M512MaskzPacksEpi32

func M512MaskzPacksEpi32(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzPacksEpi32: Convert packed 32-bit integers from 'a' and 'b' to packed 16-bit integers using signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp_dst[15:0] := Saturate_Int32_To_Int16 (a[31:0])
tmp_dst[31:16] := Saturate_Int32_To_Int16 (a[63:32])
tmp_dst[47:32] := Saturate_Int32_To_Int16 (a[95:64])
tmp_dst[63:48] := Saturate_Int32_To_Int16 (a[127:96])
tmp_dst[79:64] := Saturate_Int32_To_Int16 (b[31:0])
tmp_dst[95:80] := Saturate_Int32_To_Int16 (b[63:32])
tmp_dst[111:96] := Saturate_Int32_To_Int16 (b[95:64])
tmp_dst[127:112] := Saturate_Int32_To_Int16 (b[127:96])
tmp_dst[143:128] := Saturate_Int32_To_Int16 (a[159:128])
tmp_dst[159:144] := Saturate_Int32_To_Int16 (a[191:160])
tmp_dst[175:160] := Saturate_Int32_To_Int16 (a[223:192])
tmp_dst[191:176] := Saturate_Int32_To_Int16 (a[255:224])
tmp_dst[207:192] := Saturate_Int32_To_Int16 (b[159:128])
tmp_dst[223:208] := Saturate_Int32_To_Int16 (b[191:160])
tmp_dst[239:224] := Saturate_Int32_To_Int16 (b[223:192])
tmp_dst[255:240] := Saturate_Int32_To_Int16 (b[255:224])
tmp_dst[271:256] := Saturate_Int32_To_Int16 (a[287:256])
tmp_dst[287:272] := Saturate_Int32_To_Int16 (a[319:288])
tmp_dst[303:288] := Saturate_Int32_To_Int16 (a[351:320])
tmp_dst[319:304] := Saturate_Int32_To_Int16 (a[383:352])
tmp_dst[335:320] := Saturate_Int32_To_Int16 (b[287:256])
tmp_dst[351:336] := Saturate_Int32_To_Int16 (b[319:288])
tmp_dst[367:352] := Saturate_Int32_To_Int16 (b[351:320])
tmp_dst[383:368] := Saturate_Int32_To_Int16 (b[383:352])
tmp_dst[399:384] := Saturate_Int32_To_Int16 (a[415:384])
tmp_dst[415:400] := Saturate_Int32_To_Int16 (a[447:416])
tmp_dst[431:416] := Saturate_Int32_To_Int16 (a[479:448])
tmp_dst[447:432] := Saturate_Int32_To_Int16 (a[511:480])
tmp_dst[463:448] := Saturate_Int32_To_Int16 (b[415:384])
tmp_dst[479:464] := Saturate_Int32_To_Int16 (b[447:416])
tmp_dst[495:480] := Saturate_Int32_To_Int16 (b[479:448])
tmp_dst[511:496] := Saturate_Int32_To_Int16 (b[511:480])

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPACKSSDW'. Intrinsic: '_mm512_maskz_packs_epi32'. Requires AVX512BW.

func M512MaskzPackusEpi16

func M512MaskzPackusEpi16(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzPackusEpi16: Convert packed 16-bit integers from 'a' and 'b' to packed 8-bit integers using unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp_dst[7:0] := Saturate_Int16_To_UnsignedInt8 (a[15:0])
tmp_dst[15:8] := Saturate_Int16_To_UnsignedInt8 (a[31:16])
tmp_dst[23:16] := Saturate_Int16_To_UnsignedInt8 (a[47:32])
tmp_dst[31:24] := Saturate_Int16_To_UnsignedInt8 (a[63:48])
tmp_dst[39:32] := Saturate_Int16_To_UnsignedInt8 (a[79:64])
tmp_dst[47:40] := Saturate_Int16_To_UnsignedInt8 (a[95:80])
tmp_dst[55:48] := Saturate_Int16_To_UnsignedInt8 (a[111:96])
tmp_dst[63:56] := Saturate_Int16_To_UnsignedInt8 (a[127:112])
tmp_dst[71:64] := Saturate_Int16_To_UnsignedInt8 (b[15:0])
tmp_dst[79:72] := Saturate_Int16_To_UnsignedInt8 (b[31:16])
tmp_dst[87:80] := Saturate_Int16_To_UnsignedInt8 (b[47:32])
tmp_dst[95:88] := Saturate_Int16_To_UnsignedInt8 (b[63:48])
tmp_dst[103:96] := Saturate_Int16_To_UnsignedInt8 (b[79:64])
tmp_dst[111:104] := Saturate_Int16_To_UnsignedInt8 (b[95:80])
tmp_dst[119:112] := Saturate_Int16_To_UnsignedInt8 (b[111:96])
tmp_dst[127:120] := Saturate_Int16_To_UnsignedInt8 (b[127:112])
tmp_dst[135:128] := Saturate_Int16_To_UnsignedInt8 (a[143:128])
tmp_dst[143:136] := Saturate_Int16_To_UnsignedInt8 (a[159:144])
tmp_dst[151:144] := Saturate_Int16_To_UnsignedInt8 (a[175:160])
tmp_dst[159:152] := Saturate_Int16_To_UnsignedInt8 (a[191:176])
tmp_dst[167:160] := Saturate_Int16_To_UnsignedInt8 (a[207:192])
tmp_dst[175:168] := Saturate_Int16_To_UnsignedInt8 (a[223:208])
tmp_dst[183:176] := Saturate_Int16_To_UnsignedInt8 (a[239:224])
tmp_dst[191:184] := Saturate_Int16_To_UnsignedInt8 (a[255:240])
tmp_dst[199:192] := Saturate_Int16_To_UnsignedInt8 (b[143:128])
tmp_dst[207:200] := Saturate_Int16_To_UnsignedInt8 (b[159:144])
tmp_dst[215:208] := Saturate_Int16_To_UnsignedInt8 (b[175:160])
tmp_dst[223:216] := Saturate_Int16_To_UnsignedInt8 (b[191:176])
tmp_dst[231:224] := Saturate_Int16_To_UnsignedInt8 (b[207:192])
tmp_dst[239:232] := Saturate_Int16_To_UnsignedInt8 (b[223:208])
tmp_dst[247:240] := Saturate_Int16_To_UnsignedInt8 (b[239:224])
tmp_dst[255:248] := Saturate_Int16_To_UnsignedInt8 (b[255:240])
tmp_dst[263:256] := Saturate_Int16_To_UnsignedInt8 (a[271:256])
tmp_dst[271:264] := Saturate_Int16_To_UnsignedInt8 (a[287:272])
tmp_dst[279:272] := Saturate_Int16_To_UnsignedInt8 (a[303:288])
tmp_dst[287:280] := Saturate_Int16_To_UnsignedInt8 (a[319:304])
tmp_dst[295:288] := Saturate_Int16_To_UnsignedInt8 (a[335:320])
tmp_dst[303:296] := Saturate_Int16_To_UnsignedInt8 (a[351:336])
tmp_dst[311:304] := Saturate_Int16_To_UnsignedInt8 (a[367:352])
tmp_dst[319:312] := Saturate_Int16_To_UnsignedInt8 (a[383:368])
tmp_dst[327:320] := Saturate_Int16_To_UnsignedInt8 (b[271:256])
tmp_dst[335:328] := Saturate_Int16_To_UnsignedInt8 (b[287:272])
tmp_dst[343:336] := Saturate_Int16_To_UnsignedInt8 (b[303:288])
tmp_dst[351:344] := Saturate_Int16_To_UnsignedInt8 (b[319:304])
tmp_dst[359:352] := Saturate_Int16_To_UnsignedInt8 (b[335:320])
tmp_dst[367:360] := Saturate_Int16_To_UnsignedInt8 (b[351:336])
tmp_dst[375:368] := Saturate_Int16_To_UnsignedInt8 (b[367:352])
tmp_dst[383:376] := Saturate_Int16_To_UnsignedInt8 (b[383:368])
tmp_dst[391:384] := Saturate_Int16_To_UnsignedInt8 (a[399:384])
tmp_dst[399:392] := Saturate_Int16_To_UnsignedInt8 (a[415:400])
tmp_dst[407:400] := Saturate_Int16_To_UnsignedInt8 (a[431:416])
tmp_dst[415:408] := Saturate_Int16_To_UnsignedInt8 (a[447:432])
tmp_dst[423:416] := Saturate_Int16_To_UnsignedInt8 (a[463:448])
tmp_dst[431:424] := Saturate_Int16_To_UnsignedInt8 (a[479:464])
tmp_dst[439:432] := Saturate_Int16_To_UnsignedInt8 (a[495:480])
tmp_dst[447:440] := Saturate_Int16_To_UnsignedInt8 (a[511:496])
tmp_dst[455:448] := Saturate_Int16_To_UnsignedInt8 (b[399:384])
tmp_dst[463:456] := Saturate_Int16_To_UnsignedInt8 (b[415:400])
tmp_dst[471:464] := Saturate_Int16_To_UnsignedInt8 (b[431:416])
tmp_dst[479:472] := Saturate_Int16_To_UnsignedInt8 (b[447:432])
tmp_dst[487:480] := Saturate_Int16_To_UnsignedInt8 (b[463:448])
tmp_dst[495:488] := Saturate_Int16_To_UnsignedInt8 (b[479:464])
tmp_dst[503:496] := Saturate_Int16_To_UnsignedInt8 (b[495:480])
tmp_dst[511:504] := Saturate_Int16_To_UnsignedInt8 (b[511:496])

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPACKUSWB'. Intrinsic: '_mm512_maskz_packus_epi16'. Requires AVX512BW.

func M512MaskzPackusEpi32

func M512MaskzPackusEpi32(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzPackusEpi32: Convert packed 32-bit integers from 'a' and 'b' to packed 16-bit integers using unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp_dst[15:0] := Saturate_Int32_To_UnsignedInt16 (a[31:0])
tmp_dst[31:16] := Saturate_Int32_To_UnsignedInt16 (a[63:32])
tmp_dst[47:32] := Saturate_Int32_To_UnsignedInt16 (a[95:64])
tmp_dst[63:48] := Saturate_Int32_To_UnsignedInt16 (a[127:96])
tmp_dst[79:64] := Saturate_Int32_To_UnsignedInt16 (b[31:0])
tmp_dst[95:80] := Saturate_Int32_To_UnsignedInt16 (b[63:32])
tmp_dst[111:96] := Saturate_Int32_To_UnsignedInt16 (b[95:64])
tmp_dst[127:112] := Saturate_Int32_To_UnsignedInt16 (b[127:96])
tmp_dst[143:128] := Saturate_Int32_To_UnsignedInt16 (a[159:128])
tmp_dst[159:144] := Saturate_Int32_To_UnsignedInt16 (a[191:160])
tmp_dst[175:160] := Saturate_Int32_To_UnsignedInt16 (a[223:192])
tmp_dst[191:176] := Saturate_Int32_To_UnsignedInt16 (a[255:224])
tmp_dst[207:192] := Saturate_Int32_To_UnsignedInt16 (b[159:128])
tmp_dst[223:208] := Saturate_Int32_To_UnsignedInt16 (b[191:160])
tmp_dst[239:224] := Saturate_Int32_To_UnsignedInt16 (b[223:192])
tmp_dst[255:240] := Saturate_Int32_To_UnsignedInt16 (b[255:224])
tmp_dst[271:256] := Saturate_Int32_To_UnsignedInt16 (a[287:256])
tmp_dst[287:272] := Saturate_Int32_To_UnsignedInt16 (a[319:288])
tmp_dst[303:288] := Saturate_Int32_To_UnsignedInt16 (a[351:320])
tmp_dst[319:304] := Saturate_Int32_To_UnsignedInt16 (a[383:352])
tmp_dst[335:320] := Saturate_Int32_To_UnsignedInt16 (b[287:256])
tmp_dst[351:336] := Saturate_Int32_To_UnsignedInt16 (b[319:288])
tmp_dst[367:352] := Saturate_Int32_To_UnsignedInt16 (b[351:320])
tmp_dst[383:368] := Saturate_Int32_To_UnsignedInt16 (b[383:352])
tmp_dst[399:384] := Saturate_Int32_To_UnsignedInt16 (a[415:384])
tmp_dst[415:400] := Saturate_Int32_To_UnsignedInt16 (a[447:416])
tmp_dst[431:416] := Saturate_Int32_To_UnsignedInt16 (a[479:448])
tmp_dst[447:432] := Saturate_Int32_To_UnsignedInt16 (a[511:480])
tmp_dst[463:448] := Saturate_Int32_To_UnsignedInt16 (b[415:384])
tmp_dst[479:464] := Saturate_Int32_To_UnsignedInt16 (b[447:416])
tmp_dst[495:480] := Saturate_Int32_To_UnsignedInt16 (b[479:448])
tmp_dst[511:496] := Saturate_Int32_To_UnsignedInt16 (b[511:480])

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPACKUSDW'. Intrinsic: '_mm512_maskz_packus_epi32'. Requires AVX512BW.

func M512MaskzPermutex2varEpi16

func M512MaskzPermutex2varEpi16(k x86.Mmask32, a x86.M512i, idx x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzPermutex2varEpi16: Shuffle 16-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		off := 16*idx[i+4:i]
		dst[i+15:i] := idx[i+5] ? b[off+15:off] : a[off+15:off]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMI2W, VPERMT2W'. Intrinsic: '_mm512_maskz_permutex2var_epi16'. Requires AVX512BW.

func M512MaskzPermutexvarEpi16

func M512MaskzPermutexvarEpi16(k x86.Mmask32, idx x86.M512i, a x86.M512i) (dst x86.M512i)

M512MaskzPermutexvarEpi16: Shuffle 16-bit integers in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	id := idx[i+4:i]*16
	IF k[j]
		dst[i+15:i] := a[id+15:id]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMW'. Intrinsic: '_mm512_maskz_permutexvar_epi16'. Requires AVX512BW.

func M512MaskzSet1Epi16

func M512MaskzSet1Epi16(k x86.Mmask32, a int16) (dst x86.M512i)

M512MaskzSet1Epi16: Broadcast the low packed 16-bit integer from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := a[15:0]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPBROADCASTW'. Intrinsic: '_mm512_maskz_set1_epi16'. Requires AVX512BW.

func M512MaskzSet1Epi8

func M512MaskzSet1Epi8(k x86.Mmask64, a byte) (dst x86.M512i)

M512MaskzSet1Epi8: Broadcast 8-bit integer 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := a[7:0]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPBROADCASTB'. Intrinsic: '_mm512_maskz_set1_epi8'. Requires AVX512BW.

func M512MaskzShuffleEpi8

func M512MaskzShuffleEpi8(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzShuffleEpi8: Shuffle packed 8-bit integers in 'a' according to shuffle control mask in the corresponding 8-bit element of 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		IF b[i+7] == 1
			dst[i+7:i] := 0
		ELSE
			index[3:0] := b[i+3:i]
			dst[i+7:i] := a[index*8+7:index*8]
		FI
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSHUFB'. Intrinsic: '_mm512_maskz_shuffle_epi8'. Requires AVX512BW.

func M512MaskzShufflehiEpi16

func M512MaskzShufflehiEpi16(k x86.Mmask32, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskzShufflehiEpi16: Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of 'a' using the control in 'imm8'. Store the results in the high 64 bits of 128-bit lanes of 'dst', with the low 64 bits of 128-bit lanes being copied from from 'a' to 'dst', using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp_dst[63:0] := a[63:0]
tmp_dst[79:64] := (a >> (imm8[1:0] * 16))[79:64]
tmp_dst[95:80] := (a >> (imm8[3:2] * 16))[79:64]
tmp_dst[111:96] := (a >> (imm8[5:4] * 16))[79:64]
tmp_dst[127:112] := (a >> (imm8[7:6] * 16))[79:64]
tmp_dst[191:128] := a[191:128]
tmp_dst[207:192] := (a >> (imm8[1:0] * 16))[207:192]
tmp_dst[223:208] := (a >> (imm8[3:2] * 16))[207:192]
tmp_dst[239:224] := (a >> (imm8[5:4] * 16))[207:192]
tmp_dst[255:240] := (a >> (imm8[7:6] * 16))[207:192]
tmp_dst[319:256] := a[319:256]
tmp_dst[335:320] := (a >> (imm8[1:0] * 16))[335:320]
tmp_dst[351:336] := (a >> (imm8[3:2] * 16))[335:320]
tmp_dst[367:352] := (a >> (imm8[5:4] * 16))[335:320]
tmp_dst[383:368] := (a >> (imm8[7:6] * 16))[335:320]
tmp_dst[447:384] := a[447:384]
tmp_dst[463:448] := (a >> (imm8[1:0] * 16))[463:448]
tmp_dst[479:464] := (a >> (imm8[3:2] * 16))[463:448]
tmp_dst[495:480] := (a >> (imm8[5:4] * 16))[463:448]
tmp_dst[511:496] := (a >> (imm8[7:6] * 16))[463:448]

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSHUFHW'. Intrinsic: '_mm512_maskz_shufflehi_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512MaskzShuffleloEpi16

func M512MaskzShuffleloEpi16(k x86.Mmask32, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskzShuffleloEpi16: Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of 'a' using the control in 'imm8'. Store the results in the low 64 bits of 128-bit lanes of 'dst', with the high 64 bits of 128-bit lanes being copied from from 'a' to 'dst', using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp_dst[15:0] := (a >> (imm8[1:0] * 16))[15:0]
tmp_dst[31:16] := (a >> (imm8[3:2] * 16))[15:0]
tmp_dst[47:32] := (a >> (imm8[5:4] * 16))[15:0]
tmp_dst[63:48] := (a >> (imm8[7:6] * 16))[15:0]
tmp_dst[127:64] := a[127:64]
tmp_dst[143:128] := (a >> (imm8[1:0] * 16))[143:128]
tmp_dst[159:144] := (a >> (imm8[3:2] * 16))[143:128]
tmp_dst[175:160] := (a >> (imm8[5:4] * 16))[143:128]
tmp_dst[191:176] := (a >> (imm8[7:6] * 16))[143:128]
tmp_dst[255:192] := a[255:192]
tmp_dst[271:256] := (a >> (imm8[1:0] * 16))[271:256]
tmp_dst[287:272] := (a >> (imm8[3:2] * 16))[271:256]
tmp_dst[303:288] := (a >> (imm8[5:4] * 16))[271:256]
tmp_dst[319:304] := (a >> (imm8[7:6] * 16))[271:256]
tmp_dst[383:320] := a[383:320]
tmp_dst[399:384] := (a >> (imm8[1:0] * 16))[399:384]
tmp_dst[415:400] := (a >> (imm8[3:2] * 16))[399:384]
tmp_dst[431:416] := (a >> (imm8[5:4] * 16))[399:384]
tmp_dst[447:432] := (a >> (imm8[7:6] * 16))[399:384]
tmp_dst[511:448] := a[511:448]

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSHUFLW'. Intrinsic: '_mm512_maskz_shufflelo_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512MaskzSllEpi16

func M512MaskzSllEpi16(k x86.Mmask32, a x86.M512i, count x86.M128i) (dst x86.M512i)

M512MaskzSllEpi16: Shift packed 16-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		IF count[63:0] > 15
			dst[i+15:i] := 0
		ELSE
			dst[i+15:i] := ZeroExtend(a[i+15:i] << count[63:0])
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSLLW'. Intrinsic: '_mm512_maskz_sll_epi16'. Requires AVX512BW.

func M512MaskzSlliEpi16

func M512MaskzSlliEpi16(k x86.Mmask32, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskzSlliEpi16: Shift packed 16-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		IF imm8[7:0] > 15
			dst[i+15:i] := 0
		ELSE
			dst[i+15:i] := ZeroExtend(a[i+15:i] << imm8[7:0])
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSLLW'. Intrinsic: '_mm512_maskz_slli_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512MaskzSllvEpi16

func M512MaskzSllvEpi16(k x86.Mmask32, a x86.M512i, count x86.M512i) (dst x86.M512i)

M512MaskzSllvEpi16: Shift packed 16-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := ZeroExtend(a[i+15:i] << count[i+15:i])
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSLLVW'. Intrinsic: '_mm512_maskz_sllv_epi16'. Requires AVX512BW.

func M512MaskzSraEpi16

func M512MaskzSraEpi16(k x86.Mmask32, a x86.M512i, count x86.M128i) (dst x86.M512i)

M512MaskzSraEpi16: Shift packed 16-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		IF count[63:0] > 15
			dst[i+15:i] := SignBit
		ELSE
			dst[i+15:i] := SignExtend(a[i+15:i] >> count[63:0])
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRAW'. Intrinsic: '_mm512_maskz_sra_epi16'. Requires AVX512BW.

func M512MaskzSraiEpi16

func M512MaskzSraiEpi16(k x86.Mmask32, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskzSraiEpi16: Shift packed 16-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		IF imm8[7:0] > 15
			dst[i+15:i] := SignBit
		ELSE
			dst[i+15:i] := SignExtend(a[i+15:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRAW'. Intrinsic: '_mm512_maskz_srai_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512MaskzSravEpi16

func M512MaskzSravEpi16(k x86.Mmask32, a x86.M512i, count x86.M512i) (dst x86.M512i)

M512MaskzSravEpi16: Shift packed 16-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := SignExtend(a[i+15:i] >> count[i+15:i])
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRAVW'. Intrinsic: '_mm512_maskz_srav_epi16'. Requires AVX512BW.

func M512MaskzSrlEpi16

func M512MaskzSrlEpi16(k x86.Mmask32, a x86.M512i, count x86.M128i) (dst x86.M512i)

M512MaskzSrlEpi16: Shift packed 16-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		IF count[63:0] > 15
			dst[i+15:i] := 0
		ELSE
			dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[63:0])
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRLW'. Intrinsic: '_mm512_maskz_srl_epi16'. Requires AVX512BW.

func M512MaskzSrliEpi16

func M512MaskzSrliEpi16(k x86.Mmask32, a x86.M512i, imm8 byte) (dst x86.M512i)

M512MaskzSrliEpi16: Shift packed 16-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		IF imm8[7:0] > 15
			dst[i+15:i] := 0
		ELSE
			dst[i+15:i] := ZeroExtend(a[i+15:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRLW'. Intrinsic: '_mm512_maskz_srli_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512MaskzSrlvEpi16

func M512MaskzSrlvEpi16(k x86.Mmask32, a x86.M512i, count x86.M512i) (dst x86.M512i)

M512MaskzSrlvEpi16: Shift packed 16-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[i+15:i])
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRLVW'. Intrinsic: '_mm512_maskz_srlv_epi16'. Requires AVX512BW.

func M512MaskzSubEpi16

func M512MaskzSubEpi16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzSubEpi16: Subtract packed 16-bit integers in 'b' from packed 16-bit integers in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := a[i+15:i] - b[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBW'. Intrinsic: '_mm512_maskz_sub_epi16'. Requires AVX512BW.

func M512MaskzSubEpi8

func M512MaskzSubEpi8(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzSubEpi8: Subtract packed 8-bit integers in 'b' from packed 8-bit integers in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := a[i+7:i] - b[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBB'. Intrinsic: '_mm512_maskz_sub_epi8'. Requires AVX512BW.

func M512MaskzSubsEpi16

func M512MaskzSubsEpi16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzSubsEpi16: Subtract packed 16-bit integers in 'b' from packed 16-bit integers in 'a' using saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_Int16(a[i+15:i] - b[i+15:i])
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBSW'. Intrinsic: '_mm512_maskz_subs_epi16'. Requires AVX512BW.

func M512MaskzSubsEpi8

func M512MaskzSubsEpi8(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzSubsEpi8: Subtract packed 8-bit integers in 'b' from packed 8-bit integers in 'a' using saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := Saturate_To_Int8(a[i+7:i] - b[i+7:i])
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBSB'. Intrinsic: '_mm512_maskz_subs_epi8'. Requires AVX512BW.

func M512MaskzSubsEpu16

func M512MaskzSubsEpu16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzSubsEpu16: Subtract packed unsigned 16-bit integers in 'b' from packed unsigned 16-bit integers in 'a' using saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_UnsignedInt16(a[i+15:i] - b[i+15:i])
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBUSW'. Intrinsic: '_mm512_maskz_subs_epu16'. Requires AVX512BW.

func M512MaskzSubsEpu8

func M512MaskzSubsEpu8(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzSubsEpu8: Subtract packed unsigned 8-bit integers in 'b' from packed unsigned 8-bit integers in 'a' using saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := Saturate_To_UnsignedInt8(a[i+7:i] - b[i+7:i])
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBUSB'. Intrinsic: '_mm512_maskz_subs_epu8'. Requires AVX512BW.

func M512MaskzUnpackhiEpi16

func M512MaskzUnpackhiEpi16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzUnpackhiEpi16: Unpack and interleave 16-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]){
	dst[15:0] := src1[79:64]
	dst[31:16] := src2[79:64]
	dst[47:32] := src1[95:80]
	dst[63:48] := src2[95:80]
	dst[79:64] := src1[111:96]
	dst[95:80] := src2[111:96]
	dst[111:96] := src1[127:112]
	dst[127:112] := src2[127:112]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_HIGH_WORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_HIGH_WORDS(a[511:384], b[511:384])

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPUNPCKHWD'. Intrinsic: '_mm512_maskz_unpackhi_epi16'. Requires AVX512BW.

func M512MaskzUnpackhiEpi8

func M512MaskzUnpackhiEpi8(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzUnpackhiEpi8: Unpack and interleave 8-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]){
	dst[7:0] := src1[71:64]
	dst[15:8] := src2[71:64]
	dst[23:16] := src1[79:72]
	dst[31:24] := src2[79:72]
	dst[39:32] := src1[87:80]
	dst[47:40] := src2[87:80]
	dst[55:48] := src1[95:88]
	dst[63:56] := src2[95:88]
	dst[71:64] := src1[103:96]
	dst[79:72] := src2[103:96]
	dst[87:80] := src1[111:104]
	dst[95:88] := src2[111:104]
	dst[103:96] := src1[119:112]
	dst[111:104] := src2[119:112]
	dst[119:112] := src1[127:120]
	dst[127:120] := src2[127:120]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_HIGH_BYTES(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_HIGH_BYTES(a[511:384], b[511:384])

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPUNPCKHBW'. Intrinsic: '_mm512_maskz_unpackhi_epi8'. Requires AVX512BW.

func M512MaskzUnpackloEpi16

func M512MaskzUnpackloEpi16(k x86.Mmask32, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzUnpackloEpi16: Unpack and interleave 16-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_WORDS(src1[127:0], src2[127:0]){
	dst[15:0] := src1[15:0]
	dst[31:16] := src2[15:0]
	dst[47:32] := src1[31:16]
	dst[63:48] := src2[31:16]
	dst[79:64] := src1[47:32]
	dst[95:80] := src2[47:32]
	dst[111:96] := src1[63:48]
	dst[127:112] := src2[63:48]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_WORDS(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_WORDS(a[511:384], b[511:384])

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPUNPCKLWD'. Intrinsic: '_mm512_maskz_unpacklo_epi16'. Requires AVX512BW.

func M512MaskzUnpackloEpi8

func M512MaskzUnpackloEpi8(k x86.Mmask64, a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaskzUnpackloEpi8: Unpack and interleave 8-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_BYTES(src1[127:0], src2[127:0]){
	dst[7:0] := src1[7:0]
	dst[15:8] := src2[7:0]
	dst[23:16] := src1[15:8]
	dst[31:24] := src2[15:8]
	dst[39:32] := src1[23:16]
	dst[47:40] := src2[23:16]
	dst[55:48] := src1[31:24]
	dst[63:56] := src2[31:24]
	dst[71:64] := src1[39:32]
	dst[79:72] := src2[39:32]
	dst[87:80] := src1[47:40]
	dst[95:88] := src2[47:40]
	dst[103:96] := src1[55:48]
	dst[111:104] := src2[55:48]
	dst[119:112] := src1[63:56]
	dst[127:120] := src2[63:56]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])
tmp_dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128])
tmp_dst[383:256] := INTERLEAVE_BYTES(a[383:256], b[383:256])
tmp_dst[511:384] := INTERLEAVE_BYTES(a[511:384], b[511:384])

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPUNPCKLBW'. Intrinsic: '_mm512_maskz_unpacklo_epi8'. Requires AVX512BW.

func M512MaxEpi16

func M512MaxEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaxEpi16: Compare packed 16-bit integers in 'a' and 'b', and store packed maximum values in 'dst'.

FOR j := 0 to 31
	i := j*16
	IF a[i+15:i] > b[i+15:i]
		dst[i+15:i] := a[i+15:i]
	ELSE
		dst[i+15:i] := b[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMAXSW'. Intrinsic: '_mm512_max_epi16'. Requires AVX512BW.

func M512MaxEpi8

func M512MaxEpi8(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaxEpi8: Compare packed 8-bit integers in 'a' and 'b', and store packed maximum values in 'dst'.

FOR j := 0 to 63
	i := j*8
	IF a[i+7:i] > b[i+7:i]
		dst[i+7:i] := a[i+7:i]
	ELSE
		dst[i+7:i] := b[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMAXSB'. Intrinsic: '_mm512_max_epi8'. Requires AVX512BW.

func M512MaxEpu16

func M512MaxEpu16(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaxEpu16: Compare packed unsigned 16-bit integers in 'a' and 'b', and store packed maximum values in 'dst'.

FOR j := 0 to 31
	i := j*16
	IF a[i+15:i] > b[i+15:i]
		dst[i+15:i] := a[i+15:i]
	ELSE
		dst[i+15:i] := b[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMAXUW'. Intrinsic: '_mm512_max_epu16'. Requires AVX512BW.

func M512MaxEpu8

func M512MaxEpu8(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MaxEpu8: Compare packed unsigned 8-bit integers in 'a' and 'b', and store packed maximum values in 'dst'.

FOR j := 0 to 63
	i := j*8
	IF a[i+7:i] > b[i+7:i]
		dst[i+7:i] := a[i+7:i]
	ELSE
		dst[i+7:i] := b[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMAXUB'. Intrinsic: '_mm512_max_epu8'. Requires AVX512BW.

func M512MinEpi16

func M512MinEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MinEpi16: Compare packed 16-bit integers in 'a' and 'b', and store packed minimum values in 'dst'.

FOR j := 0 to 31
	i := j*16
	IF a[i+15:i] < b[i+15:i]
		dst[i+15:i] := a[i+15:i]
	ELSE
		dst[i+15:i] := b[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMINSW'. Intrinsic: '_mm512_min_epi16'. Requires AVX512BW.

func M512MinEpi8

func M512MinEpi8(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MinEpi8: Compare packed 8-bit integers in 'a' and 'b', and store packed minimum values in 'dst'.

FOR j := 0 to 63
	i := j*8
	IF a[i+7:i] < b[i+7:i]
		dst[i+7:i] := a[i+7:i]
	ELSE
		dst[i+7:i] := b[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMINSB'. Intrinsic: '_mm512_min_epi8'. Requires AVX512BW.

func M512MinEpu16

func M512MinEpu16(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MinEpu16: Compare packed unsigned 16-bit integers in 'a' and 'b', and store packed minimum values in 'dst'.

FOR j := 0 to 31
	i := j*16
	IF a[i+15:i] < b[i+15:i]
		dst[i+15:i] := a[i+15:i]
	ELSE
		dst[i+15:i] := b[i+15:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMINUW'. Intrinsic: '_mm512_min_epu16'. Requires AVX512BW.

func M512MinEpu8

func M512MinEpu8(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MinEpu8: Compare packed unsigned 8-bit integers in 'a' and 'b', and store packed minimum values in 'dst'.

FOR j := 0 to 63
	i := j*8
	IF a[i+7:i] < b[i+7:i]
		dst[i+7:i] := a[i+7:i]
	ELSE
		dst[i+7:i] := b[i+7:i]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMINUB'. Intrinsic: '_mm512_min_epu8'. Requires AVX512BW.

func M512Movepi16Mask

func M512Movepi16Mask(a x86.M512i) (dst x86.Mmask32)

M512Movepi16Mask: Set each bit of mask register 'k' based on the most significant bit of the corresponding packed 16-bit integer in 'a'.

FOR j := 0 to 31
	i := j*16
	IF a[i+15]
		k[j] := 1
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:32] := 0

Instruction: 'VPMOVW2M'. Intrinsic: '_mm512_movepi16_mask'. Requires AVX512BW.

func M512Movepi8Mask

func M512Movepi8Mask(a x86.M512i) (dst x86.Mmask64)

M512Movepi8Mask: Set each bit of mask register 'k' based on the most significant bit of the corresponding packed 8-bit integer in 'a'.

FOR j := 0 to 63
	i := j*8
	IF a[i+7]
		k[j] := 1
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:64] := 0

Instruction: 'VPMOVB2M'. Intrinsic: '_mm512_movepi8_mask'. Requires AVX512BW.

func M512MovmEpi16

func M512MovmEpi16(k x86.Mmask32) (dst x86.M512i)

M512MovmEpi16: Set each packed 16-bit integer in 'dst' to all ones or all zeros based on the value of the corresponding bit in 'k'.

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := 0xFFFF
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVM2W'. Intrinsic: '_mm512_movm_epi16'. Requires AVX512BW.

func M512MovmEpi8

func M512MovmEpi8(k x86.Mmask64) (dst x86.M512i)

M512MovmEpi8: Set each packed 8-bit integer in 'dst' to all ones or all zeros based on the value of the corresponding bit in 'k'.

FOR j := 0 to 63
	i := j*8
	IF k[j]
		dst[i+7:i] := 0xFF
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMOVM2B'. Intrinsic: '_mm512_movm_epi8'. Requires AVX512BW.

func M512MulhiEpi16

func M512MulhiEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MulhiEpi16: Multiply the packed 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in 'dst'.

FOR j := 0 to 31
	i := j*16
	tmp[31:0] := a[i+15:i] * b[i+15:i]
	dst[i+15:i] := tmp[31:16]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMULHW'. Intrinsic: '_mm512_mulhi_epi16'. Requires AVX512BW.

func M512MulhiEpu16

func M512MulhiEpu16(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MulhiEpu16: Multiply the packed unsigned 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in 'dst'.

FOR j := 0 to 31
	i := j*16
	tmp[31:0] := a[i+15:i] * b[i+15:i]
	dst[i+15:i] := tmp[31:16]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMULHUW'. Intrinsic: '_mm512_mulhi_epu16'. Requires AVX512BW.

func M512MulhrsEpi16

func M512MulhrsEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MulhrsEpi16: Multiply packed 16-bit integers in 'a' and 'b', producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to 'dst'.

FOR j := 0 to 31
	i := j*16
	tmp[31:0] := ((a[i+15:i] * b[i+15:i]) >> 14) + 1
	dst[i+15:i] := tmp[16:1]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMULHRSW'. Intrinsic: '_mm512_mulhrs_epi16'. Requires AVX512BW.

func M512MulloEpi16

func M512MulloEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512MulloEpi16: Multiply the packed 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in 'dst'.

FOR j := 0 to 31
	i := j*16
	tmp[31:0] := a[i+15:i] * b[i+15:i]
	dst[i+15:i] := tmp[15:0]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPMULLW'. Intrinsic: '_mm512_mullo_epi16'. Requires AVX512BW.

func M512PacksEpi16

func M512PacksEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512PacksEpi16: Convert packed 16-bit integers from 'a' and 'b' to packed 8-bit integers using signed saturation, and store the results in 'dst'.

dst[7:0] := Saturate_Int16_To_Int8 (a[15:0])
dst[15:8] := Saturate_Int16_To_Int8 (a[31:16])
dst[23:16] := Saturate_Int16_To_Int8 (a[47:32])
dst[31:24] := Saturate_Int16_To_Int8 (a[63:48])
dst[39:32] := Saturate_Int16_To_Int8 (a[79:64])
dst[47:40] := Saturate_Int16_To_Int8 (a[95:80])
dst[55:48] := Saturate_Int16_To_Int8 (a[111:96])
dst[63:56] := Saturate_Int16_To_Int8 (a[127:112])
dst[71:64] := Saturate_Int16_To_Int8 (b[15:0])
dst[79:72] := Saturate_Int16_To_Int8 (b[31:16])
dst[87:80] := Saturate_Int16_To_Int8 (b[47:32])
dst[95:88] := Saturate_Int16_To_Int8 (b[63:48])
dst[103:96] := Saturate_Int16_To_Int8 (b[79:64])
dst[111:104] := Saturate_Int16_To_Int8 (b[95:80])
dst[119:112] := Saturate_Int16_To_Int8 (b[111:96])
dst[127:120] := Saturate_Int16_To_Int8 (b[127:112])
dst[135:128] := Saturate_Int16_To_Int8 (a[143:128])
dst[143:136] := Saturate_Int16_To_Int8 (a[159:144])
dst[151:144] := Saturate_Int16_To_Int8 (a[175:160])
dst[159:152] := Saturate_Int16_To_Int8 (a[191:176])
dst[167:160] := Saturate_Int16_To_Int8 (a[207:192])
dst[175:168] := Saturate_Int16_To_Int8 (a[223:208])
dst[183:176] := Saturate_Int16_To_Int8 (a[239:224])
dst[191:184] := Saturate_Int16_To_Int8 (a[255:240])
dst[199:192] := Saturate_Int16_To_Int8 (b[143:128])
dst[207:200] := Saturate_Int16_To_Int8 (b[159:144])
dst[215:208] := Saturate_Int16_To_Int8 (b[175:160])
dst[223:216] := Saturate_Int16_To_Int8 (b[191:176])
dst[231:224] := Saturate_Int16_To_Int8 (b[207:192])
dst[239:232] := Saturate_Int16_To_Int8 (b[223:208])
dst[247:240] := Saturate_Int16_To_Int8 (b[239:224])
dst[255:248] := Saturate_Int16_To_Int8 (b[255:240])
dst[263:256] := Saturate_Int16_To_Int8 (a[271:256])
dst[271:264] := Saturate_Int16_To_Int8 (a[287:272])
dst[279:272] := Saturate_Int16_To_Int8 (a[303:288])
dst[287:280] := Saturate_Int16_To_Int8 (a[319:304])
dst[295:288] := Saturate_Int16_To_Int8 (a[335:320])
dst[303:296] := Saturate_Int16_To_Int8 (a[351:336])
dst[311:304] := Saturate_Int16_To_Int8 (a[367:352])
dst[319:312] := Saturate_Int16_To_Int8 (a[383:368])
dst[327:320] := Saturate_Int16_To_Int8 (b[271:256])
dst[335:328] := Saturate_Int16_To_Int8 (b[287:272])
dst[343:336] := Saturate_Int16_To_Int8 (b[303:288])
dst[351:344] := Saturate_Int16_To_Int8 (b[319:304])
dst[359:352] := Saturate_Int16_To_Int8 (b[335:320])
dst[367:360] := Saturate_Int16_To_Int8 (b[351:336])
dst[375:368] := Saturate_Int16_To_Int8 (b[367:352])
dst[383:376] := Saturate_Int16_To_Int8 (b[383:368])
dst[391:384] := Saturate_Int16_To_Int8 (a[399:384])
dst[399:392] := Saturate_Int16_To_Int8 (a[415:400])
dst[407:400] := Saturate_Int16_To_Int8 (a[431:416])
dst[415:408] := Saturate_Int16_To_Int8 (a[447:432])
dst[423:416] := Saturate_Int16_To_Int8 (a[463:448])
dst[431:424] := Saturate_Int16_To_Int8 (a[479:464])
dst[439:432] := Saturate_Int16_To_Int8 (a[495:480])
dst[447:440] := Saturate_Int16_To_Int8 (a[511:496])
dst[455:448] := Saturate_Int16_To_Int8 (b[399:384])
dst[463:456] := Saturate_Int16_To_Int8 (b[415:400])
dst[471:464] := Saturate_Int16_To_Int8 (b[431:416])
dst[479:472] := Saturate_Int16_To_Int8 (b[447:432])
dst[487:480] := Saturate_Int16_To_Int8 (b[463:448])
dst[495:488] := Saturate_Int16_To_Int8 (b[479:464])
dst[503:496] := Saturate_Int16_To_Int8 (b[495:480])
dst[511:504] := Saturate_Int16_To_Int8 (b[511:496])
dst[MAX:512] := 0

Instruction: 'VPACKSSWB'. Intrinsic: '_mm512_packs_epi16'. Requires AVX512BW.

func M512PacksEpi32

func M512PacksEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512PacksEpi32: Convert packed 32-bit integers from 'a' and 'b' to packed 16-bit integers using signed saturation, and store the results in 'dst'.

dst[15:0] := Saturate_Int32_To_Int16 (a[31:0])
dst[31:16] := Saturate_Int32_To_Int16 (a[63:32])
dst[47:32] := Saturate_Int32_To_Int16 (a[95:64])
dst[63:48] := Saturate_Int32_To_Int16 (a[127:96])
dst[79:64] := Saturate_Int32_To_Int16 (b[31:0])
dst[95:80] := Saturate_Int32_To_Int16 (b[63:32])
dst[111:96] := Saturate_Int32_To_Int16 (b[95:64])
dst[127:112] := Saturate_Int32_To_Int16 (b[127:96])
dst[143:128] := Saturate_Int32_To_Int16 (a[159:128])
dst[159:144] := Saturate_Int32_To_Int16 (a[191:160])
dst[175:160] := Saturate_Int32_To_Int16 (a[223:192])
dst[191:176] := Saturate_Int32_To_Int16 (a[255:224])
dst[207:192] := Saturate_Int32_To_Int16 (b[159:128])
dst[223:208] := Saturate_Int32_To_Int16 (b[191:160])
dst[239:224] := Saturate_Int32_To_Int16 (b[223:192])
dst[255:240] := Saturate_Int32_To_Int16 (b[255:224])
dst[271:256] := Saturate_Int32_To_Int16 (a[287:256])
dst[287:272] := Saturate_Int32_To_Int16 (a[319:288])
dst[303:288] := Saturate_Int32_To_Int16 (a[351:320])
dst[319:304] := Saturate_Int32_To_Int16 (a[383:352])
dst[335:320] := Saturate_Int32_To_Int16 (b[287:256])
dst[351:336] := Saturate_Int32_To_Int16 (b[319:288])
dst[367:352] := Saturate_Int32_To_Int16 (b[351:320])
dst[383:368] := Saturate_Int32_To_Int16 (b[383:352])
dst[399:384] := Saturate_Int32_To_Int16 (a[415:384])
dst[415:400] := Saturate_Int32_To_Int16 (a[447:416])
dst[431:416] := Saturate_Int32_To_Int16 (a[479:448])
dst[447:432] := Saturate_Int32_To_Int16 (a[511:480])
dst[463:448] := Saturate_Int32_To_Int16 (b[415:384])
dst[479:464] := Saturate_Int32_To_Int16 (b[447:416])
dst[495:480] := Saturate_Int32_To_Int16 (b[479:448])
dst[511:496] := Saturate_Int32_To_Int16 (b[511:480])
dst[MAX:512] := 0

Instruction: 'VPACKSSDW'. Intrinsic: '_mm512_packs_epi32'. Requires AVX512BW.

func M512PackusEpi16

func M512PackusEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512PackusEpi16: Convert packed 16-bit integers from 'a' and 'b' to packed 8-bit integers using unsigned saturation, and store the results in 'dst'.

dst[7:0] := Saturate_Int16_To_UnsignedInt8 (a[15:0])
dst[15:8] := Saturate_Int16_To_UnsignedInt8 (a[31:16])
dst[23:16] := Saturate_Int16_To_UnsignedInt8 (a[47:32])
dst[31:24] := Saturate_Int16_To_UnsignedInt8 (a[63:48])
dst[39:32] := Saturate_Int16_To_UnsignedInt8 (a[79:64])
dst[47:40] := Saturate_Int16_To_UnsignedInt8 (a[95:80])
dst[55:48] := Saturate_Int16_To_UnsignedInt8 (a[111:96])
dst[63:56] := Saturate_Int16_To_UnsignedInt8 (a[127:112])
dst[71:64] := Saturate_Int16_To_UnsignedInt8 (b[15:0])
dst[79:72] := Saturate_Int16_To_UnsignedInt8 (b[31:16])
dst[87:80] := Saturate_Int16_To_UnsignedInt8 (b[47:32])
dst[95:88] := Saturate_Int16_To_UnsignedInt8 (b[63:48])
dst[103:96] := Saturate_Int16_To_UnsignedInt8 (b[79:64])
dst[111:104] := Saturate_Int16_To_UnsignedInt8 (b[95:80])
dst[119:112] := Saturate_Int16_To_UnsignedInt8 (b[111:96])
dst[127:120] := Saturate_Int16_To_UnsignedInt8 (b[127:112])
dst[135:128] := Saturate_Int16_To_UnsignedInt8 (a[143:128])
dst[143:136] := Saturate_Int16_To_UnsignedInt8 (a[159:144])
dst[151:144] := Saturate_Int16_To_UnsignedInt8 (a[175:160])
dst[159:152] := Saturate_Int16_To_UnsignedInt8 (a[191:176])
dst[167:160] := Saturate_Int16_To_UnsignedInt8 (a[207:192])
dst[175:168] := Saturate_Int16_To_UnsignedInt8 (a[223:208])
dst[183:176] := Saturate_Int16_To_UnsignedInt8 (a[239:224])
dst[191:184] := Saturate_Int16_To_UnsignedInt8 (a[255:240])
dst[199:192] := Saturate_Int16_To_UnsignedInt8 (b[143:128])
dst[207:200] := Saturate_Int16_To_UnsignedInt8 (b[159:144])
dst[215:208] := Saturate_Int16_To_UnsignedInt8 (b[175:160])
dst[223:216] := Saturate_Int16_To_UnsignedInt8 (b[191:176])
dst[231:224] := Saturate_Int16_To_UnsignedInt8 (b[207:192])
dst[239:232] := Saturate_Int16_To_UnsignedInt8 (b[223:208])
dst[247:240] := Saturate_Int16_To_UnsignedInt8 (b[239:224])
dst[255:248] := Saturate_Int16_To_UnsignedInt8 (b[255:240])
dst[263:256] := Saturate_Int16_To_UnsignedInt8 (a[271:256])
dst[271:264] := Saturate_Int16_To_UnsignedInt8 (a[287:272])
dst[279:272] := Saturate_Int16_To_UnsignedInt8 (a[303:288])
dst[287:280] := Saturate_Int16_To_UnsignedInt8 (a[319:304])
dst[295:288] := Saturate_Int16_To_UnsignedInt8 (a[335:320])
dst[303:296] := Saturate_Int16_To_UnsignedInt8 (a[351:336])
dst[311:304] := Saturate_Int16_To_UnsignedInt8 (a[367:352])
dst[319:312] := Saturate_Int16_To_UnsignedInt8 (a[383:368])
dst[327:320] := Saturate_Int16_To_UnsignedInt8 (b[271:256])
dst[335:328] := Saturate_Int16_To_UnsignedInt8 (b[287:272])
dst[343:336] := Saturate_Int16_To_UnsignedInt8 (b[303:288])
dst[351:344] := Saturate_Int16_To_UnsignedInt8 (b[319:304])
dst[359:352] := Saturate_Int16_To_UnsignedInt8 (b[335:320])
dst[367:360] := Saturate_Int16_To_UnsignedInt8 (b[351:336])
dst[375:368] := Saturate_Int16_To_UnsignedInt8 (b[367:352])
dst[383:376] := Saturate_Int16_To_UnsignedInt8 (b[383:368])
dst[391:384] := Saturate_Int16_To_UnsignedInt8 (a[399:384])
dst[399:392] := Saturate_Int16_To_UnsignedInt8 (a[415:400])
dst[407:400] := Saturate_Int16_To_UnsignedInt8 (a[431:416])
dst[415:408] := Saturate_Int16_To_UnsignedInt8 (a[447:432])
dst[423:416] := Saturate_Int16_To_UnsignedInt8 (a[463:448])
dst[431:424] := Saturate_Int16_To_UnsignedInt8 (a[479:464])
dst[439:432] := Saturate_Int16_To_UnsignedInt8 (a[495:480])
dst[447:440] := Saturate_Int16_To_UnsignedInt8 (a[511:496])
dst[455:448] := Saturate_Int16_To_UnsignedInt8 (b[399:384])
dst[463:456] := Saturate_Int16_To_UnsignedInt8 (b[415:400])
dst[471:464] := Saturate_Int16_To_UnsignedInt8 (b[431:416])
dst[479:472] := Saturate_Int16_To_UnsignedInt8 (b[447:432])
dst[487:480] := Saturate_Int16_To_UnsignedInt8 (b[463:448])
dst[495:488] := Saturate_Int16_To_UnsignedInt8 (b[479:464])
dst[503:496] := Saturate_Int16_To_UnsignedInt8 (b[495:480])
dst[511:504] := Saturate_Int16_To_UnsignedInt8 (b[511:496])
dst[MAX:512] := 0

Instruction: 'VPACKUSWB'. Intrinsic: '_mm512_packus_epi16'. Requires AVX512BW.

func M512PackusEpi32

func M512PackusEpi32(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512PackusEpi32: Convert packed 32-bit integers from 'a' and 'b' to packed 16-bit integers using unsigned saturation, and store the results in 'dst'.

dst[15:0] := Saturate_Int32_To_UnsignedInt16 (a[31:0])
dst[31:16] := Saturate_Int32_To_UnsignedInt16 (a[63:32])
dst[47:32] := Saturate_Int32_To_UnsignedInt16 (a[95:64])
dst[63:48] := Saturate_Int32_To_UnsignedInt16 (a[127:96])
dst[79:64] := Saturate_Int32_To_UnsignedInt16 (b[31:0])
dst[95:80] := Saturate_Int32_To_UnsignedInt16 (b[63:32])
dst[111:96] := Saturate_Int32_To_UnsignedInt16 (b[95:64])
dst[127:112] := Saturate_Int32_To_UnsignedInt16 (b[127:96])
dst[143:128] := Saturate_Int32_To_UnsignedInt16 (a[159:128])
dst[159:144] := Saturate_Int32_To_UnsignedInt16 (a[191:160])
dst[175:160] := Saturate_Int32_To_UnsignedInt16 (a[223:192])
dst[191:176] := Saturate_Int32_To_UnsignedInt16 (a[255:224])
dst[207:192] := Saturate_Int32_To_UnsignedInt16 (b[159:128])
dst[223:208] := Saturate_Int32_To_UnsignedInt16 (b[191:160])
dst[239:224] := Saturate_Int32_To_UnsignedInt16 (b[223:192])
dst[255:240] := Saturate_Int32_To_UnsignedInt16 (b[255:224])
dst[271:256] := Saturate_Int32_To_UnsignedInt16 (a[287:256])
dst[287:272] := Saturate_Int32_To_UnsignedInt16 (a[319:288])
dst[303:288] := Saturate_Int32_To_UnsignedInt16 (a[351:320])
dst[319:304] := Saturate_Int32_To_UnsignedInt16 (a[383:352])
dst[335:320] := Saturate_Int32_To_UnsignedInt16 (b[287:256])
dst[351:336] := Saturate_Int32_To_UnsignedInt16 (b[319:288])
dst[367:352] := Saturate_Int32_To_UnsignedInt16 (b[351:320])
dst[383:368] := Saturate_Int32_To_UnsignedInt16 (b[383:352])
dst[399:384] := Saturate_Int32_To_UnsignedInt16 (a[415:384])
dst[415:400] := Saturate_Int32_To_UnsignedInt16 (a[447:416])
dst[431:416] := Saturate_Int32_To_UnsignedInt16 (a[479:448])
dst[447:432] := Saturate_Int32_To_UnsignedInt16 (a[511:480])
dst[463:448] := Saturate_Int32_To_UnsignedInt16 (b[415:384])
dst[479:464] := Saturate_Int32_To_UnsignedInt16 (b[447:416])
dst[495:480] := Saturate_Int32_To_UnsignedInt16 (b[479:448])
dst[511:496] := Saturate_Int32_To_UnsignedInt16 (b[511:480])
dst[MAX:512] := 0

Instruction: 'VPACKUSDW'. Intrinsic: '_mm512_packus_epi32'. Requires AVX512BW.

func M512Permutex2varEpi16

func M512Permutex2varEpi16(a x86.M512i, idx x86.M512i, b x86.M512i) (dst x86.M512i)

M512Permutex2varEpi16: Shuffle 16-bit integers in 'a' and 'b' across lanes using the corresponding selector and index in 'idx', and store the results in 'dst'.

FOR j := 0 to 31
	i := j*16
	off := 16*idx[i+4:i]
	dst[i+15:i] := idx[i+5] ? b[off+15:off] : a[off+15:off]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMI2W, VPERMT2W'. Intrinsic: '_mm512_permutex2var_epi16'. Requires AVX512BW.

func M512PermutexvarEpi16

func M512PermutexvarEpi16(idx x86.M512i, a x86.M512i) (dst x86.M512i)

M512PermutexvarEpi16: Shuffle 16-bit integers in 'a' across lanes using the corresponding index in 'idx', and store the results in 'dst'.

FOR j := 0 to 31
	i := j*16
	id := idx[i+4:i]*16
	dst[i+15:i] := a[id+15:id]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPERMW'. Intrinsic: '_mm512_permutexvar_epi16'. Requires AVX512BW.

func M512SadEpu8

func M512SadEpu8(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512SadEpu8: Compute the absolute differences of packed unsigned 8-bit integers in 'a' and 'b', then horizontally sum each consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of 64-bit elements in 'dst'.

FOR j := 0 to 63
	i := j*8
	tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i])
ENDFOR
FOR j := 0 to 7
	i := j*64
	dst[i+15:i] := tmp[i+7:i] + tmp[i+15:i+8] + tmp[i+23:i+16] + tmp[i+31:i+24] + tmp[i+39:i+32] + tmp[i+47:i+40] + tmp[i+55:i+48] + tmp[i+63:i+56]
	dst[i+63:i+16] := 0
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSADBW'. Intrinsic: '_mm512_sad_epu8'. Requires AVX512BW.

func M512ShuffleEpi8

func M512ShuffleEpi8(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512ShuffleEpi8: Shuffle packed 8-bit integers in 'a' according to shuffle control mask in the corresponding 8-bit element of 'b', and store the results in 'dst'.

FOR j := 0 to 63
	i := j*8
	IF b[i+7] == 1
		dst[i+7:i] := 0
	ELSE
		index[3:0] := b[i+3:i]
		dst[i+7:i] := a[index*8+7:index*8]
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSHUFB'. Intrinsic: '_mm512_shuffle_epi8'. Requires AVX512BW.

func M512ShufflehiEpi16

func M512ShufflehiEpi16(a x86.M512i, imm8 byte) (dst x86.M512i)

M512ShufflehiEpi16: Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of 'a' using the control in 'imm8'. Store the results in the high 64 bits of 128-bit lanes of 'dst', with the low 64 bits of 128-bit lanes being copied from from 'a' to 'dst'.

dst[63:0] := a[63:0]
dst[79:64] := (a >> (imm8[1:0] * 16))[79:64]
dst[95:80] := (a >> (imm8[3:2] * 16))[79:64]
dst[111:96] := (a >> (imm8[5:4] * 16))[79:64]
dst[127:112] := (a >> (imm8[7:6] * 16))[79:64]
dst[191:128] := a[191:128]
dst[207:192] := (a >> (imm8[1:0] * 16))[207:192]
dst[223:208] := (a >> (imm8[3:2] * 16))[207:192]
dst[239:224] := (a >> (imm8[5:4] * 16))[207:192]
dst[255:240] := (a >> (imm8[7:6] * 16))[207:192]
dst[319:256] := a[319:256]
dst[335:320] := (a >> (imm8[1:0] * 16))[335:320]
dst[351:336] := (a >> (imm8[3:2] * 16))[335:320]
dst[367:352] := (a >> (imm8[5:4] * 16))[335:320]
dst[383:368] := (a >> (imm8[7:6] * 16))[335:320]
dst[447:384] := a[447:384]
dst[463:448] := (a >> (imm8[1:0] * 16))[463:448]
dst[479:464] := (a >> (imm8[3:2] * 16))[463:448]
dst[495:480] := (a >> (imm8[5:4] * 16))[463:448]
dst[511:496] := (a >> (imm8[7:6] * 16))[463:448]
dst[MAX:512] := 0

Instruction: 'VPSHUFHW'. Intrinsic: '_mm512_shufflehi_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512ShuffleloEpi16

func M512ShuffleloEpi16(a x86.M512i, imm8 byte) (dst x86.M512i)

M512ShuffleloEpi16: Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of 'a' using the control in 'imm8'. Store the results in the low 64 bits of 128-bit lanes of 'dst', with the high 64 bits of 128-bit lanes being copied from from 'a' to 'dst'.

dst[15:0] := (a >> (imm8[1:0] * 16))[15:0]
dst[31:16] := (a >> (imm8[3:2] * 16))[15:0]
dst[47:32] := (a >> (imm8[5:4] * 16))[15:0]
dst[63:48] := (a >> (imm8[7:6] * 16))[15:0]
dst[127:64] := a[127:64]
dst[143:128] := (a >> (imm8[1:0] * 16))[143:128]
dst[159:144] := (a >> (imm8[3:2] * 16))[143:128]
dst[175:160] := (a >> (imm8[5:4] * 16))[143:128]
dst[191:176] := (a >> (imm8[7:6] * 16))[143:128]
dst[255:192] := a[255:192]
dst[271:256] := (a >> (imm8[1:0] * 16))[271:256]
dst[287:272] := (a >> (imm8[3:2] * 16))[271:256]
dst[303:288] := (a >> (imm8[5:4] * 16))[271:256]
dst[319:304] := (a >> (imm8[7:6] * 16))[271:256]
dst[383:320] := a[383:320]
dst[399:384] := (a >> (imm8[1:0] * 16))[399:384]
dst[415:400] := (a >> (imm8[3:2] * 16))[399:384]
dst[431:416] := (a >> (imm8[5:4] * 16))[399:384]
dst[447:432] := (a >> (imm8[7:6] * 16))[399:384]
dst[511:448] := a[511:448]
dst[MAX:512] := 0

Instruction: 'VPSHUFLW'. Intrinsic: '_mm512_shufflelo_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512SllEpi16

func M512SllEpi16(a x86.M512i, count x86.M128i) (dst x86.M512i)

M512SllEpi16: Shift packed 16-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 31
	i := j*16
	IF count[63:0] > 15
		dst[i+15:i] := 0
	ELSE
		dst[i+15:i] := ZeroExtend(a[i+15:i] << count[63:0])
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSLLW'. Intrinsic: '_mm512_sll_epi16'. Requires AVX512BW.

func M512SlliEpi16

func M512SlliEpi16(a x86.M512i, imm8 byte) (dst x86.M512i)

M512SlliEpi16: Shift packed 16-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 31
	i := j*16
	IF imm8[7:0] > 15
		dst[i+15:i] := 0
	ELSE
		dst[i+15:i] := ZeroExtend(a[i+15:i] << imm8[7:0])
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSLLW'. Intrinsic: '_mm512_slli_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512SllvEpi16

func M512SllvEpi16(a x86.M512i, count x86.M512i) (dst x86.M512i)

M512SllvEpi16: Shift packed 16-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := ZeroExtend(a[i+15:i] << count[i+15:i])
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSLLVW'. Intrinsic: '_mm512_sllv_epi16'. Requires AVX512BW.

func M512SraEpi16

func M512SraEpi16(a x86.M512i, count x86.M128i) (dst x86.M512i)

M512SraEpi16: Shift packed 16-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst'.

FOR j := 0 to 31
	i := j*16
	IF count[63:0] > 15
		dst[i+15:i] := SignBit
	ELSE
		dst[i+15:i] := SignExtend(a[i+15:i] >> count[63:0])
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRAW'. Intrinsic: '_mm512_sra_epi16'. Requires AVX512BW.

func M512SraiEpi16

func M512SraiEpi16(a x86.M512i, imm8 byte) (dst x86.M512i)

M512SraiEpi16: Shift packed 16-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst'.

FOR j := 0 to 31
	i := j*16
	IF imm8[7:0] > 15
		dst[i+15:i] := SignBit
	ELSE
		dst[i+15:i] := SignExtend(a[i+15:i] >> imm8[7:0])
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRAW'. Intrinsic: '_mm512_srai_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512SravEpi16

func M512SravEpi16(a x86.M512i, count x86.M512i) (dst x86.M512i)

M512SravEpi16: Shift packed 16-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst'.

FOR j := 0 to 31
	i := j*16
	dst[i+15:i] := SignExtend(a[i+15:i] >> count[i+15:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRAVW'. Intrinsic: '_mm512_srav_epi16'. Requires AVX512BW.

func M512SrlEpi16

func M512SrlEpi16(a x86.M512i, count x86.M128i) (dst x86.M512i)

M512SrlEpi16: Shift packed 16-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 31
	i := j*16
	IF count[63:0] > 15
		dst[i+15:i] := 0
	ELSE
		dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[63:0])
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRLW'. Intrinsic: '_mm512_srl_epi16'. Requires AVX512BW.

func M512SrliEpi16

func M512SrliEpi16(a x86.M512i, imm8 byte) (dst x86.M512i)

M512SrliEpi16: Shift packed 16-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 31
	i := j*16
	IF imm8[7:0] > 15
		dst[i+15:i] := 0
	ELSE
		dst[i+15:i] := ZeroExtend(a[i+15:i] >> imm8[7:0])
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRLW'. Intrinsic: '_mm512_srli_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func M512SrlvEpi16

func M512SrlvEpi16(a x86.M512i, count x86.M512i) (dst x86.M512i)

M512SrlvEpi16: Shift packed 16-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 31
	i := j*16
	IF k[j]
		dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[i+15:i])
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSRLVW'. Intrinsic: '_mm512_srlv_epi16'. Requires AVX512BW.

func M512SubEpi16

func M512SubEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512SubEpi16: Subtract packed 16-bit integers in 'b' from packed 16-bit integers in 'a', and store the results in 'dst'.

FOR j := 0 to 31
	i := j*16
	dst[i+15:i] := a[i+15:i] - b[i+15:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBW'. Intrinsic: '_mm512_sub_epi16'. Requires AVX512BW.

func M512SubEpi8

func M512SubEpi8(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512SubEpi8: Subtract packed 8-bit integers in 'b' from packed 8-bit integers in 'a', and store the results in 'dst'.

FOR j := 0 to 63
	i := j*8
	dst[i+7:i] := a[i+7:i] - b[i+7:i]
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBB'. Intrinsic: '_mm512_sub_epi8'. Requires AVX512BW.

func M512SubsEpi16

func M512SubsEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512SubsEpi16: Subtract packed 16-bit integers in 'b' from packed 16-bit integers in 'a' using saturation, and store the results in 'dst'.

FOR j := 0 to 31
	i := j*16
	dst[i+15:i] := Saturate_To_Int16(a[i+15:i] - b[i+15:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBSW'. Intrinsic: '_mm512_subs_epi16'. Requires AVX512BW.

func M512SubsEpi8

func M512SubsEpi8(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512SubsEpi8: Subtract packed 8-bit integers in 'b' from packed 8-bit integers in 'a' using saturation, and store the results in 'dst'.

FOR j := 0 to 63
	i := j*8
	dst[i+7:i] := Saturate_To_Int8(a[i+7:i] - b[i+7:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBSB'. Intrinsic: '_mm512_subs_epi8'. Requires AVX512BW.

func M512SubsEpu16

func M512SubsEpu16(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512SubsEpu16: Subtract packed unsigned 16-bit integers in 'b' from packed unsigned 16-bit integers in 'a' using saturation, and store the results in 'dst'.

FOR j := 0 to 31
	i := j*16
	dst[i+15:i] := Saturate_To_UnsignedInt16(a[i+15:i] - b[i+15:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBUSW'. Intrinsic: '_mm512_subs_epu16'. Requires AVX512BW.

func M512SubsEpu8

func M512SubsEpu8(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512SubsEpu8: Subtract packed unsigned 8-bit integers in 'b' from packed unsigned 8-bit integers in 'a' using saturation, and store the results in 'dst'.

FOR j := 0 to 63
	i := j*8
	dst[i+7:i] := Saturate_To_UnsignedInt8(a[i+7:i] - b[i+7:i])
ENDFOR
dst[MAX:512] := 0

Instruction: 'VPSUBUSB'. Intrinsic: '_mm512_subs_epu8'. Requires AVX512BW.

func M512TestEpi16Mask

func M512TestEpi16Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512TestEpi16Mask: Compute the bitwise AND of packed 16-bit integers in 'a' and 'b', producing intermediate 16-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is non-zero.

FOR j := 0 to 31
	i := j*16
	k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPTESTMW'. Intrinsic: '_mm512_test_epi16_mask'. Requires AVX512BW.

func M512TestEpi8Mask

func M512TestEpi8Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512TestEpi8Mask: Compute the bitwise AND of packed 8-bit integers in 'a' and 'b', producing intermediate 8-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is non-zero.

FOR j := 0 to 63
	i := j*8
	k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0
ENDFOR
k[MAX:64] := 0

Instruction: 'VPTESTMB'. Intrinsic: '_mm512_test_epi8_mask'. Requires AVX512BW.

func M512TestnEpi16Mask

func M512TestnEpi16Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask32)

M512TestnEpi16Mask: Compute the bitwise NAND of packed 16-bit integers in 'a' and 'b', producing intermediate 16-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is zero.

FOR j := 0 to 31
	i := j*16
	k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0
ENDFOR
k[MAX:32] := 0

Instruction: 'VPTESTNMW'. Intrinsic: '_mm512_testn_epi16_mask'. Requires AVX512BW.

func M512TestnEpi8Mask

func M512TestnEpi8Mask(a x86.M512i, b x86.M512i) (dst x86.Mmask64)

M512TestnEpi8Mask: Compute the bitwise NAND of packed 8-bit integers in 'a' and 'b', producing intermediate 8-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is zero.

FOR j := 0 to 63
	i := j*8
	k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0
ENDFOR
k[MAX:64] := 0

Instruction: 'VPTESTNMB'. Intrinsic: '_mm512_testn_epi8_mask'. Requires AVX512BW.

func M512UnpackhiEpi16

func M512UnpackhiEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512UnpackhiEpi16: Unpack and interleave 16-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst'.

INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]){
	dst[15:0] := src1[79:64]
	dst[31:16] := src2[79:64]
	dst[47:32] := src1[95:80]
	dst[63:48] := src2[95:80]
	dst[79:64] := src1[111:96]
	dst[95:80] := src2[111:96]
	dst[111:96] := src1[127:112]
	dst[127:112] := src2[127:112]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_HIGH_WORDS(a[255:128], b[255:128])
dst[383:256] := INTERLEAVE_HIGH_WORDS(a[383:256], b[383:256])
dst[511:384] := INTERLEAVE_HIGH_WORDS(a[511:384], b[511:384])
dst[MAX:512] := 0

Instruction: 'VPUNPCKHWD'. Intrinsic: '_mm512_unpackhi_epi16'. Requires AVX512BW.

func M512UnpackhiEpi8

func M512UnpackhiEpi8(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512UnpackhiEpi8: Unpack and interleave 8-bit integers from the high half of each 128-bit lane in 'a' and 'b', and store the results in 'dst'.

INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]){
	dst[7:0] := src1[71:64]
	dst[15:8] := src2[71:64]
	dst[23:16] := src1[79:72]
	dst[31:24] := src2[79:72]
	dst[39:32] := src1[87:80]
	dst[47:40] := src2[87:80]
	dst[55:48] := src1[95:88]
	dst[63:56] := src2[95:88]
	dst[71:64] := src1[103:96]
	dst[79:72] := src2[103:96]
	dst[87:80] := src1[111:104]
	dst[95:88] := src2[111:104]
	dst[103:96] := src1[119:112]
	dst[111:104] := src2[119:112]
	dst[119:112] := src1[127:120]
	dst[127:120] := src2[127:120]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_HIGH_BYTES(a[255:128], b[255:128])
dst[383:256] := INTERLEAVE_HIGH_BYTES(a[383:256], b[383:256])
dst[511:384] := INTERLEAVE_HIGH_BYTES(a[511:384], b[511:384])
dst[MAX:512] := 0

Instruction: 'VPUNPCKHBW'. Intrinsic: '_mm512_unpackhi_epi8'. Requires AVX512BW.

func M512UnpackloEpi16

func M512UnpackloEpi16(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512UnpackloEpi16: Unpack and interleave 16-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst'.

INTERLEAVE_WORDS(src1[127:0], src2[127:0]){
	dst[15:0] := src1[15:0]
	dst[31:16] := src2[15:0]
	dst[47:32] := src1[31:16]
	dst[63:48] := src2[31:16]
	dst[79:64] := src1[47:32]
	dst[95:80] := src2[47:32]
	dst[111:96] := src1[63:48]
	dst[127:112] := src2[63:48]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_WORDS(a[255:128], b[255:128])
dst[383:256] := INTERLEAVE_WORDS(a[383:256], b[383:256])
dst[511:384] := INTERLEAVE_WORDS(a[511:384], b[511:384])
dst[MAX:512] := 0

Instruction: 'VPUNPCKLWD'. Intrinsic: '_mm512_unpacklo_epi16'. Requires AVX512BW.

func M512UnpackloEpi8

func M512UnpackloEpi8(a x86.M512i, b x86.M512i) (dst x86.M512i)

M512UnpackloEpi8: Unpack and interleave 8-bit integers from the low half of each 128-bit lane in 'a' and 'b', and store the results in 'dst'.

INTERLEAVE_BYTES(src1[127:0], src2[127:0]){
	dst[7:0] := src1[7:0]
	dst[15:8] := src2[7:0]
	dst[23:16] := src1[15:8]
	dst[31:24] := src2[15:8]
	dst[39:32] := src1[23:16]
	dst[47:40] := src2[23:16]
	dst[55:48] := src1[31:24]
	dst[63:56] := src2[31:24]
	dst[71:64] := src1[39:32]
	dst[79:72] := src2[39:32]
	dst[87:80] := src1[47:40]
	dst[95:88] := src2[47:40]
	dst[103:96] := src1[55:48]
	dst[111:104] := src2[55:48]
	dst[119:112] := src1[63:56]
	dst[127:120] := src2[63:56]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])
dst[255:128] := INTERLEAVE_BYTES(a[255:128], b[255:128])
dst[383:256] := INTERLEAVE_BYTES(a[383:256], b[383:256])
dst[511:384] := INTERLEAVE_BYTES(a[511:384], b[511:384])
dst[MAX:512] := 0

Instruction: 'VPUNPCKLBW'. Intrinsic: '_mm512_unpacklo_epi8'. Requires AVX512BW.

func Mask2Permutex2varEpi16

func Mask2Permutex2varEpi16(a x86.M128i, idx x86.M128i, k x86.Mmask8, b x86.M128i) (dst x86.M128i)

Mask2Permutex2varEpi16: Shuffle 16-bit integers in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'idx' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		off := 16*idx[i+2:i]
		dst[i+15:i] := idx[i+3] ? b[off+15:off] : a[off+15:off]
	ELSE
		dst[i+15:i] := idx[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMI2W'. Intrinsic: '_mm_mask2_permutex2var_epi16'. Requires AVX512BW.

func MaskAbsEpi16

func MaskAbsEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskAbsEpi16: Compute the absolute value of packed 16-bit integers in 'a', and store the unsigned results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := ABS(a[i+15:i])
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPABSW'. Intrinsic: '_mm_mask_abs_epi16'. Requires AVX512BW.

func MaskAbsEpi8

func MaskAbsEpi8(src x86.M128i, k x86.Mmask16, a x86.M128i) (dst x86.M128i)

MaskAbsEpi8: Compute the absolute value of packed 8-bit integers in 'a', and store the unsigned results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := ABS(a[i+7:i])
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPABSB'. Intrinsic: '_mm_mask_abs_epi8'. Requires AVX512BW.

func MaskAddEpi16

func MaskAddEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskAddEpi16: Add packed 16-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := a[i+15:i] + b[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPADDW'. Intrinsic: '_mm_mask_add_epi16'. Requires AVX512BW.

func MaskAddEpi8

func MaskAddEpi8(src x86.M128i, k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskAddEpi8: Add packed 8-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := a[i+7:i] + b[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPADDB'. Intrinsic: '_mm_mask_add_epi8'. Requires AVX512BW.

func MaskAddsEpi16

func MaskAddsEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskAddsEpi16: Add packed 16-bit integers in 'a' and 'b' using saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] )
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPADDSW'. Intrinsic: '_mm_mask_adds_epi16'. Requires AVX512BW.

func MaskAddsEpi8

func MaskAddsEpi8(src x86.M128i, k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskAddsEpi8: Add packed 8-bit integers in 'a' and 'b' using saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] )
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPADDSB'. Intrinsic: '_mm_mask_adds_epi8'. Requires AVX512BW.

func MaskAddsEpu16

func MaskAddsEpu16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskAddsEpu16: Add packed unsigned 16-bit integers in 'a' and 'b' using saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] )
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPADDUSW'. Intrinsic: '_mm_mask_adds_epu16'. Requires AVX512BW.

func MaskAddsEpu8

func MaskAddsEpu8(src x86.M128i, k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskAddsEpu8: Add packed unsigned 8-bit integers in 'a' and 'b' using saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := Saturate_To_UnsignedInt8( a[i+7:i] + b[i+7:i] )
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPADDUSB'. Intrinsic: '_mm_mask_adds_epu8'. Requires AVX512BW.

func MaskAlignrEpi8

func MaskAlignrEpi8(src x86.M128i, k x86.Mmask16, a x86.M128i, b x86.M128i, count int) (dst x86.M128i)

MaskAlignrEpi8: Concatenate pairs of 16-byte blocks in 'a' and 'b' into a 32-byte temporary result, shift the result right by 'count' bytes, and store the low 16 bytes in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp_dst[255:0] := ((a[127:0] << 128) OR b[127:0]) >> (count[7:0]*8)

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPALIGNR'. Intrinsic: '_mm_mask_alignr_epi8'. Requires AVX512BW.

func MaskAvgEpu16

func MaskAvgEpu16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskAvgEpu16: Average packed unsigned 16-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPAVGW'. Intrinsic: '_mm_mask_avg_epu16'. Requires AVX512BW.

func MaskAvgEpu8

func MaskAvgEpu8(src x86.M128i, k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskAvgEpu8: Average packed unsigned 8-bit integers in 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPAVGB'. Intrinsic: '_mm_mask_avg_epu8'. Requires AVX512BW.

func MaskBlendEpi16

func MaskBlendEpi16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskBlendEpi16: Blend packed 16-bit integers from 'a' and 'b' using control mask 'k', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := b[i+15:i]
	ELSE
		dst[i+15:i] := a[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPBLENDMW'. Intrinsic: '_mm_mask_blend_epi16'. Requires AVX512BW.

func MaskBlendEpi8

func MaskBlendEpi8(k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskBlendEpi8: Blend packed 8-bit integers from 'a' and 'b' using control mask 'k', and store the results in 'dst'.

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := b[i+7:i]
	ELSE
		dst[i+7:i] := a[i+7:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPBLENDMB'. Intrinsic: '_mm_mask_blend_epi8'. Requires AVX512BW.

func MaskBroadcastbEpi8

func MaskBroadcastbEpi8(src x86.M128i, k x86.Mmask16, a x86.M128i) (dst x86.M128i)

MaskBroadcastbEpi8: Broadcast the low packed 8-bit integer from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := a[7:0]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPBROADCASTB'. Intrinsic: '_mm_mask_broadcastb_epi8'. Requires AVX512BW.

func MaskBroadcastwEpi16

func MaskBroadcastwEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskBroadcastwEpi16: Broadcast the low packed 16-bit integer from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := a[15:0]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPBROADCASTW'. Intrinsic: '_mm_mask_broadcastw_epi16'. Requires AVX512BW.

func MaskCmpEpi16Mask

func MaskCmpEpi16Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask8)

MaskCmpEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 7
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm_mask_cmp_epi16_mask'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func MaskCmpEpi8Mask

func MaskCmpEpi8Mask(k1 x86.Mmask16, a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask16)

MaskCmpEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 15
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm_mask_cmp_epi8_mask'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func MaskCmpEpu16Mask

func MaskCmpEpu16Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask8)

MaskCmpEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 7
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] OP b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm_mask_cmp_epu16_mask'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func MaskCmpEpu8Mask

func MaskCmpEpu8Mask(k1 x86.Mmask16, a x86.M128i, b x86.M128i, imm8 byte) (dst x86.Mmask16)

MaskCmpEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' based on the comparison operand specified by 'imm8', and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

CASE (imm8[7:0]) OF
0: OP := _MM_CMPINT_EQ
1: OP := _MM_CMPINT_LT
2: OP := _MM_CMPINT_LE
3: OP := _MM_CMPINT_FALSE
4: OP := _MM_CMPINT_NEQ
5: OP := _MM_CMPINT_NLT
6: OP := _MM_CMPINT_NLE
7: OP := _MM_CMPINT_TRUE
ESAC
FOR j := 0 to 15
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] OP b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm_mask_cmp_epu8_mask'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func MaskCmpeqEpi16Mask

func MaskCmpeqEpi16Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpeqEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm_mask_cmpeq_epi16_mask'. Requires AVX512BW.

func MaskCmpeqEpi8Mask

func MaskCmpeqEpi8Mask(k1 x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.Mmask16)

MaskCmpeqEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm_mask_cmpeq_epi8_mask'. Requires AVX512BW.

func MaskCmpeqEpu16Mask

func MaskCmpeqEpu16Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpeqEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] == b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm_mask_cmpeq_epu16_mask'. Requires AVX512BW.

func MaskCmpeqEpu8Mask

func MaskCmpeqEpu8Mask(k1 x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.Mmask16)

MaskCmpeqEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for equality, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] == b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm_mask_cmpeq_epu8_mask'. Requires AVX512BW.

func MaskCmpgeEpi16Mask

func MaskCmpgeEpi16Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpgeEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm_mask_cmpge_epi16_mask'. Requires AVX512BW.

func MaskCmpgeEpi8Mask

func MaskCmpgeEpi8Mask(k1 x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.Mmask16)

MaskCmpgeEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm_mask_cmpge_epi8_mask'. Requires AVX512BW.

func MaskCmpgeEpu16Mask

func MaskCmpgeEpu16Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpgeEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] >= b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm_mask_cmpge_epu16_mask'. Requires AVX512BW.

func MaskCmpgeEpu8Mask

func MaskCmpgeEpu8Mask(k1 x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.Mmask16)

MaskCmpgeEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for greater-than-or-equal, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] >= b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm_mask_cmpge_epu8_mask'. Requires AVX512BW.

func MaskCmpgtEpi16Mask

func MaskCmpgtEpi16Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpgtEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] >== b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm_mask_cmpgt_epi16_mask'. Requires AVX512BW.

func MaskCmpgtEpi8Mask

func MaskCmpgtEpi8Mask(k1 x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.Mmask16)

MaskCmpgtEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm_mask_cmpgt_epi8_mask'. Requires AVX512BW.

func MaskCmpgtEpu16Mask

func MaskCmpgtEpu16Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpgtEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] >== b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm_mask_cmpgt_epu16_mask'. Requires AVX512BW.

func MaskCmpgtEpu8Mask

func MaskCmpgtEpu8Mask(k1 x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.Mmask16)

MaskCmpgtEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for greater-than, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] > b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm_mask_cmpgt_epu8_mask'. Requires AVX512BW.

func MaskCmpleEpi16Mask

func MaskCmpleEpi16Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpleEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm_mask_cmple_epi16_mask'. Requires AVX512BW.

func MaskCmpleEpi8Mask

func MaskCmpleEpi8Mask(k1 x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.Mmask16)

MaskCmpleEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm_mask_cmple_epi8_mask'. Requires AVX512BW.

func MaskCmpleEpu16Mask

func MaskCmpleEpu16Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpleEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] <= b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm_mask_cmple_epu16_mask'. Requires AVX512BW.

func MaskCmpleEpu8Mask

func MaskCmpleEpu8Mask(k1 x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.Mmask16)

MaskCmpleEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for less-than-or-equal, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] <= b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm_mask_cmple_epu8_mask'. Requires AVX512BW.

func MaskCmpltEpi16Mask

func MaskCmpltEpi16Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpltEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm_mask_cmplt_epi16_mask'. Requires AVX512BW.

func MaskCmpltEpi8Mask

func MaskCmpltEpi8Mask(k1 x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.Mmask16)

MaskCmpltEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm_mask_cmplt_epi8_mask'. Requires AVX512BW.

func MaskCmpltEpu16Mask

func MaskCmpltEpu16Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpltEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] < b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm_mask_cmplt_epu16_mask'. Requires AVX512BW.

func MaskCmpltEpu8Mask

func MaskCmpltEpu8Mask(k1 x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.Mmask16)

MaskCmpltEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for less-than, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] < b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm_mask_cmplt_epu8_mask'. Requires AVX512BW.

func MaskCmpneqEpi16Mask

func MaskCmpneqEpi16Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpneqEpi16Mask: Compare packed 16-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPW'. Intrinsic: '_mm_mask_cmpneq_epi16_mask'. Requires AVX512BW.

func MaskCmpneqEpi8Mask

func MaskCmpneqEpi8Mask(k1 x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.Mmask16)

MaskCmpneqEpi8Mask: Compare packed 8-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPB'. Intrinsic: '_mm_mask_cmpneq_epi8_mask'. Requires AVX512BW.

func MaskCmpneqEpu16Mask

func MaskCmpneqEpu16Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskCmpneqEpu16Mask: Compare packed unsigned 16-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k1' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k1[j]
		k[j] := ( a[i+15:i] != b[i+15:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPCMPUW'. Intrinsic: '_mm_mask_cmpneq_epu16_mask'. Requires AVX512BW.

func MaskCmpneqEpu8Mask

func MaskCmpneqEpu8Mask(k1 x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.Mmask16)

MaskCmpneqEpu8Mask: Compare packed unsigned 8-bit integers in 'a' and 'b' for not-equal, and store the results in mask vector 'k' using zeromask 'k1' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k1[j]
		k[j] := ( a[i+7:i] != b[i+7:i] ) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPCMPUB'. Intrinsic: '_mm_mask_cmpneq_epu8_mask'. Requires AVX512BW.

func MaskCvtepi16Epi8

func MaskCvtepi16Epi8(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtepi16Epi8: Convert packed 16-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 16*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Truncate_Int16_To_Int8(a[i+15:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVWB'. Intrinsic: '_mm_mask_cvtepi16_epi8'. Requires AVX512BW.

func MaskCvtepi8Epi16

func MaskCvtepi8Epi16(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtepi8Epi16: Sign extend packed 8-bit integers in 'a' to packed 16-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*8
	l := j*16
	IF k[j]
		dst[l+15:l] := SignExtend(a[i+7:i])
	ELSE
		dst[l+15:l] := src[l+15:l]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSXBW'. Intrinsic: '_mm_mask_cvtepi8_epi16'. Requires AVX512BW.

func MaskCvtepu8Epi16

func MaskCvtepu8Epi16(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtepu8Epi16: Zero extend packed unsigned 8-bit integers in 'a' to packed 16-bit integers, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*8
	l := j*16
	IF k[j]
		dst[l+15:l] := ZeroExtend(a[i+7:i])
	ELSE
		dst[l+15:l] := src[l+15:l]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVZXBW'. Intrinsic: '_mm_mask_cvtepu8_epi16'. Requires AVX512BW.

func MaskCvtsepi16Epi8

func MaskCvtsepi16Epi8(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtsepi16Epi8: Convert packed 16-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 16*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_Int16_To_Int8(a[i+15:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVSWB'. Intrinsic: '_mm_mask_cvtsepi16_epi8'. Requires AVX512BW.

func MaskCvtusepi16Epi8

func MaskCvtusepi16Epi8(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskCvtusepi16Epi8: Convert packed unsigned 16-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 16*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i])
	ELSE
		dst[l+7:l] := src[l+7:l]
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVUSWB'. Intrinsic: '_mm_mask_cvtusepi16_epi8'. Requires AVX512BW.

func MaskDbsadEpu8

func MaskDbsadEpu8(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i, imm8 byte) (dst x86.M128i)

MaskDbsadEpu8: Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in 'a' compared to those in 'b', and store the 16-bit results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The

first two SADs use the lower 8-bit quadruplet of the lane from 'a', and the last two SADs use the uppper 8-bit quadruplet of the lane from 'a'. Quadruplets from 'b' are selected according to the control in 'imm8', and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.

tmp[31:0] := select(b[127:0], imm8[1:0])
tmp[63:32] := select(b[127:0], imm8[3:2])
tmp[95:64] := select(b[127:0], imm8[5:4])
tmp[127:96] := select(b[127:0], imm8[7:6])

FOR j := 0 to 1
	i := j*64
	tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8])
				 + ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])

	tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16])
				 + ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])

	tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24])
				 + ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])

	tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32])
				 + ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
ENDFOR

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VDBPSADBW'. Intrinsic: '_mm_mask_dbsad_epu8'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func MaskMaddEpi16

func MaskMaddEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMaddEpi16: Multiply packed 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i+16]*b[i+31:i+16] + a[i+15:i]*b[i+15:i]
	ELSE
		dst[i+31:i] := src[i+31:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMADDWD'. Intrinsic: '_mm_mask_madd_epi16'. Requires AVX512BW.

func MaskMaddubsEpi16

func MaskMaddubsEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMaddubsEpi16: Multiply packed unsigned 8-bit integers in 'a' by packed signed 8-bit integers in 'b', producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMADDUBSW'. Intrinsic: '_mm_mask_maddubs_epi16'. Requires AVX512BW.

func MaskMaxEpi16

func MaskMaxEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMaxEpi16: Compare packed 16-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		IF a[i+15:i] > b[i+15:i]
			dst[i+15:i] := a[i+15:i]
		ELSE
			dst[i+15:i] := b[i+15:i]
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMAXSW'. Intrinsic: '_mm_mask_max_epi16'. Requires AVX512BW.

func MaskMaxEpi8

func MaskMaxEpi8(src x86.M128i, k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMaxEpi8: Compare packed 8-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		IF a[i+7:i] > b[i+7:i]
			dst[i+7:i] := a[i+7:i]
		ELSE
			dst[i+7:i] := b[i+7:i]
		FI
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMAXSB'. Intrinsic: '_mm_mask_max_epi8'. Requires AVX512BW.

func MaskMaxEpu16

func MaskMaxEpu16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMaxEpu16: Compare packed unsigned 16-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		IF a[i+15:i] > b[i+15:i]
			dst[i+15:i] := a[i+15:i]
		ELSE
			dst[i+15:i] := b[i+15:i]
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMAXUW'. Intrinsic: '_mm_mask_max_epu16'. Requires AVX512BW.

func MaskMaxEpu8

func MaskMaxEpu8(src x86.M128i, k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMaxEpu8: Compare packed unsigned 8-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		IF a[i+7:i] > b[i+7:i]
			dst[i+7:i] := a[i+7:i]
		ELSE
			dst[i+7:i] := b[i+7:i]
		FI
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMAXUB'. Intrinsic: '_mm_mask_max_epu8'. Requires AVX512BW.

func MaskMinEpi16

func MaskMinEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMinEpi16: Compare packed 16-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		IF a[i+15:i] < b[i+15:i]
			dst[i+15:i] := a[i+15:i]
		ELSE
			dst[i+15:i] := b[i+15:i]
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMINSW'. Intrinsic: '_mm_mask_min_epi16'. Requires AVX512BW.

func MaskMinEpi8

func MaskMinEpi8(src x86.M128i, k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMinEpi8: Compare packed 8-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		IF a[i+7:i] < b[i+7:i]
			dst[i+7:i] := a[i+7:i]
		ELSE
			dst[i+7:i] := b[i+7:i]
		FI
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMINSB'. Intrinsic: '_mm_mask_min_epi8'. Requires AVX512BW.

func MaskMinEpu16

func MaskMinEpu16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMinEpu16: Compare packed unsigned 16-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		IF a[i+15:i] < b[i+15:i]
			dst[i+15:i] := a[i+15:i]
		ELSE
			dst[i+15:i] := b[i+15:i]
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMINUW'. Intrinsic: '_mm_mask_min_epu16'. Requires AVX512BW.

func MaskMinEpu8

func MaskMinEpu8(src x86.M128i, k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMinEpu8: Compare packed unsigned 8-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		IF a[i+7:i] < b[i+7:i]
			dst[i+7:i] := a[i+7:i]
		ELSE
			dst[i+7:i] := b[i+7:i]
		FI
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMINUB'. Intrinsic: '_mm_mask_min_epu8'. Requires AVX512BW.

func MaskMovEpi16

func MaskMovEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskMovEpi16: Move packed 16-bit integers from 'a' into 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := a[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMOVDQU16'. Intrinsic: '_mm_mask_mov_epi16'. Requires AVX512BW.

func MaskMovEpi8

func MaskMovEpi8(src x86.M128i, k x86.Mmask16, a x86.M128i) (dst x86.M128i)

MaskMovEpi8: Move packed 8-bit integers from 'a' into 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := a[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMOVDQU8'. Intrinsic: '_mm_mask_mov_epi8'. Requires AVX512BW.

func MaskMulhiEpi16

func MaskMulhiEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMulhiEpi16: Multiply the packed 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		tmp[31:0] := a[i+15:i] * b[i+15:i]
		dst[i+15:i] := tmp[31:16]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMULHW'. Intrinsic: '_mm_mask_mulhi_epi16'. Requires AVX512BW.

func MaskMulhiEpu16

func MaskMulhiEpu16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMulhiEpu16: Multiply the packed unsigned 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		tmp[31:0] := a[i+15:i] * b[i+15:i]
		dst[i+15:i] := tmp[31:16]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMULHUW'. Intrinsic: '_mm_mask_mulhi_epu16'. Requires AVX512BW.

func MaskMulhrsEpi16

func MaskMulhrsEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMulhrsEpi16: Multiply packed 16-bit integers in 'a' and 'b', producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		tmp[31:0] := ((a[i+15:i] * b[i+15:i]) >> 14) + 1
		dst[i+15:i] := tmp[16:1]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMULHRSW'. Intrinsic: '_mm_mask_mulhrs_epi16'. Requires AVX512BW.

func MaskMulloEpi16

func MaskMulloEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskMulloEpi16: Multiply the packed 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		tmp[31:0] := a[i+15:i] * b[i+15:i]
		dst[i+15:i] := tmp[15:0]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMULLW'. Intrinsic: '_mm_mask_mullo_epi16'. Requires AVX512BW.

func MaskPacksEpi16

func MaskPacksEpi16(src x86.M128i, k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskPacksEpi16: Convert packed 16-bit integers from 'a' and 'b' to packed 8-bit integers using signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp_dst[7:0] := Saturate_Int16_To_Int8 (a[15:0])
tmp_dst[15:8] := Saturate_Int16_To_Int8 (a[31:16])
tmp_dst[23:16] := Saturate_Int16_To_Int8 (a[47:32])
tmp_dst[31:24] := Saturate_Int16_To_Int8 (a[63:48])
tmp_dst[39:32] := Saturate_Int16_To_Int8 (a[79:64])
tmp_dst[47:40] := Saturate_Int16_To_Int8 (a[95:80])
tmp_dst[55:48] := Saturate_Int16_To_Int8 (a[111:96])
tmp_dst[63:56] := Saturate_Int16_To_Int8 (a[127:112])
tmp_dst[71:64] := Saturate_Int16_To_Int8 (b[15:0])
tmp_dst[79:72] := Saturate_Int16_To_Int8 (b[31:16])
tmp_dst[87:80] := Saturate_Int16_To_Int8 (b[47:32])
tmp_dst[95:88] := Saturate_Int16_To_Int8 (b[63:48])
tmp_dst[103:96] := Saturate_Int16_To_Int8 (b[79:64])
tmp_dst[111:104] := Saturate_Int16_To_Int8 (b[95:80])
tmp_dst[119:112] := Saturate_Int16_To_Int8 (b[111:96])
tmp_dst[127:120] := Saturate_Int16_To_Int8 (b[127:112])

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPACKSSWB'. Intrinsic: '_mm_mask_packs_epi16'. Requires AVX512BW.

func MaskPacksEpi32

func MaskPacksEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskPacksEpi32: Convert packed 32-bit integers from 'a' and 'b' to packed 16-bit integers using signed saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp_dst[15:0] := Saturate_Int32_To_Int16 (a[31:0])
tmp_dst[31:16] := Saturate_Int32_To_Int16 (a[63:32])
tmp_dst[47:32] := Saturate_Int32_To_Int16 (a[95:64])
tmp_dst[63:48] := Saturate_Int32_To_Int16 (a[127:96])
tmp_dst[79:64] := Saturate_Int32_To_Int16 (b[31:0])
tmp_dst[95:80] := Saturate_Int32_To_Int16 (b[63:32])
tmp_dst[111:96] := Saturate_Int32_To_Int16 (b[95:64])
tmp_dst[127:112] := Saturate_Int32_To_Int16 (b[127:96])

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPACKSSDW'. Intrinsic: '_mm_mask_packs_epi32'. Requires AVX512BW.

func MaskPackusEpi16

func MaskPackusEpi16(src x86.M128i, k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskPackusEpi16: Convert packed 16-bit integers from 'a' and 'b' to packed 8-bit integers using unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp_dst[7:0] := Saturate_Int16_To_UnsignedInt8 (a[15:0])
tmp_dst[15:8] := Saturate_Int16_To_UnsignedInt8 (a[31:16])
tmp_dst[23:16] := Saturate_Int16_To_UnsignedInt8 (a[47:32])
tmp_dst[31:24] := Saturate_Int16_To_UnsignedInt8 (a[63:48])
tmp_dst[39:32] := Saturate_Int16_To_UnsignedInt8 (a[79:64])
tmp_dst[47:40] := Saturate_Int16_To_UnsignedInt8 (a[95:80])
tmp_dst[55:48] := Saturate_Int16_To_UnsignedInt8 (a[111:96])
tmp_dst[63:56] := Saturate_Int16_To_UnsignedInt8 (a[127:112])
tmp_dst[71:64] := Saturate_Int16_To_UnsignedInt8 (b[15:0])
tmp_dst[79:72] := Saturate_Int16_To_UnsignedInt8 (b[31:16])
tmp_dst[87:80] := Saturate_Int16_To_UnsignedInt8 (b[47:32])
tmp_dst[95:88] := Saturate_Int16_To_UnsignedInt8 (b[63:48])
tmp_dst[103:96] := Saturate_Int16_To_UnsignedInt8 (b[79:64])
tmp_dst[111:104] := Saturate_Int16_To_UnsignedInt8 (b[95:80])
tmp_dst[119:112] := Saturate_Int16_To_UnsignedInt8 (b[111:96])
tmp_dst[127:120] := Saturate_Int16_To_UnsignedInt8 (b[127:112])

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPACKUSWB'. Intrinsic: '_mm_mask_packus_epi16'. Requires AVX512BW.

func MaskPackusEpi32

func MaskPackusEpi32(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskPackusEpi32: Convert packed 32-bit integers from 'a' and 'b' to packed 16-bit integers using unsigned saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp_dst[15:0] := Saturate_Int32_To_UnsignedInt16 (a[31:0])
tmp_dst[31:16] := Saturate_Int32_To_UnsignedInt16 (a[63:32])
tmp_dst[47:32] := Saturate_Int32_To_UnsignedInt16 (a[95:64])
tmp_dst[63:48] := Saturate_Int32_To_UnsignedInt16 (a[127:96])
tmp_dst[79:64] := Saturate_Int32_To_UnsignedInt16 (b[31:0])
tmp_dst[95:80] := Saturate_Int32_To_UnsignedInt16 (b[63:32])
tmp_dst[111:96] := Saturate_Int32_To_UnsignedInt16 (b[95:64])
tmp_dst[127:112] := Saturate_Int32_To_UnsignedInt16 (b[127:96])

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPACKUSDW'. Intrinsic: '_mm_mask_packus_epi32'. Requires AVX512BW.

func MaskPermutex2varEpi16

func MaskPermutex2varEpi16(a x86.M128i, k x86.Mmask8, idx x86.M128i, b x86.M128i) (dst x86.M128i)

MaskPermutex2varEpi16: Shuffle 16-bit integers in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'a' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		off := 16*idx[i+2:i]
		dst[i+15:i] := idx[i+3] ? b[off+15:off] : a[off+15:off]
	ELSE
		dst[i+15:i] := a[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMT2W'. Intrinsic: '_mm_mask_permutex2var_epi16'. Requires AVX512BW.

func MaskPermutexvarEpi16

func MaskPermutexvarEpi16(src x86.M128i, k x86.Mmask8, idx x86.M128i, a x86.M128i) (dst x86.M128i)

MaskPermutexvarEpi16: Shuffle 16-bit integers in 'a' using the corresponding index in 'idx', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	id := idx[i+2:i]*16
	IF k[j]
		dst[i+15:i] := a[id+15:id]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMW'. Intrinsic: '_mm_mask_permutexvar_epi16'. Requires AVX512BW.

func MaskSet1Epi16

func MaskSet1Epi16(src x86.M128i, k x86.Mmask8, a int16) (dst x86.M128i)

MaskSet1Epi16: Broadcast the low packed 16-bit integer from 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := a[15:0]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPBROADCASTW'. Intrinsic: '_mm_mask_set1_epi16'. Requires AVX512BW.

func MaskSet1Epi8

func MaskSet1Epi8(src x86.M128i, k x86.Mmask16, a byte) (dst x86.M128i)

MaskSet1Epi8: Broadcast 8-bit integer 'a' to all elements of 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := a[7:0]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPBROADCASTB'. Intrinsic: '_mm_mask_set1_epi8'. Requires AVX512BW.

func MaskShuffleEpi8

func MaskShuffleEpi8(src x86.M128i, k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskShuffleEpi8: Shuffle packed 8-bit integers in 'a' according to shuffle control mask in the corresponding 8-bit element of 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		IF b[i+7] == 1
			dst[i+7:i] := 0
		ELSE
			index[3:0] := b[i+3:i]
			dst[i+7:i] := a[index*8+7:index*8]
		FI
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSHUFB'. Intrinsic: '_mm_mask_shuffle_epi8'. Requires AVX512BW.

func MaskShufflehiEpi16

func MaskShufflehiEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskShufflehiEpi16: Shuffle 16-bit integers in the high 64 bits of 'a' using the control in 'imm8'. Store the results in the high 64 bits of 'dst', with the low 64 bits being copied from from 'a' to 'dst', using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp_dst[63:0] := a[63:0]
tmp_dst[79:64] := (a >> (imm8[1:0] * 16))[79:64]
tmp_dst[95:80] := (a >> (imm8[3:2] * 16))[79:64]
tmp_dst[111:96] := (a >> (imm8[5:4] * 16))[79:64]
tmp_dst[127:112] := (a >> (imm8[7:6] * 16))[79:64]

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSHUFHW'. Intrinsic: '_mm_mask_shufflehi_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func MaskShuffleloEpi16

func MaskShuffleloEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskShuffleloEpi16: Shuffle 16-bit integers in the low 64 bits of 'a' using the control in 'imm8'. Store the results in the low 64 bits of 'dst', with the high 64 bits being copied from from 'a' to 'dst', using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

tmp_dst[15:0] := (a >> (imm8[1:0] * 16))[15:0]
tmp_dst[31:16] := (a >> (imm8[3:2] * 16))[15:0]
tmp_dst[47:32] := (a >> (imm8[5:4] * 16))[15:0]
tmp_dst[63:48] := (a >> (imm8[7:6] * 16))[15:0]
tmp_dst[127:64] := a[127:64]

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSHUFLW'. Intrinsic: '_mm_mask_shufflelo_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func MaskSllEpi16

func MaskSllEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskSllEpi16: Shift packed 16-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		IF count[63:0] > 15
			dst[i+15:i] := 0
		ELSE
			dst[i+15:i] := ZeroExtend(a[i+15:i] << count[63:0])
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSLLW'. Intrinsic: '_mm_mask_sll_epi16'. Requires AVX512BW.

func MaskSlliEpi16

func MaskSlliEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskSlliEpi16: Shift packed 16-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		IF imm8[7:0] > 15
			dst[i+15:i] := 0
		ELSE
			dst[i+15:i] := ZeroExtend(a[i+15:i] << imm8[7:0])
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSLLW'. Intrinsic: '_mm_mask_slli_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func MaskSllvEpi16

func MaskSllvEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskSllvEpi16: Shift packed 16-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := ZeroExtend(a[i+15:i] << count[i+15:i])
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSLLVW'. Intrinsic: '_mm_mask_sllv_epi16'. Requires AVX512BW.

func MaskSraEpi16

func MaskSraEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskSraEpi16: Shift packed 16-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		IF count[63:0] > 15
			dst[i+15:i] := SignBit
		ELSE
			dst[i+15:i] := SignExtend(a[i+15:i] >> count[63:0])
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRAW'. Intrinsic: '_mm_mask_sra_epi16'. Requires AVX512BW.

func MaskSraiEpi16

func MaskSraiEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskSraiEpi16: Shift packed 16-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		IF imm8[7:0] > 15
			dst[i+15:i] := SignBit
		ELSE
			dst[i+15:i] := SignExtend(a[i+15:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRAW'. Intrinsic: '_mm_mask_srai_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func MaskSravEpi16

func MaskSravEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskSravEpi16: Shift packed 16-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := SignExtend(a[i+15:i] >> count[i+15:i])
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRAVW'. Intrinsic: '_mm_mask_srav_epi16'. Requires AVX512BW.

func MaskSrlEpi16

func MaskSrlEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskSrlEpi16: Shift packed 16-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		IF count[63:0] > 15
			dst[i+15:i] := 0
		ELSE
			dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[63:0])
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRLW'. Intrinsic: '_mm_mask_srl_epi16'. Requires AVX512BW.

func MaskSrliEpi16

func MaskSrliEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskSrliEpi16: Shift packed 16-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		IF imm8[7:0] > 15
			dst[i+15:i] := 0
		ELSE
			dst[i+15:i] := ZeroExtend(a[i+15:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRLW'. Intrinsic: '_mm_mask_srli_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func MaskSrlvEpi16

func MaskSrlvEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskSrlvEpi16: Shift packed 16-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[i+15:i])
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRLVW'. Intrinsic: '_mm_mask_srlv_epi16'. Requires AVX512BW.

func MaskSubEpi16

func MaskSubEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskSubEpi16: Subtract packed 16-bit integers in 'b' from packed 16-bit integers in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := a[i+15:i] - b[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSUBW'. Intrinsic: '_mm_mask_sub_epi16'. Requires AVX512BW.

func MaskSubEpi8

func MaskSubEpi8(src x86.M128i, k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskSubEpi8: Subtract packed 8-bit integers in 'b' from packed 8-bit integers in 'a', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := a[i+7:i] - b[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSUBB'. Intrinsic: '_mm_mask_sub_epi8'. Requires AVX512BW.

func MaskSubsEpi16

func MaskSubsEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskSubsEpi16: Subtract packed 16-bit integers in 'b' from packed 16-bit integers in 'a' using saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_Int16(a[i+15:i] - b[i+15:i])
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSUBSW'. Intrinsic: '_mm_mask_subs_epi16'. Requires AVX512BW.

func MaskSubsEpi8

func MaskSubsEpi8(src x86.M128i, k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskSubsEpi8: Subtract packed 8-bit integers in 'b' from packed 8-bit integers in 'a' using saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := Saturate_To_Int8(a[i+7:i] - b[i+7:i])
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSUBSB'. Intrinsic: '_mm_mask_subs_epi8'. Requires AVX512BW.

func MaskSubsEpu16

func MaskSubsEpu16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskSubsEpu16: Subtract packed unsigned 16-bit integers in 'b' from packed unsigned 16-bit integers in 'a' using saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_UnsignedInt16(a[i+15:i] - b[i+15:i])
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSUBUSW'. Intrinsic: '_mm_mask_subs_epu16'. Requires AVX512BW.

func MaskSubsEpu8

func MaskSubsEpu8(src x86.M128i, k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskSubsEpu8: Subtract packed unsigned 8-bit integers in 'b' from packed unsigned 8-bit integers in 'a' using saturation, and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := Saturate_To_UnsignedInt8(a[i+7:i] - b[i+7:i])
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSUBUSB'. Intrinsic: '_mm_mask_subs_epu8'. Requires AVX512BW.

func MaskTestEpi16Mask

func MaskTestEpi16Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskTestEpi16Mask: Compute the bitwise AND of packed 16-bit integers in 'a' and 'b', producing intermediate 16-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is non-zero.

FOR j := 0 to 7
	i := j*16
	IF k1[j]
		k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPTESTMW'. Intrinsic: '_mm_mask_test_epi16_mask'. Requires AVX512BW.

func MaskTestEpi8Mask

func MaskTestEpi8Mask(k1 x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.Mmask16)

MaskTestEpi8Mask: Compute the bitwise AND of packed 8-bit integers in 'a' and 'b', producing intermediate 8-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is non-zero.

FOR j := 0 to 15
	i := j*8
	IF k1[j]
		k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPTESTMB'. Intrinsic: '_mm_mask_test_epi8_mask'. Requires AVX512BW.

func MaskTestnEpi16Mask

func MaskTestnEpi16Mask(k1 x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.Mmask8)

MaskTestnEpi16Mask: Compute the bitwise NAND of packed 16-bit integers in 'a' and 'b', producing intermediate 16-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is zero.

FOR j := 0 to 7
	i := j*16
	IF k1[j]
		k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPTESTNMW'. Intrinsic: '_mm_mask_testn_epi16_mask'. Requires AVX512BW.

func MaskTestnEpi8Mask

func MaskTestnEpi8Mask(k1 x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.Mmask16)

MaskTestnEpi8Mask: Compute the bitwise NAND of packed 8-bit integers in 'a' and 'b', producing intermediate 8-bit values, and set the corresponding bit in result mask 'k' (subject to writemask 'k') if the intermediate value is zero.

FOR j := 0 to 15
	i := j*8
	IF k1[j]
		k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPTESTNMB'. Intrinsic: '_mm_mask_testn_epi8_mask'. Requires AVX512BW.

func MaskUnpackhiEpi16

func MaskUnpackhiEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskUnpackhiEpi16: Unpack and interleave 16-bit integers from the high half of 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]){
	dst[15:0] := src1[79:64]
	dst[31:16] := src2[79:64]
	dst[47:32] := src1[95:80]
	dst[63:48] := src2[95:80]
	dst[79:64] := src1[111:96]
	dst[95:80] := src2[111:96]
	dst[111:96] := src1[127:112]
	dst[127:112] := src2[127:112]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPUNPCKHWD'. Intrinsic: '_mm_mask_unpackhi_epi16'. Requires AVX512BW.

func MaskUnpackhiEpi8

func MaskUnpackhiEpi8(src x86.M128i, k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskUnpackhiEpi8: Unpack and interleave 8-bit integers from the high half of 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]){
	dst[7:0] := src1[71:64]
	dst[15:8] := src2[71:64]
	dst[23:16] := src1[79:72]
	dst[31:24] := src2[79:72]
	dst[39:32] := src1[87:80]
	dst[47:40] := src2[87:80]
	dst[55:48] := src1[95:88]
	dst[63:56] := src2[95:88]
	dst[71:64] := src1[103:96]
	dst[79:72] := src2[103:96]
	dst[87:80] := src1[111:104]
	dst[95:88] := src2[111:104]
	dst[103:96] := src1[119:112]
	dst[111:104] := src2[119:112]
	dst[119:112] := src1[127:120]
	dst[127:120] := src2[127:120]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPUNPCKHBW'. Intrinsic: '_mm_mask_unpackhi_epi8'. Requires AVX512BW.

func MaskUnpackloEpi16

func MaskUnpackloEpi16(src x86.M128i, k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskUnpackloEpi16: Unpack and interleave 16-bit integers from the low half of 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_WORDS(src1[127:0], src2[127:0]){
	dst[15:0] := src1[15:0]
	dst[31:16] := src2[15:0]
	dst[47:32] := src1[31:16]
	dst[63:48] := src2[31:16]
	dst[79:64] := src1[47:32]
	dst[95:80] := src2[47:32]
	dst[111:96] := src1[63:48]
	dst[127:112] := src2[63:48]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPUNPCKLWD'. Intrinsic: '_mm_mask_unpacklo_epi16'. Requires AVX512BW.

func MaskUnpackloEpi8

func MaskUnpackloEpi8(src x86.M128i, k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskUnpackloEpi8: Unpack and interleave 8-bit integers from the low half of 'a' and 'b', and store the results in 'dst' using writemask 'k' (elements are copied from 'src' when the corresponding mask bit is not set).

INTERLEAVE_BYTES(src1[127:0], src2[127:0]){
	dst[7:0] := src1[7:0]
	dst[15:8] := src2[7:0]
	dst[23:16] := src1[15:8]
	dst[31:24] := src2[15:8]
	dst[39:32] := src1[23:16]
	dst[47:40] := src2[23:16]
	dst[55:48] := src1[31:24]
	dst[63:56] := src2[31:24]
	dst[71:64] := src1[39:32]
	dst[79:72] := src2[39:32]
	dst[87:80] := src1[47:40]
	dst[95:88] := src2[47:40]
	dst[103:96] := src1[55:48]
	dst[111:104] := src2[55:48]
	dst[119:112] := src1[63:56]
	dst[127:120] := src2[63:56]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPUNPCKLBW'. Intrinsic: '_mm_mask_unpacklo_epi8'. Requires AVX512BW.

func MaskzAbsEpi16

func MaskzAbsEpi16(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzAbsEpi16: Compute the absolute value of packed 16-bit integers in 'a', and store the unsigned results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := ABS(a[i+15:i])
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPABSW'. Intrinsic: '_mm_maskz_abs_epi16'. Requires AVX512BW.

func MaskzAbsEpi8

func MaskzAbsEpi8(k x86.Mmask16, a x86.M128i) (dst x86.M128i)

MaskzAbsEpi8: Compute the absolute value of packed 8-bit integers in 'a', and store the unsigned results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := ABS(a[i+7:i])
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPABSB'. Intrinsic: '_mm_maskz_abs_epi8'. Requires AVX512BW.

func MaskzAddEpi16

func MaskzAddEpi16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzAddEpi16: Add packed 16-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := a[i+15:i] + b[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPADDW'. Intrinsic: '_mm_maskz_add_epi16'. Requires AVX512BW.

func MaskzAddEpi8

func MaskzAddEpi8(k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzAddEpi8: Add packed 8-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := a[i+7:i] + b[i+7:i]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPADDB'. Intrinsic: '_mm_maskz_add_epi8'. Requires AVX512BW.

func MaskzAddsEpi16

func MaskzAddsEpi16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzAddsEpi16: Add packed 16-bit integers in 'a' and 'b' using saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_Int16( a[i+15:i] + b[i+15:i] )
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPADDSW'. Intrinsic: '_mm_maskz_adds_epi16'. Requires AVX512BW.

func MaskzAddsEpi8

func MaskzAddsEpi8(k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzAddsEpi8: Add packed 8-bit integers in 'a' and 'b' using saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := Saturate_To_Int8( a[i+7:i] + b[i+7:i] )
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPADDSB'. Intrinsic: '_mm_maskz_adds_epi8'. Requires AVX512BW.

func MaskzAddsEpu16

func MaskzAddsEpu16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzAddsEpu16: Add packed unsigned 16-bit integers in 'a' and 'b' using saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_UnsignedInt16( a[i+15:i] + b[i+15:i] )
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPADDUSW'. Intrinsic: '_mm_maskz_adds_epu16'. Requires AVX512BW.

func MaskzAddsEpu8

func MaskzAddsEpu8(k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzAddsEpu8: Add packed unsigned 8-bit integers in 'a' and 'b' using saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := Saturate_To_UnsignedInt8( a[i+7:i] + b[i+7:i] )
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPADDUSB'. Intrinsic: '_mm_maskz_adds_epu8'. Requires AVX512BW.

func MaskzAlignrEpi8

func MaskzAlignrEpi8(k x86.Mmask16, a x86.M128i, b x86.M128i, count int) (dst x86.M128i)

MaskzAlignrEpi8: Concatenate pairs of 16-byte blocks in 'a' and 'b' into a 32-byte temporary result, shift the result right by 'count' bytes, and store the low 16 bytes in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp_dst[255:0] := ((a[127:0] << 128) OR b[127:0]) >> (count[7:0]*8)

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPALIGNR'. Intrinsic: '_mm_maskz_alignr_epi8'. Requires AVX512BW.

func MaskzAvgEpu16

func MaskzAvgEpu16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzAvgEpu16: Average packed unsigned 16-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPAVGW'. Intrinsic: '_mm_maskz_avg_epu16'. Requires AVX512BW.

func MaskzAvgEpu8

func MaskzAvgEpu8(k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzAvgEpu8: Average packed unsigned 8-bit integers in 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPAVGB'. Intrinsic: '_mm_maskz_avg_epu8'. Requires AVX512BW.

func MaskzBroadcastbEpi8

func MaskzBroadcastbEpi8(k x86.Mmask16, a x86.M128i) (dst x86.M128i)

MaskzBroadcastbEpi8: Broadcast the low packed 8-bit integer from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := a[7:0]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPBROADCASTB'. Intrinsic: '_mm_maskz_broadcastb_epi8'. Requires AVX512BW.

func MaskzBroadcastwEpi16

func MaskzBroadcastwEpi16(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzBroadcastwEpi16: Broadcast the low packed 16-bit integer from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := a[15:0]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPBROADCASTW'. Intrinsic: '_mm_maskz_broadcastw_epi16'. Requires AVX512BW.

func MaskzCvtepi16Epi8

func MaskzCvtepi16Epi8(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtepi16Epi8: Convert packed 16-bit integers in 'a' to packed 8-bit integers with truncation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 16*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Truncate_Int16_To_Int8(a[i+15:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVWB'. Intrinsic: '_mm_maskz_cvtepi16_epi8'. Requires AVX512BW.

func MaskzCvtepi8Epi16

func MaskzCvtepi8Epi16(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtepi8Epi16: Sign extend packed 8-bit integers in 'a' to packed 16-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*8
	l := j*16
	IF k[j]
		dst[l+15:l] := SignExtend(a[i+7:i])
	ELSE
		dst[l+15:l] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVSXBW'. Intrinsic: '_mm_maskz_cvtepi8_epi16'. Requires AVX512BW.

func MaskzCvtepu8Epi16

func MaskzCvtepu8Epi16(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtepu8Epi16: Zero extend packed unsigned 8-bit integers in 'a' to packed 16-bit integers, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*8
	l := j*16
	IF k[j]
		dst[l+15:l] := ZeroExtend(a[i+7:i])
	ELSE
		dst[l+15:l] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVZXBW'. Intrinsic: '_mm_maskz_cvtepu8_epi16'. Requires AVX512BW.

func MaskzCvtsepi16Epi8

func MaskzCvtsepi16Epi8(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtsepi16Epi8: Convert packed 16-bit integers in 'a' to packed 8-bit integers with signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 16*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_Int16_To_Int8(a[i+15:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVSWB'. Intrinsic: '_mm_maskz_cvtsepi16_epi8'. Requires AVX512BW.

func MaskzCvtusepi16Epi8

func MaskzCvtusepi16Epi8(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzCvtusepi16Epi8: Convert packed unsigned 16-bit integers in 'a' to packed unsigned 8-bit integers with unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := 16*j
	l := 8*j
	IF k[j]
		dst[l+7:l] := Saturate_UnsignedInt16_To_Int8(a[i+15:i])
	ELSE
		dst[l+7:l] := 0
	FI
ENDFOR
dst[MAX:64] := 0

Instruction: 'VPMOVUSWB'. Intrinsic: '_mm_maskz_cvtusepi16_epi8'. Requires AVX512BW.

func MaskzDbsadEpu8

func MaskzDbsadEpu8(k x86.Mmask8, a x86.M128i, b x86.M128i, imm8 byte) (dst x86.M128i)

MaskzDbsadEpu8: Compute the sum of absolute differences (SADs) of quadruplets of unsigned 8-bit integers in 'a' compared to those in 'b', and store the 16-bit results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

Four SADs are performed on four 8-bit quadruplets for each 64-bit lane. The

first two SADs use the lower 8-bit quadruplet of the lane from 'a', and the last two SADs use the uppper 8-bit quadruplet of the lane from 'a'. Quadruplets from 'b' are selected according to the control in 'imm8', and each SAD in each 64-bit lane uses the selected quadruplet at 8-bit offsets.

tmp[31:0] := select(b[127:0], imm8[1:0])
tmp[63:32] := select(b[127:0], imm8[3:2])
tmp[95:64] := select(b[127:0], imm8[5:4])
tmp[127:96] := select(b[127:0], imm8[7:6])

FOR j := 0 to 1
	i := j*64
	tmp_dst[i+15:i] := ABS(a[i+7:i] - tmp[i+7:i]) + ABS(a[i+15:i+8] - tmp[i+15:i+8])
				 + ABS(a[i+23:i+16] - tmp[i+23:i+16]) + ABS(a[i+31:i+24] - tmp[i+31:i+24])

	tmp_dst[i+31:i+16] := ABS(a[i+7:i] - tmp[i+15:i+8]) + ABS(a[i+15:i+8] - tmp[i+23:i+16])
				 + ABS(a[i+23:i+16] - tmp[i+31:i+24]) + ABS(a[i+31:i+24] - tmp[i+39:i+32])

	tmp_dst[i+47:i+32] := ABS(a[i+39:i+32] - tmp[i+23:i+16]) + ABS(a[i+47:i+40] - tmp[i+31:i+24])
				 + ABS(a[i+55:i+48] - tmp[i+39:i+32]) + ABS(a[i+63:i+56] - tmp[i+47:i+40])

	tmp_dst[i+63:i+48] := ABS(a[i+39:i+32] - tmp[i+31:i+24]) + ABS(a[i+47:i+40] - tmp[i+39:i+32])
				 + ABS(a[i+55:i+48] - tmp[i+47:i+40]) + ABS(a[i+63:i+56] - tmp[i+55:i+48])
ENDFOR

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VDBPSADBW'. Intrinsic: '_mm_maskz_dbsad_epu8'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func MaskzMaddEpi16

func MaskzMaddEpi16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMaddEpi16: Multiply packed 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 3
	i := j*32
	IF k[j]
		dst[i+31:i] := a[i+31:i+16]*b[i+31:i+16] + a[i+15:i]*b[i+15:i]
	ELSE
		dst[i+31:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMADDWD'. Intrinsic: '_mm_maskz_madd_epi16'. Requires AVX512BW.

func MaskzMaddubsEpi16

func MaskzMaddubsEpi16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMaddubsEpi16: Multiply packed unsigned 8-bit integers in 'a' by packed signed 8-bit integers in 'b', producing intermediate signed 16-bit integers. Horizontally add adjacent pairs of intermediate signed 16-bit integers, and pack the saturated results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + a[i+7:i]*b[i+7:i] )
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMADDUBSW'. Intrinsic: '_mm_maskz_maddubs_epi16'. Requires AVX512BW.

func MaskzMaxEpi16

func MaskzMaxEpi16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMaxEpi16: Compare packed 16-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		IF a[i+15:i] > b[i+15:i]
			dst[i+15:i] := a[i+15:i]
		ELSE
			dst[i+15:i] := b[i+15:i]
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMAXSW'. Intrinsic: '_mm_maskz_max_epi16'. Requires AVX512BW.

func MaskzMaxEpi8

func MaskzMaxEpi8(k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMaxEpi8: Compare packed 8-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		IF a[i+7:i] > b[i+7:i]
			dst[i+7:i] := a[i+7:i]
		ELSE
			dst[i+7:i] := b[i+7:i]
		FI
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMAXSB'. Intrinsic: '_mm_maskz_max_epi8'. Requires AVX512BW.

func MaskzMaxEpu16

func MaskzMaxEpu16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMaxEpu16: Compare packed unsigned 16-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		IF a[i+15:i] > b[i+15:i]
			dst[i+15:i] := a[i+15:i]
		ELSE
			dst[i+15:i] := b[i+15:i]
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMAXUW'. Intrinsic: '_mm_maskz_max_epu16'. Requires AVX512BW.

func MaskzMaxEpu8

func MaskzMaxEpu8(k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMaxEpu8: Compare packed unsigned 8-bit integers in 'a' and 'b', and store packed maximum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		IF a[i+7:i] > b[i+7:i]
			dst[i+7:i] := a[i+7:i]
		ELSE
			dst[i+7:i] := b[i+7:i]
		FI
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMAXUB'. Intrinsic: '_mm_maskz_max_epu8'. Requires AVX512BW.

func MaskzMinEpi16

func MaskzMinEpi16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMinEpi16: Compare packed 16-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		IF a[i+15:i] < b[i+15:i]
			dst[i+15:i] := a[i+15:i]
		ELSE
			dst[i+15:i] := b[i+15:i]
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMINSW'. Intrinsic: '_mm_maskz_min_epi16'. Requires AVX512BW.

func MaskzMinEpi8

func MaskzMinEpi8(k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMinEpi8: Compare packed 8-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		IF a[i+7:i] < b[i+7:i]
			dst[i+7:i] := a[i+7:i]
		ELSE
			dst[i+7:i] := b[i+7:i]
		FI
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMINSB'. Intrinsic: '_mm_maskz_min_epi8'. Requires AVX512BW.

func MaskzMinEpu16

func MaskzMinEpu16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMinEpu16: Compare packed unsigned 16-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		IF a[i+15:i] < b[i+15:i]
			dst[i+15:i] := a[i+15:i]
		ELSE
			dst[i+15:i] := b[i+15:i]
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMINUW'. Intrinsic: '_mm_maskz_min_epu16'. Requires AVX512BW.

func MaskzMinEpu8

func MaskzMinEpu8(k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMinEpu8: Compare packed unsigned 8-bit integers in 'a' and 'b', and store packed minimum values in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		IF a[i+7:i] < b[i+7:i]
			dst[i+7:i] := a[i+7:i]
		ELSE
			dst[i+7:i] := b[i+7:i]
		FI
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMINUB'. Intrinsic: '_mm_maskz_min_epu8'. Requires AVX512BW.

func MaskzMovEpi16

func MaskzMovEpi16(k x86.Mmask8, a x86.M128i) (dst x86.M128i)

MaskzMovEpi16: Move packed 16-bit integers from 'a' into 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := a[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMOVDQU16'. Intrinsic: '_mm_maskz_mov_epi16'. Requires AVX512BW.

func MaskzMovEpi8

func MaskzMovEpi8(k x86.Mmask16, a x86.M128i) (dst x86.M128i)

MaskzMovEpi8: Move packed 8-bit integers from 'a' into 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := a[i+7:i]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VMOVDQU8'. Intrinsic: '_mm_maskz_mov_epi8'. Requires AVX512BW.

func MaskzMulhiEpi16

func MaskzMulhiEpi16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMulhiEpi16: Multiply the packed 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		tmp[31:0] := a[i+15:i] * b[i+15:i]
		dst[i+15:i] := tmp[31:16]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMULHW'. Intrinsic: '_mm_maskz_mulhi_epi16'. Requires AVX512BW.

func MaskzMulhiEpu16

func MaskzMulhiEpu16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMulhiEpu16: Multiply the packed unsigned 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		tmp[31:0] := a[i+15:i] * b[i+15:i]
		dst[i+15:i] := tmp[31:16]
	ELSE
		dst[i+15:i] := o
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMULHUW'. Intrinsic: '_mm_maskz_mulhi_epu16'. Requires AVX512BW.

func MaskzMulhrsEpi16

func MaskzMulhrsEpi16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMulhrsEpi16: Multiply packed 16-bit integers in 'a' and 'b', producing intermediate signed 32-bit integers. Truncate each intermediate integer to the 18 most significant bits, round by adding 1, and store bits [16:1] to 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		tmp[31:0] := ((a[i+15:i] * b[i+15:i]) >> 14) + 1
		dst[i+15:i] := tmp[16:1]
	ELSE
		dst[i+15:i] := 9
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMULHRSW'. Intrinsic: '_mm_maskz_mulhrs_epi16'. Requires AVX512BW.

func MaskzMulloEpi16

func MaskzMulloEpi16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzMulloEpi16: Multiply the packed 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		tmp[31:0] := a[i+15:i] * b[i+15:i]
		dst[i+15:i] := tmp[15:0]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMULLW'. Intrinsic: '_mm_maskz_mullo_epi16'. Requires AVX512BW.

func MaskzPacksEpi16

func MaskzPacksEpi16(k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzPacksEpi16: Convert packed 16-bit integers from 'a' and 'b' to packed 8-bit integers using signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp_dst[7:0] := Saturate_Int16_To_Int8 (a[15:0])
tmp_dst[15:8] := Saturate_Int16_To_Int8 (a[31:16])
tmp_dst[23:16] := Saturate_Int16_To_Int8 (a[47:32])
tmp_dst[31:24] := Saturate_Int16_To_Int8 (a[63:48])
tmp_dst[39:32] := Saturate_Int16_To_Int8 (a[79:64])
tmp_dst[47:40] := Saturate_Int16_To_Int8 (a[95:80])
tmp_dst[55:48] := Saturate_Int16_To_Int8 (a[111:96])
tmp_dst[63:56] := Saturate_Int16_To_Int8 (a[127:112])
tmp_dst[71:64] := Saturate_Int16_To_Int8 (b[15:0])
tmp_dst[79:72] := Saturate_Int16_To_Int8 (b[31:16])
tmp_dst[87:80] := Saturate_Int16_To_Int8 (b[47:32])
tmp_dst[95:88] := Saturate_Int16_To_Int8 (b[63:48])
tmp_dst[103:96] := Saturate_Int16_To_Int8 (b[79:64])
tmp_dst[111:104] := Saturate_Int16_To_Int8 (b[95:80])
tmp_dst[119:112] := Saturate_Int16_To_Int8 (b[111:96])
tmp_dst[127:120] := Saturate_Int16_To_Int8 (b[127:112])

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPACKSSWB'. Intrinsic: '_mm_maskz_packs_epi16'. Requires AVX512BW.

func MaskzPacksEpi32

func MaskzPacksEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzPacksEpi32: Convert packed 32-bit integers from 'a' and 'b' to packed 16-bit integers using signed saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp_dst[15:0] := Saturate_Int32_To_Int16 (a[31:0])
tmp_dst[31:16] := Saturate_Int32_To_Int16 (a[63:32])
tmp_dst[47:32] := Saturate_Int32_To_Int16 (a[95:64])
tmp_dst[63:48] := Saturate_Int32_To_Int16 (a[127:96])
tmp_dst[79:64] := Saturate_Int32_To_Int16 (b[31:0])
tmp_dst[95:80] := Saturate_Int32_To_Int16 (b[63:32])
tmp_dst[111:96] := Saturate_Int32_To_Int16 (b[95:64])
tmp_dst[127:112] := Saturate_Int32_To_Int16 (b[127:96])

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPACKSSDW'. Intrinsic: '_mm_maskz_packs_epi32'. Requires AVX512BW.

func MaskzPackusEpi16

func MaskzPackusEpi16(k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzPackusEpi16: Convert packed 16-bit integers from 'a' and 'b' to packed 8-bit integers using unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp_dst[7:0] := Saturate_Int16_To_UnsignedInt8 (a[15:0])
tmp_dst[15:8] := Saturate_Int16_To_UnsignedInt8 (a[31:16])
tmp_dst[23:16] := Saturate_Int16_To_UnsignedInt8 (a[47:32])
tmp_dst[31:24] := Saturate_Int16_To_UnsignedInt8 (a[63:48])
tmp_dst[39:32] := Saturate_Int16_To_UnsignedInt8 (a[79:64])
tmp_dst[47:40] := Saturate_Int16_To_UnsignedInt8 (a[95:80])
tmp_dst[55:48] := Saturate_Int16_To_UnsignedInt8 (a[111:96])
tmp_dst[63:56] := Saturate_Int16_To_UnsignedInt8 (a[127:112])
tmp_dst[71:64] := Saturate_Int16_To_UnsignedInt8 (b[15:0])
tmp_dst[79:72] := Saturate_Int16_To_UnsignedInt8 (b[31:16])
tmp_dst[87:80] := Saturate_Int16_To_UnsignedInt8 (b[47:32])
tmp_dst[95:88] := Saturate_Int16_To_UnsignedInt8 (b[63:48])
tmp_dst[103:96] := Saturate_Int16_To_UnsignedInt8 (b[79:64])
tmp_dst[111:104] := Saturate_Int16_To_UnsignedInt8 (b[95:80])
tmp_dst[119:112] := Saturate_Int16_To_UnsignedInt8 (b[111:96])
tmp_dst[127:120] := Saturate_Int16_To_UnsignedInt8 (b[127:112])

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPACKUSWB'. Intrinsic: '_mm_maskz_packus_epi16'. Requires AVX512BW.

func MaskzPackusEpi32

func MaskzPackusEpi32(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzPackusEpi32: Convert packed 32-bit integers from 'a' and 'b' to packed 16-bit integers using unsigned saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp_dst[15:0] := Saturate_Int32_To_UnsignedInt16 (a[31:0])
tmp_dst[31:16] := Saturate_Int32_To_UnsignedInt16 (a[63:32])
tmp_dst[47:32] := Saturate_Int32_To_UnsignedInt16 (a[95:64])
tmp_dst[63:48] := Saturate_Int32_To_UnsignedInt16 (a[127:96])
tmp_dst[79:64] := Saturate_Int32_To_UnsignedInt16 (b[31:0])
tmp_dst[95:80] := Saturate_Int32_To_UnsignedInt16 (b[63:32])
tmp_dst[111:96] := Saturate_Int32_To_UnsignedInt16 (b[95:64])
tmp_dst[127:112] := Saturate_Int32_To_UnsignedInt16 (b[127:96])

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPACKUSDW'. Intrinsic: '_mm_maskz_packus_epi32'. Requires AVX512BW.

func MaskzPermutex2varEpi16

func MaskzPermutex2varEpi16(k x86.Mmask8, a x86.M128i, idx x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzPermutex2varEpi16: Shuffle 16-bit integers in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		off := 16*idx[i+2:i]
		dst[i+15:i] := idx[i+3] ? b[off+15:off] : a[off+15:off]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMI2W, VPERMT2W'. Intrinsic: '_mm_maskz_permutex2var_epi16'. Requires AVX512BW.

func MaskzPermutexvarEpi16

func MaskzPermutexvarEpi16(k x86.Mmask8, idx x86.M128i, a x86.M128i) (dst x86.M128i)

MaskzPermutexvarEpi16: Shuffle 16-bit integers in 'a' using the corresponding index in 'idx', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	id := idx[i+2:i]*16
	IF k[j]
		dst[i+15:i] := a[id+15:id]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMW'. Intrinsic: '_mm_maskz_permutexvar_epi16'. Requires AVX512BW.

func MaskzSet1Epi16

func MaskzSet1Epi16(k x86.Mmask8, a int16) (dst x86.M128i)

MaskzSet1Epi16: Broadcast the low packed 16-bit integer from 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := a[15:0]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPBROADCASTW'. Intrinsic: '_mm_maskz_set1_epi16'. Requires AVX512BW.

func MaskzSet1Epi8

func MaskzSet1Epi8(k x86.Mmask16, a byte) (dst x86.M128i)

MaskzSet1Epi8: Broadcast 8-bit integer 'a' to all elements of 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := a[7:0]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPBROADCASTB'. Intrinsic: '_mm_maskz_set1_epi8'. Requires AVX512BW.

func MaskzShuffleEpi8

func MaskzShuffleEpi8(k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzShuffleEpi8: Shuffle packed 8-bit integers in 'a' according to shuffle control mask in the corresponding 8-bit element of 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		IF b[i+7] == 1
			dst[i+7:i] := 0
		ELSE
			index[3:0] := b[i+3:i]
			dst[i+7:i] := a[index*8+7:index*8]
		FI
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSHUFB'. Intrinsic: '_mm_maskz_shuffle_epi8'. Requires AVX512BW.

func MaskzShufflehiEpi16

func MaskzShufflehiEpi16(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskzShufflehiEpi16: Shuffle 16-bit integers in the high 64 bits of 'a' using the control in 'imm8'. Store the results in the high 64 bits of 'dst', with the low 64 bits being copied from from 'a' to 'dst', using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp_dst[63:0] := a[63:0]
tmp_dst[79:64] := (a >> (imm8[1:0] * 16))[79:64]
tmp_dst[95:80] := (a >> (imm8[3:2] * 16))[79:64]
tmp_dst[111:96] := (a >> (imm8[5:4] * 16))[79:64]
tmp_dst[127:112] := (a >> (imm8[7:6] * 16))[79:64]

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSHUFHW'. Intrinsic: '_mm_maskz_shufflehi_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func MaskzShuffleloEpi16

func MaskzShuffleloEpi16(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskzShuffleloEpi16: Shuffle 16-bit integers in the low 64 bits of 'a' using the control in 'imm8'. Store the results in the low 64 bits of 'dst', with the high 64 bits being copied from from 'a' to 'dst', using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

tmp_dst[15:0] := (a >> (imm8[1:0] * 16))[15:0]
tmp_dst[31:16] := (a >> (imm8[3:2] * 16))[15:0]
tmp_dst[47:32] := (a >> (imm8[5:4] * 16))[15:0]
tmp_dst[63:48] := (a >> (imm8[7:6] * 16))[15:0]
tmp_dst[127:64] := a[127:64]

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSHUFLW'. Intrinsic: '_mm_maskz_shufflelo_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func MaskzSllEpi16

func MaskzSllEpi16(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskzSllEpi16: Shift packed 16-bit integers in 'a' left by 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		IF count[63:0] > 15
			dst[i+15:i] := 0
		ELSE
			dst[i+15:i] := ZeroExtend(a[i+15:i] << count[63:0])
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSLLW'. Intrinsic: '_mm_maskz_sll_epi16'. Requires AVX512BW.

func MaskzSlliEpi16

func MaskzSlliEpi16(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskzSlliEpi16: Shift packed 16-bit integers in 'a' left by 'imm8' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		IF imm8[7:0] > 15
			dst[i+15:i] := 0
		ELSE
			dst[i+15:i] := ZeroExtend(a[i+15:i] << imm8[7:0])
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSLLW'. Intrinsic: '_mm_maskz_slli_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func MaskzSllvEpi16

func MaskzSllvEpi16(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskzSllvEpi16: Shift packed 16-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := ZeroExtend(a[i+15:i] << count[i+15:i])
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSLLVW'. Intrinsic: '_mm_maskz_sllv_epi16'. Requires AVX512BW.

func MaskzSraEpi16

func MaskzSraEpi16(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskzSraEpi16: Shift packed 16-bit integers in 'a' right by 'count' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		IF count[63:0] > 15
			dst[i+15:i] := SignBit
		ELSE
			dst[i+15:i] := SignExtend(a[i+15:i] >> count[63:0])
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRAW'. Intrinsic: '_mm_maskz_sra_epi16'. Requires AVX512BW.

func MaskzSraiEpi16

func MaskzSraiEpi16(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskzSraiEpi16: Shift packed 16-bit integers in 'a' right by 'imm8' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		IF imm8[7:0] > 15
			dst[i+15:i] := SignBit
		ELSE
			dst[i+15:i] := SignExtend(a[i+15:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRAW'. Intrinsic: '_mm_maskz_srai_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func MaskzSravEpi16

func MaskzSravEpi16(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskzSravEpi16: Shift packed 16-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := SignExtend(a[i+15:i] >> count[i+15:i])
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRAVW'. Intrinsic: '_mm_maskz_srav_epi16'. Requires AVX512BW.

func MaskzSrlEpi16

func MaskzSrlEpi16(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskzSrlEpi16: Shift packed 16-bit integers in 'a' right by 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		IF count[63:0] > 15
			dst[i+15:i] := 0
		ELSE
			dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[63:0])
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRLW'. Intrinsic: '_mm_maskz_srl_epi16'. Requires AVX512BW.

func MaskzSrliEpi16

func MaskzSrliEpi16(k x86.Mmask8, a x86.M128i, imm8 byte) (dst x86.M128i)

MaskzSrliEpi16: Shift packed 16-bit integers in 'a' right by 'imm8' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		IF imm8[7:0] > 15
			dst[i+15:i] := 0
		ELSE
			dst[i+15:i] := ZeroExtend(a[i+15:i] >> imm8[7:0])
		FI
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRLW'. Intrinsic: '_mm_maskz_srli_epi16'. Requires AVX512BW.

FIXME: Requires compiler support (has immediate)

func MaskzSrlvEpi16

func MaskzSrlvEpi16(k x86.Mmask8, a x86.M128i, count x86.M128i) (dst x86.M128i)

MaskzSrlvEpi16: Shift packed 16-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[i+15:i])
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRLVW'. Intrinsic: '_mm_maskz_srlv_epi16'. Requires AVX512BW.

func MaskzSubEpi16

func MaskzSubEpi16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzSubEpi16: Subtract packed 16-bit integers in 'b' from packed 16-bit integers in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := a[i+15:i] - b[i+15:i]
	ELSE
		dst[i+15:i] := src[i+15:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSUBW'. Intrinsic: '_mm_maskz_sub_epi16'. Requires AVX512BW.

func MaskzSubEpi8

func MaskzSubEpi8(k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzSubEpi8: Subtract packed 8-bit integers in 'b' from packed 8-bit integers in 'a', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := a[i+7:i] - b[i+7:i]
	ELSE
		dst[i+7:i] := src[i+7:i]
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSUBB'. Intrinsic: '_mm_maskz_sub_epi8'. Requires AVX512BW.

func MaskzSubsEpi16

func MaskzSubsEpi16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzSubsEpi16: Subtract packed 16-bit integers in 'b' from packed 16-bit integers in 'a' using saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_Int16(a[i+15:i] - b[i+15:i])
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSUBSW'. Intrinsic: '_mm_maskz_subs_epi16'. Requires AVX512BW.

func MaskzSubsEpi8

func MaskzSubsEpi8(k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzSubsEpi8: Subtract packed 8-bit integers in 'b' from packed 8-bit integers in 'a' using saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := Saturate_To_Int8(a[i+7:i] - b[i+7:i])
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSUBSB'. Intrinsic: '_mm_maskz_subs_epi8'. Requires AVX512BW.

func MaskzSubsEpu16

func MaskzSubsEpu16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzSubsEpu16: Subtract packed unsigned 16-bit integers in 'b' from packed unsigned 16-bit integers in 'a' using saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := Saturate_To_UnsignedInt16(a[i+15:i] - b[i+15:i])
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSUBUSW'. Intrinsic: '_mm_maskz_subs_epu16'. Requires AVX512BW.

func MaskzSubsEpu8

func MaskzSubsEpu8(k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzSubsEpu8: Subtract packed unsigned 8-bit integers in 'b' from packed unsigned 8-bit integers in 'a' using saturation, and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := Saturate_To_UnsignedInt8(a[i+7:i] - b[i+7:i])
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSUBUSB'. Intrinsic: '_mm_maskz_subs_epu8'. Requires AVX512BW.

func MaskzUnpackhiEpi16

func MaskzUnpackhiEpi16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzUnpackhiEpi16: Unpack and interleave 16-bit integers from the high half of 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_HIGH_WORDS(src1[127:0], src2[127:0]){
	dst[15:0] := src1[79:64]
	dst[31:16] := src2[79:64]
	dst[47:32] := src1[95:80]
	dst[63:48] := src2[95:80]
	dst[79:64] := src1[111:96]
	dst[95:80] := src2[111:96]
	dst[111:96] := src1[127:112]
	dst[127:112] := src2[127:112]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_WORDS(a[127:0], b[127:0])

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPUNPCKHWD'. Intrinsic: '_mm_maskz_unpackhi_epi16'. Requires AVX512BW.

func MaskzUnpackhiEpi8

func MaskzUnpackhiEpi8(k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzUnpackhiEpi8: Unpack and interleave 8-bit integers from the high half of 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_HIGH_BYTES(src1[127:0], src2[127:0]){
	dst[7:0] := src1[71:64]
	dst[15:8] := src2[71:64]
	dst[23:16] := src1[79:72]
	dst[31:24] := src2[79:72]
	dst[39:32] := src1[87:80]
	dst[47:40] := src2[87:80]
	dst[55:48] := src1[95:88]
	dst[63:56] := src2[95:88]
	dst[71:64] := src1[103:96]
	dst[79:72] := src2[103:96]
	dst[87:80] := src1[111:104]
	dst[95:88] := src2[111:104]
	dst[103:96] := src1[119:112]
	dst[111:104] := src2[119:112]
	dst[119:112] := src1[127:120]
	dst[127:120] := src2[127:120]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_HIGH_BYTES(a[127:0], b[127:0])

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPUNPCKHBW'. Intrinsic: '_mm_maskz_unpackhi_epi8'. Requires AVX512BW.

func MaskzUnpackloEpi16

func MaskzUnpackloEpi16(k x86.Mmask8, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzUnpackloEpi16: Unpack and interleave 16-bit integers from the low half of 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_WORDS(src1[127:0], src2[127:0]){
	dst[15:0] := src1[15:0]
	dst[31:16] := src2[15:0]
	dst[47:32] := src1[31:16]
	dst[63:48] := src2[31:16]
	dst[79:64] := src1[47:32]
	dst[95:80] := src2[47:32]
	dst[111:96] := src1[63:48]
	dst[127:112] := src2[63:48]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_WORDS(a[127:0], b[127:0])

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := tmp_dst[i+15:i]
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPUNPCKLWD'. Intrinsic: '_mm_maskz_unpacklo_epi16'. Requires AVX512BW.

func MaskzUnpackloEpi8

func MaskzUnpackloEpi8(k x86.Mmask16, a x86.M128i, b x86.M128i) (dst x86.M128i)

MaskzUnpackloEpi8: Unpack and interleave 8-bit integers from the low half of 'a' and 'b', and store the results in 'dst' using zeromask 'k' (elements are zeroed out when the corresponding mask bit is not set).

INTERLEAVE_BYTES(src1[127:0], src2[127:0]){
	dst[7:0] := src1[7:0]
	dst[15:8] := src2[7:0]
	dst[23:16] := src1[15:8]
	dst[31:24] := src2[15:8]
	dst[39:32] := src1[23:16]
	dst[47:40] := src2[23:16]
	dst[55:48] := src1[31:24]
	dst[63:56] := src2[31:24]
	dst[71:64] := src1[39:32]
	dst[79:72] := src2[39:32]
	dst[87:80] := src1[47:40]
	dst[95:88] := src2[47:40]
	dst[103:96] := src1[55:48]
	dst[111:104] := src2[55:48]
	dst[119:112] := src1[63:56]
	dst[127:120] := src2[63:56]
	RETURN dst[127:0]
}

tmp_dst[127:0] := INTERLEAVE_BYTES(a[127:0], b[127:0])

FOR j := 0 to 15
	i := j*8
	IF k[j]
		dst[i+7:i] := tmp_dst[i+7:i]
	ELSE
		dst[i+7:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPUNPCKLBW'. Intrinsic: '_mm_maskz_unpacklo_epi8'. Requires AVX512BW.

func Movepi16Mask

func Movepi16Mask(a x86.M128i) (dst x86.Mmask8)

Movepi16Mask: Set each bit of mask register 'k' based on the most significant bit of the corresponding packed 16-bit integer in 'a'.

FOR j := 0 to 7
	i := j*16
	IF a[i+15]
		k[j] := 1
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:8] := 0

Instruction: 'VPMOVW2M'. Intrinsic: '_mm_movepi16_mask'. Requires AVX512BW.

func Movepi8Mask

func Movepi8Mask(a x86.M128i) (dst x86.Mmask16)

Movepi8Mask: Set each bit of mask register 'k' based on the most significant bit of the corresponding packed 8-bit integer in 'a'.

FOR j := 0 to 15
	i := j*8
	IF a[i+7]
		k[j] := 1
	ELSE
		k[j] := 0
	FI
ENDFOR
k[MAX:16] := 0

Instruction: 'VPMOVB2M'. Intrinsic: '_mm_movepi8_mask'. Requires AVX512BW.

func MovmEpi16

func MovmEpi16(k x86.Mmask8) (dst x86.M128i)

MovmEpi16: Set each packed 16-bit integer in 'dst' to all ones or all zeros based on the value of the corresponding bit in 'k'.

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := 0xFFFF
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPMOVM2W'. Intrinsic: '_mm_movm_epi16'. Requires AVX512BW.

func Permutex2varEpi16

func Permutex2varEpi16(a x86.M128i, idx x86.M128i, b x86.M128i) (dst x86.M128i)

Permutex2varEpi16: Shuffle 16-bit integers in 'a' and 'b' using the corresponding selector and index in 'idx', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*16
	off := 16*idx[i+2:i]
	dst[i+15:i] := idx[i+4] ? b[off+15:off] : a[off+15:off]
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMI2W, VPERMT2W'. Intrinsic: '_mm_permutex2var_epi16'. Requires AVX512BW.

func PermutexvarEpi16

func PermutexvarEpi16(idx x86.M128i, a x86.M128i) (dst x86.M128i)

PermutexvarEpi16: Shuffle 16-bit integers in 'a' using the corresponding index in 'idx', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*16
	id := idx[i+2:i]*16
	dst[i+15:i] := a[id+15:id]
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPERMW'. Intrinsic: '_mm_permutexvar_epi16'. Requires AVX512BW.

func SllvEpi16

func SllvEpi16(a x86.M128i, count x86.M128i) (dst x86.M128i)

SllvEpi16: Shift packed 16-bit integers in 'a' left by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := ZeroExtend(a[i+15:i] << count[i+15:i])
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSLLVW'. Intrinsic: '_mm_sllv_epi16'. Requires AVX512BW.

func SravEpi16

func SravEpi16(a x86.M128i, count x86.M128i) (dst x86.M128i)

SravEpi16: Shift packed 16-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in sign bits, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*16
	dst[i+15:i] := SignExtend(a[i+15:i] >> count[i+15:i])
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRAVW'. Intrinsic: '_mm_srav_epi16'. Requires AVX512BW.

func SrlvEpi16

func SrlvEpi16(a x86.M128i, count x86.M128i) (dst x86.M128i)

SrlvEpi16: Shift packed 16-bit integers in 'a' right by the amount specified by the corresponding element in 'count' while shifting in zeros, and store the results in 'dst'.

FOR j := 0 to 7
	i := j*16
	IF k[j]
		dst[i+15:i] := ZeroExtend(a[i+15:i] >> count[i+15:i])
	ELSE
		dst[i+15:i] := 0
	FI
ENDFOR
dst[MAX:128] := 0

Instruction: 'VPSRLVW'. Intrinsic: '_mm_srlv_epi16'. Requires AVX512BW.

func TestEpi16Mask

func TestEpi16Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

TestEpi16Mask: Compute the bitwise AND of packed 16-bit integers in 'a' and 'b', producing intermediate 16-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is non-zero.

FOR j := 0 to 7
	i := j*16
	k[j] := ((a[i+15:i] AND b[i+15:i]) != 0) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPTESTMW'. Intrinsic: '_mm_test_epi16_mask'. Requires AVX512BW.

func TestEpi8Mask

func TestEpi8Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask16)

TestEpi8Mask: Compute the bitwise AND of packed 8-bit integers in 'a' and 'b', producing intermediate 8-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is non-zero.

FOR j := 0 to 15
	i := j*8
	k[j] := ((a[i+7:i] AND b[i+7:i]) != 0) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPTESTMB'. Intrinsic: '_mm_test_epi8_mask'. Requires AVX512BW.

func TestnEpi16Mask

func TestnEpi16Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask8)

TestnEpi16Mask: Compute the bitwise NAND of packed 16-bit integers in 'a' and 'b', producing intermediate 16-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is zero.

FOR j := 0 to 7
	i := j*16
	k[j] := ((a[i+15:i] AND b[i+15:i]) == 0) ? 1 : 0
ENDFOR
k[MAX:8] := 0

Instruction: 'VPTESTNMW'. Intrinsic: '_mm_testn_epi16_mask'. Requires AVX512BW.

func TestnEpi8Mask

func TestnEpi8Mask(a x86.M128i, b x86.M128i) (dst x86.Mmask16)

TestnEpi8Mask: Compute the bitwise NAND of packed 8-bit integers in 'a' and 'b', producing intermediate 8-bit values, and set the corresponding bit in result mask 'k' if the intermediate value is zero.

FOR j := 0 to 15
	i := j*8
	k[j] := ((a[i+7:i] AND b[i+7:i]) == 0) ? 1 : 0
ENDFOR
k[MAX:16] := 0

Instruction: 'VPTESTNMB'. Intrinsic: '_mm_testn_epi8_mask'. Requires AVX512BW.

Types

This section is empty.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL