sse

package
v0.0.0-...-3878f85 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jul 23, 2017 License: MIT Imports: 1 Imported by: 0

Documentation

Overview

THESE PACKAGES ARE FOR DEMONSTRATION PURPOSES ONLY!

THEY DO NOT NOT CONTAIN WORKING INTRINSICS!

See https://github.com/klauspost/intrinsics

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func AcosPd

func AcosPd(a x86.M128d) (dst x86.M128d)

AcosPd: Compute the inverse cosine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ACOS(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_acos_pd'. Requires SSE.

func AcosPs

func AcosPs(a x86.M128) (dst x86.M128)

AcosPs: Compute the inverse cosine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ACOS(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_acos_ps'. Requires SSE.

func AcoshPd

func AcoshPd(a x86.M128d) (dst x86.M128d)

AcoshPd: Compute the inverse hyperbolic cosine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ACOSH(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_acosh_pd'. Requires SSE.

func AcoshPs

func AcoshPs(a x86.M128) (dst x86.M128)

AcoshPs: Compute the inverse hyperbolic cosine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ACOSH(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_acosh_ps'. Requires SSE.

func AddPs

func AddPs(a x86.M128, b x86.M128) (dst x86.M128)

AddPs: Add packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := a[i+31:i] + b[i+31:i]
ENDFOR

Instruction: 'ADDPS'. Intrinsic: '_mm_add_ps'. Requires SSE.

func AddSs

func AddSs(a x86.M128, b x86.M128) (dst x86.M128)

AddSs: Add the lower single-precision (32-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := a[31:0] + b[31:0]
dst[127:32] := a[127:32]

Instruction: 'ADDSS'. Intrinsic: '_mm_add_ss'. Requires SSE.

func AndPs

func AndPs(a x86.M128, b x86.M128) (dst x86.M128)

AndPs: Compute the bitwise AND of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := (a[i+31:i] AND b[i+31:i])
ENDFOR

Instruction: 'ANDPS'. Intrinsic: '_mm_and_ps'. Requires SSE.

func AndnotPs

func AndnotPs(a x86.M128, b x86.M128) (dst x86.M128)

AndnotPs: Compute the bitwise AND NOT of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ((NOT a[i+31:i]) AND b[i+31:i])
ENDFOR

Instruction: 'ANDNPS'. Intrinsic: '_mm_andnot_ps'. Requires SSE.

func AsinPd

func AsinPd(a x86.M128d) (dst x86.M128d)

AsinPd: Compute the inverse sine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ASIN(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_asin_pd'. Requires SSE.

func AsinPs

func AsinPs(a x86.M128) (dst x86.M128)

AsinPs: Compute the inverse sine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ASIN(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_asin_ps'. Requires SSE.

func AsinhPd

func AsinhPd(a x86.M128d) (dst x86.M128d)

AsinhPd: Compute the inverse hyperbolic sine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ASINH(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_asinh_pd'. Requires SSE.

func AsinhPs

func AsinhPs(a x86.M128) (dst x86.M128)

AsinhPs: Compute the inverse hyperbolic sine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ASINH(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_asinh_ps'. Requires SSE.

func Atan2Pd

func Atan2Pd(a x86.M128d, b x86.M128d) (dst x86.M128d)

Atan2Pd: Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in 'a' divided by packed elements in 'b', and store the results in 'dst' expressed in radians.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ATAN(a[i+63:i] / b[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_atan2_pd'. Requires SSE.

func Atan2Ps

func Atan2Ps(a x86.M128, b x86.M128) (dst x86.M128)

Atan2Ps: Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in 'a' divided by packed elements in 'b', and store the results in 'dst' expressed in radians.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ATAN(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_atan2_ps'. Requires SSE.

func AtanPd

func AtanPd(a x86.M128d) (dst x86.M128d)

AtanPd: Compute the inverse tangent of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ATAN(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_atan_pd'. Requires SSE.

func AtanPs

func AtanPs(a x86.M128) (dst x86.M128)

AtanPs: Compute the inverse tangent of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ATAN(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_atan_ps'. Requires SSE.

func AtanhPd

func AtanhPd(a x86.M128d) (dst x86.M128d)

AtanhPd: Compute the inverse hyperbolic tangent of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ATANH(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_atanh_pd'. Requires SSE.

func AtanhPs

func AtanhPs(a x86.M128) (dst x86.M128)

AtanhPs: Compute the inverse hyperbolic tangent of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ATANH(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_atanh_ps'. Requires SSE.

func AvgPu16

func AvgPu16(a x86.M64, b x86.M64) (dst x86.M64)

AvgPu16: Average packed unsigned 16-bit integers in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*16
	dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
ENDFOR

Instruction: 'PAVGW'. Intrinsic: '_mm_avg_pu16'. Requires SSE.

func AvgPu8

func AvgPu8(a x86.M64, b x86.M64) (dst x86.M64)

AvgPu8: Average packed unsigned 8-bit integers in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*8
	dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
ENDFOR

Instruction: 'PAVGB'. Intrinsic: '_mm_avg_pu8'. Requires SSE.

func CbrtPd

func CbrtPd(a x86.M128d) (dst x86.M128d)

CbrtPd: Compute the cube root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := CubeRoot(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_cbrt_pd'. Requires SSE.

func CbrtPs

func CbrtPs(a x86.M128) (dst x86.M128)

CbrtPs: Compute the cube root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := CubeRoot(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_cbrt_ps'. Requires SSE.

func CdfnormPd

func CdfnormPd(a x86.M128d) (dst x86.M128d)

CdfnormPd: Compute the cumulative distribution function of packed double-precision (64-bit) floating-point elements in 'a' using the normal distribution, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := CDFNormal(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_cdfnorm_pd'. Requires SSE.

func CdfnormPs

func CdfnormPs(a x86.M128) (dst x86.M128)

CdfnormPs: Compute the cumulative distribution function of packed single-precision (32-bit) floating-point elements in 'a' using the normal distribution, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := CDFNormal(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_cdfnorm_ps'. Requires SSE.

func CdfnorminvPd

func CdfnorminvPd(a x86.M128d) (dst x86.M128d)

CdfnorminvPd: Compute the inverse cumulative distribution function of packed double-precision (64-bit) floating-point elements in 'a' using the normal distribution, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := InverseCDFNormal(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_cdfnorminv_pd'. Requires SSE.

func CdfnorminvPs

func CdfnorminvPs(a x86.M128) (dst x86.M128)

CdfnorminvPs: Compute the inverse cumulative distribution function of packed single-precision (32-bit) floating-point elements in 'a' using the normal distribution, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := InverseCDFNormal(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_cdfnorminv_ps'. Requires SSE.

func CexpPs

func CexpPs(a x86.M128) (dst x86.M128)

CexpPs: Compute the exponential value of 'e' raised to the power of packed complex single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := e^(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_cexp_ps'. Requires SSE.

func ClogPs

func ClogPs(a x86.M128) (dst x86.M128)

ClogPs: Compute the natural logarithm of packed complex single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ln(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_clog_ps'. Requires SSE.

func CmpeqPs

func CmpeqPs(a x86.M128, b x86.M128) (dst x86.M128)

CmpeqPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' for equality, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ( a[i+31:i] == b[i+31:i] ) ? 0xffffffff : 0
ENDFOR

Instruction: 'CMPPS'. Intrinsic: '_mm_cmpeq_ps'. Requires SSE.

func CmpeqSs

func CmpeqSs(a x86.M128, b x86.M128) (dst x86.M128)

CmpeqSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b' for equality, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := ( a[31:0] == b[31:0] ) ? 0xffffffff : 0
dst[127:32] := a[127:32]

Instruction: 'CMPSS'. Intrinsic: '_mm_cmpeq_ss'. Requires SSE.

func CmpgePs

func CmpgePs(a x86.M128, b x86.M128) (dst x86.M128)

CmpgePs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' for greater-than-or-equal, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ( a[i+31:i] >= b[i+31:i] ) ? 0xffffffff : 0
ENDFOR

Instruction: 'CMPPS'. Intrinsic: '_mm_cmpge_ps'. Requires SSE.

func CmpgeSs

func CmpgeSs(a x86.M128, b x86.M128) (dst x86.M128)

CmpgeSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b' for greater-than-or-equal, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := ( a[31:0] >= b[31:0] ) ? 0xffffffff : 0
dst[127:32] := a[127:32]

Instruction: 'CMPSS'. Intrinsic: '_mm_cmpge_ss'. Requires SSE.

func CmpgtPs

func CmpgtPs(a x86.M128, b x86.M128) (dst x86.M128)

CmpgtPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' for greater-than, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ( a[i+31:i] > b[i+31:i] ) ? 0xffffffff : 0
ENDFOR

Instruction: 'CMPPS'. Intrinsic: '_mm_cmpgt_ps'. Requires SSE.

func CmpgtSs

func CmpgtSs(a x86.M128, b x86.M128) (dst x86.M128)

CmpgtSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b' for greater-than, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := ( a[31:0] > b[31:0] ) ? 0xffffffff : 0
dst[127:32] := a[127:32]

Instruction: 'CMPSS'. Intrinsic: '_mm_cmpgt_ss'. Requires SSE.

func CmplePs

func CmplePs(a x86.M128, b x86.M128) (dst x86.M128)

CmplePs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' for less-than-or-equal, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ( a[i+31:i] <= b[i+31:i] ) ? 0xffffffff : 0
ENDFOR

Instruction: 'CMPPS'. Intrinsic: '_mm_cmple_ps'. Requires SSE.

func CmpleSs

func CmpleSs(a x86.M128, b x86.M128) (dst x86.M128)

CmpleSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b' for less-than-or-equal, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := ( a[31:0] <= b[31:0] ) ? 0xffffffff : 0
dst[127:32] := a[127:32]

Instruction: 'CMPSS'. Intrinsic: '_mm_cmple_ss'. Requires SSE.

func CmpltPs

func CmpltPs(a x86.M128, b x86.M128) (dst x86.M128)

CmpltPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' for less-than, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ( a[i+31:i] < b[i+31:i] ) ? 0xffffffff : 0
ENDFOR

Instruction: 'CMPPS'. Intrinsic: '_mm_cmplt_ps'. Requires SSE.

func CmpltSs

func CmpltSs(a x86.M128, b x86.M128) (dst x86.M128)

CmpltSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b' for less-than, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := ( a[31:0] < b[31:0] ) ? 0xffffffff : 0
dst[127:32] := a[127:32]

Instruction: 'CMPSS'. Intrinsic: '_mm_cmplt_ss'. Requires SSE.

func CmpneqPs

func CmpneqPs(a x86.M128, b x86.M128) (dst x86.M128)

CmpneqPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' for not-equal, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ( a[i+31:i] != b[i+31:i] ) ? 0xffffffff : 0
ENDFOR

Instruction: 'CMPPS'. Intrinsic: '_mm_cmpneq_ps'. Requires SSE.

func CmpneqSs

func CmpneqSs(a x86.M128, b x86.M128) (dst x86.M128)

CmpneqSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b' for not-equal, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := ( a[31:0] != b[31:0] ) ? 0xffffffff : 0
dst[127:32] := a[127:32]

Instruction: 'CMPSS'. Intrinsic: '_mm_cmpneq_ss'. Requires SSE.

func CmpngePs

func CmpngePs(a x86.M128, b x86.M128) (dst x86.M128)

CmpngePs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' for not-greater-than-or-equal, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := !( a[i+31:i] >= b[i+31:i] ) ? 0xffffffff : 0
ENDFOR

Instruction: 'CMPPS'. Intrinsic: '_mm_cmpnge_ps'. Requires SSE.

func CmpngeSs

func CmpngeSs(a x86.M128, b x86.M128) (dst x86.M128)

CmpngeSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b' for not-greater-than-or-equal, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := !( a[31:0] >= b[31:0] ) ? 0xffffffff : 0
dst[127:32] := a[127:32]

Instruction: 'CMPSS'. Intrinsic: '_mm_cmpnge_ss'. Requires SSE.

func CmpngtPs

func CmpngtPs(a x86.M128, b x86.M128) (dst x86.M128)

CmpngtPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' for not-greater-than, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := !( a[i+31:i] > b[i+31:i] ) ? 0xffffffff : 0
ENDFOR

Instruction: 'CMPPS'. Intrinsic: '_mm_cmpngt_ps'. Requires SSE.

func CmpngtSs

func CmpngtSs(a x86.M128, b x86.M128) (dst x86.M128)

CmpngtSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b' for not-greater-than, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := !( a[31:0] > b[31:0] ) ? 0xffffffff : 0
dst[127:32] := a[127:32]

Instruction: 'CMPSS'. Intrinsic: '_mm_cmpngt_ss'. Requires SSE.

func CmpnlePs

func CmpnlePs(a x86.M128, b x86.M128) (dst x86.M128)

CmpnlePs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' for not-less-than-or-equal, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := !( a[i+31:i] <= b[i+31:i] ) ? 0xffffffff : 0
ENDFOR

Instruction: 'CMPPS'. Intrinsic: '_mm_cmpnle_ps'. Requires SSE.

func CmpnleSs

func CmpnleSs(a x86.M128, b x86.M128) (dst x86.M128)

CmpnleSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b' for not-less-than-or-equal, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := !( a[31:0] <= b[31:0] ) ? 0xffffffff : 0
dst[127:32] := a[127:32]

Instruction: 'CMPSS'. Intrinsic: '_mm_cmpnle_ss'. Requires SSE.

func CmpnltPs

func CmpnltPs(a x86.M128, b x86.M128) (dst x86.M128)

CmpnltPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' for not-less-than, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := !( a[i+31:i] < b[i+31:i] ) ? 0xffffffff : 0
ENDFOR

Instruction: 'CMPPS'. Intrinsic: '_mm_cmpnlt_ps'. Requires SSE.

func CmpnltSs

func CmpnltSs(a x86.M128, b x86.M128) (dst x86.M128)

CmpnltSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b' for not-less-than, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := !( a[31:0] < b[31:0] ) ? 0xffffffff : 0
dst[127:32] := a[127:32]

Instruction: 'CMPSS'. Intrinsic: '_mm_cmpnlt_ss'. Requires SSE.

func CmpordPs

func CmpordPs(a x86.M128, b x86.M128) (dst x86.M128)

CmpordPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' to see if neither is NaN, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ( a[i+31:i] != NaN AND b[i+31:i] != NaN ) ? 0xffffffff : 0
ENDFOR

Instruction: 'CMPPS'. Intrinsic: '_mm_cmpord_ps'. Requires SSE.

func CmpordSs

func CmpordSs(a x86.M128, b x86.M128) (dst x86.M128)

CmpordSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b' to see if neither is NaN, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := ( a[31:0] != NaN AND b[31:0] != NaN ) ? 0xffffffff : 0
dst[127:32] := a[127:32]

Instruction: 'CMPSS'. Intrinsic: '_mm_cmpord_ss'. Requires SSE.

func CmpunordPs

func CmpunordPs(a x86.M128, b x86.M128) (dst x86.M128)

CmpunordPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b' to see if either is NaN, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ( a[i+31:i] != NaN OR b[i+31:i] != NaN ) ? 0xffffffff : 0
ENDFOR

Instruction: 'CMPPS'. Intrinsic: '_mm_cmpunord_ps'. Requires SSE.

func CmpunordSs

func CmpunordSs(a x86.M128, b x86.M128) (dst x86.M128)

CmpunordSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b' to see if either is NaN, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := ( a[31:0] != NaN OR b[31:0] != NaN ) ? 0xffffffff : 0
dst[127:32] := a[127:32]

Instruction: 'CMPSS'. Intrinsic: '_mm_cmpunord_ss'. Requires SSE.

func ComieqSs

func ComieqSs(a x86.M128, b x86.M128) int

ComieqSs: Compare the lower single-precision (32-bit) floating-point element in 'a' and 'b' for equality, and return the boolean result (0 or 1).

RETURN ( a[31:0] == b[31:0] ) ? 1 : 0

Instruction: 'COMISS'. Intrinsic: '_mm_comieq_ss'. Requires SSE.

func ComigeSs

func ComigeSs(a x86.M128, b x86.M128) int

ComigeSs: Compare the lower single-precision (32-bit) floating-point element in 'a' and 'b' for greater-than-or-equal, and return the boolean result (0 or 1).

RETURN ( a[31:0] >= b[31:0] ) ? 1 : 0

Instruction: 'COMISS'. Intrinsic: '_mm_comige_ss'. Requires SSE.

func ComigtSs

func ComigtSs(a x86.M128, b x86.M128) int

ComigtSs: Compare the lower single-precision (32-bit) floating-point element in 'a' and 'b' for greater-than, and return the boolean result (0 or 1).

RETURN ( a[31:0] > b[31:0] ) ? 1 : 0

Instruction: 'COMISS'. Intrinsic: '_mm_comigt_ss'. Requires SSE.

func ComileSs

func ComileSs(a x86.M128, b x86.M128) int

ComileSs: Compare the lower single-precision (32-bit) floating-point element in 'a' and 'b' for less-than-or-equal, and return the boolean result (0 or 1).

RETURN ( a[31:0] <= b[31:0] ) ? 1 : 0

Instruction: 'COMISS'. Intrinsic: '_mm_comile_ss'. Requires SSE.

func ComiltSs

func ComiltSs(a x86.M128, b x86.M128) int

ComiltSs: Compare the lower single-precision (32-bit) floating-point element in 'a' and 'b' for less-than, and return the boolean result (0 or 1).

RETURN ( a[31:0] < b[31:0] ) ? 1 : 0

Instruction: 'COMISS'. Intrinsic: '_mm_comilt_ss'. Requires SSE.

func ComineqSs

func ComineqSs(a x86.M128, b x86.M128) int

ComineqSs: Compare the lower single-precision (32-bit) floating-point element in 'a' and 'b' for not-equal, and return the boolean result (0 or 1).

RETURN ( a[31:0] != b[31:0] ) ? 1 : 0

Instruction: 'COMISS'. Intrinsic: '_mm_comineq_ss'. Requires SSE.

func CosPd

func CosPd(a x86.M128d) (dst x86.M128d)

CosPd: Compute the cosine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := COS(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_cos_pd'. Requires SSE.

func CosPs

func CosPs(a x86.M128) (dst x86.M128)

CosPs: Compute the cosine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := COS(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_cos_ps'. Requires SSE.

func CosdPd

func CosdPd(a x86.M128d) (dst x86.M128d)

CosdPd: Compute the cosine of packed double-precision (64-bit) floating-point elements in 'a' expressed in degrees, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := COSD(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_cosd_pd'. Requires SSE.

func CosdPs

func CosdPs(a x86.M128) (dst x86.M128)

CosdPs: Compute the cosine of packed single-precision (32-bit) floating-point elements in 'a' expressed in degrees, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := COSD(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_cosd_ps'. Requires SSE.

func CoshPd

func CoshPd(a x86.M128d) (dst x86.M128d)

CoshPd: Compute the hyperbolic cosine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := COSH(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_cosh_pd'. Requires SSE.

func CoshPs

func CoshPs(a x86.M128) (dst x86.M128)

CoshPs: Compute the hyperbolic cosine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := COSH(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_cosh_ps'. Requires SSE.

func CsqrtPs

func CsqrtPs(a x86.M128) (dst x86.M128)

CsqrtPs: Compute the square root of packed complex single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := SQRT(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_csqrt_ps'. Requires SSE.

func CvtPi2ps

func CvtPi2ps(a x86.M128, b x86.M64) (dst x86.M128)

CvtPi2ps: Convert packed 32-bit integers in 'b' to packed single-precision (32-bit) floating-point elements, store the results in the lower 2 elements of 'dst', and copy the upper 2 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := Convert_Int32_To_FP32(b[31:0])
dst[63:32] := Convert_Int32_To_FP32(b[63:32])
dst[95:64] := a[95:64]
dst[127:96] := a[127:96]

Instruction: 'CVTPI2PS'. Intrinsic: '_mm_cvt_pi2ps'. Requires SSE.

func CvtPs2pi

func CvtPs2pi(a x86.M128) (dst x86.M64)

CvtPs2pi: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst'.

FOR j := 0 to 1
	i := 32*j
	dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
ENDFOR

Instruction: 'CVTPS2PI'. Intrinsic: '_mm_cvt_ps2pi'. Requires SSE.

func CvtSi2ss

func CvtSi2ss(a x86.M128, b int) (dst x86.M128)

CvtSi2ss: Convert the 32-bit integer 'b' to a single-precision (32-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := Convert_Int32_To_FP32(b[31:0])
dst[127:32] := a[127:32]

Instruction: 'CVTSI2SS'. Intrinsic: '_mm_cvt_si2ss'. Requires SSE.

func CvtSs2si

func CvtSs2si(a x86.M128) int

CvtSs2si: Convert the lower single-precision (32-bit) floating-point element in 'a' to a 32-bit integer, and store the result in 'dst'.

dst[31:0] := Convert_FP32_To_Int32(a[31:0])

Instruction: 'CVTSS2SI'. Intrinsic: '_mm_cvt_ss2si'. Requires SSE.

func Cvtpi16Ps

func Cvtpi16Ps(a x86.M64) (dst x86.M128)

Cvtpi16Ps: Convert packed 16-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*16
	m := j*32
	dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i])
ENDFOR

Instruction: '...'. Intrinsic: '_mm_cvtpi16_ps'. Requires SSE.

func Cvtpi32Ps

func Cvtpi32Ps(a x86.M128, b x86.M64) (dst x86.M128)

Cvtpi32Ps: Convert packed 32-bit integers in 'b' to packed single-precision (32-bit) floating-point elements, store the results in the lower 2 elements of 'dst', and copy the upper 2 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := Convert_Int32_To_FP32(b[31:0])
dst[63:32] := Convert_Int32_To_FP32(b[63:32])
dst[95:64] := a[95:64]
dst[127:96] := a[127:96]

Instruction: 'CVTPI2PS'. Intrinsic: '_mm_cvtpi32_ps'. Requires SSE.

func Cvtpi32x2Ps

func Cvtpi32x2Ps(a x86.M64, b x86.M64) (dst x86.M128)

Cvtpi32x2Ps: Convert packed 32-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, store the results in the lower 2 elements of 'dst', then covert the packed 32-bit integers in 'a' to single-precision (32-bit) floating-point element, and store the results in the upper 2 elements of 'dst'.

dst[31:0] := Convert_Int32_To_FP32(a[31:0])
dst[63:32] := Convert_Int32_To_FP32(a[63:32])
dst[95:64] := Convert_Int32_To_FP32(b[31:0])
dst[127:96] := Convert_Int32_To_FP32(b[63:32])

Instruction: '...'. Intrinsic: '_mm_cvtpi32x2_ps'. Requires SSE.

func Cvtpi8Ps

func Cvtpi8Ps(a x86.M64) (dst x86.M128)

Cvtpi8Ps: Convert the lower packed 8-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*8
	m := j*32
	dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i])
ENDFOR

Instruction: '...'. Intrinsic: '_mm_cvtpi8_ps'. Requires SSE.

func CvtpsPi16

func CvtpsPi16(a x86.M128) (dst x86.M64)

CvtpsPi16: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 16-bit integers, and store the results in 'dst'.

FOR j := 0 to 3
	i := 16*j
	k := 32*j
	dst[i+15:i] := Convert_FP32_To_Int16(a[k+31:k])
ENDFOR

Instruction: '...'. Intrinsic: '_mm_cvtps_pi16'. Requires SSE.

func CvtpsPi32

func CvtpsPi32(a x86.M128) (dst x86.M64)

CvtpsPi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers, and store the results in 'dst'.

FOR j := 0 to 1
	i := 32*j
	dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
ENDFOR

Instruction: 'CVTPS2PI'. Intrinsic: '_mm_cvtps_pi32'. Requires SSE.

func CvtpsPi8

func CvtpsPi8(a x86.M128) (dst x86.M64)

CvtpsPi8: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 8-bit integers, and store the results in lower 4 elements of 'dst'.

FOR j := 0 to 3
	i := 8*j
	k := 32*j
	dst[i+7:i] := Convert_FP32_To_Int8(a[k+31:k])
ENDFOR

Instruction: '...'. Intrinsic: '_mm_cvtps_pi8'. Requires SSE.

func Cvtpu16Ps

func Cvtpu16Ps(a x86.M64) (dst x86.M128)

Cvtpu16Ps: Convert packed unsigned 16-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*16
	m := j*32
	dst[m+31:m] := Convert_UnsignedInt16_To_FP32(a[i+15:i])
ENDFOR

Instruction: '...'. Intrinsic: '_mm_cvtpu16_ps'. Requires SSE.

func Cvtpu8Ps

func Cvtpu8Ps(a x86.M64) (dst x86.M128)

Cvtpu8Ps: Convert the lower packed unsigned 8-bit integers in 'a' to packed single-precision (32-bit) floating-point elements, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*8
	m := j*32
	dst[m+31:m] := Convert_UnsignedInt8_To_FP32(a[i+7:i])
ENDFOR

Instruction: '...'. Intrinsic: '_mm_cvtpu8_ps'. Requires SSE.

func Cvtsi32Ss

func Cvtsi32Ss(a x86.M128, b int) (dst x86.M128)

Cvtsi32Ss: Convert the 32-bit integer 'b' to a single-precision (32-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := Convert_Int32_To_FP32(b[31:0])
dst[127:32] := a[127:32]

Instruction: 'CVTSI2SS'. Intrinsic: '_mm_cvtsi32_ss'. Requires SSE.

func Cvtsi64Ss

func Cvtsi64Ss(a x86.M128, b int64) (dst x86.M128)

Cvtsi64Ss: Convert the 64-bit integer 'b' to a single-precision (32-bit) floating-point element, store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := Convert_Int64_To_FP32(b[63:0])
dst[127:32] := a[127:32]
dst[MAX:128] := 0

Instruction: 'CVTSI2SS'. Intrinsic: '_mm_cvtsi64_ss'. Requires SSE.

func CvtssF32

func CvtssF32(a x86.M128) float32

CvtssF32: Copy the lower single-precision (32-bit) floating-point element of 'a' to 'dst'.

dst[31:0] := a[31:0]

Instruction: 'MOVSS'. Intrinsic: '_mm_cvtss_f32'. Requires SSE.

func CvtssSi32

func CvtssSi32(a x86.M128) int

CvtssSi32: Convert the lower single-precision (32-bit) floating-point element in 'a' to a 32-bit integer, and store the result in 'dst'.

dst[31:0] := Convert_FP32_To_Int32(a[31:0])

Instruction: 'CVTSS2SI'. Intrinsic: '_mm_cvtss_si32'. Requires SSE.

func CvtssSi64

func CvtssSi64(a x86.M128) int64

CvtssSi64: Convert the lower single-precision (32-bit) floating-point element in 'a' to a 64-bit integer, and store the result in 'dst'.

dst[63:0] := Convert_FP32_To_Int64(a[31:0])

Instruction: 'CVTSS2SI'. Intrinsic: '_mm_cvtss_si64'. Requires SSE.

func CvttPs2pi

func CvttPs2pi(a x86.M128) (dst x86.M64)

CvttPs2pi: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 1
	i := 32*j
	dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
ENDFOR

Instruction: 'CVTTPS2PI'. Intrinsic: '_mm_cvtt_ps2pi'. Requires SSE.

func CvttSs2si

func CvttSs2si(a x86.M128) int

CvttSs2si: Convert the lower single-precision (32-bit) floating-point element in 'a' to a 32-bit integer with truncation, and store the result in 'dst'.

dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])

Instruction: 'CVTTSS2SI'. Intrinsic: '_mm_cvtt_ss2si'. Requires SSE.

func CvttpsPi32

func CvttpsPi32(a x86.M128) (dst x86.M64)

CvttpsPi32: Convert packed single-precision (32-bit) floating-point elements in 'a' to packed 32-bit integers with truncation, and store the results in 'dst'.

FOR j := 0 to 1
	i := 32*j
	dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
ENDFOR

Instruction: 'CVTTPS2PI'. Intrinsic: '_mm_cvttps_pi32'. Requires SSE.

func CvttssSi32

func CvttssSi32(a x86.M128) int

CvttssSi32: Convert the lower single-precision (32-bit) floating-point element in 'a' to a 32-bit integer with truncation, and store the result in 'dst'.

dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])

Instruction: 'CVTTSS2SI'. Intrinsic: '_mm_cvttss_si32'. Requires SSE.

func CvttssSi64

func CvttssSi64(a x86.M128) int64

CvttssSi64: Convert the lower single-precision (32-bit) floating-point element in 'a' to a 64-bit integer with truncation, and store the result in 'dst'.

dst[63:0] := Convert_FP64_To_Int32_Truncate(a[31:0])

Instruction: 'CVTTSS2SI'. Intrinsic: '_mm_cvttss_si64'. Requires SSE.

func DivEpi16

func DivEpi16(a x86.M128i, b x86.M128i) (dst x86.M128i)

DivEpi16: Divide packed 16-bit integers in 'a' by packed elements in 'b', and store the truncated results in 'dst'.

FOR j := 0 to 7
	i := 16*j
	dst[i+15:i] := TRUNCATE(a[i+15:i] / b[i+15:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_div_epi16'. Requires SSE.

func DivEpi32

func DivEpi32(a x86.M128i, b x86.M128i) (dst x86.M128i)

DivEpi32: Divide packed 32-bit integers in 'a' by packed elements in 'b', and store the truncated results in 'dst'.

FOR j := 0 to 3
	i := 32*j
	dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_div_epi32'. Requires SSE.

func DivEpi64

func DivEpi64(a x86.M128i, b x86.M128i) (dst x86.M128i)

DivEpi64: Divide packed 64-bit integers in 'a' by packed elements in 'b', and store the truncated results in 'dst'.

FOR j := 0 to 1
	i := 64*j
	dst[i+63:i] := TRUNCATE(a[i+63:i] / b[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_div_epi64'. Requires SSE.

func DivEpi8

func DivEpi8(a x86.M128i, b x86.M128i) (dst x86.M128i)

DivEpi8: Divide packed 8-bit integers in 'a' by packed elements in 'b', and store the truncated results in 'dst'.

FOR j := 0 to 15
	i := 8*j
	dst[i+7:i] := TRUNCATE(a[i+7:i] / b[i+7:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_div_epi8'. Requires SSE.

func DivEpu16

func DivEpu16(a x86.M128i, b x86.M128i) (dst x86.M128i)

DivEpu16: Divide packed unsigned 16-bit integers in 'a' by packed elements in 'b', and store the truncated results in 'dst'.

FOR j := 0 to 7
	i := 16*j
	dst[i+15:i] := TRUNCATE(a[i+15:i] / b[i+15:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_div_epu16'. Requires SSE.

func DivEpu32

func DivEpu32(a x86.M128i, b x86.M128i) (dst x86.M128i)

DivEpu32: Divide packed unsigned 32-bit integers in 'a' by packed elements in 'b', and store the truncated results in 'dst'.

FOR j := 0 to 3
	i := 32*j
	dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_div_epu32'. Requires SSE.

func DivEpu64

func DivEpu64(a x86.M128i, b x86.M128i) (dst x86.M128i)

DivEpu64: Divide packed unsigned 64-bit integers in 'a' by packed elements in 'b', and store the truncated results in 'dst'.

FOR j := 0 to 1
	i := 64*j
	dst[i+63:i] := TRUNCATE(a[i+63:i] / b[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_div_epu64'. Requires SSE.

func DivEpu8

func DivEpu8(a x86.M128i, b x86.M128i) (dst x86.M128i)

DivEpu8: Divide packed unsigned 8-bit integers in 'a' by packed elements in 'b', and store the truncated results in 'dst'.

FOR j := 0 to 15
	i := 8*j
	dst[i+7:i] := TRUNCATE(a[i+7:i] / b[i+7:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_div_epu8'. Requires SSE.

func DivPs

func DivPs(a x86.M128, b x86.M128) (dst x86.M128)

DivPs: Divide packed single-precision (32-bit) floating-point elements in 'a' by packed elements in 'b', and store the results in 'dst'.

FOR j := 0 to 3
	i := 32*j
	dst[i+31:i] := a[i+31:i] / b[i+31:i]
ENDFOR

Instruction: 'DIVPS'. Intrinsic: '_mm_div_ps'. Requires SSE.

func DivSs

func DivSs(a x86.M128, b x86.M128) (dst x86.M128)

DivSs: Divide the lower single-precision (32-bit) floating-point element in 'a' by the lower single-precision (32-bit) floating-point element in 'b', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := a[31:0] / b[31:0]
dst[127:32] := a[127:32]

Instruction: 'DIVSS'. Intrinsic: '_mm_div_ss'. Requires SSE.

func ErfPd

func ErfPd(a x86.M128d) (dst x86.M128d)

ErfPd: Compute the error function of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ERF(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_erf_pd'. Requires SSE.

func ErfPs

func ErfPs(a x86.M128) (dst x86.M128)

ErfPs: Compute the error function of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ERF(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_erf_ps'. Requires SSE.

func ErfcPd

func ErfcPd(a x86.M128d) (dst x86.M128d)

ErfcPd: Compute the complementary error function of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := 1.0 - ERF(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_erfc_pd'. Requires SSE.

func ErfcPs

func ErfcPs(a x86.M128) (dst x86.M128)

ErfcPs: Compute the complementary error function of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := 1.0 - ERF(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_erfc_ps'. Requires SSE.

func ErfcinvPd

func ErfcinvPd(a x86.M128d) (dst x86.M128d)

ErfcinvPd: Compute the inverse complementary error function of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := 1.0 / (1.0 - ERF(a[i+63:i]))
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_erfcinv_pd'. Requires SSE.

func ErfcinvPs

func ErfcinvPs(a x86.M128) (dst x86.M128)

ErfcinvPs: Compute the inverse complementary error function of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := 1.0 / (1.0 - ERF(a[i+31:i]))
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_erfcinv_ps'. Requires SSE.

func ErfinvPd

func ErfinvPd(a x86.M128d) (dst x86.M128d)

ErfinvPd: Compute the inverse error function of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := 1.0 / ERF(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_erfinv_pd'. Requires SSE.

func ErfinvPs

func ErfinvPs(a x86.M128) (dst x86.M128)

ErfinvPs: Compute the inverse error function of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := 1.0 / ERF(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_erfinv_ps'. Requires SSE.

func Exp10Pd

func Exp10Pd(a x86.M128d) (dst x86.M128d)

Exp10Pd: Compute the exponential value of 10 raised to the power of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := 10^(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_exp10_pd'. Requires SSE.

func Exp10Ps

func Exp10Ps(a x86.M128) (dst x86.M128)

Exp10Ps: Compute the exponential value of 10 raised to the power of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := 10^(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_exp10_ps'. Requires SSE.

func Exp2Pd

func Exp2Pd(a x86.M128d) (dst x86.M128d)

Exp2Pd: Compute the exponential value of 2 raised to the power of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := 2^(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_exp2_pd'. Requires SSE.

func Exp2Ps

func Exp2Ps(a x86.M128) (dst x86.M128)

Exp2Ps: Compute the exponential value of 2 raised to the power of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := 2^(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_exp2_ps'. Requires SSE.

func ExpPd

func ExpPd(a x86.M128d) (dst x86.M128d)

ExpPd: Compute the exponential value of 'e' raised to the power of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := e^(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_exp_pd'. Requires SSE.

func ExpPs

func ExpPs(a x86.M128) (dst x86.M128)

ExpPs: Compute the exponential value of 'e' raised to the power of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := e^(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_exp_ps'. Requires SSE.

func Expm1Pd

func Expm1Pd(a x86.M128d) (dst x86.M128d)

Expm1Pd: Compute the exponential value of 'e' raised to the power of packed double-precision (64-bit) floating-point elements in 'a', subtract one from each element, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := e^(a[i+63:i]) - 1.0
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_expm1_pd'. Requires SSE.

func Expm1Ps

func Expm1Ps(a x86.M128) (dst x86.M128)

Expm1Ps: Compute the exponential value of 'e' raised to the power of packed single-precision (32-bit) floating-point elements in 'a', subtract one from each element, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := e^(a[i+31:i]) - 1.0
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_expm1_ps'. Requires SSE.

func ExtractPi16

func ExtractPi16(a x86.M64, imm8 byte) int

ExtractPi16: Extract a 16-bit integer from 'a', selected with 'imm8', and store the result in the lower element of 'dst'.

dst[15:0] := (a[63:0] >> (imm8[1:0] * 16))[15:0]
dst[31:16] := 0

Instruction: 'PEXTRW'. Intrinsic: '_mm_extract_pi16'. Requires SSE.

FIXME: Requires compiler support (has immediate)

func Getcsr

func Getcsr() uint32

Getcsr: Get the unsigned 32-bit value of the MXCSR control and status register.

dst[31:0] := MXCSR

Instruction: 'STMXCSR'. Intrinsic: '_mm_getcsr'. Requires SSE.

func HypotPd

func HypotPd(a x86.M128d, b x86.M128d) (dst x86.M128d)

HypotPd: Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed double-precision (64-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := SQRT(a[i+63:i]^2 + b[i+63:i]^2)
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_hypot_pd'. Requires SSE.

func HypotPs

func HypotPs(a x86.M128, b x86.M128) (dst x86.M128)

HypotPs: Compute the length of the hypotenous of a right triangle, with the lengths of the other two sides of the triangle stored as packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := SQRT(a[i+31:i]^2 + b[i+31:i]^2)
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_hypot_ps'. Requires SSE.

func IdivEpi32

func IdivEpi32(a x86.M128i, b x86.M128i) (dst x86.M128i)

IdivEpi32: Divide packed 32-bit integers in 'a' by packed elements in 'b', and store the truncated results in 'dst'.

FOR j := 0 to 3
	i := 32*j
	dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_idiv_epi32'. Requires SSE.

func IdivremEpi32

func IdivremEpi32(mem_addr *x86.M128i, a x86.M128i, b x86.M128i) (dst x86.M128i)

IdivremEpi32: Divide packed 32-bit integers in 'a' by packed elements in 'b', store the truncated results in 'dst', and store the remainders as packed 32-bit integers into memory at 'mem_addr'.

FOR j := 0 to 3
	i := 32*j
	dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
	MEM[mem_addr+i+31:mem_addr+i] := REMAINDER(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_idivrem_epi32'. Requires SSE.

FIXME: Will likely need to be reworked (has pointer parameter).

func InsertPi16

func InsertPi16(a x86.M64, i int, imm8 byte) (dst x86.M64)

InsertPi16: Copy 'a' to 'dst', and insert the 16-bit integer 'i' into 'dst' at the location specified by 'imm8'.

dst[63:0] := a[63:0]
sel := imm8[1:0]*16
dst[sel+15:sel] := i[15:0]

Instruction: 'PINSRW'. Intrinsic: '_mm_insert_pi16'. Requires SSE.

FIXME: Requires compiler support (has immediate)

func InvcbrtPd

func InvcbrtPd(a x86.M128d) (dst x86.M128d)

InvcbrtPd: Compute the inverse cube root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := InvCubeRoot(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_invcbrt_pd'. Requires SSE.

func InvcbrtPs

func InvcbrtPs(a x86.M128) (dst x86.M128)

InvcbrtPs: Compute the inverse cube root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := InvCubeRoot(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_invcbrt_ps'. Requires SSE.

func InvsqrtPd

func InvsqrtPd(a x86.M128d) (dst x86.M128d)

InvsqrtPd: Compute the inverse square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := InvSQRT(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_invsqrt_pd'. Requires SSE.

func InvsqrtPs

func InvsqrtPs(a x86.M128) (dst x86.M128)

InvsqrtPs: Compute the inverse square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := InvSQRT(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_invsqrt_ps'. Requires SSE.

func IremEpi32

func IremEpi32(a x86.M128i, b x86.M128i) (dst x86.M128i)

IremEpi32: Divide packed 32-bit integers in 'a' by packed elements in 'b', and store the remainders as packed 32-bit integers in 'dst'.

FOR j := 0 to 3
	i := 32*j
	dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_irem_epi32'. Requires SSE.

func LoadhPi

func LoadhPi(a x86.M128, mem_addr *x86.M64Const) (dst x86.M128)

LoadhPi: Load 2 single-precision (32-bit) floating-point elements from memory into the upper 2 elements of 'dst', and copy the lower 2 elements from 'a' to 'dst'. 'mem_addr' does not need to be aligned on any particular boundary.

dst[31:0] := a[31:0]
dst[63:32] := a[63:32]
dst[95:64] := MEM[mem_addr+31:mem_addr]
dst[127:96] := MEM[mem_addr+63:mem_addr+32]

Instruction: 'MOVHPS'. Intrinsic: '_mm_loadh_pi'. Requires SSE.

FIXME: Will likely need to be reworked (has pointer parameter).

func LoadlPi

func LoadlPi(a x86.M128, mem_addr *x86.M64Const) (dst x86.M128)

LoadlPi: Load 2 single-precision (32-bit) floating-point elements from memory into the lower 2 elements of 'dst', and copy the upper 2 elements from 'a' to 'dst'. 'mem_addr' does not need to be aligned on any particular boundary.

dst[31:0] := MEM[mem_addr+31:mem_addr]
dst[63:32] := MEM[mem_addr+63:mem_addr+32]
dst[95:64] := a[95:64]
dst[127:96] := a[127:96]

Instruction: 'MOVLPS'. Intrinsic: '_mm_loadl_pi'. Requires SSE.

FIXME: Will likely need to be reworked (has pointer parameter).

func Log10Pd

func Log10Pd(a x86.M128d) (dst x86.M128d)

Log10Pd: Compute the base-10 logarithm of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := log10(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_log10_pd'. Requires SSE.

func Log10Ps

func Log10Ps(a x86.M128) (dst x86.M128)

Log10Ps: Compute the base-10 logarithm of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := log10(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_log10_ps'. Requires SSE.

func Log1pPd

func Log1pPd(a x86.M128d) (dst x86.M128d)

Log1pPd: Compute the natural logarithm of one plus packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ln(1.0 + a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_log1p_pd'. Requires SSE.

func Log1pPs

func Log1pPs(a x86.M128) (dst x86.M128)

Log1pPs: Compute the natural logarithm of one plus packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ln(1.0 + a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_log1p_ps'. Requires SSE.

func Log2Pd

func Log2Pd(a x86.M128d) (dst x86.M128d)

Log2Pd: Compute the base-2 logarithm of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := log2(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_log2_pd'. Requires SSE.

func Log2Ps

func Log2Ps(a x86.M128) (dst x86.M128)

Log2Ps: Compute the base-2 logarithm of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := log2(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_log2_ps'. Requires SSE.

func LogPd

func LogPd(a x86.M128d) (dst x86.M128d)

LogPd: Compute the natural logarithm of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ln(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_log_pd'. Requires SSE.

func LogPs

func LogPs(a x86.M128) (dst x86.M128)

LogPs: Compute the natural logarithm of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ln(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_log_ps'. Requires SSE.

func LogbPd

func LogbPd(a x86.M128d) (dst x86.M128d)

LogbPd: Convert the exponent of each packed double-precision (64-bit) floating-point element in 'a' to a double-precision floating-point number representing the integer exponent, and store the results in 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for each element.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ConvertExpFP64(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_logb_pd'. Requires SSE.

func LogbPs

func LogbPs(a x86.M128) (dst x86.M128)

LogbPs: Convert the exponent of each packed single-precision (32-bit) floating-point element in 'a' to a single-precision floating-point number representing the integer exponent, and store the results in 'dst'. This intrinsic essentially calculates 'floor(log2(x))' for each element.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ConvertExpFP32(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_logb_ps'. Requires SSE.

func MMGETEXCEPTIONMASK

func MMGETEXCEPTIONMASK() uint32

MMGETEXCEPTIONMASK: Macro: Get the exception mask bits from the MXCSR control and status register. The exception mask may contain any of the following flags: _MM_MASK_INVALID, _MM_MASK_DIV_ZERO, _MM_MASK_DENORM, _MM_MASK_OVERFLOW, _MM_MASK_UNDERFLOW, _MM_MASK_INEXACT

dst[31:0] := MXCSR & _MM_MASK_MASK

Instruction: ”. Intrinsic: '_MM_GET_EXCEPTION_MASK'. Requires SSE.

func MMGETEXCEPTIONSTATE

func MMGETEXCEPTIONSTATE() uint32

MMGETEXCEPTIONSTATE: Macro: Get the exception state bits from the MXCSR control and status register. The exception state may contain any of the following flags: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO, _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW, _MM_EXCEPT_INEXACT

dst[31:0] := MXCSR & _MM_EXCEPT_MASK

Instruction: ”. Intrinsic: '_MM_GET_EXCEPTION_STATE'. Requires SSE.

func MMGETFLUSHZEROMODE

func MMGETFLUSHZEROMODE() uint32

MMGETFLUSHZEROMODE: Macro: Get the flush zero bits from the MXCSR control and status register. The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF

dst[31:0] := MXCSR & _MM_FLUSH_MASK

Instruction: ”. Intrinsic: '_MM_GET_FLUSH_ZERO_MODE'. Requires SSE.

func MMGETROUNDINGMODE

func MMGETROUNDINGMODE() uint32

MMGETROUNDINGMODE: Macro: Get the rounding mode bits from the MXCSR control and status register. The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO

dst[31:0] := MXCSR & _MM_ROUND_MASK

Instruction: ”. Intrinsic: '_MM_GET_ROUNDING_MODE'. Requires SSE.

func MMSETEXCEPTIONMASK

func MMSETEXCEPTIONMASK(a uint32)

MMSETEXCEPTIONMASK: Macro: Set the exception mask bits of the MXCSR control and status register to the value in unsigned 32-bit integer 'a'. The exception mask may contain any of the following flags: _MM_MASK_INVALID, _MM_MASK_DIV_ZERO, _MM_MASK_DENORM, _MM_MASK_OVERFLOW, _MM_MASK_UNDERFLOW, _MM_MASK_INEXACT

MXCSR := a[31:0] AND ~_MM_MASK_MASK

Instruction: ”. Intrinsic: '_MM_SET_EXCEPTION_MASK'. Requires SSE.

func MMSETEXCEPTIONSTATE

func MMSETEXCEPTIONSTATE(a uint32)

MMSETEXCEPTIONSTATE: Macro: Set the exception state bits of the MXCSR control and status register to the value in unsigned 32-bit integer 'a'. The exception state may contain any of the following flags: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO, _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW, _MM_EXCEPT_INEXACT

MXCSR := a[31:0] AND ~_MM_EXCEPT_MASK

Instruction: ”. Intrinsic: '_MM_SET_EXCEPTION_STATE'. Requires SSE.

func MMSETFLUSHZEROMODE

func MMSETFLUSHZEROMODE(a uint32)

MMSETFLUSHZEROMODE: Macro: Set the flush zero bits of the MXCSR control and status register to the value in unsigned 32-bit integer 'a'. The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF

MXCSR := a[31:0] AND ~_MM_FLUSH_MASK

Instruction: ”. Intrinsic: '_MM_SET_FLUSH_ZERO_MODE'. Requires SSE.

func MMSETROUNDINGMODE

func MMSETROUNDINGMODE(a uint32)

MMSETROUNDINGMODE: Macro: Set the rounding mode bits of the MXCSR control and status register to the value in unsigned 32-bit integer 'a'. The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO

MXCSR := a[31:0] AND ~_MM_ROUND_MASK

Instruction: ”. Intrinsic: '_MM_SET_ROUNDING_MODE'. Requires SSE.

func MMTRANSPOSE4PS

func MMTRANSPOSE4PS(row0 x86.M128, row1 x86.M128, row2 x86.M128, row3 x86.M128)

MMTRANSPOSE4PS: Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision (32-bit) floating-point elements in 'row0', 'row1', 'row2', and 'row3', and store the transposed matrix in these vectors ('row0' now contains column 0, etc.).

__m128 tmp3, tmp2, tmp1, tmp0;
tmp0 = _mm_unpacklo_ps(row0, row1);
tmp2 = _mm_unpacklo_ps(row2, row3);
tmp1 = _mm_unpackhi_ps(row0, row1);
tmp3 = _mm_unpackhi_ps(row2, row3);
row0 = _mm_movelh_ps(tmp0, tmp2);
row1 = _mm_movehl_ps(tmp2, tmp0);
row2 = _mm_movelh_ps(tmp1, tmp3);
row3 = _mm_movehl_ps(tmp3, tmp1);

Instruction: '...'. Intrinsic: '_MM_TRANSPOSE4_PS'. Requires SSE.

func MaskmoveSi64

func MaskmoveSi64(a x86.M64, mask x86.M64, mem_addr *byte)

MaskmoveSi64: Conditionally store 8-bit integer elements from 'a' into memory using 'mask' (elements are not stored when the highest bit is not set in the corresponding element) and a non-temporal memory hint.

FOR j := 0 to 7
	i := j*8
	IF mask[i+7]
		MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i]
	FI
ENDFOR

Instruction: 'MASKMOVQ'. Intrinsic: '_mm_maskmove_si64'. Requires SSE.

FIXME: Will likely need to be reworked (has pointer parameter).

func Maskmovq

func Maskmovq(a x86.M64, mask x86.M64, mem_addr *byte)

Maskmovq: Conditionally store 8-bit integer elements from 'a' into memory using 'mask' (elements are not stored when the highest bit is not set in the corresponding element).

FOR j := 0 to 7
	i := j*8
	IF mask[i+7]
		MEM[mem_addr+i+7:mem_addr+i] := a[i+7:i]
	FI
ENDFOR

Instruction: 'MASKMOVQ'. Intrinsic: '_m_maskmovq'. Requires SSE.

FIXME: Will likely need to be reworked (has pointer parameter).

func MaxPi16

func MaxPi16(a x86.M64, b x86.M64) (dst x86.M64)

MaxPi16: Compare packed 16-bit integers in 'a' and 'b', and store packed maximum values in 'dst'.

FOR j := 0 to 3
	i := j*16
	IF a[i+15:i] > b[i+15:i]
		dst[i+15:i] := a[i+15:i]
	ELSE
		dst[i+15:i] := b[i+15:i]
	FI
ENDFOR

Instruction: 'PMAXSW'. Intrinsic: '_mm_max_pi16'. Requires SSE.

func MaxPs

func MaxPs(a x86.M128, b x86.M128) (dst x86.M128)

MaxPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed maximum values in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := MAX(a[i+31:i], b[i+31:i])
ENDFOR

Instruction: 'MAXPS'. Intrinsic: '_mm_max_ps'. Requires SSE.

func MaxPu8

func MaxPu8(a x86.M64, b x86.M64) (dst x86.M64)

MaxPu8: Compare packed unsigned 8-bit integers in 'a' and 'b', and store packed maximum values in 'dst'.

FOR j := 0 to 7
	i := j*8
	IF a[i+7:i] > b[i+7:i]
		dst[i+7:i] := a[i+7:i]
	ELSE
		dst[i+7:i] := b[i+7:i]
	FI
ENDFOR

Instruction: 'PMAXUB'. Intrinsic: '_mm_max_pu8'. Requires SSE.

func MaxSs

func MaxSs(a x86.M128, b x86.M128) (dst x86.M128)

MaxSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b', store the maximum value in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'.

dst[31:0] := MAX(a[31:0], b[31:0])
dst[127:32] := a[127:32]

Instruction: 'MAXSS'. Intrinsic: '_mm_max_ss'. Requires SSE.

func MinPi16

func MinPi16(a x86.M64, b x86.M64) (dst x86.M64)

MinPi16: Compare packed 16-bit integers in 'a' and 'b', and store packed minimum values in 'dst'.

FOR j := 0 to 3
	i := j*16
	IF a[i+15:i] < b[i+15:i]
		dst[i+15:i] := a[i+15:i]
	ELSE
		dst[i+15:i] := b[i+15:i]
	FI
ENDFOR

Instruction: 'PMINSW'. Intrinsic: '_mm_min_pi16'. Requires SSE.

func MinPs

func MinPs(a x86.M128, b x86.M128) (dst x86.M128)

MinPs: Compare packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store packed minimum values in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := MIN(a[i+31:i], b[i+31:i])
ENDFOR

Instruction: 'MINPS'. Intrinsic: '_mm_min_ps'. Requires SSE.

func MinPu8

func MinPu8(a x86.M64, b x86.M64) (dst x86.M64)

MinPu8: Compare packed unsigned 8-bit integers in 'a' and 'b', and store packed minimum values in 'dst'.

FOR j := 0 to 7
	i := j*8
	IF a[i+7:i] < b[i+7:i]
		dst[i+7:i] := a[i+7:i]
	ELSE
		dst[i+7:i] := b[i+7:i]
	FI
ENDFOR

Instruction: 'PMINUB'. Intrinsic: '_mm_min_pu8'. Requires SSE.

func MinSs

func MinSs(a x86.M128, b x86.M128) (dst x86.M128)

MinSs: Compare the lower single-precision (32-bit) floating-point elements in 'a' and 'b', store the minimum value in the lower element of 'dst', and copy the upper element from 'a' to the upper element of 'dst'.

dst[31:0] := MIN(a[31:0], b[31:0])
dst[127:32] := a[127:32]

Instruction: 'MINSS'. Intrinsic: '_mm_min_ss'. Requires SSE.

func MoveSs

func MoveSs(a x86.M128, b x86.M128) (dst x86.M128)

MoveSs: Move the lower single-precision (32-bit) floating-point element from 'b' to the lower element of 'dst', and copy the upper 3 elements from 'a' to the upper elements of 'dst'.

dst[31:0] := b[31:0]
dst[63:32] := a[63:32]
dst[95:64] := a[95:64]
dst[127:96] := a[127:96]

Instruction: 'MOVSS'. Intrinsic: '_mm_move_ss'. Requires SSE.

func MovehlPs

func MovehlPs(a x86.M128, b x86.M128) (dst x86.M128)

MovehlPs: Move the upper 2 single-precision (32-bit) floating-point elements from 'b' to the lower 2 elements of 'dst', and copy the upper 2 elements from 'a' to the upper 2 elements of 'dst'.

dst[31:0] := b[95:64]
dst[63:32] := b[127:96]
dst[95:64] := a[95:64]
dst[127:96] := a[127:96]

Instruction: 'MOVHLPS'. Intrinsic: '_mm_movehl_ps'. Requires SSE.

func MovelhPs

func MovelhPs(a x86.M128, b x86.M128) (dst x86.M128)

MovelhPs: Move the lower 2 single-precision (32-bit) floating-point elements from 'b' to the upper 2 elements of 'dst', and copy the lower 2 elements from 'a' to the lower 2 elements of 'dst'.

dst[31:0] := a[31:0]
dst[63:32] := a[63:32]
dst[95:64] := b[31:0]
dst[127:96] := b[63:32]

Instruction: 'MOVLHPS'. Intrinsic: '_mm_movelh_ps'. Requires SSE.

func MovemaskPi8

func MovemaskPi8(a x86.M64) int

MovemaskPi8: Create mask from the most significant bit of each 8-bit element in 'a', and store the result in 'dst'.

FOR j := 0 to 7
	i := j*8
	dst[j] := a[i+7]
ENDFOR
dst[MAX:8] := 0

Instruction: 'PMOVMSKB'. Intrinsic: '_mm_movemask_pi8'. Requires SSE.

func MovemaskPs

func MovemaskPs(a x86.M128) int

MovemaskPs: Set each bit of mask 'dst' based on the most significant bit of the corresponding packed single-precision (32-bit) floating-point element in 'a'.

FOR j := 0 to 3
	i := j*32
	IF a[i+31]
		dst[j] := 1
	ELSE
		dst[j] := 0
	FI
ENDFOR
dst[MAX:4] := 0

Instruction: 'MOVMSKPS'. Intrinsic: '_mm_movemask_ps'. Requires SSE.

func MulPs

func MulPs(a x86.M128, b x86.M128) (dst x86.M128)

MulPs: Multiply packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := a[i+31:i] * b[i+31:i]
ENDFOR

Instruction: 'MULPS'. Intrinsic: '_mm_mul_ps'. Requires SSE.

func MulSs

func MulSs(a x86.M128, b x86.M128) (dst x86.M128)

MulSs: Multiply the lower single-precision (32-bit) floating-point element in 'a' and 'b', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := a[31:0] * b[31:0]
dst[127:32] := a[127:32]

Instruction: 'MULSS'. Intrinsic: '_mm_mul_ss'. Requires SSE.

func MulhiPu16

func MulhiPu16(a x86.M64, b x86.M64) (dst x86.M64)

MulhiPu16: Multiply the packed unsigned 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in 'dst'.

FOR j := 0 to 3
	i := j*16
	tmp[31:0] := a[i+15:i] * b[i+15:i]
	dst[i+15:i] := tmp[31:16]
ENDFOR

Instruction: 'PMULHUW'. Intrinsic: '_mm_mulhi_pu16'. Requires SSE.

func OrPs

func OrPs(a x86.M128, b x86.M128) (dst x86.M128)

OrPs: Compute the bitwise OR of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := a[i+31:i] BITWISE OR b[i+31:i]
ENDFOR

Instruction: 'ORPS'. Intrinsic: '_mm_or_ps'. Requires SSE.

func Pavgb

func Pavgb(a x86.M64, b x86.M64) (dst x86.M64)

Pavgb: Average packed unsigned 8-bit integers in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 7
	i := j*8
	dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
ENDFOR

Instruction: 'PAVGB'. Intrinsic: '_m_pavgb'. Requires SSE.

func Pavgw

func Pavgw(a x86.M64, b x86.M64) (dst x86.M64)

Pavgw: Average packed unsigned 16-bit integers in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*16
	dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
ENDFOR

Instruction: 'PAVGW'. Intrinsic: '_m_pavgw'. Requires SSE.

func Pextrw

func Pextrw(a x86.M64, imm8 byte) int

Pextrw: Extract a 16-bit integer from 'a', selected with 'imm8', and store the result in the lower element of 'dst'.

dst[15:0] := (a[63:0] >> (imm8[1:0] * 16))[15:0]
dst[31:16] := 0

Instruction: 'PEXTRW'. Intrinsic: '_m_pextrw'. Requires SSE.

FIXME: Requires compiler support (has immediate)

func Pinsrw

func Pinsrw(a x86.M64, i int, imm8 byte) (dst x86.M64)

Pinsrw: Copy 'a' to 'dst', and insert the 16-bit integer 'i' into 'dst' at the location specified by 'imm8'.

dst[63:0] := a[63:0]
sel := imm8[1:0]*16
dst[sel+15:sel] := i[15:0]

Instruction: 'PINSRW'. Intrinsic: '_m_pinsrw'. Requires SSE.

FIXME: Requires compiler support (has immediate)

func Pmaxsw

func Pmaxsw(a x86.M64, b x86.M64) (dst x86.M64)

Pmaxsw: Compare packed 16-bit integers in 'a' and 'b', and store packed maximum values in 'dst'.

FOR j := 0 to 3
	i := j*16
	IF a[i+15:i] > b[i+15:i]
		dst[i+15:i] := a[i+15:i]
	ELSE
		dst[i+15:i] := b[i+15:i]
	FI
ENDFOR

Instruction: 'PMAXSW'. Intrinsic: '_m_pmaxsw'. Requires SSE.

func Pmaxub

func Pmaxub(a x86.M64, b x86.M64) (dst x86.M64)

Pmaxub: Compare packed unsigned 8-bit integers in 'a' and 'b', and store packed maximum values in 'dst'.

FOR j := 0 to 7
	i := j*8
	IF a[i+7:i] > b[i+7:i]
		dst[i+7:i] := a[i+7:i]
	ELSE
		dst[i+7:i] := b[i+7:i]
	FI
ENDFOR

Instruction: 'PMAXUB'. Intrinsic: '_m_pmaxub'. Requires SSE.

func Pminsw

func Pminsw(a x86.M64, b x86.M64) (dst x86.M64)

Pminsw: Compare packed 16-bit integers in 'a' and 'b', and store packed minimum values in 'dst'.

FOR j := 0 to 3
	i := j*16
	IF a[i+15:i] < b[i+15:i]
		dst[i+15:i] := a[i+15:i]
	ELSE
		dst[i+15:i] := b[i+15:i]
	FI
ENDFOR

Instruction: 'PMINSW'. Intrinsic: '_m_pminsw'. Requires SSE.

func Pminub

func Pminub(a x86.M64, b x86.M64) (dst x86.M64)

Pminub: Compare packed unsigned 8-bit integers in 'a' and 'b', and store packed minimum values in 'dst'.

FOR j := 0 to 7
	i := j*8
	IF a[i+7:i] < b[i+7:i]
		dst[i+7:i] := a[i+7:i]
	ELSE
		dst[i+7:i] := b[i+7:i]
	FI
ENDFOR

Instruction: 'PMINUB'. Intrinsic: '_m_pminub'. Requires SSE.

func Pmovmskb

func Pmovmskb(a x86.M64) int

Pmovmskb: Create mask from the most significant bit of each 8-bit element in 'a', and store the result in 'dst'.

FOR j := 0 to 7
	i := j*8
	dst[j] := a[i+7]
ENDFOR
dst[MAX:8] := 0

Instruction: 'PMOVMSKB'. Intrinsic: '_m_pmovmskb'. Requires SSE.

func Pmulhuw

func Pmulhuw(a x86.M64, b x86.M64) (dst x86.M64)

Pmulhuw: Multiply the packed unsigned 16-bit integers in 'a' and 'b', producing intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in 'dst'.

FOR j := 0 to 3
	i := j*16
	tmp[31:0] := a[i+15:i] * b[i+15:i]
	dst[i+15:i] := tmp[31:16]
ENDFOR

Instruction: 'PMULHUW'. Intrinsic: '_m_pmulhuw'. Requires SSE.

func PowPd

func PowPd(a x86.M128d, b x86.M128d) (dst x86.M128d)

PowPd: Compute the exponential value of packed double-precision (64-bit) floating-point elements in 'a' raised by packed elements in 'b', and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := (a[i+63:i])^(b[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_pow_pd'. Requires SSE.

func PowPs

func PowPs(a x86.M128, b x86.M128) (dst x86.M128)

PowPs: Compute the exponential value of packed single-precision (32-bit) floating-point elements in 'a' raised by packed elements in 'b', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := (a[i+31:i])^(b[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_pow_ps'. Requires SSE.

func Prefetch

func Prefetch(p *byte, i int)

Prefetch: Fetch the line of data from memory that contains address 'p' to a location in the cache heirarchy specified by the locality hint 'i'.

Instruction: 'PREFETCHNTA, PREFETCHT0, PREFETCHT1, PREFETCHT2'. Intrinsic: '_mm_prefetch'. Requires SSE.

FIXME: Will likely need to be reworked (has pointer parameter).

func Psadbw

func Psadbw(a x86.M64, b x86.M64) (dst x86.M64)

Psadbw: Compute the absolute differences of packed unsigned 8-bit integers in 'a' and 'b', then horizontally sum each consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of 'dst'.

FOR j := 0 to 7
	i := j*8
	tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i])
ENDFOR

dst[15:0] := tmp[7:0] + tmp[15:8] + tmp[23:16] + tmp[31:24] + tmp[39:32] + tmp[47:40] + tmp[55:48] + tmp[63:56]
dst[63:16] := 0

Instruction: 'PSADBW'. Intrinsic: '_m_psadbw'. Requires SSE.

func Pshufw

func Pshufw(a x86.M64, imm8 byte) (dst x86.M64)

Pshufw: Shuffle 16-bit integers in 'a' using the control in 'imm8', and store the results in 'dst'.

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[15:0] := src[15:0]
	1:	tmp[15:0] := src[31:16]
	2:	tmp[15:0] := src[47:32]
	3:	tmp[15:0] := src[63:48]
	ESAC
	RETURN tmp[15:0]
}

dst[15:0] := SELECT4(a[63:0], imm8[1:0])
dst[31:16] := SELECT4(a[63:0], imm8[3:2])
dst[47:32] := SELECT4(a[63:0], imm8[5:4])
dst[63:48] := SELECT4(a[63:0], imm8[7:6])

Instruction: 'PSHUFW'. Intrinsic: '_m_pshufw'. Requires SSE.

FIXME: Requires compiler support (has immediate)

func RcpPs

func RcpPs(a x86.M128) (dst x86.M128)

RcpPs: Compute the approximate reciprocal of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'. The maximum relative error for this approximation is less than 1.5*2^-12.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := APPROXIMATE(1.0/a[i+31:i])
ENDFOR

Instruction: 'RCPPS'. Intrinsic: '_mm_rcp_ps'. Requires SSE.

func RcpSs

func RcpSs(a x86.M128) (dst x86.M128)

RcpSs: Compute the approximate reciprocal of the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. The maximum relative error for this approximation is less than 1.5*2^-12.

dst[31:0] := APPROXIMATE(1.0/a[31:0])
dst[127:32] := a[127:32]

Instruction: 'RCPSS'. Intrinsic: '_mm_rcp_ss'. Requires SSE.

func RemEpi16

func RemEpi16(a x86.M128i, b x86.M128i) (dst x86.M128i)

RemEpi16: Divide packed 16-bit integers in 'a' by packed elements in 'b', and store the remainders as packed 32-bit integers in 'dst'.

FOR j := 0 to 7
	i := 16*j
	dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_rem_epi16'. Requires SSE.

func RemEpi32

func RemEpi32(a x86.M128i, b x86.M128i) (dst x86.M128i)

RemEpi32: Divide packed 32-bit integers in 'a' by packed elements in 'b', and store the remainders as packed 32-bit integers in 'dst'.

FOR j := 0 to 3
	i := 32*j
	dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_rem_epi32'. Requires SSE.

func RemEpi64

func RemEpi64(a x86.M128i, b x86.M128i) (dst x86.M128i)

RemEpi64: Divide packed 64-bit integers in 'a' by packed elements in 'b', and store the remainders as packed 32-bit integers in 'dst'.

FOR j := 0 to 1
	i := 64*j
	dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_rem_epi64'. Requires SSE.

func RemEpi8

func RemEpi8(a x86.M128i, b x86.M128i) (dst x86.M128i)

RemEpi8: Divide packed 8-bit integers in 'a' by packed elements in 'b', and store the remainders as packed 32-bit integers in 'dst'.

FOR j := 0 to 15
	i := 8*j
	dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_rem_epi8'. Requires SSE.

func RemEpu16

func RemEpu16(a x86.M128i, b x86.M128i) (dst x86.M128i)

RemEpu16: Divide packed unsigned 16-bit integers in 'a' by packed elements in 'b', and store the remainders as packed unsigned 32-bit integers in 'dst'.

FOR j := 0 to 7
	i := 16*j
	dst[i+15:i] := REMAINDER(a[i+15:i] / b[i+15:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_rem_epu16'. Requires SSE.

func RemEpu32

func RemEpu32(a x86.M128i, b x86.M128i) (dst x86.M128i)

RemEpu32: Divide packed unsigned 32-bit integers in 'a' by packed elements in 'b', and store the remainders as packed unsigned 32-bit integers in 'dst'.

FOR j := 0 to 3
	i := 32*j
	dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_rem_epu32'. Requires SSE.

func RemEpu64

func RemEpu64(a x86.M128i, b x86.M128i) (dst x86.M128i)

RemEpu64: Divide packed unsigned 64-bit integers in 'a' by packed elements in 'b', and store the remainders as packed unsigned 32-bit integers in 'dst'.

FOR j := 0 to 1
	i := 64*j
	dst[i+63:i] := REMAINDER(a[i+63:i] / b[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_rem_epu64'. Requires SSE.

func RemEpu8

func RemEpu8(a x86.M128i, b x86.M128i) (dst x86.M128i)

RemEpu8: Divide packed unsigned 8-bit integers in 'a' by packed elements in 'b', and store the remainders as packed unsigned 32-bit integers in 'dst'.

FOR j := 0 to 15
	i := 8*j
	dst[i+7:i] := REMAINDER(a[i+7:i] / b[i+7:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_rem_epu8'. Requires SSE.

func RsqrtPs

func RsqrtPs(a x86.M128) (dst x86.M128)

RsqrtPs: Compute the approximate reciprocal square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'. The maximum relative error for this approximation is less than 1.5*2^-12.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := APPROXIMATE(1.0 / SQRT(a[i+31:i]))
ENDFOR

Instruction: 'RSQRTPS'. Intrinsic: '_mm_rsqrt_ps'. Requires SSE.

func RsqrtSs

func RsqrtSs(a x86.M128) (dst x86.M128)

RsqrtSs: Compute the approximate reciprocal square root of the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'. The maximum relative error for this approximation is less than 1.5*2^-12.

dst[31:0] := APPROXIMATE(1.0 / SQRT(a[31:0]))
dst[127:32] := a[127:32]

Instruction: 'RSQRTSS'. Intrinsic: '_mm_rsqrt_ss'. Requires SSE.

func SadPu8

func SadPu8(a x86.M64, b x86.M64) (dst x86.M64)

SadPu8: Compute the absolute differences of packed unsigned 8-bit integers in 'a' and 'b', then horizontally sum each consecutive 8 differences to produce four unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low 16 bits of 'dst'.

FOR j := 0 to 7
	i := j*8
	tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i])
ENDFOR

dst[15:0] := tmp[7:0] + tmp[15:8] + tmp[23:16] + tmp[31:24] + tmp[39:32] + tmp[47:40] + tmp[55:48] + tmp[63:56]
dst[63:16] := 0

Instruction: 'PSADBW'. Intrinsic: '_mm_sad_pu8'. Requires SSE.

func Set1Ps

func Set1Ps(a float32) (dst x86.M128)

Set1Ps: Broadcast single-precision (32-bit) floating-point value 'a' to all elements of 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := a[31:0]
ENDFOR

Instruction: '...'. Intrinsic: '_mm_set1_ps'. Requires SSE.

func SetPs

func SetPs(e3 float32, e2 float32, e1 float32, e0 float32) (dst x86.M128)

SetPs: Set packed single-precision (32-bit) floating-point elements in 'dst' with the supplied values.

dst[31:0] := e0
dst[63:32] := e1
dst[95:64] := e2
dst[127:96] := e3

Instruction: '...'. Intrinsic: '_mm_set_ps'. Requires SSE.

func SetPs1

func SetPs1(a float32) (dst x86.M128)

SetPs1: Broadcast single-precision (32-bit) floating-point value 'a' to all elements of 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := a[31:0]
ENDFOR

Instruction: '...'. Intrinsic: '_mm_set_ps1'. Requires SSE.

func SetSs

func SetSs(a float32) (dst x86.M128)

SetSs: Copy single-precision (32-bit) floating-point element 'a' to the lower element of 'dst', and zero the upper 3 elements.

dst[31:0] := a[31:0]
dst[127:32] := 0

Instruction: '...'. Intrinsic: '_mm_set_ss'. Requires SSE.

func Setcsr

func Setcsr(a uint32)

Setcsr: Set the MXCSR control and status register with the value in unsigned 32-bit integer 'a'.

MXCSR := a[31:0]

Instruction: 'LDMXCSR'. Intrinsic: '_mm_setcsr'. Requires SSE.

func SetrPs

func SetrPs(e3 float32, e2 float32, e1 float32, e0 float32) (dst x86.M128)

SetrPs: Set packed single-precision (32-bit) floating-point elements in 'dst' with the supplied values in reverse order.

dst[31:0] := e3
dst[63:32] := e2
dst[95:64] := e1
dst[127:96] := e0

Instruction: '...'. Intrinsic: '_mm_setr_ps'. Requires SSE.

func SetzeroPs

func SetzeroPs() (dst x86.M128)

SetzeroPs: Return vector of type __m128 with all elements set to zero.

dst[MAX:0] := 0

Instruction: 'XORPS'. Intrinsic: '_mm_setzero_ps'. Requires SSE.

func Sfence

func Sfence()

Sfence: Perform a serializing operation on all store-to-memory instructions that were issued prior to this instruction. Guarantees that every store instruction that precedes, in program order, is globally visible before any store instruction which follows the fence in program order.

Instruction: 'SFENCE'. Intrinsic: '_mm_sfence'. Requires SSE.

func ShufflePi16

func ShufflePi16(a x86.M64, imm8 byte) (dst x86.M64)

ShufflePi16: Shuffle 16-bit integers in 'a' using the control in 'imm8', and store the results in 'dst'.

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[15:0] := src[15:0]
	1:	tmp[15:0] := src[31:16]
	2:	tmp[15:0] := src[47:32]
	3:	tmp[15:0] := src[63:48]
	ESAC
	RETURN tmp[15:0]
}

dst[15:0] := SELECT4(a[63:0], imm8[1:0])
dst[31:16] := SELECT4(a[63:0], imm8[3:2])
dst[47:32] := SELECT4(a[63:0], imm8[5:4])
dst[63:48] := SELECT4(a[63:0], imm8[7:6])

Instruction: 'PSHUFW'. Intrinsic: '_mm_shuffle_pi16'. Requires SSE.

FIXME: Requires compiler support (has immediate)

func ShufflePs

func ShufflePs(a x86.M128, b x86.M128, imm8 byte) (dst x86.M128)

ShufflePs: Shuffle single-precision (32-bit) floating-point elements in 'a' using the control in 'imm8', and store the results in 'dst'.

SELECT4(src, control){
	CASE(control[1:0])
	0:	tmp[31:0] := src[31:0]
	1:	tmp[31:0] := src[63:32]
	2:	tmp[31:0] := src[95:64]
	3:	tmp[31:0] := src[127:96]
	ESAC
	RETURN tmp[31:0]
}

dst[31:0] := SELECT4(a[127:0], imm8[1:0])
dst[63:32] := SELECT4(a[127:0], imm8[3:2])
dst[95:64] := SELECT4(b[127:0], imm8[5:4])
dst[127:96] := SELECT4(b[127:0], imm8[7:6])

Instruction: 'SHUFPS'. Intrinsic: '_mm_shuffle_ps'. Requires SSE.

FIXME: Requires compiler support (has immediate)

func SinPd

func SinPd(a x86.M128d) (dst x86.M128d)

SinPd: Compute the sine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := SIN(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_sin_pd'. Requires SSE.

func SinPs

func SinPs(a x86.M128) (dst x86.M128)

SinPs: Compute the sine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := SIN(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_sin_ps'. Requires SSE.

func SincosPd

func SincosPd(mem_addr *x86.M128d, a x86.M128d) (dst x86.M128d)

SincosPd: Compute the sine and cosine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, store the sine in 'dst', and store the cosine into memory at 'mem_addr'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := SIN(a[i+63:i])
	MEM[mem_addr+i+63:mem_addr+i] := COS(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_sincos_pd'. Requires SSE.

FIXME: Will likely need to be reworked (has pointer parameter).

func SincosPs

func SincosPs(mem_addr *x86.M128, a x86.M128) (dst x86.M128)

SincosPs: Compute the sine and cosine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, store the sine in 'dst', and store the cosine into memory at 'mem_addr'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := SIN(a[i+31:i])
	MEM[mem_addr+i+31:mem_addr+i] := COS(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_sincos_ps'. Requires SSE.

FIXME: Will likely need to be reworked (has pointer parameter).

func SindPd

func SindPd(a x86.M128d) (dst x86.M128d)

SindPd: Compute the sine of packed double-precision (64-bit) floating-point elements in 'a' expressed in degrees, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := SIND(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_sind_pd'. Requires SSE.

func SindPs

func SindPs(a x86.M128) (dst x86.M128)

SindPs: Compute the sine of packed single-precision (32-bit) floating-point elements in 'a' expressed in degrees, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := SIND(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_sind_ps'. Requires SSE.

func SinhPd

func SinhPd(a x86.M128d) (dst x86.M128d)

SinhPd: Compute the hyperbolic sine of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := SINH(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_sinh_pd'. Requires SSE.

func SinhPs

func SinhPs(a x86.M128) (dst x86.M128)

SinhPs: Compute the hyperbolic sine of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := SINH(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_sinh_ps'. Requires SSE.

func SqrtPs

func SqrtPs(a x86.M128) (dst x86.M128)

SqrtPs: Compute the square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := SQRT(a[i+31:i])
ENDFOR

Instruction: 'SQRTPS'. Intrinsic: '_mm_sqrt_ps'. Requires SSE.

func SqrtSs

func SqrtSs(a x86.M128) (dst x86.M128)

SqrtSs: Compute the square root of the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := SQRT(a[31:0])
dst[127:32] := a[127:32]

Instruction: 'SQRTSS'. Intrinsic: '_mm_sqrt_ss'. Requires SSE.

func Store1Ps

func Store1Ps(mem_addr *float32, a x86.M128)

Store1Ps: Store the lower single-precision (32-bit) floating-point element from 'a' into 4 contiguous elements in memory. 'mem_addr' must be aligned on a 16-byte boundary or a general-protection exception may be generated.

MEM[mem_addr+31:mem_addr] := a[31:0]
MEM[mem_addr+63:mem_addr+32] := a[31:0]
MEM[mem_addr+95:mem_addr+64] := a[31:0]
MEM[mem_addr+127:mem_addr+96] := a[31:0]

Instruction: '...'. Intrinsic: '_mm_store1_ps'. Requires SSE.

FIXME: Will likely need to be reworked (has pointer parameter).

func StorePs

func StorePs(mem_addr *float32, a x86.M128)

StorePs: Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from 'a' into memory.

'mem_addr' must be aligned on a 16-byte boundary or a general-protection

exception may be generated.

MEM[mem_addr+127:mem_addr] := a[127:0]

Instruction: 'MOVAPS'. Intrinsic: '_mm_store_ps'. Requires SSE.

FIXME: Will likely need to be reworked (has pointer parameter).

func StorePs1

func StorePs1(mem_addr *float32, a x86.M128)

StorePs1: Store the lower single-precision (32-bit) floating-point element from 'a' into 4 contiguous elements in memory. 'mem_addr' must be aligned on a 16-byte boundary or a general-protection exception may be generated.

MEM[mem_addr+31:mem_addr] := a[31:0]
MEM[mem_addr+63:mem_addr+32] := a[31:0]
MEM[mem_addr+95:mem_addr+64] := a[31:0]
MEM[mem_addr+127:mem_addr+96] := a[31:0]

Instruction: '...'. Intrinsic: '_mm_store_ps1'. Requires SSE.

FIXME: Will likely need to be reworked (has pointer parameter).

func StoreSs

func StoreSs(mem_addr *float32, a x86.M128)

StoreSs: Store the lower single-precision (32-bit) floating-point element from 'a' into memory. 'mem_addr' does not need to be aligned on any particular boundary.

MEM[mem_addr+31:mem_addr] := a[31:0]

Instruction: 'MOVSS'. Intrinsic: '_mm_store_ss'. Requires SSE.

FIXME: Will likely need to be reworked (has pointer parameter).

func StorehPi

func StorehPi(mem_addr *x86.M64, a x86.M128)

StorehPi: Store the upper 2 single-precision (32-bit) floating-point elements from 'a' into memory.

MEM[mem_addr+31:mem_addr] := a[95:64]
MEM[mem_addr+63:mem_addr+32] := a[127:96]

Instruction: 'MOVHPS'. Intrinsic: '_mm_storeh_pi'. Requires SSE.

FIXME: Will likely need to be reworked (has pointer parameter).

func StorelPi

func StorelPi(mem_addr *x86.M64, a x86.M128)

StorelPi: Store the lower 2 single-precision (32-bit) floating-point elements from 'a' into memory.

MEM[mem_addr+31:mem_addr] := a[31:0]
MEM[mem_addr+63:mem_addr+32] := a[63:32]

Instruction: 'MOVLPS'. Intrinsic: '_mm_storel_pi'. Requires SSE.

FIXME: Will likely need to be reworked (has pointer parameter).

func StorerPs

func StorerPs(mem_addr *float32, a x86.M128)

StorerPs: Store 4 single-precision (32-bit) floating-point elements from 'a' into memory in reverse order.

'mem_addr' must be aligned on a 16-byte boundary or a general-protection

exception may be generated.

MEM[mem_addr+31:mem_addr] := a[127:96]
MEM[mem_addr+63:mem_addr+32] := a[95:64]
MEM[mem_addr+95:mem_addr+64] := a[63:32]
MEM[mem_addr+127:mem_addr+96] := a[31:0]

Instruction: '...'. Intrinsic: '_mm_storer_ps'. Requires SSE.

FIXME: Will likely need to be reworked (has pointer parameter).

func StoreuPs

func StoreuPs(mem_addr *float32, a x86.M128)

StoreuPs: Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from 'a' into memory.

'mem_addr' does not need to be aligned on any particular boundary.

	MEM[mem_addr+127:mem_addr] := a[127:0]

Instruction: 'MOVUPS'. Intrinsic: '_mm_storeu_ps'. Requires SSE.

FIXME: Will likely need to be reworked (has pointer parameter).

func StreamPi

func StreamPi(mem_addr *x86.M64, a x86.M64)

StreamPi: Store 64-bits of integer data from 'a' into memory using a non-temporal memory hint.

MEM[mem_addr+63:mem_addr] := a[63:0]

Instruction: 'MOVNTQ'. Intrinsic: '_mm_stream_pi'. Requires SSE.

FIXME: Will likely need to be reworked (has pointer parameter).

func StreamPs

func StreamPs(mem_addr *float32, a x86.M128)

StreamPs: Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point elements) from 'a' into memory using a non-temporal memory hint.

'mem_addr' must be aligned on a 16-byte boundary or a general-protection

exception may be generated.

MEM[mem_addr+127:mem_addr] := a[127:0]

Instruction: 'MOVNTPS'. Intrinsic: '_mm_stream_ps'. Requires SSE.

FIXME: Will likely need to be reworked (has pointer parameter).

func SubPs

func SubPs(a x86.M128, b x86.M128) (dst x86.M128)

SubPs: Subtract packed single-precision (32-bit) floating-point elements in 'b' from packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := a[i+31:i] - b[i+31:i]
ENDFOR

Instruction: 'SUBPS'. Intrinsic: '_mm_sub_ps'. Requires SSE.

func SubSs

func SubSs(a x86.M128, b x86.M128) (dst x86.M128)

SubSs: Subtract the lower single-precision (32-bit) floating-point element in 'b' from the lower single-precision (32-bit) floating-point element in 'a', store the result in the lower element of 'dst', and copy the upper 3 packed elements from 'a' to the upper elements of 'dst'.

dst[31:0] := a[31:0] - b[31:0]
dst[127:32] := a[127:32]

Instruction: 'SUBSS'. Intrinsic: '_mm_sub_ss'. Requires SSE.

func SvmlCeilPd

func SvmlCeilPd(a x86.M128d) (dst x86.M128d)

SvmlCeilPd: Round the packed double-precision (64-bit) floating-point elements in 'a' up to an integer value, and store the results as packed double-precision floating-point elements in 'dst'. This intrinsic may generate the 'roundpd'/'vroundpd' instruction.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := CEIL(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_svml_ceil_pd'. Requires SSE.

func SvmlCeilPs

func SvmlCeilPs(a x86.M128) (dst x86.M128)

SvmlCeilPs: Round the packed single-precision (32-bit) floating-point elements in 'a' up to an integer value, and store the results as packed single-precision floating-point elements in 'dst'. This intrinsic may generate the 'roundps'/'vroundps' instruction.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := CEIL(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_svml_ceil_ps'. Requires SSE.

func SvmlFloorPd

func SvmlFloorPd(a x86.M128d) (dst x86.M128d)

SvmlFloorPd: Round the packed double-precision (64-bit) floating-point elements in 'a' down to an integer value, and store the results as packed double-precision floating-point elements in 'dst'. This intrinsic may generate the 'roundpd'/'vroundpd' instruction.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := FLOOR(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_svml_floor_pd'. Requires SSE.

func SvmlFloorPs

func SvmlFloorPs(a x86.M128) (dst x86.M128)

SvmlFloorPs: Round the packed single-precision (32-bit) floating-point elements in 'a' down to an integer value, and store the results as packed single-precision floating-point elements in 'dst'. This intrinsic may generate the 'roundps'/'vroundps' instruction.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := FLOOR(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_svml_floor_ps'. Requires SSE.

func SvmlRoundPd

func SvmlRoundPd(a x86.M128d) (dst x86.M128d)

SvmlRoundPd: Round the packed double-precision (64-bit) floating-point elements in 'a' to the nearest integer value, and store the results as packed double-precision floating-point elements in 'dst'. This intrinsic may generate the 'roundpd'/'vroundpd' instruction.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := ROUND(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_svml_round_pd'. Requires SSE.

func SvmlRoundPs

func SvmlRoundPs(a x86.M128) (dst x86.M128)

SvmlRoundPs: Round the packed single-precision (32-bit) floating-point elements in 'a' to the nearest integer value, and store the results as packed single-precision floating-point elements in 'dst'. This intrinsic may generate the 'roundps'/'vroundps' instruction.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := ROUND(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_svml_round_ps'. Requires SSE.

func SvmlSqrtPd

func SvmlSqrtPd(a x86.M128d) (dst x86.M128d)

SvmlSqrtPd: Compute the square root of packed double-precision (64-bit) floating-point elements in 'a', and store the results in 'dst'. Note that this intrinsic is less efficient than '_mm_sqrt_pd'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := SQRT(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_svml_sqrt_pd'. Requires SSE.

func SvmlSqrtPs

func SvmlSqrtPs(a x86.M128) (dst x86.M128)

SvmlSqrtPs: Compute the square root of packed single-precision (32-bit) floating-point elements in 'a', and store the results in 'dst'. Note that this intrinsic is less efficient than '_mm_sqrt_ps'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := SQRT(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_svml_sqrt_ps'. Requires SSE.

func TanPd

func TanPd(a x86.M128d) (dst x86.M128d)

TanPd: Compute the tangent of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := TAN(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_tan_pd'. Requires SSE.

func TanPs

func TanPs(a x86.M128) (dst x86.M128)

TanPs: Compute the tangent of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := TAN(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_tan_ps'. Requires SSE.

func TandPd

func TandPd(a x86.M128d) (dst x86.M128d)

TandPd: Compute the tangent of packed double-precision (64-bit) floating-point elements in 'a' expressed in degrees, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := TAND(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_tand_pd'. Requires SSE.

func TandPs

func TandPs(a x86.M128) (dst x86.M128)

TandPs: Compute the tangent of packed single-precision (32-bit) floating-point elements in 'a' expressed in degrees, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := TAND(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_tand_ps'. Requires SSE.

func TanhPd

func TanhPd(a x86.M128d) (dst x86.M128d)

TanhPd: Compute the hyperbolic tangent of packed double-precision (64-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := TANH(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_tanh_pd'. Requires SSE.

func TanhPs

func TanhPs(a x86.M128) (dst x86.M128)

TanhPs: Compute the hyperbolic tangent of packed single-precision (32-bit) floating-point elements in 'a' expressed in radians, and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := TANH(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_tanh_ps'. Requires SSE.

func TruncPd

func TruncPd(a x86.M128d) (dst x86.M128d)

TruncPd: Truncate the packed double-precision (64-bit) floating-point elements in 'a', and store the results as packed double-precision floating-point elements in 'dst'. This intrinsic may generate the 'roundpd'/'vroundpd' instruction.

FOR j := 0 to 1
	i := j*64
	dst[i+63:i] := TRUNCATE(a[i+63:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_trunc_pd'. Requires SSE.

func TruncPs

func TruncPs(a x86.M128) (dst x86.M128)

TruncPs: Truncate the packed single-precision (32-bit) floating-point elements in 'a', and store the results as packed single-precision floating-point elements in 'dst'. This intrinsic may generate the 'roundps'/'vroundps' instruction.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := TRUNCATE(a[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_trunc_ps'. Requires SSE.

func UcomieqSs

func UcomieqSs(a x86.M128, b x86.M128) int

UcomieqSs: Compare the lower single-precision (32-bit) floating-point element in 'a' and 'b' for equality, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.

RETURN ( a[31:0] == b[31:0] ) ? 1 : 0

Instruction: 'UCOMISS'. Intrinsic: '_mm_ucomieq_ss'. Requires SSE.

func UcomigeSs

func UcomigeSs(a x86.M128, b x86.M128) int

UcomigeSs: Compare the lower single-precision (32-bit) floating-point element in 'a' and 'b' for greater-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.

RETURN ( a[31:0] >= b[31:0] ) ? 1 : 0

Instruction: 'UCOMISS'. Intrinsic: '_mm_ucomige_ss'. Requires SSE.

func UcomigtSs

func UcomigtSs(a x86.M128, b x86.M128) int

UcomigtSs: Compare the lower single-precision (32-bit) floating-point element in 'a' and 'b' for greater-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.

RETURN ( a[31:0] > b[31:0] ) ? 1 : 0

Instruction: 'UCOMISS'. Intrinsic: '_mm_ucomigt_ss'. Requires SSE.

func UcomileSs

func UcomileSs(a x86.M128, b x86.M128) int

UcomileSs: Compare the lower single-precision (32-bit) floating-point element in 'a' and 'b' for less-than-or-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.

RETURN ( a[31:0] <= b[31:0] ) ? 1 : 0

Instruction: 'UCOMISS'. Intrinsic: '_mm_ucomile_ss'. Requires SSE.

func UcomiltSs

func UcomiltSs(a x86.M128, b x86.M128) int

UcomiltSs: Compare the lower single-precision (32-bit) floating-point element in 'a' and 'b' for less-than, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.

RETURN ( a[31:0] < b[31:0] ) ? 1 : 0

Instruction: 'UCOMISS'. Intrinsic: '_mm_ucomilt_ss'. Requires SSE.

func UcomineqSs

func UcomineqSs(a x86.M128, b x86.M128) int

UcomineqSs: Compare the lower single-precision (32-bit) floating-point element in 'a' and 'b' for not-equal, and return the boolean result (0 or 1). This instruction will not signal an exception for QNaNs.

RETURN ( a[31:0] != b[31:0] ) ? 1 : 0

Instruction: 'UCOMISS'. Intrinsic: '_mm_ucomineq_ss'. Requires SSE.

func UdivEpi32

func UdivEpi32(a x86.M128i, b x86.M128i) (dst x86.M128i)

UdivEpi32: Divide packed unsigned 32-bit integers in 'a' by packed elements in 'b', and store the truncated results in 'dst'.

FOR j := 0 to 3
	i := 32*j
	dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_udiv_epi32'. Requires SSE.

func UdivremEpi32

func UdivremEpi32(mem_addr *x86.M128i, a x86.M128i, b x86.M128i) (dst x86.M128i)

UdivremEpi32: Divide packed unsigned 32-bit integers in 'a' by packed elements in 'b', store the truncated results in 'dst', and store the remainders as packed unsigned 32-bit integers into memory at 'mem_addr'.

FOR j := 0 to 3
	i := 32*j
	dst[i+31:i] := TRUNCATE(a[i+31:i] / b[i+31:i])
	MEM[mem_addr+i+31:mem_addr+i] := REMAINDER(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_udivrem_epi32'. Requires SSE.

FIXME: Will likely need to be reworked (has pointer parameter).

func UnpackhiPs

func UnpackhiPs(a x86.M128, b x86.M128) (dst x86.M128)

UnpackhiPs: Unpack and interleave single-precision (32-bit) floating-point elements from the high half 'a' and 'b', and store the results in 'dst'.

INTERLEAVE_HIGH_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[95:64]
	dst[63:32] := src2[95:64]
	dst[95:64] := src1[127:96]
	dst[127:96] := src2[127:96]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_HIGH_DWORDS(a[127:0], b[127:0])

Instruction: 'UNPCKHPS'. Intrinsic: '_mm_unpackhi_ps'. Requires SSE.

func UnpackloPs

func UnpackloPs(a x86.M128, b x86.M128) (dst x86.M128)

UnpackloPs: Unpack and interleave single-precision (32-bit) floating-point elements from the low half of 'a' and 'b', and store the results in 'dst'.

INTERLEAVE_DWORDS(src1[127:0], src2[127:0]){
	dst[31:0] := src1[31:0]
	dst[63:32] := src2[31:0]
	dst[95:64] := src1[63:32]
	dst[127:96] := src2[63:32]
	RETURN dst[127:0]
}

dst[127:0] := INTERLEAVE_DWORDS(a[127:0], b[127:0])

Instruction: 'UNPCKLPS'. Intrinsic: '_mm_unpacklo_ps'. Requires SSE.

func UremEpi32

func UremEpi32(a x86.M128i, b x86.M128i) (dst x86.M128i)

UremEpi32: Divide packed unsigned 32-bit integers in 'a' by packed elements in 'b', and store the remainders as packed unsigned 32-bit integers in 'dst'.

FOR j := 0 to 3
	i := 32*j
	dst[i+31:i] := REMAINDER(a[i+31:i] / b[i+31:i])
ENDFOR
dst[MAX:128] := 0

Instruction: '...'. Intrinsic: '_mm_urem_epi32'. Requires SSE.

func XorPs

func XorPs(a x86.M128, b x86.M128) (dst x86.M128)

XorPs: Compute the bitwise XOR of packed single-precision (32-bit) floating-point elements in 'a' and 'b', and store the results in 'dst'.

FOR j := 0 to 3
	i := j*32
	dst[i+31:i] := a[i+31:i] XOR b[i+31:i]
ENDFOR

Instruction: 'XORPS'. Intrinsic: '_mm_xor_ps'. Requires SSE.

Types

This section is empty.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL