unop("ineg", tint, "-src0")
unop("fneg", tfloat, "-src0")
unop("inot", tint, "~src0") # invert every bit of the integer
-unop("fnot", tfloat, ("bit_size == 64 ? ((src0 == 0.0) ? 1.0 : 0.0f) : " +
- "((src0 == 0.0f) ? 1.0f : 0.0f)"))
unop("fsign", tfloat, ("bit_size == 64 ? " +
"((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
"((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
dst_types = [tint, tuint, tfloat, tbool]
for dst_t in dst_types:
- for bit_size in type_sizes(dst_t):
- if bit_size == 16 and dst_t == tfloat and src_t == tfloat:
+ for dst_bit_size in type_sizes(dst_t):
+ if dst_bit_size == 16 and dst_t == tfloat and src_t == tfloat:
rnd_modes = ['_rtne', '_rtz', '']
for rnd_mode in rnd_modes:
- unop_numeric_convert("{0}2{1}{2}{3}".format(src_t[0], dst_t[0],
- bit_size, rnd_mode),
- dst_t + str(bit_size), src_t, "src0")
+ if rnd_mode == '_rtne':
+ conv_expr = """
+ if (bit_size > 16) {
+ dst = _mesa_half_to_float(_mesa_float_to_float16_rtne(src0));
+ } else {
+ dst = src0;
+ }
+ """
+ elif rnd_mode == '_rtz':
+ conv_expr = """
+ if (bit_size > 16) {
+ dst = _mesa_half_to_float(_mesa_float_to_float16_rtz(src0));
+ } else {
+ dst = src0;
+ }
+ """
+ else:
+ conv_expr = "src0"
+
+ unop_numeric_convert("{0}2{1}{2}{3}".format(src_t[0],
+ dst_t[0],
+ dst_bit_size,
+ rnd_mode),
+ dst_t + str(dst_bit_size),
+ src_t, conv_expr)
+ elif dst_bit_size == 32 and dst_t == tfloat and src_t == tfloat:
+ conv_expr = """
+ if (bit_size > 32 && nir_is_rounding_mode_rtz(execution_mode, 32)) {
+ dst = _mesa_double_to_float_rtz(src0);
+ } else {
+ dst = src0;
+ }
+ """
+ unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0],
+ dst_bit_size),
+ dst_t + str(dst_bit_size), src_t, conv_expr)
else:
conv_expr = "src0 != 0" if dst_t == tbool else "src0"
- unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0], bit_size),
- dst_t + str(bit_size), src_t, conv_expr)
+ unop_numeric_convert("{0}2{1}{2}".format(src_t[0], dst_t[0],
+ dst_bit_size),
+ dst_t + str(dst_bit_size), src_t, conv_expr)
# Unary floating-point rounding operations.
unop_horiz("unpack_32_2x16", 2, tuint16, 1, tuint32,
"dst.x = src0.x; dst.y = src0.x >> 16;")
-# Lowered floating point unpacking operations.
+unop_horiz("unpack_half_2x16_flush_to_zero", 2, tfloat32, 1, tuint32, """
+dst.x = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x & 0xffff));
+dst.y = unpack_half_1x16_flush_to_zero((uint16_t)(src0.x << 16));
+""")
+# Lowered floating point unpacking operations.
unop_convert("unpack_half_2x16_split_x", tfloat32, tuint32,
"unpack_half_1x16((uint16_t)(src0 & 0xffff))")
unop_convert("unpack_half_2x16_split_y", tfloat32, tuint32,
"unpack_half_1x16((uint16_t)(src0 >> 16))")
+unop_convert("unpack_half_2x16_split_x_flush_to_zero", tfloat32, tuint32,
+ "unpack_half_1x16_flush_to_zero((uint16_t)(src0 & 0xffff))")
+unop_convert("unpack_half_2x16_split_y_flush_to_zero", tfloat32, tuint32,
+ "unpack_half_1x16_flush_to_zero((uint16_t)(src0 >> 16))")
+
unop_convert("unpack_32_2x16_split_x", tuint16, tuint32, "src0")
unop_convert("unpack_32_2x16_split_y", tuint16, tuint32, "src0 >> 16")
if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5;
""")
+# Sum of vector components
+unop_reduce("fsum", 1, tfloat, tfloat, "{src}", "{src0} + {src1}", "{src}")
def binop_convert(name, out_type, in_type, alg_props, const_expr):
opcode(name, 0, out_type, [0, 0], [in_type, in_type],
[4, 4], [src_type, src_type], False, _2src_commutative,
final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
-binop("fadd", tfloat, _2src_commutative + associative, "src0 + src1")
+binop("fadd", tfloat, _2src_commutative + associative,"""
+if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
+ if (bit_size == 64)
+ dst = _mesa_double_add_rtz(src0, src1);
+ else
+ dst = _mesa_double_to_float_rtz((double)src0 + (double)src1);
+} else {
+ dst = src0 + src1;
+}
+""")
binop("iadd", tint, _2src_commutative + associative, "src0 + src1")
binop("iadd_sat", tint, _2src_commutative, """
src1 > 0 ?
""")
binop("usub_sat", tuint, "", "src0 < src1 ? 0 : src0 - src1")
-binop("fsub", tfloat, "", "src0 - src1")
+binop("fsub", tfloat, "", """
+if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
+ if (bit_size == 64)
+ dst = _mesa_double_sub_rtz(src0, src1);
+ else
+ dst = _mesa_double_to_float_rtz((double)src0 - (double)src1);
+} else {
+ dst = src0 - src1;
+}
+""")
binop("isub", tint, "", "src0 - src1")
-binop("fmul", tfloat, _2src_commutative + associative, "src0 * src1")
+binop("fmul", tfloat, _2src_commutative + associative, """
+if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
+ if (bit_size == 64)
+ dst = _mesa_double_mul_rtz(src0, src1);
+ else
+ dst = _mesa_double_to_float_rtz((double)src0 * (double)src1);
+} else {
+ dst = src0 * src1;
+}
+""")
# low 32-bits of signed/unsigned integer multiply
binop("imul", tint, _2src_commutative + associative, "src0 * src1")
opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], False, "",
"src0 >> (src1 & (sizeof(src0) * 8 - 1))")
+opcode("urol", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
+ uint32_t rotate_mask = sizeof(src0) * 8 - 1;
+ dst = (src0 << (src1 & rotate_mask)) |
+ (src0 >> (-src1 & rotate_mask));
+""")
+opcode("uror", 0, tuint, [0, 0], [tuint, tuint32], False, "", """
+ uint32_t rotate_mask = sizeof(src0) * 8 - 1;
+ dst = (src0 >> (src1 & rotate_mask)) |
+ (src0 << (-src1 & rotate_mask));
+""")
+
# bitwise logic operators
#
# These are also used as boolean and, or, xor for hardware supporting
binop("ixor", tuint, _2src_commutative + associative, "src0 ^ src1")
-# floating point logic operators
-#
-# These use (src != 0.0) for testing the truth of the input, and output 1.0
-# for true and 0.0 for false
-
-binop("fand", tfloat32, _2src_commutative,
- "((src0 != 0.0f) && (src1 != 0.0f)) ? 1.0f : 0.0f")
-binop("for", tfloat32, _2src_commutative,
- "((src0 != 0.0f) || (src1 != 0.0f)) ? 1.0f : 0.0f")
-binop("fxor", tfloat32, _2src_commutative,
- "(src0 != 0.0f && src1 == 0.0f) || (src0 == 0.0f && src1 != 0.0f) ? 1.0f : 0.0f")
-
binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
"{src}")
opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], False, "",
"src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
-binop("fmin", tfloat, "", "fminf(src0, src1)")
+binop("fmin", tfloat, "", "fmin(src0, src1)")
binop("imin", tint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
binop("umin", tuint, _2src_commutative + associative, "src1 > src0 ? src0 : src1")
-binop("fmax", tfloat, "", "fmaxf(src0, src1)")
+binop("fmax", tfloat, "", "fmax(src0, src1)")
binop("imax", tint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
binop("umax", tuint, _2src_commutative + associative, "src1 > src0 ? src1 : src0")
[src1_size, src2_size, src3_size],
[tuint, tuint, tuint], False, "", const_expr)
-triop("ffma", tfloat, _2src_commutative, "src0 * src1 + src2")
+triop("ffma", tfloat, _2src_commutative, """
+if (nir_is_rounding_mode_rtz(execution_mode, bit_size)) {
+ if (bit_size == 64)
+ dst = _mesa_double_fma_rtz(src0, src1, src2);
+ else if (bit_size == 32)
+ dst = _mesa_float_fma_rtz(src0, src1, src2);
+ else
+ dst = _mesa_double_to_float_rtz(_mesa_double_fma_rtz(src0, src1, src2));
+} else {
+ if (bit_size == 32)
+ dst = fmaf(src0, src1, src2);
+ else
+ dst = fma(src0, src1, src2);
+}
+""")
triop("flrp", tfloat, "", "src0 * (1 - src2) + src1 * src2")
}
""")
+
+triop("bitfield_select", tuint, "", "(src0 & src1) | (~src0 & src2)")
+
# SM5 ubfe/ibfe assembly: only the 5 least significant bits of offset and bits are used.
opcode("ubfe", 0, tuint32,
[0, 0, 0], [tuint32, tuint32, tuint32], False, "", """
dst.w = src3.x;
""")
+# An integer multiply instruction for address calculation. This is
+# similar to imul, except that the results are undefined in case of
+# overflow. Overflow is defined according to the size of the variable
+# being dereferenced.
+#
+# This relaxed definition, compared to imul, allows an optimization
+# pass to propagate bounds (ie, from an load/store intrinsic) to the
+# sources, such that lower precision integer multiplies can be used.
+# This is useful on hw that has 24b or perhaps 16b integer multiply
+# instructions.
+binop("amul", tint, _2src_commutative + associative, "src0 * src1")
+
# ir3-specific instruction that maps directly to mul-add shift high mix,
# (IMADSH_MIX16 i.e. ah * bl << 16 + c). It is used for lowering integer
# multiplication (imul) on Freedreno backend..
[1, 1, 1], [tint32, tint32, tint32], False, "", """
dst.x = ((((src0.x & 0xffff0000) >> 16) * (src1.x & 0x0000ffff)) << 16) + src2.x;
""")
+
+# ir3-specific instruction that maps directly to ir3 mad.s24.
+#
+# 24b multiply into 32b result (with sign extension) plus 32b int
+triop("imad24_ir3", tint32, _2src_commutative,
+ "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8) + src2")
+
+# 24b multiply into 32b result (with sign extension)
+binop("imul24", tint32, _2src_commutative + associative,
+ "(((int32_t)src0 << 8) >> 8) * (((int32_t)src1 << 8) >> 8)")