From 165b7f3a4487e7ab7738dd9b37f2b3375692f8a2 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Daniel=20Sch=C3=BCrmann?=
 <daniel.schuermann@campus.tu-berlin.de>
Date: Sat, 26 Jan 2019 09:12:46 +0100
Subject: [PATCH] nir: define behavior of nir_op_bfm and nir_op_u/ibfe
 according to SM5 spec.

That is: the five least significant bits provide the values of
'bits' and 'offset' which is the case for all hardware currently
supported by NIR and using the bfm/bfe instructions.
This patch also changes the lowering of bitfield_insert/extract
using shifts to not use bfm and removes the flag 'lower_bfm'.

Tested-by: Eric Anholt <eric@anholt.net>
Reviewed-by: Connor Abbott <cwabbott0@gmail.com>
---
 src/broadcom/compiler/nir_to_vir.c            |  1 -
 src/compiler/nir/nir.h                        |  6 ++--
 src/compiler/nir/nir_opcodes.py               | 28 ++++++++-----------
 src/compiler/nir/nir_opt_algebraic.py         | 15 +++-------
 src/freedreno/ir3/ir3_nir.c                   |  2 --
 .../drivers/nouveau/nvc0/nvc0_screen.c        |  1 -
 6 files changed, 18 insertions(+), 35 deletions(-)

diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
index 242414983e1..cb58b737a80 100644
--- a/src/broadcom/compiler/nir_to_vir.c
+++ b/src/broadcom/compiler/nir_to_vir.c
@@ -2392,7 +2392,6 @@ const nir_shader_compiler_options v3d_nir_options = {
         .lower_all_io_to_temps = true,
         .lower_extract_byte = true,
         .lower_extract_word = true,
-        .lower_bfm = true,
         .lower_bitfield_insert_to_shifts = true,
         .lower_bitfield_extract_to_shifts = true,
         .lower_bitfield_reverse = true,
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 0b3745be8b1..9feed169e47 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -2282,18 +2282,16 @@ typedef struct nir_shader_compiler_options {
    bool lower_fmod;
    /** Lowers ibitfield_extract/ubitfield_extract to ibfe/ubfe. */
    bool lower_bitfield_extract;
-   /** Lowers ibitfield_extract/ubitfield_extract to bfm, compares, shifts. */
+   /** Lowers ibitfield_extract/ubitfield_extract to compares, shifts. */
    bool lower_bitfield_extract_to_shifts;
    /** Lowers bitfield_insert to bfi/bfm */
    bool lower_bitfield_insert;
-   /** Lowers bitfield_insert to bfm, compares, and shifts. */
+   /** Lowers bitfield_insert to compares, and shifts. */
    bool lower_bitfield_insert_to_shifts;
    /** Lowers bitfield_reverse to shifts. */
    bool lower_bitfield_reverse;
    /** Lowers bit_count to shifts. */
    bool lower_bit_count;
-   /** Lowers bfm to shifts and subtracts. */
-   bool lower_bfm;
    /** Lowers ifind_msb to compare and ufind_msb */
    bool lower_ifind_msb;
    /** Lowers find_lsb to ufind_msb and logic ops */
diff --git a/src/compiler/nir/nir_opcodes.py b/src/compiler/nir/nir_opcodes.py
index 1ab4a3e7a31..8771c74b033 100644
--- a/src/compiler/nir/nir_opcodes.py
+++ b/src/compiler/nir/nir_opcodes.py
@@ -787,14 +787,12 @@ binop_convert("pack_32_2x16_split", tuint32, tuint16, "",
               "src0 | ((uint32_t)src1 << 16)")
 
 # bfm implements the behavior of the first operation of the SM5 "bfi" assembly
-# and that of the "bfi1" i965 instruction. That is, it has undefined behavior
-# if either of its arguments are 32.
+# and that of the "bfi1" i965 instruction. That is, the bits and offset values
+# are from the low five bits of src0 and src1, respectively.
 binop_convert("bfm", tuint32, tint32, "", """
-int bits = src0, offset = src1;
-if (offset < 0 || bits < 0 || offset > 31 || bits > 31 || offset + bits > 32)
-   dst = 0; /* undefined */
-else
-   dst = ((1u << bits) - 1) << offset;
+int bits = src0 & 0x1F;
+int offset = src1 & 0x1F;
+dst = ((1u << bits) - 1) << offset;
 """)
 
 opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], False, "", """
@@ -873,15 +871,14 @@ if (mask == 0) {
 }
 """)
 
-# SM5 ubfe/ibfe assembly
+# SM5 ubfe/ibfe assembly: only the 5 least significant bits of offset and bits are used.
 opcode("ubfe", 0, tuint32,
-       [0, 0, 0], [tuint32, tint32, tint32], False, "", """
+       [0, 0, 0], [tuint32, tuint32, tuint32], False, "", """
 unsigned base = src0;
-int offset = src1, bits = src2;
+unsigned offset = src1 & 0x1F;
+unsigned bits = src2 & 0x1F;
 if (bits == 0) {
    dst = 0;
-} else if (bits < 0 || offset < 0) {
-   dst = 0; /* undefined */
 } else if (offset + bits < 32) {
    dst = (base << (32 - bits - offset)) >> (32 - bits);
 } else {
@@ -889,13 +886,12 @@ if (bits == 0) {
 }
 """)
 opcode("ibfe", 0, tint32,
-       [0, 0, 0], [tint32, tint32, tint32], False, "", """
+       [0, 0, 0], [tint32, tuint32, tuint32], False, "", """
 int base = src0;
-int offset = src1, bits = src2;
+unsigned offset = src1 & 0x1F;
+unsigned bits = src2 & 0x1F;
 if (bits == 0) {
    dst = 0;
-} else if (bits < 0 || offset < 0) {
-   dst = 0; /* undefined */
 } else if (offset + bits < 32) {
    dst = (base << (32 - bits - offset)) >> (32 - bits);
 } else {
diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py
index 36e576b7eff..c7e69c0310c 100644
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -794,18 +794,11 @@ optimizations.extend([
 
    # Alternative lowering that doesn't rely on bfi.
    (('bitfield_insert', 'base', 'insert', 'offset', 'bits'),
-    ('bcsel', ('ilt', 31, 'bits'),
-     'insert',
-     ('ior',
-      ('iand', 'base', ('inot', ('bfm', 'bits', 'offset'))),
-      ('iand', ('ishl', 'insert', 'offset'), ('bfm', 'bits', 'offset')))),
+    ('ior',
+    ('iand', 'base', ('inot', ('ishl', ('isub', ('ishl', 1, 'bits'), 1), 'offset'))),
+    ('iand', ('ishl', 'insert', 'offset'), ('ishl', ('isub', ('ishl', 1, 'bits'), 1), 'offset'))),
     'options->lower_bitfield_insert_to_shifts'),
 
-   # bfm lowering -- note that the NIR opcode is undefined if either arg is 32.
-   (('bfm', 'bits', 'offset'),
-    ('ishl', ('isub', ('ishl', 1, 'bits'), 1), 'offset'),
-    'options->lower_bfm'),
-
    (('ibitfield_extract', 'value', 'offset', 'bits'),
     ('bcsel', ('ilt', 31, 'bits'), 'value',
               ('ibfe', 'value', 'offset', 'bits')),
@@ -829,7 +822,7 @@ optimizations.extend([
      ('ushr', 'value', 'offset'),
      ('bcsel', ('ieq', 'bits', 32),
       0xffffffff,
-      ('bfm', 'bits', 0))),
+      ('isub', ('ishl', 1, 'bits'), 1))),
     'options->lower_bitfield_extract_to_shifts'),
 
    (('ifind_msb', 'value'),
diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c
index 1ef47aa3e06..320e485b98d 100644
--- a/src/freedreno/ir3/ir3_nir.c
+++ b/src/freedreno/ir3/ir3_nir.c
@@ -54,7 +54,6 @@ static const nir_shader_compiler_options options = {
 		.lower_helper_invocation = true,
 		.lower_bitfield_insert_to_shifts = true,
 		.lower_bitfield_extract_to_shifts = true,
-		.lower_bfm = true,
 		.use_interpolated_input_intrinsics = true,
 };
 
@@ -79,7 +78,6 @@ static const nir_shader_compiler_options options_a6xx = {
 		.lower_helper_invocation = true,
 		.lower_bitfield_insert_to_shifts = true,
 		.lower_bitfield_extract_to_shifts = true,
-		.lower_bfm = true,
 		.use_interpolated_input_intrinsics = true,
 };
 
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index 527d5a8108a..4d26568d391 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -912,7 +912,6 @@ static const nir_shader_compiler_options nir_options = {
    .lower_bitfield_insert_to_shifts = false,
    .lower_bitfield_reverse = false,
    .lower_bit_count = false,
-   .lower_bfm = false,
    .lower_ifind_msb = false,
    .lower_find_lsb = false,
    .lower_uadd_carry = true, // TODO
-- 
2.30.2