i965/nir/vec4: Prepare source and destination registers for ALU operations
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
index 6d6de3bc677b7cf2fb44a15d6f3f9f0ce4d12824..15fe3648af84e9d7b5221fa039a2e3c3373a5bb4 100644 (file)
@@ -711,6 +711,49 @@ fs_inst::components_read(unsigned i) const
       else
          return 1;
 
+   case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
+   case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
+      assert(src[3].file == IMM);
+      /* Surface coordinates. */
+      if (i == 0)
+         return src[3].fixed_hw_reg.dw1.ud;
+      /* Surface operation source (ignored for reads). */
+      else if (i == 1)
+         return 0;
+      else
+         return 1;
+
+   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
+   case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
+      assert(src[3].file == IMM &&
+             src[4].file == IMM);
+      /* Surface coordinates. */
+      if (i == 0)
+         return src[3].fixed_hw_reg.dw1.ud;
+      /* Surface operation source. */
+      else if (i == 1)
+         return src[4].fixed_hw_reg.dw1.ud;
+      else
+         return 1;
+
+   case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
+   case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: {
+      assert(src[3].file == IMM &&
+             src[4].file == IMM);
+      const unsigned op = src[4].fixed_hw_reg.dw1.ud;
+      /* Surface coordinates. */
+      if (i == 0)
+         return src[3].fixed_hw_reg.dw1.ud;
+      /* Surface operation source. */
+      else if (i == 1 && op == BRW_AOP_CMPWR)
+         return 2;
+      else if (i == 1 && (op == BRW_AOP_INC || op == BRW_AOP_DEC ||
+                          op == BRW_AOP_PREDEC))
+         return 0;
+      else
+         return 1;
+   }
+
    default:
       return 1;
    }
@@ -897,6 +940,18 @@ fs_visitor::import_uniforms(fs_visitor *v)
    this->param_size = v->param_size;
 }
 
+void
+fs_visitor::setup_vector_uniform_values(const gl_constant_value *values, unsigned n)
+{
+   static const gl_constant_value zero = { 0 };
+
+   for (unsigned i = 0; i < n; ++i)
+      stage_prog_data->param[uniforms++] = &values[i];
+
+   for (unsigned i = n; i < 4; ++i)
+      stage_prog_data->param[uniforms++] = &zero;
+}
+
 fs_reg *
 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
                                          bool origin_upper_left)
@@ -920,11 +975,11 @@ fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
       bld.MOV(wpos, this->pixel_y);
    } else {
       fs_reg pixel_y = this->pixel_y;
-      float offset = (pixel_center_integer ? 0.0 : 0.5);
+      float offset = (pixel_center_integer ? 0.0f : 0.5f);
 
       if (flip) {
         pixel_y.negate = true;
-        offset += key->drawable_height - 1.0;
+        offset += key->drawable_height - 1.0f;
       }
 
       bld.ADD(wpos, pixel_y, fs_reg(offset));
@@ -1869,8 +1924,7 @@ fs_visitor::demote_pull_constants()
            continue;
 
          /* Set up the annotation tracking for new generated instructions. */
-         const fs_builder ibld = bld.annotate(inst->annotation, inst->ir)
-                                    .at(block, inst);
+         const fs_builder ibld(this, block, inst);
          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
          fs_reg dst = vgrf(glsl_type::float_type);
 
@@ -1885,8 +1939,9 @@ fs_visitor::demote_pull_constants()
             inst->src[i].reladdr = NULL;
             inst->src[i].stride = 1;
          } else {
+            const fs_builder ubld = ibld.exec_all().group(8, 0);
             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
-            ibld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
+            ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
                       dst, surf_index, offset);
             inst->src[i].set_smear(pull_index & 3);
          }
@@ -2203,7 +2258,8 @@ fs_visitor::opt_sampler_eot()
       return false;
 
    /* Look for a texturing instruction immediately before the final FB_WRITE. */
-   fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
+   bblock_t *block = cfg->blocks[cfg->num_blocks - 1];
+   fs_inst *fb_write = (fs_inst *)block->end();
    assert(fb_write->eot);
    assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
 
@@ -2234,9 +2290,11 @@ fs_visitor::opt_sampler_eot()
    assert(!tex_inst->eot); /* We can't get here twice */
    assert((tex_inst->offset & (0xff << 24)) == 0);
 
+   const fs_builder ibld(this, block, tex_inst);
+
    tex_inst->offset |= fb_write->target << 24;
    tex_inst->eot = true;
-   tex_inst->dst = bld.null_reg_ud();
+   tex_inst->dst = ibld.null_reg_ud();
    fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
 
    /* If a header is present, marking the eot is sufficient. Otherwise, we need
@@ -2248,8 +2306,8 @@ fs_visitor::opt_sampler_eot()
    if (tex_inst->header_size != 0)
       return true;
 
-   fs_reg send_header = bld.vgrf(BRW_REGISTER_TYPE_F,
-                                 load_payload->sources + 1);
+   fs_reg send_header = ibld.vgrf(BRW_REGISTER_TYPE_F,
+                                  load_payload->sources + 1);
    fs_reg *new_sources =
       ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
 
@@ -2769,7 +2827,8 @@ fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
       if (block->start() == scan_inst) {
          for (int i = 0; i < write_len; i++) {
             if (needs_dep[i])
-               DEP_RESOLVE_MOV(bld.at(block, inst), first_write_grf + i);
+               DEP_RESOLVE_MOV(fs_builder(this, block, inst),
+                               first_write_grf + i);
          }
          return;
       }
@@ -2785,7 +2844,7 @@ fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
             if (reg >= first_write_grf &&
                 reg < first_write_grf + write_len &&
                 needs_dep[reg - first_write_grf]) {
-               DEP_RESOLVE_MOV(bld.at(block, inst), reg);
+               DEP_RESOLVE_MOV(fs_builder(this, block, inst), reg);
                needs_dep[reg - first_write_grf] = false;
                if (scan_inst->exec_size == 16)
                   needs_dep[reg - first_write_grf + 1] = false;
@@ -2832,7 +2891,8 @@ fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_ins
       if (block->end() == scan_inst) {
          for (int i = 0; i < write_len; i++) {
             if (needs_dep[i])
-               DEP_RESOLVE_MOV(bld.at(block, scan_inst), first_write_grf + i);
+               DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
+                               first_write_grf + i);
          }
          return;
       }
@@ -2847,7 +2907,8 @@ fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_ins
           scan_inst->dst.reg >= first_write_grf &&
           scan_inst->dst.reg < first_write_grf + write_len &&
           needs_dep[scan_inst->dst.reg - first_write_grf]) {
-         DEP_RESOLVE_MOV(bld.at(block, scan_inst), scan_inst->dst.reg);
+         DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
+                         scan_inst->dst.reg);
          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
       }
 
@@ -2982,7 +3043,8 @@ fs_visitor::lower_load_payload()
       if (dst.file == MRF)
          dst.reg = dst.reg & ~BRW_MRF_COMPR4;
 
-      const fs_builder hbld = bld.exec_all().group(8, 0).at(block, inst);
+      const fs_builder ibld(this, block, inst);
+      const fs_builder hbld = ibld.exec_all().group(8, 0);
 
       for (uint8_t i = 0; i < inst->header_size; i++) {
          if (inst->src[i].file != BAD_FILE) {
@@ -2993,10 +3055,6 @@ fs_visitor::lower_load_payload()
          dst = offset(dst, hbld, 1);
       }
 
-      const fs_builder ibld = bld.exec_all(inst->force_writemask_all)
-                                 .group(inst->exec_size, inst->force_sechalf)
-                                 .at(block, inst);
-
       if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) &&
           inst->exec_size > 8) {
          /* In this case, the payload portion of the LOAD_PAYLOAD isn't
@@ -3084,7 +3142,7 @@ fs_visitor::lower_integer_multiplication()
            inst->dst.type != BRW_REGISTER_TYPE_UD))
          continue;
 
-      const fs_builder ibld = bld.at(block, inst);
+      const fs_builder ibld(this, block, inst);
 
       /* The MUL instruction isn't commutative. On Gen <= 6, only the low
        * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
@@ -3835,15 +3893,74 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
    }
 }
 
+/**
+ * Initialize the header present in some typed and untyped surface
+ * messages.
+ */
+static fs_reg
+emit_surface_header(const fs_builder &bld, const fs_reg &sample_mask)
+{
+   fs_builder ubld = bld.exec_all().group(8, 0);
+   const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+   ubld.MOV(dst, fs_reg(0));
+   ubld.MOV(component(dst, 7), sample_mask);
+   return dst;
+}
+
+static void
+lower_surface_logical_send(const fs_builder &bld, fs_inst *inst, opcode op,
+                           const fs_reg &sample_mask)
+{
+   /* Get the logical send arguments. */
+   const fs_reg &addr = inst->src[0];
+   const fs_reg &src = inst->src[1];
+   const fs_reg &surface = inst->src[2];
+   const UNUSED fs_reg &dims = inst->src[3];
+   const fs_reg &arg = inst->src[4];
+
+   /* Calculate the total number of components of the payload. */
+   const unsigned addr_sz = inst->components_read(0);
+   const unsigned src_sz = inst->components_read(1);
+   const unsigned header_sz = (sample_mask.file == BAD_FILE ? 0 : 1);
+   const unsigned sz = header_sz + addr_sz + src_sz;
+
+   /* Allocate space for the payload. */
+   fs_reg *const components = new fs_reg[sz];
+   const fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, sz);
+   unsigned n = 0;
+
+   /* Construct the payload. */
+   if (header_sz)
+      components[n++] = emit_surface_header(bld, sample_mask);
+
+   for (unsigned i = 0; i < addr_sz; i++)
+      components[n++] = offset(addr, bld, i);
+
+   for (unsigned i = 0; i < src_sz; i++)
+      components[n++] = offset(src, bld, i);
+
+   bld.LOAD_PAYLOAD(payload, components, sz, header_sz);
+
+   /* Update the original instruction. */
+   inst->opcode = op;
+   inst->mlen = header_sz + (addr_sz + src_sz) * inst->exec_size / 8;
+   inst->header_size = header_sz;
+
+   inst->src[0] = payload;
+   inst->src[1] = surface;
+   inst->src[2] = arg;
+   inst->resize_sources(3);
+
+   delete[] components;
+}
+
 bool
 fs_visitor::lower_logical_sends()
 {
    bool progress = false;
 
    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
-      const fs_builder ibld = bld.exec_all(inst->force_writemask_all)
-                                 .group(inst->exec_size, inst->force_sechalf)
-                                 .at(block, inst);
+      const fs_builder ibld(this, block, inst);
 
       switch (inst->opcode) {
       case FS_OPCODE_FB_WRITE_LOGICAL:
@@ -3902,6 +4019,42 @@ fs_visitor::lower_logical_sends()
          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_OFFSET);
          break;
 
+      case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
+         lower_surface_logical_send(ibld, inst,
+                                    SHADER_OPCODE_UNTYPED_SURFACE_READ,
+                                    fs_reg(0xffff));
+         break;
+
+      case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
+         lower_surface_logical_send(ibld, inst,
+                                    SHADER_OPCODE_UNTYPED_SURFACE_WRITE,
+                                    ibld.sample_mask_reg());
+         break;
+
+      case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
+         lower_surface_logical_send(ibld, inst,
+                                    SHADER_OPCODE_UNTYPED_ATOMIC,
+                                    ibld.sample_mask_reg());
+         break;
+
+      case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
+         lower_surface_logical_send(ibld, inst,
+                                    SHADER_OPCODE_TYPED_SURFACE_READ,
+                                    fs_reg(0xffff));
+         break;
+
+      case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
+         lower_surface_logical_send(ibld, inst,
+                                    SHADER_OPCODE_TYPED_SURFACE_WRITE,
+                                    ibld.sample_mask_reg());
+         break;
+
+      case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
+         lower_surface_logical_send(ibld, inst,
+                                    SHADER_OPCODE_TYPED_ATOMIC,
+                                    ibld.sample_mask_reg());
+         break;
+
       default:
          continue;
       }
@@ -3968,6 +4121,11 @@ get_lowered_simd_width(const struct brw_device_info *devinfo,
       else
          return inst->exec_size;
 
+   case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
+   case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
+   case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
+      return 8;
+
    default:
       return inst->exec_size;
    }
@@ -4005,10 +4163,15 @@ fs_visitor::lower_simd_width()
       const unsigned lower_width = get_lowered_simd_width(devinfo, inst);
 
       if (lower_width != inst->exec_size) {
-         /* Builder matching the original instruction. */
+         /* Builder matching the original instruction.  We may also need to
+          * emit an instruction of width larger than the original, set the
+          * execution size of the builder to the highest of both for now so
+          * we're sure that both cases can be handled.
+          */
          const fs_builder ibld = bld.at(block, inst)
                                     .exec_all(inst->force_writemask_all)
-                                    .group(inst->exec_size, inst->force_sechalf);
+                                    .group(MAX2(inst->exec_size, lower_width),
+                                           inst->force_sechalf);
 
          /* Split the copies in chunks of the execution width of either the
           * original or the lowered instruction, whichever is lower.
@@ -4031,14 +4194,11 @@ fs_visitor::lower_simd_width()
             split_inst.exec_size = lower_width;
             split_inst.eot = inst->eot && i == n - 1;
 
-            /* Set exec_all if the lowered width is higher than the original
-             * to avoid breaking the compiler invariant that no control
-             * flow-masked instruction is wider than the shader's
-             * dispatch_width.  Then transform the sources and destination and
-             * emit the lowered instruction.
+            /* Select the correct channel enables for the i-th group, then
+             * transform the sources and destination and emit the lowered
+             * instruction.
              */
-            const fs_builder lbld = ibld.exec_all(lower_width > inst->exec_size)
-                                        .group(lower_width, i);
+            const fs_builder lbld = ibld.group(lower_width, i);
 
             for (unsigned j = 0; j < inst->sources; j++) {
                if (inst->src[j].file != BAD_FILE &&
@@ -4511,9 +4671,11 @@ fs_visitor::optimize()
     * Ideally optimization passes wouldn't be part of the visitor so they
     * wouldn't have access to bld at all, but they do, so just in case some
     * pass forgets to ask for a location explicitly set it to NULL here to
-    * make it trip.
+    * make it trip.  The dispatch width is initialized to a bogus value to
+    * make sure that optimizations set the execution controls explicitly to
+    * match the code they are manipulating instead of relying on the defaults.
     */
-   bld = bld.at(NULL, NULL);
+   bld = fs_builder(this, 64);
 
    split_virtual_grfs();