i965/fs: Lower 32x32 bit multiplication on BXT.

[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp

index 2af7a892ca4b11b4348ebd279b3399df7a016207..0278237101e92490e9247e295e764c7a1db38793 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -975,11 +975,11 @@ fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
        bld.MOV(wpos, this->pixel_y);
     } else {
        fs_reg pixel_y = this->pixel_y;
-      float offset = (pixel_center_integer ? 0.0 : 0.5);
+      float offset = (pixel_center_integer ? 0.0f : 0.5f);
  
        if (flip) {
          pixel_y.negate = true;
-        offset += key->drawable_height - 1.0;
+        offset += key->drawable_height - 1.0f;
        }
  
        bld.ADD(wpos, pixel_y, fs_reg(offset));
@@ -1924,8 +1924,7 @@ fs_visitor::demote_pull_constants()
             continue;
  
           /* Set up the annotation tracking for new generated instructions. */
-         const fs_builder ibld = bld.annotate(inst->annotation, inst->ir)
-                                    .at(block, inst);
+         const fs_builder ibld(this, block, inst);
           fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
           fs_reg dst = vgrf(glsl_type::float_type);
  
@@ -1940,8 +1939,9 @@ fs_visitor::demote_pull_constants()
              inst->src[i].reladdr = NULL;
              inst->src[i].stride = 1;
           } else {
+            const fs_builder ubld = ibld.exec_all().group(8, 0);
              fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
-            ibld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
+            ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
                        dst, surf_index, offset);
              inst->src[i].set_smear(pull_index & 3);
           }
@@ -2258,7 +2258,8 @@ fs_visitor::opt_sampler_eot()
        return false;
  
     /* Look for a texturing instruction immediately before the final FB_WRITE. */
-   fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
+   bblock_t *block = cfg->blocks[cfg->num_blocks - 1];
+   fs_inst *fb_write = (fs_inst *)block->end();
     assert(fb_write->eot);
     assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
  
@@ -2289,9 +2290,11 @@ fs_visitor::opt_sampler_eot()
     assert(!tex_inst->eot); /* We can't get here twice */
     assert((tex_inst->offset & (0xff << 24)) == 0);
  
+   const fs_builder ibld(this, block, tex_inst);
+
     tex_inst->offset |= fb_write->target << 24;
     tex_inst->eot = true;
-   tex_inst->dst = bld.null_reg_ud();
+   tex_inst->dst = ibld.null_reg_ud();
     fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
  
     /* If a header is present, marking the eot is sufficient. Otherwise, we need
@@ -2303,8 +2306,8 @@ fs_visitor::opt_sampler_eot()
     if (tex_inst->header_size != 0)
        return true;
  
-   fs_reg send_header = bld.vgrf(BRW_REGISTER_TYPE_F,
-                                 load_payload->sources + 1);
+   fs_reg send_header = ibld.vgrf(BRW_REGISTER_TYPE_F,
+                                  load_payload->sources + 1);
     fs_reg *new_sources =
        ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
  
@@ -2824,7 +2827,8 @@ fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
        if (block->start() == scan_inst) {
           for (int i = 0; i < write_len; i++) {
              if (needs_dep[i])
-               DEP_RESOLVE_MOV(bld.at(block, inst), first_write_grf + i);
+               DEP_RESOLVE_MOV(fs_builder(this, block, inst),
+                               first_write_grf + i);
           }
           return;
        }
@@ -2840,7 +2844,7 @@ fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
              if (reg >= first_write_grf &&
                  reg < first_write_grf + write_len &&
                  needs_dep[reg - first_write_grf]) {
-               DEP_RESOLVE_MOV(bld.at(block, inst), reg);
+               DEP_RESOLVE_MOV(fs_builder(this, block, inst), reg);
                 needs_dep[reg - first_write_grf] = false;
                 if (scan_inst->exec_size == 16)
                    needs_dep[reg - first_write_grf + 1] = false;
@@ -2887,7 +2891,8 @@ fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_ins
        if (block->end() == scan_inst) {
           for (int i = 0; i < write_len; i++) {
              if (needs_dep[i])
-               DEP_RESOLVE_MOV(bld.at(block, scan_inst), first_write_grf + i);
+               DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
+                               first_write_grf + i);
           }
           return;
        }
@@ -2902,7 +2907,8 @@ fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_ins
            scan_inst->dst.reg >= first_write_grf &&
            scan_inst->dst.reg < first_write_grf + write_len &&
            needs_dep[scan_inst->dst.reg - first_write_grf]) {
-         DEP_RESOLVE_MOV(bld.at(block, scan_inst), scan_inst->dst.reg);
+         DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
+                         scan_inst->dst.reg);
           needs_dep[scan_inst->dst.reg - first_write_grf] = false;
        }
  
@@ -3037,7 +3043,8 @@ fs_visitor::lower_load_payload()
        if (dst.file == MRF)
           dst.reg = dst.reg & ~BRW_MRF_COMPR4;
  
-      const fs_builder hbld = bld.exec_all().group(8, 0).at(block, inst);
+      const fs_builder ibld(this, block, inst);
+      const fs_builder hbld = ibld.exec_all().group(8, 0);
  
        for (uint8_t i = 0; i < inst->header_size; i++) {
           if (inst->src[i].file != BAD_FILE) {
@@ -3048,10 +3055,6 @@ fs_visitor::lower_load_payload()
           dst = offset(dst, hbld, 1);
        }
  
-      const fs_builder ibld = bld.exec_all(inst->force_writemask_all)
-                                 .group(inst->exec_size, inst->force_sechalf)
-                                 .at(block, inst);
-
        if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) &&
            inst->exec_size > 8) {
           /* In this case, the payload portion of the LOAD_PAYLOAD isn't
@@ -3127,9 +3130,9 @@ fs_visitor::lower_integer_multiplication()
     bool progress = false;
  
     /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit operation
-    * directly, but Cherryview cannot.
+    * directly, but CHV/BXT cannot.
      */
-   if (devinfo->gen >= 8 && !devinfo->is_cherryview)
+   if (devinfo->gen >= 8 && !devinfo->is_cherryview && !devinfo->is_broxton)
        return false;
  
     foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
@@ -3139,7 +3142,7 @@ fs_visitor::lower_integer_multiplication()
             inst->dst.type != BRW_REGISTER_TYPE_UD))
           continue;
  
-      const fs_builder ibld = bld.at(block, inst);
+      const fs_builder ibld(this, block, inst);
  
        /* The MUL instruction isn't commutative. On Gen <= 6, only the low
         * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
@@ -3890,6 +3893,20 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
     }
  }
  
+/**
+ * Initialize the header present in some typed and untyped surface
+ * messages.
+ */
+static fs_reg
+emit_surface_header(const fs_builder &bld, const fs_reg &sample_mask)
+{
+   fs_builder ubld = bld.exec_all().group(8, 0);
+   const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+   ubld.MOV(dst, fs_reg(0));
+   ubld.MOV(component(dst, 7), sample_mask);
+   return dst;
+}
+
  static void
  lower_surface_logical_send(const fs_builder &bld, fs_inst *inst, opcode op,
                             const fs_reg &sample_mask)
@@ -3898,10 +3915,43 @@ lower_surface_logical_send(const fs_builder &bld, fs_inst *inst, opcode op,
     const fs_reg &addr = inst->src[0];
     const fs_reg &src = inst->src[1];
     const fs_reg &surface = inst->src[2];
-   const fs_reg &dims = inst->src[3];
+   const UNUSED fs_reg &dims = inst->src[3];
     const fs_reg &arg = inst->src[4];
  
-   assert(!"Not implemented");
+   /* Calculate the total number of components of the payload. */
+   const unsigned addr_sz = inst->components_read(0);
+   const unsigned src_sz = inst->components_read(1);
+   const unsigned header_sz = (sample_mask.file == BAD_FILE ? 0 : 1);
+   const unsigned sz = header_sz + addr_sz + src_sz;
+
+   /* Allocate space for the payload. */
+   fs_reg *const components = new fs_reg[sz];
+   const fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, sz);
+   unsigned n = 0;
+
+   /* Construct the payload. */
+   if (header_sz)
+      components[n++] = emit_surface_header(bld, sample_mask);
+
+   for (unsigned i = 0; i < addr_sz; i++)
+      components[n++] = offset(addr, bld, i);
+
+   for (unsigned i = 0; i < src_sz; i++)
+      components[n++] = offset(src, bld, i);
+
+   bld.LOAD_PAYLOAD(payload, components, sz, header_sz);
+
+   /* Update the original instruction. */
+   inst->opcode = op;
+   inst->mlen = header_sz + (addr_sz + src_sz) * inst->exec_size / 8;
+   inst->header_size = header_sz;
+
+   inst->src[0] = payload;
+   inst->src[1] = surface;
+   inst->src[2] = arg;
+   inst->resize_sources(3);
+
+   delete[] components;
  }
  
  bool
@@ -3910,9 +3960,7 @@ fs_visitor::lower_logical_sends()
     bool progress = false;
  
     foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
-      const fs_builder ibld = bld.exec_all(inst->force_writemask_all)
-                                 .group(inst->exec_size, inst->force_sechalf)
-                                 .at(block, inst);
+      const fs_builder ibld(this, block, inst);
  
        switch (inst->opcode) {
        case FS_OPCODE_FB_WRITE_LOGICAL:
@@ -3974,37 +4022,37 @@ fs_visitor::lower_logical_sends()
        case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
           lower_surface_logical_send(ibld, inst,
                                      SHADER_OPCODE_UNTYPED_SURFACE_READ,
-                                    fs_reg());
+                                    fs_reg(0xffff));
           break;
  
        case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
           lower_surface_logical_send(ibld, inst,
                                      SHADER_OPCODE_UNTYPED_SURFACE_WRITE,
-                                    fs_reg());
+                                    ibld.sample_mask_reg());
           break;
  
        case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
           lower_surface_logical_send(ibld, inst,
                                      SHADER_OPCODE_UNTYPED_ATOMIC,
-                                    fs_reg());
+                                    ibld.sample_mask_reg());
           break;
  
        case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
           lower_surface_logical_send(ibld, inst,
                                      SHADER_OPCODE_TYPED_SURFACE_READ,
-                                    fs_reg());
+                                    fs_reg(0xffff));
           break;
  
        case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
           lower_surface_logical_send(ibld, inst,
                                      SHADER_OPCODE_TYPED_SURFACE_WRITE,
-                                    fs_reg());
+                                    ibld.sample_mask_reg());
           break;
  
        case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
           lower_surface_logical_send(ibld, inst,
                                      SHADER_OPCODE_TYPED_ATOMIC,
-                                    fs_reg());
+                                    ibld.sample_mask_reg());
           break;
  
        default:
@@ -4073,6 +4121,11 @@ get_lowered_simd_width(const struct brw_device_info *devinfo,
        else
           return inst->exec_size;
  
+   case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
+   case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
+   case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
+      return 8;
+
     default:
        return inst->exec_size;
     }
@@ -4110,10 +4163,15 @@ fs_visitor::lower_simd_width()
        const unsigned lower_width = get_lowered_simd_width(devinfo, inst);
  
        if (lower_width != inst->exec_size) {
-         /* Builder matching the original instruction. */
+         /* Builder matching the original instruction.  We may also need to
+          * emit an instruction of width larger than the original, set the
+          * execution size of the builder to the highest of both for now so
+          * we're sure that both cases can be handled.
+          */
           const fs_builder ibld = bld.at(block, inst)
                                      .exec_all(inst->force_writemask_all)
-                                    .group(inst->exec_size, inst->force_sechalf);
+                                    .group(MAX2(inst->exec_size, lower_width),
+                                           inst->force_sechalf);
  
           /* Split the copies in chunks of the execution width of either the
            * original or the lowered instruction, whichever is lower.
@@ -4136,14 +4194,11 @@ fs_visitor::lower_simd_width()
              split_inst.exec_size = lower_width;
              split_inst.eot = inst->eot && i == n - 1;
  
-            /* Set exec_all if the lowered width is higher than the original
-             * to avoid breaking the compiler invariant that no control
-             * flow-masked instruction is wider than the shader's
-             * dispatch_width.  Then transform the sources and destination and
-             * emit the lowered instruction.
+            /* Select the correct channel enables for the i-th group, then
+             * transform the sources and destination and emit the lowered
+             * instruction.
               */
-            const fs_builder lbld = ibld.exec_all(lower_width > inst->exec_size)
-                                        .group(lower_width, i);
+            const fs_builder lbld = ibld.group(lower_width, i);
  
              for (unsigned j = 0; j < inst->sources; j++) {
                 if (inst->src[j].file != BAD_FILE &&
@@ -4616,9 +4671,11 @@ fs_visitor::optimize()
      * Ideally optimization passes wouldn't be part of the visitor so they
      * wouldn't have access to bld at all, but they do, so just in case some
      * pass forgets to ask for a location explicitly set it to NULL here to
-    * make it trip.
+    * make it trip.  The dispatch width is initialized to a bogus value to
+    * make sure that optimizations set the execution controls explicitly to
+    * match the code they are manipulating instead of relying on the defaults.
      */
-   bld = bld.at(NULL, NULL);
+   bld = fs_builder(this, 64);
  
     split_virtual_grfs();