i965/fs: Lower 32x32 bit multiplication on BXT.

[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp

index 50ea1b271d9da25ba86d091d33f60f976b97e95d..0278237101e92490e9247e295e764c7a1db38793 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -975,11 +975,11 @@ fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
        bld.MOV(wpos, this->pixel_y);
     } else {
        fs_reg pixel_y = this->pixel_y;
-      float offset = (pixel_center_integer ? 0.0 : 0.5);
+      float offset = (pixel_center_integer ? 0.0f : 0.5f);
  
        if (flip) {
          pixel_y.negate = true;
-        offset += key->drawable_height - 1.0;
+        offset += key->drawable_height - 1.0f;
        }
  
        bld.ADD(wpos, pixel_y, fs_reg(offset));
@@ -2258,7 +2258,8 @@ fs_visitor::opt_sampler_eot()
        return false;
  
     /* Look for a texturing instruction immediately before the final FB_WRITE. */
-   fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
+   bblock_t *block = cfg->blocks[cfg->num_blocks - 1];
+   fs_inst *fb_write = (fs_inst *)block->end();
     assert(fb_write->eot);
     assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
  
@@ -2289,9 +2290,11 @@ fs_visitor::opt_sampler_eot()
     assert(!tex_inst->eot); /* We can't get here twice */
     assert((tex_inst->offset & (0xff << 24)) == 0);
  
+   const fs_builder ibld(this, block, tex_inst);
+
     tex_inst->offset |= fb_write->target << 24;
     tex_inst->eot = true;
-   tex_inst->dst = bld.null_reg_ud();
+   tex_inst->dst = ibld.null_reg_ud();
     fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
  
     /* If a header is present, marking the eot is sufficient. Otherwise, we need
@@ -2303,8 +2306,8 @@ fs_visitor::opt_sampler_eot()
     if (tex_inst->header_size != 0)
        return true;
  
-   fs_reg send_header = bld.vgrf(BRW_REGISTER_TYPE_F,
-                                 load_payload->sources + 1);
+   fs_reg send_header = ibld.vgrf(BRW_REGISTER_TYPE_F,
+                                  load_payload->sources + 1);
     fs_reg *new_sources =
        ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
  
@@ -2824,7 +2827,8 @@ fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
        if (block->start() == scan_inst) {
           for (int i = 0; i < write_len; i++) {
              if (needs_dep[i])
-               DEP_RESOLVE_MOV(bld.at(block, inst), first_write_grf + i);
+               DEP_RESOLVE_MOV(fs_builder(this, block, inst),
+                               first_write_grf + i);
           }
           return;
        }
@@ -2840,7 +2844,7 @@ fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
              if (reg >= first_write_grf &&
                  reg < first_write_grf + write_len &&
                  needs_dep[reg - first_write_grf]) {
-               DEP_RESOLVE_MOV(bld.at(block, inst), reg);
+               DEP_RESOLVE_MOV(fs_builder(this, block, inst), reg);
                 needs_dep[reg - first_write_grf] = false;
                 if (scan_inst->exec_size == 16)
                    needs_dep[reg - first_write_grf + 1] = false;
@@ -2887,7 +2891,8 @@ fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_ins
        if (block->end() == scan_inst) {
           for (int i = 0; i < write_len; i++) {
              if (needs_dep[i])
-               DEP_RESOLVE_MOV(bld.at(block, scan_inst), first_write_grf + i);
+               DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
+                               first_write_grf + i);
           }
           return;
        }
@@ -2902,7 +2907,8 @@ fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_ins
            scan_inst->dst.reg >= first_write_grf &&
            scan_inst->dst.reg < first_write_grf + write_len &&
            needs_dep[scan_inst->dst.reg - first_write_grf]) {
-         DEP_RESOLVE_MOV(bld.at(block, scan_inst), scan_inst->dst.reg);
+         DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
+                         scan_inst->dst.reg);
           needs_dep[scan_inst->dst.reg - first_write_grf] = false;
        }
  
@@ -3037,7 +3043,8 @@ fs_visitor::lower_load_payload()
        if (dst.file == MRF)
           dst.reg = dst.reg & ~BRW_MRF_COMPR4;
  
-      const fs_builder hbld = bld.exec_all().group(8, 0).at(block, inst);
+      const fs_builder ibld(this, block, inst);
+      const fs_builder hbld = ibld.exec_all().group(8, 0);
  
        for (uint8_t i = 0; i < inst->header_size; i++) {
           if (inst->src[i].file != BAD_FILE) {
@@ -3048,10 +3055,6 @@ fs_visitor::lower_load_payload()
           dst = offset(dst, hbld, 1);
        }
  
-      const fs_builder ibld = bld.exec_all(inst->force_writemask_all)
-                                 .group(inst->exec_size, inst->force_sechalf)
-                                 .at(block, inst);
-
        if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) &&
            inst->exec_size > 8) {
           /* In this case, the payload portion of the LOAD_PAYLOAD isn't
@@ -3127,9 +3130,9 @@ fs_visitor::lower_integer_multiplication()
     bool progress = false;
  
     /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit operation
-    * directly, but Cherryview cannot.
+    * directly, but CHV/BXT cannot.
      */
-   if (devinfo->gen >= 8 && !devinfo->is_cherryview)
+   if (devinfo->gen >= 8 && !devinfo->is_cherryview && !devinfo->is_broxton)
        return false;
  
     foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
@@ -3139,7 +3142,7 @@ fs_visitor::lower_integer_multiplication()
             inst->dst.type != BRW_REGISTER_TYPE_UD))
           continue;
  
-      const fs_builder ibld = bld.at(block, inst);
+      const fs_builder ibld(this, block, inst);
  
        /* The MUL instruction isn't commutative. On Gen <= 6, only the low
         * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
@@ -3957,9 +3960,7 @@ fs_visitor::lower_logical_sends()
     bool progress = false;
  
     foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
-      const fs_builder ibld = bld.exec_all(inst->force_writemask_all)
-                                 .group(inst->exec_size, inst->force_sechalf)
-                                 .at(block, inst);
+      const fs_builder ibld(this, block, inst);
  
        switch (inst->opcode) {
        case FS_OPCODE_FB_WRITE_LOGICAL:
@@ -4162,10 +4163,15 @@ fs_visitor::lower_simd_width()
        const unsigned lower_width = get_lowered_simd_width(devinfo, inst);
  
        if (lower_width != inst->exec_size) {
-         /* Builder matching the original instruction. */
+         /* Builder matching the original instruction.  We may also need to
+          * emit an instruction of width larger than the original, set the
+          * execution size of the builder to the highest of both for now so
+          * we're sure that both cases can be handled.
+          */
           const fs_builder ibld = bld.at(block, inst)
                                      .exec_all(inst->force_writemask_all)
-                                    .group(inst->exec_size, inst->force_sechalf);
+                                    .group(MAX2(inst->exec_size, lower_width),
+                                           inst->force_sechalf);
  
           /* Split the copies in chunks of the execution width of either the
            * original or the lowered instruction, whichever is lower.
@@ -4188,14 +4194,11 @@ fs_visitor::lower_simd_width()
              split_inst.exec_size = lower_width;
              split_inst.eot = inst->eot && i == n - 1;
  
-            /* Set exec_all if the lowered width is higher than the original
-             * to avoid breaking the compiler invariant that no control
-             * flow-masked instruction is wider than the shader's
-             * dispatch_width.  Then transform the sources and destination and
-             * emit the lowered instruction.
+            /* Select the correct channel enables for the i-th group, then
+             * transform the sources and destination and emit the lowered
+             * instruction.
               */
-            const fs_builder lbld = ibld.exec_all(lower_width > inst->exec_size)
-                                        .group(lower_width, i);
+            const fs_builder lbld = ibld.group(lower_width, i);
  
              for (unsigned j = 0; j < inst->sources; j++) {
                 if (inst->src[j].file != BAD_FILE &&
@@ -4668,9 +4671,11 @@ fs_visitor::optimize()
      * Ideally optimization passes wouldn't be part of the visitor so they
      * wouldn't have access to bld at all, but they do, so just in case some
      * pass forgets to ask for a location explicitly set it to NULL here to
-    * make it trip.
+    * make it trip.  The dispatch width is initialized to a bogus value to
+    * make sure that optimizations set the execution controls explicitly to
+    * match the code they are manipulating instead of relying on the defaults.
      */
-   bld = bld.at(NULL, NULL);
+   bld = fs_builder(this, 64);
  
     split_virtual_grfs();