i965/vec4: use the IR's execution size

[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_generator.cpp
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp

index 7ad4f86aebd209027626516b5b2355a5f3199b3e..707bd91882e7bf58e182db3bdce691f65b305737 100644 (file)
--- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
@@ -112,7 +112,7 @@ generate_tex(struct brw_codegen *p,
               struct brw_reg surface_index,
               struct brw_reg sampler_index)
  {
-   const struct brw_device_info *devinfo = p->devinfo;
+   const struct gen_device_info *devinfo = p->devinfo;
     int msg_type = -1;
  
     if (devinfo->gen >= 5) {
@@ -298,8 +298,12 @@ generate_tex(struct brw_codegen *p,
        if (brw_regs_equal(&surface_reg, &sampler_reg)) {
           brw_MUL(p, addr, sampler_reg, brw_imm_uw(0x101));
        } else {
-         brw_SHL(p, addr, sampler_reg, brw_imm_ud(8));
-         brw_OR(p, addr, addr, surface_reg);
+         if (sampler_reg.file == BRW_IMMEDIATE_VALUE) {
+            brw_OR(p, addr, surface_reg, brw_imm_ud(sampler_reg.ud << 8));
+         } else {
+            brw_SHL(p, addr, sampler_reg, brw_imm_ud(8));
+            brw_OR(p, addr, addr, surface_reg);
+         }
        }
        if (base_binding_table_index)
           brw_ADD(p, addr, addr, brw_imm_ud(base_binding_table_index));
@@ -728,7 +732,7 @@ generate_gs_set_primitive_id(struct brw_codegen *p, struct brw_reg dst)
  static void
  generate_tcs_get_instance_id(struct brw_codegen *p, struct brw_reg dst)
  {
-   const struct brw_device_info *devinfo = p->devinfo;
+   const struct gen_device_info *devinfo = p->devinfo;
     const bool ivb = devinfo->is_ivybridge || devinfo->is_baytrail;
  
     /* "Instance Count" comes as part of the payload in r0.2 bits 23:17.
@@ -759,7 +763,7 @@ generate_tcs_urb_write(struct brw_codegen *p,
                         vec4_instruction *inst,
                         struct brw_reg urb_header)
  {
-   const struct brw_device_info *devinfo = p->devinfo;
+   const struct gen_device_info *devinfo = p->devinfo;
  
     brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
     brw_set_dest(p, send, brw_null_reg());
@@ -929,7 +933,7 @@ generate_vec4_urb_read(struct brw_codegen *p,
                         struct brw_reg dst,
                         struct brw_reg header)
  {
-   const struct brw_device_info *devinfo = p->devinfo;
+   const struct gen_device_info *devinfo = p->devinfo;
  
     assert(header.file == BRW_GENERAL_REGISTER_FILE);
     assert(header.type == BRW_REGISTER_TYPE_UD);
@@ -954,7 +958,7 @@ generate_tcs_release_input(struct brw_codegen *p,
                             struct brw_reg vertex,
                             struct brw_reg is_unpaired)
  {
-   const struct brw_device_info *devinfo = p->devinfo;
+   const struct gen_device_info *devinfo = p->devinfo;
  
     assert(vertex.file == BRW_IMMEDIATE_VALUE);
     assert(vertex.type == BRW_REGISTER_TYPE_UD);
@@ -1034,7 +1038,7 @@ generate_tcs_create_barrier_header(struct brw_codegen *p,
                                     struct brw_vue_prog_data *prog_data,
                                     struct brw_reg dst)
  {
-   const struct brw_device_info *devinfo = p->devinfo;
+   const struct gen_device_info *devinfo = p->devinfo;
     const bool ivb = devinfo->is_ivybridge || devinfo->is_baytrail;
     struct brw_reg m0_2 = get_element_ud(dst, 2);
     unsigned instances = ((struct brw_tcs_prog_data *) prog_data)->instances;
@@ -1123,7 +1127,7 @@ generate_scratch_read(struct brw_codegen *p,
                        struct brw_reg dst,
                        struct brw_reg index)
  {
-   const struct brw_device_info *devinfo = p->devinfo;
+   const struct gen_device_info *devinfo = p->devinfo;
     struct brw_reg header = brw_vec8_grf(0, 0);
  
     gen6_resolve_implied_move(p, &header, inst->base_mrf);
@@ -1140,6 +1144,11 @@ generate_scratch_read(struct brw_codegen *p,
     else
        msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
  
+   const unsigned target_cache =
+      devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
+      devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
+      BRW_DATAPORT_READ_TARGET_RENDER_CACHE;
+
     /* Each of the 8 channel enables is considered for whether each
      * dword is written.
      */
@@ -1151,8 +1160,7 @@ generate_scratch_read(struct brw_codegen *p,
     brw_set_dp_read_message(p, send,
                             brw_scratch_surface_idx(p),
                            BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
-                          msg_type,
-                          BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
+                          msg_type, target_cache,
                            2, /* mlen */
                             true, /* header_present */
                            1 /* rlen */);
@@ -1165,7 +1173,11 @@ generate_scratch_write(struct brw_codegen *p,
                         struct brw_reg src,
                         struct brw_reg index)
  {
-   const struct brw_device_info *devinfo = p->devinfo;
+   const struct gen_device_info *devinfo = p->devinfo;
+   const unsigned target_cache =
+      (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
+       devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
+       BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
     struct brw_reg header = brw_vec8_grf(0, 0);
     bool write_commit;
  
@@ -1225,6 +1237,7 @@ generate_scratch_write(struct brw_codegen *p,
                              brw_scratch_surface_idx(p),
                             BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
                             msg_type,
+                            target_cache,
                             3, /* mlen */
                             true, /* header present */
                             false, /* not a render target write */
@@ -1241,7 +1254,10 @@ generate_pull_constant_load(struct brw_codegen *p,
                              struct brw_reg index,
                              struct brw_reg offset)
  {
-   const struct brw_device_info *devinfo = p->devinfo;
+   const struct gen_device_info *devinfo = p->devinfo;
+   const unsigned target_cache =
+      (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_SAMPLER_CACHE :
+       BRW_DATAPORT_READ_TARGET_DATA_CACHE);
     assert(index.file == BRW_IMMEDIATE_VALUE &&
           index.type == BRW_REGISTER_TYPE_UD);
     uint32_t surf_index = index.ud;
@@ -1287,7 +1303,7 @@ generate_pull_constant_load(struct brw_codegen *p,
                            surf_index,
                            BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
                            msg_type,
-                          BRW_DATAPORT_READ_TARGET_DATA_CACHE,
+                           target_cache,
                            2, /* mlen */
                             true, /* header_present */
                            1 /* rlen */);
@@ -1469,12 +1485,13 @@ generate_code(struct brw_codegen *p,
                struct brw_vue_prog_data *prog_data,
                const struct cfg_t *cfg)
  {
-   const struct brw_device_info *devinfo = p->devinfo;
+   const struct gen_device_info *devinfo = p->devinfo;
     const char *stage_abbrev = _mesa_shader_stage_to_abbrev(nir->stage);
     bool debug_flag = INTEL_DEBUG &
        intel_debug_flag_for_shader_stage(nir->stage);
     struct annotation_info annotation;
     memset(&annotation, 0, sizeof(annotation));
+   int spill_count = 0, fill_count = 0;
     int loop_count = 0;
  
     foreach_block_and_inst (block, vec4_instruction, inst, cfg) {
@@ -1494,39 +1511,12 @@ generate_code(struct brw_codegen *p,
        brw_set_default_saturate(p, inst->saturate);
        brw_set_default_mask_control(p, inst->force_writemask_all);
        brw_set_default_acc_write_control(p, inst->writes_accumulator);
+      brw_set_default_exec_size(p, cvt(inst->exec_size) - 1);
  
        assert(inst->base_mrf + inst->mlen <= BRW_MAX_MRF(devinfo->gen));
        assert(inst->mlen <= BRW_MAX_MSG_LENGTH);
  
        unsigned pre_emit_nr_insn = p->nr_insn;
-      bool fix_exec_size = false;
-
-      if (dst.width == BRW_WIDTH_4) {
-         /* This happens in attribute fixups for "dual instanced" geometry
-          * shaders, since they use attributes that are vec4's.  Since the exec
-          * width is only 4, it's essential that the caller set
-          * force_writemask_all in order to make sure the instruction is executed
-          * regardless of which channels are enabled.
-          */
-         assert(inst->force_writemask_all);
-
-         /* Fix up any <8;8,1> or <0;4,1> source registers to <4;4,1> to satisfy
-          * the following register region restrictions (from Graphics BSpec:
-          * 3D-Media-GPGPU Engine > EU Overview > Registers and Register Regions
-          * > Register Region Restrictions)
-          *
-          *     1. ExecSize must be greater than or equal to Width.
-          *
-          *     2. If ExecSize = Width and HorzStride != 0, VertStride must be set
-          *        to Width * HorzStride."
-          */
-         for (int i = 0; i < 3; i++) {
-            if (src[i].file == BRW_GENERAL_REGISTER_FILE)
-               src[i] = stride(src[i], 4, 4, 1);
-         }
-         brw_set_default_exec_size(p, BRW_EXECUTE_4);
-         fix_exec_size = true;
-      }
  
        switch (inst->opcode) {
        case VEC4_OPCODE_UNPACK_UNIFORM:
@@ -1758,10 +1748,12 @@ generate_code(struct brw_codegen *p,
  
        case SHADER_OPCODE_GEN4_SCRATCH_READ:
           generate_scratch_read(p, inst, dst, src[0]);
+         fill_count++;
           break;
  
        case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
           generate_scratch_write(p, inst, dst, src[0], src[1]);
+         spill_count++;
           break;
  
        case VS_OPCODE_PULL_CONSTANT_LOAD:
@@ -1884,9 +1876,14 @@ generate_code(struct brw_codegen *p,
           brw_memory_fence(p, dst);
           break;
  
-      case SHADER_OPCODE_FIND_LIVE_CHANNEL:
-         brw_find_live_channel(p, dst);
+      case SHADER_OPCODE_FIND_LIVE_CHANNEL: {
+         const struct brw_reg mask =
+            brw_stage_has_packed_dispatch(devinfo, nir->stage,
+                                          &prog_data->base) ? brw_imm_ud(~0u) :
+            brw_dmask_reg();
+         brw_find_live_channel(p, dst, mask);
           break;
+      }
  
        case SHADER_OPCODE_BROADCAST:
           assert(inst->force_writemask_all);
@@ -1913,6 +1910,100 @@ generate_code(struct brw_codegen *p,
           break;
        }
  
+      case VEC4_OPCODE_FROM_DOUBLE: {
+         assert(type_sz(src[0].type) == 8);
+         assert(type_sz(dst.type) == 4);
+
+         brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+         dst.hstride = BRW_HORIZONTAL_STRIDE_2;
+         dst.width = BRW_WIDTH_4;
+         src[0].vstride = BRW_VERTICAL_STRIDE_4;
+         src[0].width = BRW_WIDTH_4;
+         brw_MOV(p, dst, src[0]);
+
+         struct brw_reg dst_as_src = dst;
+         dst.hstride = BRW_HORIZONTAL_STRIDE_1;
+         dst.width = BRW_WIDTH_8;
+         brw_MOV(p, dst, dst_as_src);
+
+         brw_set_default_access_mode(p, BRW_ALIGN_16);
+         break;
+      }
+
+      case VEC4_OPCODE_TO_DOUBLE: {
+         assert(type_sz(src[0].type) == 4);
+         assert(type_sz(dst.type) == 8);
+
+         brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+         struct brw_reg tmp = retype(dst, src[0].type);
+         tmp.hstride = BRW_HORIZONTAL_STRIDE_2;
+         tmp.width = BRW_WIDTH_4;
+         src[0].vstride = BRW_VERTICAL_STRIDE_4;
+         src[0].hstride = BRW_HORIZONTAL_STRIDE_1;
+         src[0].width = BRW_WIDTH_4;
+         brw_MOV(p, tmp, src[0]);
+
+         tmp.vstride = BRW_VERTICAL_STRIDE_8;
+         tmp.hstride = BRW_HORIZONTAL_STRIDE_2;
+         tmp.width = BRW_WIDTH_4;
+         brw_MOV(p, dst, tmp);
+
+         brw_set_default_access_mode(p, BRW_ALIGN_16);
+         break;
+      }
+
+      case VEC4_OPCODE_PICK_LOW_32BIT:
+      case VEC4_OPCODE_PICK_HIGH_32BIT: {
+         /* Stores the low/high 32-bit of each 64-bit element in src[0] into
+          * dst using ALIGN1 mode and a <8,4,2>:UD region on the source.
+          */
+         assert(type_sz(src[0].type) == 8);
+         assert(type_sz(dst.type) == 4);
+
+         brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+         dst = retype(dst, BRW_REGISTER_TYPE_UD);
+         dst.hstride = BRW_HORIZONTAL_STRIDE_1;
+
+         src[0] = retype(src[0], BRW_REGISTER_TYPE_UD);
+         if (inst->opcode == VEC4_OPCODE_PICK_HIGH_32BIT)
+            src[0] = suboffset(src[0], 1);
+         src[0].vstride = BRW_VERTICAL_STRIDE_8;
+         src[0].width = BRW_WIDTH_4;
+         src[0].hstride = BRW_HORIZONTAL_STRIDE_2;
+         brw_MOV(p, dst, src[0]);
+
+         brw_set_default_access_mode(p, BRW_ALIGN_16);
+         break;
+      }
+
+      case VEC4_OPCODE_SET_LOW_32BIT:
+      case VEC4_OPCODE_SET_HIGH_32BIT: {
+         /* Reads consecutive 32-bit elements from src[0] and writes
+          * them to the low/high 32-bit of each 64-bit element in dst.
+          */
+         assert(type_sz(src[0].type) == 4);
+         assert(type_sz(dst.type) == 8);
+
+         brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+         dst = retype(dst, BRW_REGISTER_TYPE_UD);
+         if (inst->opcode == VEC4_OPCODE_SET_HIGH_32BIT)
+            dst = suboffset(dst, 1);
+         dst.hstride = BRW_HORIZONTAL_STRIDE_2;
+
+         src[0] = retype(src[0], BRW_REGISTER_TYPE_UD);
+         src[0].vstride = BRW_VERTICAL_STRIDE_4;
+         src[0].width = BRW_WIDTH_4;
+         src[0].hstride = BRW_HORIZONTAL_STRIDE_1;
+         brw_MOV(p, dst, src[0]);
+
+         brw_set_default_access_mode(p, BRW_ALIGN_16);
+         break;
+      }
+
        case VEC4_OPCODE_PACK_BYTES: {
           /* Is effectively:
            *
@@ -2028,9 +2119,6 @@ generate_code(struct brw_codegen *p,
           unreachable("Unsupported opcode");
        }
  
-      if (fix_exec_size)
-         brw_set_default_exec_size(p, BRW_EXECUTE_8);
-
        if (inst->opcode == VEC4_OPCODE_PACK_BYTES) {
           /* Handled dependency hints in the generator. */
  
@@ -2049,7 +2137,7 @@ generate_code(struct brw_codegen *p,
        }
     }
  
-   brw_set_uip_jip(p);
+   brw_set_uip_jip(p, 0);
     annotation_finalize(&annotation, p->next_insn_offset);
  
  #ifndef NDEBUG
@@ -2065,13 +2153,13 @@ generate_code(struct brw_codegen *p,
  
     if (unlikely(debug_flag)) {
        fprintf(stderr, "Native code for %s %s shader %s:\n",
-              nir->info.label ? nir->info.label : "unnamed",
-              _mesa_shader_stage_to_string(nir->stage), nir->info.name);
+              nir->info->label ? nir->info->label : "unnamed",
+              _mesa_shader_stage_to_string(nir->stage), nir->info->name);
  
-      fprintf(stderr, "%s vec4 shader: %d instructions. %d loops. %u cycles."
-                      "Compacted %d to %d bytes (%.0f%%)\n",
-              stage_abbrev,
-              before_size / 16, loop_count, cfg->cycle_count, before_size, after_size,
+      fprintf(stderr, "%s vec4 shader: %d instructions. %d loops. %u cycles. %d:%d "
+                      "spills:fills. Compacted %d to %d bytes (%.0f%%)\n",
+              stage_abbrev, before_size / 16, loop_count, cfg->cycle_count,
+              spill_count, fill_count, before_size, after_size,
                100.0f * (before_size - after_size) / before_size);
  
        dump_assembly(p->store, annotation.ann_count, annotation.ann,
@@ -2082,10 +2170,11 @@ generate_code(struct brw_codegen *p,
  
     compiler->shader_debug_log(log_data,
                                "%s vec4 shader: %d inst, %d loops, %u cycles, "
-                              "compacted %d to %d bytes.",
+                              "%d:%d spills:fills, compacted %d to %d bytes.",
                                stage_abbrev, before_size / 16,
-                              loop_count, cfg->cycle_count,
-                              before_size, after_size);
+                              loop_count, cfg->cycle_count, spill_count,
+                              fill_count, before_size, after_size);
+
  }
  
  extern "C" const unsigned *