i965/vec4: use the IR's execution size
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_generator.cpp
index baf442212cce440753e2ba992da31ca75a99458d..707bd91882e7bf58e182db3bdce691f65b305737 100644 (file)
@@ -112,7 +112,7 @@ generate_tex(struct brw_codegen *p,
              struct brw_reg surface_index,
              struct brw_reg sampler_index)
 {
-   const struct brw_device_info *devinfo = p->devinfo;
+   const struct gen_device_info *devinfo = p->devinfo;
    int msg_type = -1;
 
    if (devinfo->gen >= 5) {
@@ -298,8 +298,12 @@ generate_tex(struct brw_codegen *p,
       if (brw_regs_equal(&surface_reg, &sampler_reg)) {
          brw_MUL(p, addr, sampler_reg, brw_imm_uw(0x101));
       } else {
-         brw_SHL(p, addr, sampler_reg, brw_imm_ud(8));
-         brw_OR(p, addr, addr, surface_reg);
+         if (sampler_reg.file == BRW_IMMEDIATE_VALUE) {
+            brw_OR(p, addr, surface_reg, brw_imm_ud(sampler_reg.ud << 8));
+         } else {
+            brw_SHL(p, addr, sampler_reg, brw_imm_ud(8));
+            brw_OR(p, addr, addr, surface_reg);
+         }
       }
       if (base_binding_table_index)
          brw_ADD(p, addr, addr, brw_imm_ud(base_binding_table_index));
@@ -728,7 +732,7 @@ generate_gs_set_primitive_id(struct brw_codegen *p, struct brw_reg dst)
 static void
 generate_tcs_get_instance_id(struct brw_codegen *p, struct brw_reg dst)
 {
-   const struct brw_device_info *devinfo = p->devinfo;
+   const struct gen_device_info *devinfo = p->devinfo;
    const bool ivb = devinfo->is_ivybridge || devinfo->is_baytrail;
 
    /* "Instance Count" comes as part of the payload in r0.2 bits 23:17.
@@ -759,7 +763,7 @@ generate_tcs_urb_write(struct brw_codegen *p,
                        vec4_instruction *inst,
                        struct brw_reg urb_header)
 {
-   const struct brw_device_info *devinfo = p->devinfo;
+   const struct gen_device_info *devinfo = p->devinfo;
 
    brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
    brw_set_dest(p, send, brw_null_reg());
@@ -929,7 +933,7 @@ generate_vec4_urb_read(struct brw_codegen *p,
                        struct brw_reg dst,
                        struct brw_reg header)
 {
-   const struct brw_device_info *devinfo = p->devinfo;
+   const struct gen_device_info *devinfo = p->devinfo;
 
    assert(header.file == BRW_GENERAL_REGISTER_FILE);
    assert(header.type == BRW_REGISTER_TYPE_UD);
@@ -954,7 +958,7 @@ generate_tcs_release_input(struct brw_codegen *p,
                            struct brw_reg vertex,
                            struct brw_reg is_unpaired)
 {
-   const struct brw_device_info *devinfo = p->devinfo;
+   const struct gen_device_info *devinfo = p->devinfo;
 
    assert(vertex.file == BRW_IMMEDIATE_VALUE);
    assert(vertex.type == BRW_REGISTER_TYPE_UD);
@@ -1034,7 +1038,7 @@ generate_tcs_create_barrier_header(struct brw_codegen *p,
                                    struct brw_vue_prog_data *prog_data,
                                    struct brw_reg dst)
 {
-   const struct brw_device_info *devinfo = p->devinfo;
+   const struct gen_device_info *devinfo = p->devinfo;
    const bool ivb = devinfo->is_ivybridge || devinfo->is_baytrail;
    struct brw_reg m0_2 = get_element_ud(dst, 2);
    unsigned instances = ((struct brw_tcs_prog_data *) prog_data)->instances;
@@ -1123,7 +1127,7 @@ generate_scratch_read(struct brw_codegen *p,
                       struct brw_reg dst,
                       struct brw_reg index)
 {
-   const struct brw_device_info *devinfo = p->devinfo;
+   const struct gen_device_info *devinfo = p->devinfo;
    struct brw_reg header = brw_vec8_grf(0, 0);
 
    gen6_resolve_implied_move(p, &header, inst->base_mrf);
@@ -1140,6 +1144,11 @@ generate_scratch_read(struct brw_codegen *p,
    else
       msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
 
+   const unsigned target_cache =
+      devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
+      devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
+      BRW_DATAPORT_READ_TARGET_RENDER_CACHE;
+
    /* Each of the 8 channel enables is considered for whether each
     * dword is written.
     */
@@ -1151,8 +1160,7 @@ generate_scratch_read(struct brw_codegen *p,
    brw_set_dp_read_message(p, send,
                            brw_scratch_surface_idx(p),
                           BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
-                          msg_type,
-                          BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
+                          msg_type, target_cache,
                           2, /* mlen */
                            true, /* header_present */
                           1 /* rlen */);
@@ -1165,7 +1173,11 @@ generate_scratch_write(struct brw_codegen *p,
                        struct brw_reg src,
                        struct brw_reg index)
 {
-   const struct brw_device_info *devinfo = p->devinfo;
+   const struct gen_device_info *devinfo = p->devinfo;
+   const unsigned target_cache =
+      (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
+       devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
+       BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
    struct brw_reg header = brw_vec8_grf(0, 0);
    bool write_commit;
 
@@ -1225,6 +1237,7 @@ generate_scratch_write(struct brw_codegen *p,
                             brw_scratch_surface_idx(p),
                            BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
                            msg_type,
+                            target_cache,
                            3, /* mlen */
                            true, /* header present */
                            false, /* not a render target write */
@@ -1241,7 +1254,10 @@ generate_pull_constant_load(struct brw_codegen *p,
                             struct brw_reg index,
                             struct brw_reg offset)
 {
-   const struct brw_device_info *devinfo = p->devinfo;
+   const struct gen_device_info *devinfo = p->devinfo;
+   const unsigned target_cache =
+      (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_SAMPLER_CACHE :
+       BRW_DATAPORT_READ_TARGET_DATA_CACHE);
    assert(index.file == BRW_IMMEDIATE_VALUE &&
          index.type == BRW_REGISTER_TYPE_UD);
    uint32_t surf_index = index.ud;
@@ -1287,7 +1303,7 @@ generate_pull_constant_load(struct brw_codegen *p,
                           surf_index,
                           BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
                           msg_type,
-                          BRW_DATAPORT_READ_TARGET_DATA_CACHE,
+                           target_cache,
                           2, /* mlen */
                            true, /* header_present */
                           1 /* rlen */);
@@ -1469,12 +1485,13 @@ generate_code(struct brw_codegen *p,
               struct brw_vue_prog_data *prog_data,
               const struct cfg_t *cfg)
 {
-   const struct brw_device_info *devinfo = p->devinfo;
+   const struct gen_device_info *devinfo = p->devinfo;
    const char *stage_abbrev = _mesa_shader_stage_to_abbrev(nir->stage);
    bool debug_flag = INTEL_DEBUG &
       intel_debug_flag_for_shader_stage(nir->stage);
    struct annotation_info annotation;
    memset(&annotation, 0, sizeof(annotation));
+   int spill_count = 0, fill_count = 0;
    int loop_count = 0;
 
    foreach_block_and_inst (block, vec4_instruction, inst, cfg) {
@@ -1494,39 +1511,12 @@ generate_code(struct brw_codegen *p,
       brw_set_default_saturate(p, inst->saturate);
       brw_set_default_mask_control(p, inst->force_writemask_all);
       brw_set_default_acc_write_control(p, inst->writes_accumulator);
+      brw_set_default_exec_size(p, cvt(inst->exec_size) - 1);
 
       assert(inst->base_mrf + inst->mlen <= BRW_MAX_MRF(devinfo->gen));
       assert(inst->mlen <= BRW_MAX_MSG_LENGTH);
 
       unsigned pre_emit_nr_insn = p->nr_insn;
-      bool fix_exec_size = false;
-
-      if (dst.width == BRW_WIDTH_4) {
-         /* This happens in attribute fixups for "dual instanced" geometry
-          * shaders, since they use attributes that are vec4's.  Since the exec
-          * width is only 4, it's essential that the caller set
-          * force_writemask_all in order to make sure the instruction is executed
-          * regardless of which channels are enabled.
-          */
-         assert(inst->force_writemask_all);
-
-         /* Fix up any <8;8,1> or <0;4,1> source registers to <4;4,1> to satisfy
-          * the following register region restrictions (from Graphics BSpec:
-          * 3D-Media-GPGPU Engine > EU Overview > Registers and Register Regions
-          * > Register Region Restrictions)
-          *
-          *     1. ExecSize must be greater than or equal to Width.
-          *
-          *     2. If ExecSize = Width and HorzStride != 0, VertStride must be set
-          *        to Width * HorzStride."
-          */
-         for (int i = 0; i < 3; i++) {
-            if (src[i].file == BRW_GENERAL_REGISTER_FILE)
-               src[i] = stride(src[i], 4, 4, 1);
-         }
-         brw_set_default_exec_size(p, BRW_EXECUTE_4);
-         fix_exec_size = true;
-      }
 
       switch (inst->opcode) {
       case VEC4_OPCODE_UNPACK_UNIFORM:
@@ -1637,6 +1627,9 @@ generate_code(struct brw_codegen *p,
          /* FBL only supports UD type for dst. */
          brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
          break;
+      case BRW_OPCODE_LZD:
+         brw_LZD(p, dst, src[0]);
+         break;
       case BRW_OPCODE_CBIT:
          assert(devinfo->gen >= 7);
          /* CBIT only supports UD type for dst. */
@@ -1755,10 +1748,12 @@ generate_code(struct brw_codegen *p,
 
       case SHADER_OPCODE_GEN4_SCRATCH_READ:
          generate_scratch_read(p, inst, dst, src[0]);
+         fill_count++;
          break;
 
       case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
          generate_scratch_write(p, inst, dst, src[0], src[1]);
+         spill_count++;
          break;
 
       case VS_OPCODE_PULL_CONSTANT_LOAD:
@@ -1881,11 +1876,17 @@ generate_code(struct brw_codegen *p,
          brw_memory_fence(p, dst);
          break;
 
-      case SHADER_OPCODE_FIND_LIVE_CHANNEL:
-         brw_find_live_channel(p, dst);
+      case SHADER_OPCODE_FIND_LIVE_CHANNEL: {
+         const struct brw_reg mask =
+            brw_stage_has_packed_dispatch(devinfo, nir->stage,
+                                          &prog_data->base) ? brw_imm_ud(~0u) :
+            brw_dmask_reg();
+         brw_find_live_channel(p, dst, mask);
          break;
+      }
 
       case SHADER_OPCODE_BROADCAST:
+         assert(inst->force_writemask_all);
          brw_broadcast(p, dst, src[0], src[1]);
          break;
 
@@ -1909,6 +1910,100 @@ generate_code(struct brw_codegen *p,
          break;
       }
 
+      case VEC4_OPCODE_FROM_DOUBLE: {
+         assert(type_sz(src[0].type) == 8);
+         assert(type_sz(dst.type) == 4);
+
+         brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+         dst.hstride = BRW_HORIZONTAL_STRIDE_2;
+         dst.width = BRW_WIDTH_4;
+         src[0].vstride = BRW_VERTICAL_STRIDE_4;
+         src[0].width = BRW_WIDTH_4;
+         brw_MOV(p, dst, src[0]);
+
+         struct brw_reg dst_as_src = dst;
+         dst.hstride = BRW_HORIZONTAL_STRIDE_1;
+         dst.width = BRW_WIDTH_8;
+         brw_MOV(p, dst, dst_as_src);
+
+         brw_set_default_access_mode(p, BRW_ALIGN_16);
+         break;
+      }
+
+      case VEC4_OPCODE_TO_DOUBLE: {
+         assert(type_sz(src[0].type) == 4);
+         assert(type_sz(dst.type) == 8);
+
+         brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+         struct brw_reg tmp = retype(dst, src[0].type);
+         tmp.hstride = BRW_HORIZONTAL_STRIDE_2;
+         tmp.width = BRW_WIDTH_4;
+         src[0].vstride = BRW_VERTICAL_STRIDE_4;
+         src[0].hstride = BRW_HORIZONTAL_STRIDE_1;
+         src[0].width = BRW_WIDTH_4;
+         brw_MOV(p, tmp, src[0]);
+
+         tmp.vstride = BRW_VERTICAL_STRIDE_8;
+         tmp.hstride = BRW_HORIZONTAL_STRIDE_2;
+         tmp.width = BRW_WIDTH_4;
+         brw_MOV(p, dst, tmp);
+
+         brw_set_default_access_mode(p, BRW_ALIGN_16);
+         break;
+      }
+
+      case VEC4_OPCODE_PICK_LOW_32BIT:
+      case VEC4_OPCODE_PICK_HIGH_32BIT: {
+         /* Stores the low/high 32-bit of each 64-bit element in src[0] into
+          * dst using ALIGN1 mode and a <8,4,2>:UD region on the source.
+          */
+         assert(type_sz(src[0].type) == 8);
+         assert(type_sz(dst.type) == 4);
+
+         brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+         dst = retype(dst, BRW_REGISTER_TYPE_UD);
+         dst.hstride = BRW_HORIZONTAL_STRIDE_1;
+
+         src[0] = retype(src[0], BRW_REGISTER_TYPE_UD);
+         if (inst->opcode == VEC4_OPCODE_PICK_HIGH_32BIT)
+            src[0] = suboffset(src[0], 1);
+         src[0].vstride = BRW_VERTICAL_STRIDE_8;
+         src[0].width = BRW_WIDTH_4;
+         src[0].hstride = BRW_HORIZONTAL_STRIDE_2;
+         brw_MOV(p, dst, src[0]);
+
+         brw_set_default_access_mode(p, BRW_ALIGN_16);
+         break;
+      }
+
+      case VEC4_OPCODE_SET_LOW_32BIT:
+      case VEC4_OPCODE_SET_HIGH_32BIT: {
+         /* Reads consecutive 32-bit elements from src[0] and writes
+          * them to the low/high 32-bit of each 64-bit element in dst.
+          */
+         assert(type_sz(src[0].type) == 4);
+         assert(type_sz(dst.type) == 8);
+
+         brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+         dst = retype(dst, BRW_REGISTER_TYPE_UD);
+         if (inst->opcode == VEC4_OPCODE_SET_HIGH_32BIT)
+            dst = suboffset(dst, 1);
+         dst.hstride = BRW_HORIZONTAL_STRIDE_2;
+
+         src[0] = retype(src[0], BRW_REGISTER_TYPE_UD);
+         src[0].vstride = BRW_VERTICAL_STRIDE_4;
+         src[0].width = BRW_WIDTH_4;
+         src[0].hstride = BRW_HORIZONTAL_STRIDE_1;
+         brw_MOV(p, dst, src[0]);
+
+         brw_set_default_access_mode(p, BRW_ALIGN_16);
+         break;
+      }
+
       case VEC4_OPCODE_PACK_BYTES: {
          /* Is effectively:
           *
@@ -2013,13 +2108,17 @@ generate_code(struct brw_codegen *p,
          generate_mov_indirect(p, inst, dst, src[0], src[1], src[2]);
          break;
 
+      case BRW_OPCODE_DIM:
+         assert(devinfo->is_haswell);
+         assert(src[0].type == BRW_REGISTER_TYPE_DF);
+         assert(dst.type == BRW_REGISTER_TYPE_DF);
+         brw_DIM(p, dst, retype(src[0], BRW_REGISTER_TYPE_F));
+         break;
+
       default:
          unreachable("Unsupported opcode");
       }
 
-      if (fix_exec_size)
-         brw_set_default_exec_size(p, BRW_EXECUTE_8);
-
       if (inst->opcode == VEC4_OPCODE_PACK_BYTES) {
          /* Handled dependency hints in the generator. */
 
@@ -2038,7 +2137,7 @@ generate_code(struct brw_codegen *p,
       }
    }
 
-   brw_set_uip_jip(p);
+   brw_set_uip_jip(p, 0);
    annotation_finalize(&annotation, p->next_insn_offset);
 
 #ifndef NDEBUG
@@ -2054,13 +2153,13 @@ generate_code(struct brw_codegen *p,
 
    if (unlikely(debug_flag)) {
       fprintf(stderr, "Native code for %s %s shader %s:\n",
-              nir->info.label ? nir->info.label : "unnamed",
-              _mesa_shader_stage_to_string(nir->stage), nir->info.name);
+              nir->info->label ? nir->info->label : "unnamed",
+              _mesa_shader_stage_to_string(nir->stage), nir->info->name);
 
-      fprintf(stderr, "%s vec4 shader: %d instructions. %d loops. %u cycles."
-                      "Compacted %d to %d bytes (%.0f%%)\n",
-              stage_abbrev,
-              before_size / 16, loop_count, cfg->cycle_count, before_size, after_size,
+      fprintf(stderr, "%s vec4 shader: %d instructions. %d loops. %u cycles. %d:%d "
+                      "spills:fills. Compacted %d to %d bytes (%.0f%%)\n",
+              stage_abbrev, before_size / 16, loop_count, cfg->cycle_count,
+              spill_count, fill_count, before_size, after_size,
               100.0f * (before_size - after_size) / before_size);
 
       dump_assembly(p->store, annotation.ann_count, annotation.ann,
@@ -2071,10 +2170,11 @@ generate_code(struct brw_codegen *p,
 
    compiler->shader_debug_log(log_data,
                               "%s vec4 shader: %d inst, %d loops, %u cycles, "
-                              "compacted %d to %d bytes.",
+                              "%d:%d spills:fills, compacted %d to %d bytes.",
                               stage_abbrev, before_size / 16,
-                              loop_count, cfg->cycle_count,
-                              before_size, after_size);
+                              loop_count, cfg->cycle_count, spill_count,
+                              fill_count, before_size, after_size);
+
 }
 
 extern "C" const unsigned *