intel/compiler: Remove cs_prog_data->threads

[mesa.git] / src / intel / compiler / brw_fs.cpp
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp

index f284a2b6644fc8d41a951038c705644d87fe447f..4e13dcca54adcd0268f2b083d2ca4d9b814df295 100644 (file)
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -1190,6 +1190,8 @@ fs_visitor::import_uniforms(fs_visitor *v)
     this->pull_constant_loc = v->pull_constant_loc;
     this->uniforms = v->uniforms;
     this->subgroup_id = v->subgroup_id;
+   for (unsigned i = 0; i < ARRAY_SIZE(this->group_size); i++)
+      this->group_size[i] = v->group_size[i];
  }
  
  void
@@ -1641,6 +1643,26 @@ fs_visitor::assign_curb_setup()
     this->first_non_payload_grf = payload.num_regs + prog_data->curb_read_length;
  }
  
+/*
+ * Build up an array of indices into the urb_setup array that
+ * references the active entries of the urb_setup array.
+ * Used to accelerate walking the active entries of the urb_setup array
+ * on each upload.
+ */
+void
+brw_compute_urb_setup_index(struct brw_wm_prog_data *wm_prog_data)
+{
+   /* Make sure uint8_t is sufficient */
+   STATIC_ASSERT(VARYING_SLOT_MAX <= 0xff);
+   uint8_t index = 0;
+   for (uint8_t attr = 0; attr < VARYING_SLOT_MAX; attr++) {
+      if (wm_prog_data->urb_setup[attr] >= 0) {
+         wm_prog_data->urb_setup_attribs[index++] = attr;
+      }
+   }
+   wm_prog_data->urb_setup_attribs_count = index;
+}
+
  static void
  calculate_urb_setup(const struct gen_device_info *devinfo,
                      const struct brw_wm_prog_key *key,
@@ -1679,7 +1701,7 @@ calculate_urb_setup(const struct gen_device_info *devinfo,
           struct brw_vue_map prev_stage_vue_map;
           brw_compute_vue_map(devinfo, &prev_stage_vue_map,
                               key->input_slots_valid,
-                             nir->info.separate_shader);
+                             nir->info.separate_shader, 1);
  
           int first_slot =
              brw_compute_first_urb_slot_required(nir->info.inputs_read,
@@ -1728,6 +1750,9 @@ calculate_urb_setup(const struct gen_device_info *devinfo,
     }
  
     prog_data->num_varying_inputs = urb_next;
+   prog_data->inputs = nir->info.inputs_read;
+
+   brw_compute_urb_setup_index(prog_data);
  }
  
  void
@@ -7811,8 +7836,15 @@ fs_visitor::allocate_registers(unsigned min_dispatch_width, bool allow_spilling)
        const int iteration = 99;
        int pass_num = 0;
  
-      if (OPT(opt_cmod_propagation))
-         OPT(dead_code_eliminate);
+      if (OPT(opt_cmod_propagation)) {
+         /* dead_code_eliminate "undoes" the fixing done by
+          * fixup_3src_null_dest, so we have to do it again if
+          * dead_code_eliminiate makes any progress.
+          */
+         if (OPT(dead_code_eliminate))
+            fixup_3src_null_dest();
+      }
+
  
        /* We only allow spilling for the last schedule mode and only if the
         * allow_spilling parameter and dispatch width work out ok.
@@ -8164,6 +8196,8 @@ gen9_ps_header_only_workaround(struct brw_wm_prog_data *wm_prog_data)
  
     wm_prog_data->urb_setup[VARYING_SLOT_LAYER] = 0;
     wm_prog_data->num_varying_inputs = 1;
+
+   brw_compute_urb_setup_index(wm_prog_data);
  }
  
  bool
@@ -8775,6 +8809,16 @@ fs_visitor::emit_cs_work_group_id_setup()
     return reg;
  }
  
+unsigned
+brw_cs_push_const_total_size(const struct brw_cs_prog_data *cs_prog_data,
+                             unsigned threads)
+{
+   assert(cs_prog_data->push.per_thread.size % REG_SIZE == 0);
+   assert(cs_prog_data->push.cross_thread.size % REG_SIZE == 0);
+   return cs_prog_data->push.per_thread.size * threads +
+          cs_prog_data->push.cross_thread.size;
+}
+
  static void
  fill_push_const_block_info(struct brw_push_const_block *block, unsigned dwords)
  {
@@ -8813,11 +8857,6 @@ cs_fill_push_const_info(const struct gen_device_info *devinfo,
     fill_push_const_block_info(&cs_prog_data->push.cross_thread, cross_thread_dwords);
     fill_push_const_block_info(&cs_prog_data->push.per_thread, per_thread_dwords);
  
-   unsigned total_dwords =
-      (cs_prog_data->push.per_thread.size * cs_prog_data->threads +
-       cs_prog_data->push.cross_thread.size) / 4;
-   fill_push_const_block_info(&cs_prog_data->push.total, total_dwords);
-
     assert(cs_prog_data->push.cross_thread.dwords % 8 == 0 ||
            cs_prog_data->push.per_thread.size == 0);
     assert(cs_prog_data->push.cross_thread.dwords +
@@ -8825,15 +8864,6 @@ cs_fill_push_const_info(const struct gen_device_info *devinfo,
               prog_data->nr_params);
  }
  
-static void
-cs_set_simd_size(struct brw_cs_prog_data *cs_prog_data, unsigned size)
-{
-   cs_prog_data->simd_size = size;
-   unsigned group_size = cs_prog_data->local_size[0] *
-      cs_prog_data->local_size[1] * cs_prog_data->local_size[2];
-   cs_prog_data->threads = (group_size + size - 1) / size;
-}
-
  static nir_shader *
  compile_cs_to_nir(const struct brw_compiler *compiler,
                    void *mem_ctx,
@@ -8866,13 +8896,20 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
                 char **error_str)
  {
     prog_data->base.total_shared = src_shader->info.cs.shared_size;
-   prog_data->local_size[0] = src_shader->info.cs.local_size[0];
-   prog_data->local_size[1] = src_shader->info.cs.local_size[1];
-   prog_data->local_size[2] = src_shader->info.cs.local_size[2];
     prog_data->slm_size = src_shader->num_shared;
-   unsigned local_workgroup_size =
-      src_shader->info.cs.local_size[0] * src_shader->info.cs.local_size[1] *
-      src_shader->info.cs.local_size[2];
+
+   unsigned local_workgroup_size;
+   if (prog_data->uses_variable_group_size) {
+      prog_data->max_variable_local_size =
+         src_shader->info.cs.max_variable_local_size;
+      local_workgroup_size = src_shader->info.cs.max_variable_local_size;
+   } else {
+      prog_data->local_size[0] = src_shader->info.cs.local_size[0];
+      prog_data->local_size[1] = src_shader->info.cs.local_size[1];
+      prog_data->local_size[2] = src_shader->info.cs.local_size[2];
+      local_workgroup_size = src_shader->info.cs.local_size[0] *
+         src_shader->info.cs.local_size[1] * src_shader->info.cs.local_size[2];
+   }
  
     /* Limit max_threads to 64 for the GPGPU_WALKER command */
     const uint32_t max_threads = MIN2(64, compiler->devinfo->max_cs_threads);
@@ -8919,7 +8956,7 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
           assert(v8->max_dispatch_width >= 32);
  
           v = v8;
-         cs_set_simd_size(prog_data, 8);
+         prog_data->simd_size = 8;
           cs_fill_push_const_info(compiler->devinfo, prog_data);
        }
     }
@@ -8949,7 +8986,7 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
           assert(v16->max_dispatch_width >= 32);
  
           v = v16;
-         cs_set_simd_size(prog_data, 16);
+         prog_data->simd_size = 16;
           cs_fill_push_const_info(compiler->devinfo, prog_data);
        }
     }
@@ -8981,7 +9018,7 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
           }
        } else {
           v = v32;
-         cs_set_simd_size(prog_data, 32);
+         prog_data->simd_size = 32;
           cs_fill_push_const_info(compiler->devinfo, prog_data);
        }
     }