/* Determine the number of threads per wave for all stages. */
device->cs_wave_size = 64;
device->ps_wave_size = 64;
+ device->ge_wave_size = 64;
if (device->rad_info.chip_class >= GFX10) {
if (device->instance->perftest_flags & RADV_PERFTEST_CS_WAVE_32)
/* For pixel shaders, wave64 is recommanded. */
if (device->instance->perftest_flags & RADV_PERFTEST_PS_WAVE_32)
device->ps_wave_size = 32;
+
+ if (device->instance->perftest_flags & RADV_PERFTEST_GE_WAVE_32)
+ device->ge_wave_size = 32;
}
radv_physical_device_init_mem_types(device);
{"tccompatcmask", RADV_PERFTEST_TC_COMPAT_CMASK},
{"cswave32", RADV_PERFTEST_CS_WAVE_32},
{"pswave32", RADV_PERFTEST_PS_WAVE_32},
+ {"gewave32", RADV_PERFTEST_GE_WAVE_32},
{NULL, 0}
};
/* GFX6 bug workaround - limit LS-HS threadgroups to only one wave. */
if (ctx->options->chip_class == GFX6) {
- unsigned one_wave = 64 / MAX2(num_tcs_input_cp, num_tcs_output_cp);
+ unsigned one_wave = ctx->options->ge_wave_size / MAX2(num_tcs_input_cp, num_tcs_output_cp);
num_patches = MIN2(num_patches, one_wave);
}
return num_patches;
LLVMValueRef wave_idx = ac_unpack_param(&ctx->ac, ctx->merged_wave_info, 24, 4);
vertex_idx = LLVMBuildOr(ctx->ac.builder, vertex_idx,
LLVMBuildMul(ctx->ac.builder, wave_idx,
- LLVMConstInt(ctx->ac.i32, 64, false), ""), "");
+ LLVMConstInt(ctx->ac.i32,
+ ctx->ac.wave_size, false), ""), "");
lds_base = LLVMBuildMul(ctx->ac.builder, vertex_idx,
LLVMConstInt(ctx->ac.i32, itemsize_dw, 0), "");
}
LLVMBuilderRef builder = ctx->ac.builder;
LLVMValueRef tmp;
tmp = LLVMBuildMul(builder, get_wave_id_in_tg(ctx),
- LLVMConstInt(ctx->ac.i32, 64, false), "");
+ LLVMConstInt(ctx->ac.i32, ctx->ac.wave_size, false), "");
return LLVMBuildAdd(builder, tmp, ac_get_thread_id(&ctx->ac), "");
}
*/
LLVMTypeRef v2i64 = LLVMVectorType(ctx->ac.i64, 2);
uint64_t stream_offset = 0;
- unsigned num_records = 64;
+ unsigned num_records = ctx->ac.wave_size;
LLVMValueRef base_ring;
base_ring =
ring = LLVMBuildInsertElement(ctx->ac.builder,
ring, tmp, ctx->ac.i32_0, "");
- stream_offset += stride * 64;
+ stream_offset += stride * ctx->ac.wave_size;
ring = LLVMBuildBitCast(ctx->ac.builder, ring,
ctx->ac.v4i32, "");
return options->cs_wave_size;
else if (shaders[0]->info.stage == MESA_SHADER_FRAGMENT)
return options->ps_wave_size;
- return 64;
+ return options->ge_wave_size;
}
static
/* Round up towards full wave sizes for better ALU utilization. */
if (!max_vert_out_per_gs_instance) {
- const unsigned wavesize = 64;
+ const unsigned wavesize = pipeline->device->physical_device->ge_wave_size;
unsigned orig_max_esverts;
unsigned orig_max_gsprims;
do {
if (pipeline->device->physical_device->rad_info.chip_class >= GFX9)
stages |= S_028B54_MAX_PRIMGRP_IN_WAVE(2);
+ if (pipeline->device->physical_device->rad_info.chip_class >= GFX10 &&
+ pipeline->device->physical_device->ge_wave_size == 32) {
+ /* legacy GS only supports Wave64 */
+ stages |= S_028B54_HS_W32_EN(1) |
+ S_028B54_GS_W32_EN(radv_pipeline_has_ngg(pipeline)) |
+ S_028B54_VS_W32_EN(1);
+ }
+
return stages;
}
return pdevice->cs_wave_size;
else if (stage == MESA_SHADER_FRAGMENT)
return pdevice->ps_wave_size;
- return 64;
+ return pdevice->ge_wave_size;
}
static void radv_postprocess_config(const struct radv_physical_device *pdevice,
options->address32_hi = device->physical_device->rad_info.address32_hi;
options->cs_wave_size = device->physical_device->cs_wave_size;
options->ps_wave_size = device->physical_device->ps_wave_size;
+ options->ge_wave_size = device->physical_device->ge_wave_size;
if (options->supports_spill)
tm_options |= AC_TM_SUPPORTS_SPILL;