unsigned max_waves_per_sh = 0;
uint64_t va;
- pipeline->cs.buf = malloc(20 * 4);
- pipeline->cs.max_dw = 20;
+ pipeline->cs.max_dw = device->physical_device->rad_info.chip_class >= GFX10 ? 22 : 20;
+ pipeline->cs.buf = malloc(pipeline->cs.max_dw * 4);
compute_shader = pipeline->shaders[MESA_SHADER_COMPUTE];
va = radv_buffer_get_va(compute_shader->bo) + compute_shader->bo_offset;
radeon_set_sh_reg_seq(&pipeline->cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
radeon_emit(&pipeline->cs, compute_shader->config.rsrc1);
radeon_emit(&pipeline->cs, compute_shader->config.rsrc2);
+ if (device->physical_device->rad_info.chip_class >= GFX10) {
+ radeon_set_sh_reg(&pipeline->cs, R_00B8A0_COMPUTE_PGM_RSRC3, compute_shader->config.rsrc3);
+ }
radeon_set_sh_reg(&pipeline->cs, R_00B860_COMPUTE_TMPRING_SIZE,
S_00B860_WAVES(pipeline->max_waves) |
unsigned num_vgprs = MAX2(config_in->num_vgprs, num_input_vgprs);
/* +3 for scratch wave offset and VCC */
unsigned num_sgprs = MAX2(config_in->num_sgprs, info->num_input_sgprs + 3);
+ unsigned num_shared_vgprs = config_in->num_shared_vgprs;
+ /* shared VGPRs are introduced in Navi and are allocated in blocks of 8 (RDNA ref 3.6.5) */
+ assert((pdevice->rad_info.chip_class >= GFX10 && num_shared_vgprs % 8 == 0)
+ || (pdevice->rad_info.chip_class < GFX10 && num_shared_vgprs == 0));
+ unsigned num_shared_vgpr_blocks = num_shared_vgprs / 8;
*config_out = *config_in;
config_out->num_vgprs = num_vgprs;
config_out->num_sgprs = num_sgprs;
+ config_out->num_shared_vgprs = num_shared_vgprs;
/* Enable 64-bit and 16-bit denormals, because there is no performance
* cost.
config_out->rsrc1 |= S_00B128_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10);
config_out->rsrc2 |= S_00B12C_OC_LDS_EN(1);
}
+ config_out->rsrc2 |= S_00B22C_SHARED_VGPR_CNT(num_shared_vgpr_blocks);
break;
case MESA_SHADER_TESS_CTRL:
if (pdevice->rad_info.chip_class >= GFX9) {
}
config_out->rsrc1 |= S_00B428_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10) |
S_00B848_WGP_MODE(pdevice->rad_info.chip_class >= GFX10);
+ config_out->rsrc2 |= S_00B42C_SHARED_VGPR_CNT(num_shared_vgpr_blocks);
break;
case MESA_SHADER_VERTEX:
if (info->is_ngg) {
}
config_out->rsrc1 |= S_00B128_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10);
+ config_out->rsrc2 |= S_00B12C_SHARED_VGPR_CNT(num_shared_vgpr_blocks);
}
break;
case MESA_SHADER_FRAGMENT:
config_out->rsrc1 |= S_00B028_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10);
+ config_out->rsrc2 |= S_00B02C_SHARED_VGPR_CNT(num_shared_vgpr_blocks);
break;
case MESA_SHADER_GEOMETRY:
config_out->rsrc1 |= S_00B228_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10) |
S_00B848_WGP_MODE(pdevice->rad_info.chip_class >= GFX10);
+ config_out->rsrc2 |= S_00B22C_SHARED_VGPR_CNT(num_shared_vgpr_blocks);
break;
case MESA_SHADER_COMPUTE:
config_out->rsrc1 |= S_00B848_MEM_ORDERED(pdevice->rad_info.chip_class >= GFX10) |
info->cs.uses_thread_id[1] ? 1 : 0) |
S_00B84C_TG_SIZE_EN(info->cs.uses_local_invocation_idx) |
S_00B84C_LDS_SIZE(config_in->lds_size);
+ config_out->rsrc3 |= S_00B8A0_SHARED_VGPR_CNT(num_shared_vgpr_blocks);
+
break;
default:
unreachable("unsupported shader type");