radeonsi: enable LLVM optimizations that assume no NaNs for non-compute shaders
authorMarek Olšák <marek.olsak@amd.com>
Sun, 4 Jan 2015 16:08:57 +0000 (17:08 +0100)
committerMarek Olšák <marek.olsak@amd.com>
Wed, 7 Jan 2015 17:27:54 +0000 (18:27 +0100)
v2: complete rewrite

Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Reviewed-by: Tom Stellard <thomas.stellard@amd.com>
src/gallium/drivers/radeonsi/si_shader.c
src/gallium/drivers/radeonsi/si_shader.h
src/gallium/drivers/radeonsi/si_state_shaders.c

index 5d61a549fe6626251b7d0b938746daa4bc12dfc1..cf28860a35f6cb8360d264ba8fbd485354cf1c67 100644 (file)
@@ -2369,6 +2369,10 @@ static void create_function(struct si_shader_context *si_shader_ctx)
        radeon_llvm_create_func(&si_shader_ctx->radeon_bld, params, num_params);
        radeon_llvm_shader_type(si_shader_ctx->radeon_bld.main_fn, si_shader_ctx->type);
 
+       if (shader->dx10_clamp_mode)
+               LLVMAddTargetDependentFunctionAttr(si_shader_ctx->radeon_bld.main_fn,
+                                                  "enable-no-nans-fp-math", "true");
+
        for (i = 0; i <= last_sgpr; ++i) {
                LLVMValueRef P = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, i);
 
@@ -2723,6 +2727,9 @@ int si_shader_create(struct si_screen *sscreen, struct si_shader *shader)
        radeon_llvm_context_init(&si_shader_ctx.radeon_bld);
        bld_base = &si_shader_ctx.radeon_bld.soa.bld_base;
 
+       if (sel->type != PIPE_SHADER_COMPUTE)
+               shader->dx10_clamp_mode = true;
+
        if (sel->info.uses_kill)
                shader->db_shader_control |= S_02880C_KILL_ENABLE(1);
 
index 21692f0ee33b59d57efd5fbe0c27cb5b506a58be..08e344af444effae1d968d83d9d9757f04df35fb 100644 (file)
@@ -160,6 +160,7 @@ struct si_shader {
        bool                    uses_instanceid;
        unsigned                nr_pos_exports;
        bool                    is_gs_copy_shader;
+       bool                    dx10_clamp_mode; /* convert NaNs to 0 */
 };
 
 static inline struct tgsi_shader_info *si_get_vs_info(struct si_context *sctx)
index e51d50eb9a7b167318b4bf0931085a255239628d..817a990ee2ee9bb248facc28b001ba89e7b4b100 100644 (file)
@@ -65,7 +65,7 @@ static void si_shader_es(struct si_shader *shader)
                       S_00B328_VGPRS((shader->num_vgprs - 1) / 4) |
                       S_00B328_SGPRS((num_sgprs - 1) / 8) |
                       S_00B328_VGPR_COMP_CNT(vgpr_comp_cnt) |
-                      S_00B328_DX10_CLAMP(1));
+                      S_00B328_DX10_CLAMP(shader->dx10_clamp_mode));
        si_pm4_set_reg(pm4, R_00B32C_SPI_SHADER_PGM_RSRC2_ES,
                       S_00B32C_USER_SGPR(num_user_sgprs));
 }
@@ -134,7 +134,7 @@ static void si_shader_gs(struct si_shader *shader)
        si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS,
                       S_00B228_VGPRS((shader->num_vgprs - 1) / 4) |
                       S_00B228_SGPRS((num_sgprs - 1) / 8) |
-                      S_00B228_DX10_CLAMP(1));
+                      S_00B228_DX10_CLAMP(shader->dx10_clamp_mode));
        si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
                       S_00B22C_USER_SGPR(num_user_sgprs));
 }
@@ -209,7 +209,7 @@ static void si_shader_vs(struct si_shader *shader)
                       S_00B128_VGPRS((shader->num_vgprs - 1) / 4) |
                       S_00B128_SGPRS((num_sgprs - 1) / 8) |
                       S_00B128_VGPR_COMP_CNT(vgpr_comp_cnt) |
-                      S_00B128_DX10_CLAMP(1));
+                      S_00B128_DX10_CLAMP(shader->dx10_clamp_mode));
        si_pm4_set_reg(pm4, R_00B12C_SPI_SHADER_PGM_RSRC2_VS,
                       S_00B12C_USER_SGPR(num_user_sgprs) |
                       S_00B12C_SO_BASE0_EN(!!shader->selector->so.stride[0]) |
@@ -304,7 +304,7 @@ static void si_shader_ps(struct si_shader *shader)
        si_pm4_set_reg(pm4, R_00B028_SPI_SHADER_PGM_RSRC1_PS,
                       S_00B028_VGPRS((shader->num_vgprs - 1) / 4) |
                       S_00B028_SGPRS((num_sgprs - 1) / 8) |
-                      S_00B028_DX10_CLAMP(1));
+                      S_00B028_DX10_CLAMP(shader->dx10_clamp_mode));
        si_pm4_set_reg(pm4, R_00B02C_SPI_SHADER_PGM_RSRC2_PS,
                       S_00B02C_EXTRA_LDS_SIZE(shader->lds_size) |
                       S_00B02C_USER_SGPR(num_user_sgprs));