radv: Fix float16 interpolation set up.
authorBas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Fri, 22 Feb 2019 13:16:08 +0000 (14:16 +0100)
committerBas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Fri, 22 Feb 2019 16:06:55 +0000 (17:06 +0100)
float16 types can have non-flat interpolation so set up the HW
correctly for that.

Fixes: 62024fa7750 "radv: enable VK_KHR_16bit_storage extension / 16bit storage features"
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
src/amd/common/ac_llvm_build.c
src/amd/common/ac_llvm_build.h
src/amd/vulkan/radv_device.c
src/amd/vulkan/radv_nir_to_llvm.c
src/amd/vulkan/radv_pipeline.c
src/amd/vulkan/radv_shader.h

index c6bc507358ec38c2aa798c7ea5784073ed8baac4..bc64f0bb7e3e9fe1e32a61f2c6561e7f27f9d4f5 100644 (file)
@@ -919,6 +919,37 @@ ac_build_fs_interp(struct ac_llvm_context *ctx,
                                  ctx->f32, args, 5, AC_FUNC_ATTR_READNONE);
 }
 
+LLVMValueRef
+ac_build_fs_interp_f16(struct ac_llvm_context *ctx,
+                      LLVMValueRef llvm_chan,
+                      LLVMValueRef attr_number,
+                      LLVMValueRef params,
+                      LLVMValueRef i,
+                      LLVMValueRef j)
+{
+       LLVMValueRef args[6];
+       LLVMValueRef p1;
+
+       args[0] = i;
+       args[1] = llvm_chan;
+       args[2] = attr_number;
+       args[3] = ctx->i1false;
+       args[4] = params;
+
+       p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16",
+                               ctx->f32, args, 5, AC_FUNC_ATTR_READNONE);
+
+       args[0] = p1;
+       args[1] = j;
+       args[2] = llvm_chan;
+       args[3] = attr_number;
+       args[4] = ctx->i1false;
+       args[5] = params;
+
+       return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16",
+                                 ctx->f16, args, 6, AC_FUNC_ATTR_READNONE);
+}
+
 LLVMValueRef
 ac_build_fs_interp_mov(struct ac_llvm_context *ctx,
                       LLVMValueRef parameter,
index 00d59a9e36965972fbcd4e789248894dd7a342b3..fd5c4295abf487e2dcbd911be0ac59cdf89b0c41 100644 (file)
@@ -213,6 +213,14 @@ ac_build_fs_interp(struct ac_llvm_context *ctx,
                   LLVMValueRef i,
                   LLVMValueRef j);
 
+LLVMValueRef
+ac_build_fs_interp_f16(struct ac_llvm_context *ctx,
+                      LLVMValueRef llvm_chan,
+                      LLVMValueRef attr_number,
+                      LLVMValueRef params,
+                      LLVMValueRef i,
+                      LLVMValueRef j);
+
 LLVMValueRef
 ac_build_fs_interp_mov(struct ac_llvm_context *ctx,
                       LLVMValueRef parameter,
index 2aa8b3611e36bf94deab7e10cb18b6bb0a400c9c..fc04de21025136326ff980d7e572195ecaf32e3a 100644 (file)
@@ -806,7 +806,7 @@ void radv_GetPhysicalDeviceFeatures2(
                        features->storageBuffer16BitAccess = enabled;
                        features->uniformAndStorageBuffer16BitAccess = enabled;
                        features->storagePushConstant16 = enabled;
-                       features->storageInputOutput16 = enabled;
+                       features->storageInputOutput16 = enabled && HAVE_LLVM >= 0x900;
                        break;
                }
                case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLER_YCBCR_CONVERSION_FEATURES: {
index 1624a86923bbb503a0600f9b147940228157134e..dae09f7ddce177f380b68c62db50c095bce7a216 100644 (file)
@@ -92,6 +92,7 @@ struct radv_shader_context {
        gl_shader_stage stage;
 
        LLVMValueRef inputs[RADEON_LLVM_MAX_INPUTS * 4];
+       uint64_t float16_shaded_mask;
 
        uint64_t input_mask;
        uint64_t output_mask;
@@ -2197,6 +2198,7 @@ static void interp_fs_input(struct radv_shader_context *ctx,
                            unsigned attr,
                            LLVMValueRef interp_param,
                            LLVMValueRef prim_mask,
+                           bool float16,
                            LLVMValueRef result[4])
 {
        LLVMValueRef attr_number;
@@ -2229,7 +2231,12 @@ static void interp_fs_input(struct radv_shader_context *ctx,
        for (chan = 0; chan < 4; chan++) {
                LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, chan, false);
 
-               if (interp) {
+               if (interp && float16) {
+                       result[chan] = ac_build_fs_interp_f16(&ctx->ac,
+                                                             llvm_chan,
+                                                             attr_number,
+                                                             prim_mask, i, j);
+               } else if (interp) {
                        result[chan] = ac_build_fs_interp(&ctx->ac,
                                                          llvm_chan,
                                                          attr_number,
@@ -2241,7 +2248,30 @@ static void interp_fs_input(struct radv_shader_context *ctx,
                                                              attr_number,
                                                              prim_mask);
                        result[chan] = LLVMBuildBitCast(ctx->ac.builder, result[chan], ctx->ac.i32, "");
-                       result[chan] = LLVMBuildTruncOrBitCast(ctx->ac.builder, result[chan], LLVMTypeOf(interp_param), "");
+                       result[chan] = LLVMBuildTruncOrBitCast(ctx->ac.builder, result[chan], float16 ? ctx->ac.i16 : ctx->ac.i32, "");
+               }
+       }
+}
+
+static void mark_16bit_fs_input(struct radv_shader_context *ctx,
+                                const struct glsl_type *type,
+                                int location)
+{
+       if (glsl_type_is_scalar(type) || glsl_type_is_vector(type) || glsl_type_is_matrix(type)) {
+               unsigned attrib_count = glsl_count_attribute_slots(type, false);
+               if (glsl_type_is_16bit(type)) {
+                       ctx->float16_shaded_mask |= ((1ull << attrib_count) - 1) << location;
+               }
+       } else if (glsl_type_is_array(type)) {
+               unsigned stride = glsl_count_attribute_slots(glsl_get_array_element(type), false);
+               for (unsigned i = 0; i < glsl_get_length(type); ++i) {
+                       mark_16bit_fs_input(ctx, glsl_get_array_element(type), location + i * stride);
+               }
+       } else {
+               assert(glsl_type_is_struct(type));
+               for (unsigned i = 0; i < glsl_get_length(type); i++) {
+                       mark_16bit_fs_input(ctx, glsl_get_struct_field(type, i), location);
+                       location += glsl_count_attribute_slots(glsl_get_struct_field(type, i), false);
                }
        }
 }
@@ -2262,7 +2292,8 @@ handle_fs_input_decl(struct radv_shader_context *ctx,
                unsigned component_count = variable->data.location_frac +
                                           glsl_get_length(variable->type);
                attrib_count = (component_count + 3) / 4;
-       }
+       } else
+               mark_16bit_fs_input(ctx, variable->type, idx);
 
        mask = ((1ull << attrib_count) - 1) << variable->data.location;
 
@@ -2277,10 +2308,8 @@ handle_fs_input_decl(struct radv_shader_context *ctx,
 
                interp = lookup_interp_param(&ctx->abi, variable->data.interpolation, interp_type);
        }
-       bool is_16bit = glsl_type_is_16bit(glsl_without_array(variable->type));
-       LLVMTypeRef type = is_16bit ? ctx->ac.i16 : ctx->ac.i32;
        if (interp == NULL)
-               interp = LLVMGetUndef(type);
+               interp = LLVMGetUndef(ctx->ac.i32);
 
        for (unsigned i = 0; i < attrib_count; ++i)
                ctx->inputs[ac_llvm_reg_index_soa(idx + i, 0)] = interp;
@@ -2346,11 +2375,14 @@ handle_fs_inputs(struct radv_shader_context *ctx,
                if (i >= VARYING_SLOT_VAR0 || i == VARYING_SLOT_PNTC ||
                    i == VARYING_SLOT_PRIMITIVE_ID || i == VARYING_SLOT_LAYER) {
                        interp_param = *inputs;
-                       interp_fs_input(ctx, index, interp_param, ctx->abi.prim_mask,
+                       bool float16 = (ctx->float16_shaded_mask >> i) & 1;
+                       interp_fs_input(ctx, index, interp_param, ctx->abi.prim_mask, float16,
                                        inputs);
 
                        if (LLVMIsUndef(interp_param))
                                ctx->shader_info->fs.flat_shaded_mask |= 1u << index;
+                       if (float16)
+                               ctx->shader_info->fs.float16_shaded_mask |= 1u << index;
                        if (i >= VARYING_SLOT_VAR0)
                                ctx->abi.fs_input_attr_indices[i - VARYING_SLOT_VAR0] = index;
                        ++index;
@@ -2362,7 +2394,7 @@ handle_fs_inputs(struct radv_shader_context *ctx,
 
                                interp_param = *inputs;
                                interp_fs_input(ctx, index, interp_param,
-                                               ctx->abi.prim_mask, inputs);
+                                               ctx->abi.prim_mask, false, inputs);
                                ++index;
                        }
                } else if (i == VARYING_SLOT_POS) {
index f0a5fef3923fc366b0e86b25757e599cbd02548f..09363af37e745bb15c6f8e4dc40605a080068d0a 100644 (file)
@@ -3101,13 +3101,17 @@ radv_pipeline_generate_geometry_shader(struct radeon_cmdbuf *ctx_cs,
        radv_pipeline_generate_hw_vs(ctx_cs, cs, pipeline, pipeline->gs_copy_shader);
 }
 
-static uint32_t offset_to_ps_input(uint32_t offset, bool flat_shade)
+static uint32_t offset_to_ps_input(uint32_t offset, bool flat_shade, bool float16)
 {
        uint32_t ps_input_cntl;
        if (offset <= AC_EXP_PARAM_OFFSET_31) {
                ps_input_cntl = S_028644_OFFSET(offset);
                if (flat_shade)
                        ps_input_cntl |= S_028644_FLAT_SHADE(1);
+               if (float16) {
+                       ps_input_cntl |= S_028644_FP16_INTERP_MODE(1) |
+                                        S_028644_ATTR0_VALID(1);
+               }
        } else {
                /* The input is a DEFAULT_VAL constant. */
                assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 &&
@@ -3132,7 +3136,7 @@ radv_pipeline_generate_ps_inputs(struct radeon_cmdbuf *ctx_cs,
        if (ps->info.info.ps.prim_id_input) {
                unsigned vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID];
                if (vs_offset != AC_EXP_PARAM_UNDEFINED) {
-                       ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, true);
+                       ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, true, false);
                        ++ps_offset;
                }
        }
@@ -3142,9 +3146,9 @@ radv_pipeline_generate_ps_inputs(struct radeon_cmdbuf *ctx_cs,
            ps->info.info.needs_multiview_view_index) {
                unsigned vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_LAYER];
                if (vs_offset != AC_EXP_PARAM_UNDEFINED)
-                       ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, true);
+                       ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, true, false);
                else
-                       ps_input_cntl[ps_offset] = offset_to_ps_input(AC_EXP_PARAM_DEFAULT_VAL_0000, true);
+                       ps_input_cntl[ps_offset] = offset_to_ps_input(AC_EXP_PARAM_DEFAULT_VAL_0000, true, false);
                ++ps_offset;
        }
 
@@ -3160,14 +3164,14 @@ radv_pipeline_generate_ps_inputs(struct radeon_cmdbuf *ctx_cs,
 
                vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_CLIP_DIST0];
                if (vs_offset != AC_EXP_PARAM_UNDEFINED) {
-                       ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, false);
+                       ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, false, false);
                        ++ps_offset;
                }
 
                vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_CLIP_DIST1];
                if (vs_offset != AC_EXP_PARAM_UNDEFINED &&
                    ps->info.info.ps.num_input_clips_culls > 4) {
-                       ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, false);
+                       ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, false, false);
                        ++ps_offset;
                }
        }
@@ -3175,6 +3179,7 @@ radv_pipeline_generate_ps_inputs(struct radeon_cmdbuf *ctx_cs,
        for (unsigned i = 0; i < 32 && (1u << i) <= ps->info.fs.input_mask; ++i) {
                unsigned vs_offset;
                bool flat_shade;
+               bool float16;
                if (!(ps->info.fs.input_mask & (1u << i)))
                        continue;
 
@@ -3186,8 +3191,9 @@ radv_pipeline_generate_ps_inputs(struct radeon_cmdbuf *ctx_cs,
                }
 
                flat_shade = !!(ps->info.fs.flat_shaded_mask & (1u << ps_offset));
+               float16 = !!(ps->info.fs.float16_shaded_mask & (1u << ps_offset));
 
-               ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, flat_shade);
+               ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, flat_shade, float16);
                ++ps_offset;
        }
 
index 8a66f5825c7089c9c816692309669bc2c3671d09..d9fc64aeb9a819dc7041552a9594139ad7410528 100644 (file)
@@ -266,6 +266,7 @@ struct radv_shader_variant_info {
                        unsigned num_interp;
                        uint32_t input_mask;
                        uint32_t flat_shaded_mask;
+                       uint32_t float16_shaded_mask;
                        bool can_discard;
                        bool early_fragment_test;
                } fs;