radeonsi: implement R9G9B9E5 render target and image store support on gfx10.3

[mesa.git] / src / gallium / drivers / radeonsi / si_state.c
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c

index 2ce68c781d0bde5fe68181e9ae9d3f232f2e5541..2f156a0885f493210314fa3264a56c2834381615 100644 (file)
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -33,15 +33,6 @@
  #include "util/u_resource.h"
  #include "util/u_upload_mgr.h"
  
-struct gfx10_format {
-   unsigned img_format : 9;
-
-   /* Various formats are only supported with workarounds for vertex fetch,
-    * and some 32_32_32 formats are supported natively, but only for buffers
-    * (possibly with some image support, actually, but no filtering). */
-   bool buffers_only : 1;
-};
-
  #include "gfx10_format_table.h"
  
  static unsigned si_map_swizzle(unsigned swizzle)
@@ -246,6 +237,11 @@ static void si_emit_cb_render_state(struct si_context *sctx)
                 sx_blend_opt_epsilon |= V_028758_10BIT_FORMAT << (i * 4);
              }
              break;
+
+         case V_028C70_COLOR_5_9_9_9:
+            if (spi_format == V_028714_SPI_SHADER_FP16_ABGR)
+               sx_ps_downconvert |= V_028754_SX_RT_EXPORT_9_9_9_E5 << (i * 4);
+            break;
           }
        }
  
@@ -766,8 +762,9 @@ static void si_emit_clip_regs(struct si_context *sctx)
  
     unsigned initial_cdw = sctx->gfx_cs->current.cdw;
     unsigned pa_cl_cntl = S_02881C_VS_OUT_CCDIST0_VEC_ENA((total_mask & 0x0F) != 0) |
-                         S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xF0) != 0) | clipdist_mask |
-                         (culldist_mask << 8);
+                         S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xF0) != 0) |
+                         S_02881C_BYPASS_PRIM_RATE_COMBINER_GFX103(sctx->chip_class >= GFX10_3) |
+                         clipdist_mask | (culldist_mask << 8);
  
     if (sctx->chip_class >= GFX10) {
        radeon_opt_set_context_reg_rmw(sctx, R_02881C_PA_CL_VS_OUT_CNTL,
@@ -1393,8 +1390,9 @@ static void si_emit_db_render_state(struct si_context *sctx)
     radeon_opt_set_context_reg(
        sctx, R_028010_DB_RENDER_OVERRIDE2, SI_TRACKED_DB_RENDER_OVERRIDE2,
        S_028010_DISABLE_ZMASK_EXPCLEAR_OPTIMIZATION(sctx->db_depth_disable_expclear) |
-         S_028010_DISABLE_SMEM_EXPCLEAR_OPTIMIZATION(sctx->db_stencil_disable_expclear) |
-         S_028010_DECOMPRESS_Z_ON_FLUSH(sctx->framebuffer.nr_samples >= 4));
+      S_028010_DISABLE_SMEM_EXPCLEAR_OPTIMIZATION(sctx->db_stencil_disable_expclear) |
+      S_028010_DECOMPRESS_Z_ON_FLUSH(sctx->framebuffer.nr_samples >= 4) |
+      S_028010_CENTROID_COMPUTATION_MODE_GFX103(sctx->chip_class >= GFX10_3 ? 2 : 0));
  
     db_shader_control = sctx->ps_db_shader_control;
  
@@ -1421,7 +1419,8 @@ static void si_emit_db_render_state(struct si_context *sctx)
  /*
   * format translation
   */
-static uint32_t si_translate_colorformat(enum pipe_format format)
+static uint32_t si_translate_colorformat(enum chip_class chip_class,
+                                         enum pipe_format format)
  {
     const struct util_format_description *desc = util_format_description(format);
     if (!desc)
@@ -1434,6 +1433,10 @@ static uint32_t si_translate_colorformat(enum pipe_format format)
     if (format == PIPE_FORMAT_R11G11B10_FLOAT) /* isn't plain */
        return V_028C70_COLOR_10_11_11;
  
+   if (chip_class >= GFX10_3 &&
+       format == PIPE_FORMAT_R9G9B9E5_FLOAT) /* isn't plain */
+      return V_028C70_COLOR_5_9_9_9;
+
     if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN)
        return V_028C70_COLOR_INVALID;
  
@@ -2099,9 +2102,10 @@ static unsigned si_is_vertex_format_supported(struct pipe_screen *screen, enum p
     return usage;
  }
  
-static bool si_is_colorbuffer_format_supported(enum pipe_format format)
+static bool si_is_colorbuffer_format_supported(enum chip_class chip_class,
+                                               enum pipe_format format)
  {
-   return si_translate_colorformat(format) != V_028C70_COLOR_INVALID &&
+   return si_translate_colorformat(chip_class, format) != V_028C70_COLOR_INVALID &&
            si_translate_colorswap(format, false) != ~0U;
  }
  
@@ -2167,7 +2171,7 @@ static bool si_is_format_supported(struct pipe_screen *screen, enum pipe_format
  
     if ((usage & (PIPE_BIND_RENDER_TARGET | PIPE_BIND_DISPLAY_TARGET | PIPE_BIND_SCANOUT |
                   PIPE_BIND_SHARED | PIPE_BIND_BLENDABLE)) &&
-       si_is_colorbuffer_format_supported(format)) {
+       si_is_colorbuffer_format_supported(sscreen->info.chip_class, format)) {
        retval |= usage & (PIPE_BIND_RENDER_TARGET | PIPE_BIND_DISPLAY_TARGET | PIPE_BIND_SCANOUT |
                           PIPE_BIND_SHARED);
        if (!util_format_is_pure_integer(format) && !util_format_is_depth_or_stencil(format))
@@ -2214,6 +2218,7 @@ static void si_choose_spi_color_formats(struct si_surface *surf, unsigned format
     case V_028C70_COLOR_4_4_4_4:
     case V_028C70_COLOR_10_11_11:
     case V_028C70_COLOR_11_11_10:
+   case V_028C70_COLOR_5_9_9_9:
     case V_028C70_COLOR_8:
     case V_028C70_COLOR_8_8:
     case V_028C70_COLOR_8_8_8_8:
@@ -2346,7 +2351,7 @@ static void si_initialize_color_surface(struct si_context *sctx, struct si_surfa
        }
     }
  
-   format = si_translate_colorformat(surf->base.format);
+   format = si_translate_colorformat(sctx->chip_class, surf->base.format);
     if (format == V_028C70_COLOR_INVALID) {
        PRINT_ERR("Invalid CB format: %d, disabling CB.\n", surf->base.format);
     }
@@ -3544,7 +3549,8 @@ static void si_emit_msaa_config(struct si_context *sctx)
        sc_line_cntl |= S_028BDC_EXPAND_LINE_WIDTH(1);
        sc_aa_config = S_028BE0_MSAA_NUM_SAMPLES(log_samples) |
                       S_028BE0_MAX_SAMPLE_DIST(max_dist[log_samples]) |
-                     S_028BE0_MSAA_EXPOSED_SAMPLES(log_samples);
+                     S_028BE0_MSAA_EXPOSED_SAMPLES(log_samples) |
+                     S_028BE0_COVERED_CENTROID_IS_CENTER_GFX103(sctx->chip_class >= GFX10_3);
  
        if (sctx->framebuffer.nr_samples > 1) {
           db_eqaa |= S_028804_MAX_ANCHOR_SAMPLES(log_z_samples) |
@@ -3797,12 +3803,26 @@ static void gfx10_make_texture_descriptor(
        S_00A00C_BASE_LEVEL(res->nr_samples > 1 ? 0 : first_level) |
        S_00A00C_LAST_LEVEL(res->nr_samples > 1 ? util_logbase2(res->nr_samples) : last_level) |
        S_00A00C_BC_SWIZZLE(gfx9_border_color_swizzle(desc->swizzle)) | S_00A00C_TYPE(type);
-   /* Depth is the the last accessible layer on gfx9+. The hw doesn't need
-    * to know the total number of layers.
-    */
-   state[4] =
-      S_00A010_DEPTH((type == V_008F1C_SQ_RSRC_IMG_3D && sampler) ? depth - 1 : last_layer) |
-      S_00A010_BASE_ARRAY(first_layer);
+
+   if (res->target == PIPE_TEXTURE_1D ||
+       res->target == PIPE_TEXTURE_2D) {
+      /* 1D, 2D, and 2D_MSAA can set a custom pitch for shader resources
+       * starting with gfx10.3 (ignored if pitch <= width). Other texture
+       * targets can't. CB and DB can't set a custom pitch for any target.
+       */
+      if (screen->info.chip_class >= GFX10_3)
+         state[4] = S_00A010_DEPTH(tex->surface.u.gfx9.surf_pitch - 1);
+      else
+         state[4] = 0;
+   } else {
+      /* Depth is the last accessible layer on gfx9+. The hw doesn't need
+       * to know the total number of layers.
+       */
+      state[4] = S_00A010_DEPTH((type == V_008F1C_SQ_RSRC_IMG_3D && sampler) ?
+                                   depth - 1 : last_layer) |
+                 S_00A010_BASE_ARRAY(first_layer);
+   }
+
     state[5] = S_00A014_ARRAY_PITCH(!!(type == V_008F1C_SQ_RSRC_IMG_3D && !sampler)) |
                S_00A014_MAX_MIP(res->nr_samples > 1 ? util_logbase2(res->nr_samples)
                                                     : tex->buffer.b.b.last_level) |
@@ -4479,7 +4499,8 @@ static void *si_create_sampler_state(struct pipe_context *ctx,
     unsigned max_aniso = sscreen->force_aniso >= 0 ? sscreen->force_aniso : state->max_anisotropy;
     unsigned max_aniso_ratio = si_tex_aniso_filter(max_aniso);
     bool trunc_coord = state->min_img_filter == PIPE_TEX_FILTER_NEAREST &&
-                      state->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
+                      state->mag_img_filter == PIPE_TEX_FILTER_NEAREST &&
+                      state->compare_mode == PIPE_TEX_COMPARE_NONE;
     union pipe_color_union clamped_border_color;
  
     if (!rstate) {
@@ -4753,8 +4774,9 @@ static void *si_create_vertex_elements(struct pipe_context *ctx, unsigned count,
         * into account would complicate the fast path (where everything
         * is nicely aligned).
         */
-      bool check_alignment = log_hw_load_size >= 1 && (sscreen->info.chip_class == GFX6 ||
-                                                       sscreen->info.chip_class == GFX10);
+      bool check_alignment =
+            log_hw_load_size >= 1 &&
+            (sscreen->info.chip_class == GFX6 || sscreen->info.chip_class >= GFX10);
        bool opencode = sscreen->options.vs_fetch_always_opencode;
  
        if (check_alignment && (elements[i].src_offset & ((1 << log_hw_load_size) - 1)) != 0)
@@ -5270,7 +5292,7 @@ static void si_init_config(struct si_context *sctx)
  
        /* Compute LATE_ALLOC_VS.LIMIT. */
        unsigned num_cu_per_sh = sscreen->info.min_good_cu_per_sa;
-      unsigned late_alloc_wave64 = 0; /* The limit is per SH. */
+      unsigned late_alloc_wave64 = 0; /* The limit is per SA. */
        unsigned cu_mask_vs = 0xffff;
        unsigned cu_mask_gs = 0xffff;
  
@@ -5294,7 +5316,7 @@ static void si_init_config(struct si_context *sctx)
           if (!sscreen->info.use_late_alloc) {
              late_alloc_wave64 = 0;
           } else if (num_cu_per_sh <= 4) {
-            /* Too few available compute units per SH. Disallowing
+            /* Too few available compute units per SA. Disallowing
               * VS to run on one CU could hurt us more than late VS
               * allocation would help.
               *
@@ -5336,6 +5358,7 @@ static void si_init_config(struct si_context *sctx)
         * a single primitive shader subgroup.
         */
        si_pm4_set_reg(pm4, R_028C50_PA_SC_NGG_MODE_CNTL, S_028C50_MAX_DEALLOCS_IN_WAVE(512));
+      /* Reuse for legacy (non-NGG) only. */
        si_pm4_set_reg(pm4, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14);
  
        if (!has_clear_state) {
@@ -5377,6 +5400,9 @@ static void si_init_config(struct si_context *sctx)
                       S_00B0C0_SOFT_GROUPING_EN(1) | S_00B0C0_NUMBER_OF_REQUESTS_PER_CU(4 - 1));
        si_pm4_set_reg(pm4, R_00B1C0_SPI_SHADER_REQ_CTRL_VS, 0);
     }
+   if (sctx->chip_class >= GFX10_3) {
+      si_pm4_set_reg(pm4, R_028750_SX_PS_DOWNCONVERT_CONTROL_GFX103, 0xff);
+   }
  
     if (sctx->chip_class >= GFX9) {
        si_pm4_set_reg(pm4, R_028B50_VGT_TESS_DISTRIBUTION,