radeonsi: take reference glsl types for compile threads
[mesa.git] / src / gallium / drivers / radeonsi / si_pipe.c
index e25c65abda887ef2d58cc5aed66fc47538955945..475c15c54ba56e88fcd3319e1c0e6cbea9d49662 100644 (file)
@@ -31,7 +31,6 @@
 
 #include "ac_llvm_util.h"
 #include "radeon/radeon_uvd.h"
-#include "gallivm/lp_bld_misc.h"
 #include "util/disk_cache.h"
 #include "util/u_log.h"
 #include "util/u_memory.h"
@@ -60,9 +59,14 @@ static const struct debug_named_value debug_options[] = {
        { "preoptir", DBG(PREOPT_IR), "Print the LLVM IR before initial optimizations" },
 
        /* Shader compiler options the shader cache should be aware of: */
-       { "unsafemath", DBG(UNSAFE_MATH), "Enable unsafe math shader optimizations" },
        { "sisched", DBG(SI_SCHED), "Enable LLVM SI Machine Instruction Scheduler." },
        { "gisel", DBG(GISEL), "Enable LLVM global instruction selector." },
+       { "w32ge", DBG(W32_GE), "Use Wave32 for vertex, tessellation, and geometry shaders." },
+       { "w32ps", DBG(W32_PS), "Use Wave32 for pixel shaders." },
+       { "w32cs", DBG(W32_CS), "Use Wave32 for computes shaders." },
+       { "w64ge", DBG(W64_GE), "Use Wave64 for vertex, tessellation, and geometry shaders." },
+       { "w64ps", DBG(W64_PS), "Use Wave64 for pixel shaders." },
+       { "w64cs", DBG(W64_CS), "Use Wave64 for computes shaders." },
 
        /* Shader compiler options (with no effect on the shader cache): */
        { "checkir", DBG(CHECK_IR), "Enable additional sanity checks on shader IR" },
@@ -84,6 +88,7 @@ static const struct debug_named_value debug_options[] = {
        { "zerovram", DBG(ZERO_VRAM), "Clear VRAM allocations." },
 
        /* 3D engine options: */
+       { "nogfx", DBG(NO_GFX), "Disable graphics. Only multimedia compute paths can be used." },
        { "alwayspd", DBG(ALWAYS_PD), "Always enable the primitive discard compute shader." },
        { "pd", DBG(PD), "Enable the primitive discard compute shader for large draw calls." },
        { "nopd", DBG(NO_PD), "Disable the primitive discard compute shader." },
@@ -137,14 +142,14 @@ static void si_init_compiler(struct si_screen *sscreen,
        ac_init_llvm_compiler(compiler, sscreen->info.family, tm_options);
        compiler->passes = ac_create_llvm_passes(compiler->tm);
 
+       if (compiler->tm_wave32)
+               compiler->passes_wave32 = ac_create_llvm_passes(compiler->tm_wave32);
        if (compiler->low_opt_tm)
                compiler->low_opt_passes = ac_create_llvm_passes(compiler->low_opt_tm);
 }
 
 static void si_destroy_compiler(struct ac_llvm_compiler *compiler)
 {
-       ac_destroy_llvm_passes(compiler->passes);
-       ac_destroy_llvm_passes(compiler->low_opt_passes);
        ac_destroy_llvm_compiler(compiler);
 }
 
@@ -386,8 +391,14 @@ static void si_set_context_param(struct pipe_context *ctx,
 static struct pipe_context *si_create_context(struct pipe_screen *screen,
                                               unsigned flags)
 {
-       struct si_context *sctx = CALLOC_STRUCT(si_context);
        struct si_screen* sscreen = (struct si_screen *)screen;
+
+       /* Don't create a context if it's not compute-only and hw is compute-only. */
+       if (!sscreen->info.has_graphics &&
+           !(flags & PIPE_CONTEXT_COMPUTE_ONLY))
+               return NULL;
+
+       struct si_context *sctx = CALLOC_STRUCT(si_context);
        struct radeon_winsys *ws = sscreen->ws;
        int shader, i;
        bool stop_exec_on_failure = (flags & PIPE_CONTEXT_LOSE_CONTEXT_ON_RESET) != 0;
@@ -459,9 +470,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,
                                                 0, PIPE_USAGE_DEFAULT,
                                                 SI_RESOURCE_FLAG_32BIT |
                                                 (use_sdma_upload ?
-                                                         SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA :
-                                                         (sscreen->cpdma_prefetch_writes_memory ?
-                                                                  0 : SI_RESOURCE_FLAG_READ_ONLY)));
+                                                         SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA : 0));
        if (!sctx->b.const_uploader)
                goto fail;
 
@@ -491,8 +500,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,
        if (!sctx->border_color_map)
                goto fail;
 
-       if (sctx->chip_class >= GFX10)
-               sctx->ngg = !sscreen->options.disable_ngg;
+       sctx->ngg = sscreen->use_ngg;
 
        /* Initialize context functions used by graphics and compute. */
        if (sctx->chip_class >= GFX10)
@@ -517,10 +525,10 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,
        si_init_fence_functions(sctx);
        si_init_query_functions(sctx);
        si_init_state_compute_functions(sctx);
+       si_init_context_texture_functions(sctx);
 
        /* Initialize graphics-only context functions. */
        if (sctx->has_graphics) {
-               si_init_context_texture_functions(sctx);
                if (sctx->chip_class >= GFX10)
                        gfx10_init_query(sctx);
                si_init_msaa_functions(sctx);
@@ -534,6 +542,17 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,
                        goto fail;
                sctx->blitter->skip_viewport_restore = true;
 
+               /* Some states are expected to be always non-NULL. */
+               sctx->noop_blend = util_blitter_get_noop_blend_state(sctx->blitter);
+               sctx->queued.named.blend = sctx->noop_blend;
+
+               sctx->noop_dsa = util_blitter_get_noop_dsa_state(sctx->blitter);
+               sctx->queued.named.dsa = sctx->noop_dsa;
+
+               sctx->discard_rasterizer_state =
+                       util_blitter_get_discard_rasterizer_state(sctx->blitter);
+               sctx->queued.named.rasterizer = sctx->discard_rasterizer_state;
+
                si_init_draw_functions(sctx);
                si_initialize_prim_discard_tunables(sctx);
        }
@@ -724,6 +743,9 @@ static void si_destroy_screen(struct pipe_screen* pscreen)
        util_queue_destroy(&sscreen->shader_compiler_queue);
        util_queue_destroy(&sscreen->shader_compiler_queue_low_priority);
 
+       /* Release the reference on glsl types of the compiler threads. */
+       glsl_type_singleton_decref();
+
        for (i = 0; i < ARRAY_SIZE(sscreen->compiler); i++)
                si_destroy_compiler(&sscreen->compiler[i]);
 
@@ -852,9 +874,18 @@ static void si_disk_cache_create(struct si_screen *sscreen)
        #define ALL_FLAGS (DBG(FS_CORRECT_DERIVS_AFTER_KILL) |  \
                           DBG(SI_SCHED) |                      \
                           DBG(GISEL) |                         \
-                          DBG(UNSAFE_MATH))
-       uint64_t shader_debug_flags = sscreen->debug_flags &
-               ALL_FLAGS;
+                          DBG(W32_GE) |                        \
+                          DBG(W32_PS) |                        \
+                          DBG(W32_CS) |                        \
+                          DBG(W64_GE) |                        \
+                          DBG(W64_PS) |                        \
+                          DBG(W64_CS))
+       uint64_t shader_debug_flags = sscreen->debug_flags & ALL_FLAGS;
+
+       if (sscreen->options.enable_nir) {
+               STATIC_ASSERT((ALL_FLAGS & (1u << 31)) == 0);
+               shader_debug_flags |= 1u << 31;
+       }
 
        /* Add the high bits of 32-bit addresses, which affects
         * how 32-bit addresses are expanded to 64 bits.
@@ -863,9 +894,6 @@ static void si_disk_cache_create(struct si_screen *sscreen)
        assert((int16_t)sscreen->info.address32_hi == (int32_t)sscreen->info.address32_hi);
        shader_debug_flags |= (uint64_t)(sscreen->info.address32_hi & 0xffff) << 32;
 
-       if (sscreen->options.enable_nir)
-               shader_debug_flags |= 1ull << 48;
-
        sscreen->disk_shader_cache =
                disk_cache_create(sscreen->info.name,
                                  cache_id,
@@ -886,13 +914,8 @@ static void si_set_max_shader_compiler_threads(struct pipe_screen *screen,
 
 static bool si_is_parallel_shader_compilation_finished(struct pipe_screen *screen,
                                                       void *shader,
-                                                      unsigned shader_type)
+                                                      enum pipe_shader_type shader_type)
 {
-       if (shader_type == PIPE_SHADER_COMPUTE) {
-               struct si_compute *cs = (struct si_compute*)shader;
-
-               return util_queue_fence_is_signalled(&cs->ready);
-       }
        struct si_shader_selector *sel = (struct si_shader_selector *)shader;
 
        return util_queue_fence_is_signalled(&sel->ready);
@@ -932,6 +955,9 @@ radeonsi_screen_create_impl(struct radeon_winsys *ws,
        sscreen->debug_flags |= debug_get_flags_option("AMD_DEBUG",
                                                       debug_options, 0);
 
+       if (sscreen->debug_flags & DBG(NO_GFX))
+               sscreen->info.has_graphics = false;
+
        /* Set functions first. */
        sscreen->b.context_create = si_pipe_create_context;
        sscreen->b.destroy = si_destroy_screen;
@@ -1006,12 +1032,16 @@ radeonsi_screen_create_impl(struct radeon_winsys *ws,
        num_comp_lo_threads = MIN2(num_comp_lo_threads,
                                   ARRAY_SIZE(sscreen->compiler_lowp));
 
+       /* Take a reference on the glsl types for the compiler threads. */
+       glsl_type_singleton_init_or_ref();
+
        if (!util_queue_init(&sscreen->shader_compiler_queue, "sh",
                             64, num_comp_hi_threads,
                             UTIL_QUEUE_INIT_RESIZE_IF_FULL |
                             UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY)) {
                si_destroy_shader_cache(sscreen);
                FREE(sscreen);
+               glsl_type_singleton_decref();
                return NULL;
        }
 
@@ -1023,6 +1053,7 @@ radeonsi_screen_create_impl(struct radeon_winsys *ws,
                             UTIL_QUEUE_INIT_USE_MINIMUM_PRIORITY)) {
               si_destroy_shader_cache(sscreen);
               FREE(sscreen);
+              glsl_type_singleton_decref();
               return NULL;
        }
 
@@ -1063,7 +1094,6 @@ radeonsi_screen_create_impl(struct radeon_winsys *ws,
        }
 
        sscreen->tess_factor_ring_size = 32768 * sscreen->info.max_se;
-       assert(((sscreen->tess_factor_ring_size / 4) & C_030938_SIZE) == 0);
        sscreen->tess_offchip_ring_size = max_offchip_buffers *
                                          sscreen->tess_offchip_block_dw_size * 4;
 
@@ -1080,10 +1110,10 @@ radeonsi_screen_create_impl(struct radeon_winsys *ws,
        }
 
        /* The mere presense of CLEAR_STATE in the IB causes random GPU hangs
-        * on GFX6. Some CLEAR_STATE cause asic hang on radeon kernel, etc.
-        * SPI_VS_OUT_CONFIG. So only enable GFX7 CLEAR_STATE on amdgpu kernel.*/
-       sscreen->has_clear_state = sscreen->info.chip_class >= GFX7 &&
-                                  sscreen->info.is_amdgpu;
+        * on GFX6. Some CLEAR_STATE cause asic hang on radeon kernel, etc.
+        * SPI_VS_OUT_CONFIG. So only enable GFX7 CLEAR_STATE on amdgpu kernel. */
+       sscreen->has_clear_state = sscreen->info.chip_class >= GFX7 &&
+                                  sscreen->info.is_amdgpu;
 
        sscreen->has_distributed_tess =
                sscreen->info.chip_class >= GFX8 &&
@@ -1125,14 +1155,19 @@ radeonsi_screen_create_impl(struct radeon_winsys *ws,
        sscreen->has_ls_vgpr_init_bug = sscreen->info.family == CHIP_VEGA10 ||
                                        sscreen->info.family == CHIP_RAVEN;
        sscreen->has_dcc_constant_encode = sscreen->info.family == CHIP_RAVEN2 ||
+                                          sscreen->info.family == CHIP_RENOIR ||
                                           sscreen->info.chip_class >= GFX10;
+       sscreen->use_ngg = sscreen->info.chip_class >= GFX10;
+       sscreen->use_ngg_streamout = sscreen->info.chip_class >= GFX10;
 
        /* Only enable primitive binning on APUs by default. */
-       sscreen->dpbb_allowed = sscreen->info.family == CHIP_RAVEN ||
-                               sscreen->info.family == CHIP_RAVEN2;
-
-       sscreen->dfsm_allowed = sscreen->info.family == CHIP_RAVEN ||
-                               sscreen->info.family == CHIP_RAVEN2;
+       if (sscreen->info.chip_class >= GFX10) {
+               sscreen->dpbb_allowed = true;
+               sscreen->dfsm_allowed = !sscreen->info.has_dedicated_vram;
+       } else if (sscreen->info.chip_class == GFX9) {
+               sscreen->dpbb_allowed = !sscreen->info.has_dedicated_vram;
+               sscreen->dfsm_allowed = !sscreen->info.has_dedicated_vram;
+       }
 
        /* Process DPBB enable flags. */
        if (sscreen->debug_flags & DBG(DPBB)) {
@@ -1166,7 +1201,8 @@ radeonsi_screen_create_impl(struct radeon_winsys *ws,
                        (sscreen->info.family == CHIP_STONEY ||
                         sscreen->info.family == CHIP_VEGA12 ||
                         sscreen->info.family == CHIP_RAVEN ||
-                        sscreen->info.family == CHIP_RAVEN2);
+                        sscreen->info.family == CHIP_RAVEN2 ||
+                        sscreen->info.family == CHIP_RENOIR);
        }
 
        sscreen->dcc_msaa_allowed =
@@ -1218,9 +1254,35 @@ radeonsi_screen_create_impl(struct radeon_winsys *ws,
        for (i = 0; i < num_comp_lo_threads; i++)
                si_init_compiler(sscreen, &sscreen->compiler_lowp[i]);
 
+       sscreen->ge_wave_size = 64;
+       sscreen->ps_wave_size = 64;
+       sscreen->compute_wave_size = 64;
+
+       if (sscreen->info.chip_class >= GFX10) {
+               /* Pixels shaders: Wave64 is recommended.
+                * Compute shaders: There are piglit failures with Wave32.
+                */
+               sscreen->ge_wave_size = 32;
+
+               if (sscreen->debug_flags & DBG(W32_GE))
+                       sscreen->ge_wave_size = 32;
+               if (sscreen->debug_flags & DBG(W32_PS))
+                       sscreen->ps_wave_size = 32;
+               if (sscreen->debug_flags & DBG(W32_CS))
+                       sscreen->compute_wave_size = 32;
+
+               if (sscreen->debug_flags & DBG(W64_GE))
+                       sscreen->ge_wave_size = 64;
+               if (sscreen->debug_flags & DBG(W64_PS))
+                       sscreen->ps_wave_size = 64;
+               if (sscreen->debug_flags & DBG(W64_CS))
+                       sscreen->compute_wave_size = 64;
+       }
+
        /* Create the auxiliary context. This must be done last. */
-       sscreen->aux_context = si_create_context(
-               &sscreen->b, sscreen->options.aux_debug ? PIPE_CONTEXT_DEBUG : 0);
+       sscreen->aux_context = si_create_context(&sscreen->b,
+               (sscreen->options.aux_debug ? PIPE_CONTEXT_DEBUG : 0) |
+               (sscreen->info.has_graphics ? 0 : PIPE_CONTEXT_COMPUTE_ONLY));
        if (sscreen->options.aux_debug) {
                struct u_log_context *log = CALLOC_STRUCT(u_log_context);
                u_log_context_init(log);