u_upload_mgr: pass alignment to u_upload_alloc manually
[mesa.git] / src / gallium / drivers / radeonsi / si_state_draw.c
index 3b606b2c7dcf60dc9f98f78e2e1cd566aa630839..d5540bec71d1b07158f2291573e6f6d814995650 100644 (file)
@@ -108,7 +108,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
                                       const struct pipe_draw_info *info,
                                       unsigned *num_patches)
 {
-       struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+       struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
        struct si_shader_ctx_state *ls = &sctx->vs_shader;
        /* The TES pointer will only be used for sctx->last_tcs.
         * It would be wrong to think that TCS = TES. */
@@ -163,7 +163,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
        perpatch_output_offset = output_patch0_offset + pervertex_output_patch_size;
 
        lds_size = output_patch0_offset + output_patch_size * *num_patches;
-       ls_rsrc2 = ls->current->ls_rsrc2;
+       ls_rsrc2 = ls->current->rsrc2;
 
        if (sctx->b.chip_class >= CIK) {
                assert(lds_size <= 65536);
@@ -178,7 +178,7 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
        if (sctx->b.chip_class == CIK && sctx->b.family != CHIP_HAWAII)
                radeon_set_sh_reg(cs, R_00B52C_SPI_SHADER_PGM_RSRC2_LS, ls_rsrc2);
        radeon_set_sh_reg_seq(cs, R_00B528_SPI_SHADER_PGM_RSRC1_LS, 2);
-       radeon_emit(cs, ls->current->ls_rsrc1);
+       radeon_emit(cs, ls->current->rsrc1);
        radeon_emit(cs, ls_rsrc2);
 
        /* Compute userdata SGPRs. */
@@ -216,6 +216,18 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
        radeon_emit(cs, tcs_out_layout | (num_tcs_output_cp << 26));
 }
 
+static unsigned si_num_prims_for_vertices(const struct pipe_draw_info *info)
+{
+       switch (info->mode) {
+       case PIPE_PRIM_PATCHES:
+               return info->count / info->vertices_per_patch;
+       case R600_PRIM_RECTANGLE_LIST:
+               return info->count / 3;
+       default:
+               return u_prims_for_vertices(info->mode, info->count);
+       }
+}
+
 static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
                                          const struct pipe_draw_info *info,
                                          unsigned num_patches)
@@ -247,13 +259,10 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
                /* primgroup_size must be set to a multiple of NUM_PATCHES */
                primgroup_size = (primgroup_size / num_patches) * num_patches;
 
-               /* SWITCH_ON_EOI must be set if PrimID is used.
-                * If SWITCH_ON_EOI is set, PARTIAL_ES_WAVE must be set too. */
+               /* SWITCH_ON_EOI must be set if PrimID is used. */
                if ((sctx->tcs_shader.cso && sctx->tcs_shader.cso->info.uses_primid) ||
-                   sctx->tes_shader.cso->info.uses_primid) {
+                   sctx->tes_shader.cso->info.uses_primid)
                        ia_switch_on_eoi = true;
-                       partial_es_wave = true;
-               }
 
                /* Bug with tessellation and GS on Bonaire and older 2 SE chips. */
                if ((sctx->b.family == CHIP_TAHITI ||
@@ -279,7 +288,8 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
                    prim == PIPE_PRIM_LINE_LOOP ||
                    prim == PIPE_PRIM_TRIANGLE_FAN ||
                    prim == PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY ||
-                   info->primitive_restart)
+                   info->primitive_restart ||
+                   info->count_from_stream_output)
                        wd_switch_on_eop = true;
 
                /* Hawaii hangs if instancing is enabled and WD_SWITCH_ON_EOP is 0.
@@ -289,10 +299,6 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
                    (info->indirect || info->instance_count > 1))
                        wd_switch_on_eop = true;
 
-               /* USE_OPAQUE doesn't work when WD_SWITCH_ON_EOP is 0. */
-               if (info->count_from_stream_output)
-                       wd_switch_on_eop = true;
-
                /* Required on CIK and later. */
                if (sctx->b.screen->info.max_se > 2 && !wd_switch_on_eop)
                        ia_switch_on_eoi = true;
@@ -313,12 +319,20 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
                assert(wd_switch_on_eop || !ia_switch_on_eop);
        }
 
+       /* If SWITCH_ON_EOI is set, PARTIAL_ES_WAVE must be set too. */
+       if (ia_switch_on_eoi)
+               partial_es_wave = true;
+
+       /* GS requirement. */
+       if (SI_GS_PER_ES / primgroup_size >= sctx->screen->gs_table_depth - 3)
+               partial_es_wave = true;
+
        /* Hw bug with single-primitive instances and SWITCH_ON_EOI
         * on multi-SE chips. */
        if (sctx->b.screen->info.max_se >= 2 && ia_switch_on_eoi &&
            (info->indirect ||
             (info->instance_count > 1 &&
-             u_prims_for_vertices(info->mode, info->count) <= 1)))
+             si_num_prims_for_vertices(info) <= 1)))
                sctx->b.flags |= SI_CONTEXT_VGT_FLUSH;
 
        return S_028AA8_SWITCH_ON_EOP(ia_switch_on_eop) |
@@ -351,7 +365,7 @@ static unsigned si_get_ls_hs_config(struct si_context *sctx,
 
 static void si_emit_scratch_reloc(struct si_context *sctx)
 {
-       struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+       struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 
        if (!sctx->emit_scratch_reloc)
                return;
@@ -360,7 +374,7 @@ static void si_emit_scratch_reloc(struct si_context *sctx)
                               sctx->spi_tmpring_size);
 
        if (sctx->scratch_buffer) {
-               radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+               radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
                                      sctx->scratch_buffer, RADEON_USAGE_READWRITE,
                                      RADEON_PRIO_SCRATCH_BUFFER);
 
@@ -371,7 +385,7 @@ static void si_emit_scratch_reloc(struct si_context *sctx)
 /* rast_prim is the primitive type after GS. */
 static void si_emit_rasterizer_prim_state(struct si_context *sctx)
 {
-       struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+       struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
        unsigned rast_prim = sctx->current_rast_prim;
        struct si_state_rasterizer *rs = sctx->emitted.named.rasterizer;
 
@@ -399,7 +413,7 @@ static void si_emit_rasterizer_prim_state(struct si_context *sctx)
 static void si_emit_draw_registers(struct si_context *sctx,
                                   const struct pipe_draw_info *info)
 {
-       struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+       struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
        unsigned prim = si_conv_pipe_prim(info->mode);
        unsigned gs_out_prim = si_conv_prim_to_gs_out(sctx->current_rast_prim);
        unsigned ia_multi_vgt_param, ls_hs_config, num_patches = 0;
@@ -453,8 +467,9 @@ static void si_emit_draw_packets(struct si_context *sctx,
                                 const struct pipe_draw_info *info,
                                 const struct pipe_index_buffer *ib)
 {
-       struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+       struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
        unsigned sh_base_reg = sctx->shader_userdata.sh_base[PIPE_SHADER_VERTEX];
+       bool render_cond_bit = sctx->b.render_cond && !sctx->b.render_cond_force_off;
 
        if (info->count_from_stream_output) {
                struct r600_so_target *t =
@@ -474,7 +489,7 @@ static void si_emit_draw_packets(struct si_context *sctx,
                radeon_emit(cs, R_028B2C_VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE >> 2);
                radeon_emit(cs, 0); /* unused */
 
-               radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+               radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
                                      t->buf_filled_size, RADEON_USAGE_READ,
                                      RADEON_PRIO_SO_FILLED_SIZE);
        }
@@ -528,7 +543,7 @@ static void si_emit_draw_packets(struct si_context *sctx,
        } else {
                si_invalidate_draw_sh_constants(sctx);
 
-               radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+               radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
                                      (struct r600_resource *)info->indirect,
                                      RADEON_USAGE_READ, RADEON_PRIO_DRAW_INDIRECT);
        }
@@ -538,7 +553,7 @@ static void si_emit_draw_packets(struct si_context *sctx,
                                          ib->index_size;
                uint64_t index_va = r600_resource(ib->buffer)->gpu_address + ib->offset;
 
-               radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx,
+               radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
                                      (struct r600_resource *)ib->buffer,
                                      RADEON_USAGE_READ, RADEON_PRIO_INDEX_BUFFER);
 
@@ -561,7 +576,7 @@ static void si_emit_draw_packets(struct si_context *sctx,
                        radeon_emit(cs, PKT3(PKT3_INDEX_BUFFER_SIZE, 0, 0));
                        radeon_emit(cs, index_max_size);
 
-                       radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_INDIRECT, 3, sctx->b.predicate_drawing));
+                       radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_INDIRECT, 3, render_cond_bit));
                        radeon_emit(cs, info->indirect_offset);
                        radeon_emit(cs, (sh_base_reg + SI_SGPR_BASE_VERTEX * 4 - SI_SH_REG_OFFSET) >> 2);
                        radeon_emit(cs, (sh_base_reg + SI_SGPR_START_INSTANCE * 4 - SI_SH_REG_OFFSET) >> 2);
@@ -569,7 +584,7 @@ static void si_emit_draw_packets(struct si_context *sctx,
                } else {
                        index_va += info->start * ib->index_size;
 
-                       radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_2, 4, sctx->b.predicate_drawing));
+                       radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_2, 4, render_cond_bit));
                        radeon_emit(cs, index_max_size);
                        radeon_emit(cs, index_va);
                        radeon_emit(cs, (index_va >> 32UL) & 0xFF);
@@ -588,13 +603,13 @@ static void si_emit_draw_packets(struct si_context *sctx,
                        radeon_emit(cs, indirect_va);
                        radeon_emit(cs, indirect_va >> 32);
 
-                       radeon_emit(cs, PKT3(PKT3_DRAW_INDIRECT, 3, sctx->b.predicate_drawing));
+                       radeon_emit(cs, PKT3(PKT3_DRAW_INDIRECT, 3, render_cond_bit));
                        radeon_emit(cs, info->indirect_offset);
                        radeon_emit(cs, (sh_base_reg + SI_SGPR_BASE_VERTEX * 4 - SI_SH_REG_OFFSET) >> 2);
                        radeon_emit(cs, (sh_base_reg + SI_SGPR_START_INSTANCE * 4 - SI_SH_REG_OFFSET) >> 2);
                        radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX);
                } else {
-                       radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, sctx->b.predicate_drawing));
+                       radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_AUTO, 1, render_cond_bit));
                        radeon_emit(cs, info->count);
                        radeon_emit(cs, V_0287F0_DI_SRC_SEL_AUTO_INDEX |
                                    S_0287F0_USE_OPAQUE(!!info->count_from_stream_output));
@@ -602,12 +617,10 @@ static void si_emit_draw_packets(struct si_context *sctx,
        }
 }
 
-#define BOTH_ICACHE_KCACHE (SI_CONTEXT_INV_ICACHE | SI_CONTEXT_INV_KCACHE)
-
 void si_emit_cache_flush(struct si_context *si_ctx, struct r600_atom *atom)
 {
        struct r600_common_context *sctx = &si_ctx->b;
-       struct radeon_winsys_cs *cs = sctx->rings.gfx.cs;
+       struct radeon_winsys_cs *cs = sctx->gfx.cs;
        uint32_t cp_coher_cntl = 0;
        uint32_t compute =
                PKT3_SHADER_TYPE_S(!!(sctx->flags & SI_CONTEXT_FLAG_COMPUTE));
@@ -622,12 +635,12 @@ void si_emit_cache_flush(struct si_context *si_ctx, struct r600_atom *atom)
 
        if (sctx->flags & SI_CONTEXT_INV_ICACHE)
                cp_coher_cntl |= S_0085F0_SH_ICACHE_ACTION_ENA(1);
-       if (sctx->flags & SI_CONTEXT_INV_KCACHE)
+       if (sctx->flags & SI_CONTEXT_INV_SMEM_L1)
                cp_coher_cntl |= S_0085F0_SH_KCACHE_ACTION_ENA(1);
 
-       if (sctx->flags & SI_CONTEXT_INV_TC_L1)
+       if (sctx->flags & SI_CONTEXT_INV_VMEM_L1)
                cp_coher_cntl |= S_0085F0_TCL1_ACTION_ENA(1);
-       if (sctx->flags & SI_CONTEXT_INV_TC_L2) {
+       if (sctx->flags & SI_CONTEXT_INV_GLOBAL_L2) {
                cp_coher_cntl |= S_0085F0_TC_ACTION_ENA(1);
 
                /* TODO: this might not be needed. */
@@ -645,6 +658,17 @@ void si_emit_cache_flush(struct si_context *si_ctx, struct r600_atom *atom)
                                 S_0085F0_CB5_DEST_BASE_ENA(1) |
                                 S_0085F0_CB6_DEST_BASE_ENA(1) |
                                 S_0085F0_CB7_DEST_BASE_ENA(1);
+
+               /* Necessary for DCC */
+               if (sctx->chip_class >= VI) {
+                       radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0) | compute);
+                       radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_AND_INV_CB_DATA_TS) |
+                                       EVENT_INDEX(5));
+                       radeon_emit(cs, 0);
+                       radeon_emit(cs, 0);
+                       radeon_emit(cs, 0);
+                       radeon_emit(cs, 0);
+               }
        }
        if (sctx->flags & SI_CONTEXT_FLUSH_AND_INV_DB) {
                cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) |
@@ -794,7 +818,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
                        si_get_draw_start_count(sctx, info, &start, &count);
                        start_offset = start * ib.index_size;
 
-                       u_upload_alloc(sctx->b.uploader, start_offset, count * 2,
+                       u_upload_alloc(sctx->b.uploader, start_offset, count * 2, 256,
                                       &out_offset, &out_buffer, &ptr);
                        if (!out_buffer) {
                                pipe_resource_reference(&ib.buffer, NULL);
@@ -830,7 +854,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
        /* VI reads index buffers through TC L2. */
        if (info->indexed && sctx->b.chip_class <= CIK &&
            r600_resource(ib.buffer)->TC_L2_dirty) {
-               sctx->b.flags |= SI_CONTEXT_INV_TC_L2;
+               sctx->b.flags |= SI_CONTEXT_INV_GLOBAL_L2;
                r600_resource(ib.buffer)->TC_L2_dirty = false;
        }
 
@@ -860,7 +884,9 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 
        /* Workaround for a VGT hang when streamout is enabled.
         * It must be done after drawing. */
-       if ((sctx->b.family == CHIP_HAWAII || sctx->b.family == CHIP_TONGA) &&
+       if ((sctx->b.family == CHIP_HAWAII ||
+            sctx->b.family == CHIP_TONGA ||
+            sctx->b.family == CHIP_FIJI) &&
            (sctx->b.streamout.streamout_enabled ||
             sctx->b.streamout.prims_gen_query_enabled)) {
                sctx->b.flags |= SI_CONTEXT_VGT_STREAMOUT_SYNC;
@@ -896,10 +922,10 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 
 void si_trace_emit(struct si_context *sctx)
 {
-       struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+       struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 
        sctx->trace_id++;
-       radeon_add_to_buffer_list(&sctx->b, &sctx->b.rings.gfx, sctx->trace_buf,
+       radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, sctx->trace_buf,
                              RADEON_USAGE_READWRITE, RADEON_PRIO_TRACE);
        radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0));
        radeon_emit(cs, S_370_DST_SEL(V_370_MEMORY_SYNC) |