radeonsi: switch to 3-spaces style
[mesa.git] / src / gallium / drivers / radeonsi / si_state.c
1 /*
2 * Copyright 2012 Advanced Micro Devices, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 */
24
25 #include "si_build_pm4.h"
26 #include "si_query.h"
27 #include "sid.h"
28 #include "util/fast_idiv_by_const.h"
29 #include "util/format/u_format.h"
30 #include "util/format/u_format_s3tc.h"
31 #include "util/u_dual_blend.h"
32 #include "util/u_memory.h"
33 #include "util/u_resource.h"
34 #include "util/u_upload_mgr.h"
35
36 struct gfx10_format {
37 unsigned img_format : 9;
38
39 /* Various formats are only supported with workarounds for vertex fetch,
40 * and some 32_32_32 formats are supported natively, but only for buffers
41 * (possibly with some image support, actually, but no filtering). */
42 bool buffers_only : 1;
43 };
44
45 #include "gfx10_format_table.h"
46
47 static unsigned si_map_swizzle(unsigned swizzle)
48 {
49 switch (swizzle) {
50 case PIPE_SWIZZLE_Y:
51 return V_008F0C_SQ_SEL_Y;
52 case PIPE_SWIZZLE_Z:
53 return V_008F0C_SQ_SEL_Z;
54 case PIPE_SWIZZLE_W:
55 return V_008F0C_SQ_SEL_W;
56 case PIPE_SWIZZLE_0:
57 return V_008F0C_SQ_SEL_0;
58 case PIPE_SWIZZLE_1:
59 return V_008F0C_SQ_SEL_1;
60 default: /* PIPE_SWIZZLE_X */
61 return V_008F0C_SQ_SEL_X;
62 }
63 }
64
65 /* 12.4 fixed-point */
66 static unsigned si_pack_float_12p4(float x)
67 {
68 return x <= 0 ? 0 : x >= 4096 ? 0xffff : x * 16;
69 }
70
71 /*
72 * Inferred framebuffer and blender state.
73 *
74 * CB_TARGET_MASK is emitted here to avoid a hang with dual source blending
75 * if there is not enough PS outputs.
76 */
77 static void si_emit_cb_render_state(struct si_context *sctx)
78 {
79 struct radeon_cmdbuf *cs = sctx->gfx_cs;
80 struct si_state_blend *blend = sctx->queued.named.blend;
81 /* CB_COLORn_INFO.FORMAT=INVALID should disable unbound colorbuffers,
82 * but you never know. */
83 uint32_t cb_target_mask = sctx->framebuffer.colorbuf_enabled_4bit & blend->cb_target_mask;
84 unsigned i;
85
86 /* Avoid a hang that happens when dual source blending is enabled
87 * but there is not enough color outputs. This is undefined behavior,
88 * so disable color writes completely.
89 *
90 * Reproducible with Unigine Heaven 4.0 and drirc missing.
91 */
92 if (blend->dual_src_blend && sctx->ps_shader.cso &&
93 (sctx->ps_shader.cso->info.colors_written & 0x3) != 0x3)
94 cb_target_mask = 0;
95
96 /* GFX9: Flush DFSM when CB_TARGET_MASK changes.
97 * I think we don't have to do anything between IBs.
98 */
99 if (sctx->screen->dpbb_allowed && sctx->last_cb_target_mask != cb_target_mask) {
100 sctx->last_cb_target_mask = cb_target_mask;
101
102 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
103 radeon_emit(cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
104 }
105
106 unsigned initial_cdw = cs->current.cdw;
107 radeon_opt_set_context_reg(sctx, R_028238_CB_TARGET_MASK, SI_TRACKED_CB_TARGET_MASK,
108 cb_target_mask);
109
110 if (sctx->chip_class >= GFX8) {
111 /* DCC MSAA workaround.
112 * Alternatively, we can set CB_COLORi_DCC_CONTROL.OVERWRITE_-
113 * COMBINER_DISABLE, but that would be more complicated.
114 */
115 bool oc_disable =
116 blend->dcc_msaa_corruption_4bit & cb_target_mask && sctx->framebuffer.nr_samples >= 2;
117 unsigned watermark = sctx->framebuffer.dcc_overwrite_combiner_watermark;
118
119 radeon_opt_set_context_reg(
120 sctx, R_028424_CB_DCC_CONTROL, SI_TRACKED_CB_DCC_CONTROL,
121 S_028424_OVERWRITE_COMBINER_MRT_SHARING_DISABLE(sctx->chip_class <= GFX9) |
122 S_028424_OVERWRITE_COMBINER_WATERMARK(watermark) |
123 S_028424_OVERWRITE_COMBINER_DISABLE(oc_disable) |
124 S_028424_DISABLE_CONSTANT_ENCODE_REG(sctx->screen->info.has_dcc_constant_encode));
125 }
126
127 /* RB+ register settings. */
128 if (sctx->screen->info.rbplus_allowed) {
129 unsigned spi_shader_col_format =
130 sctx->ps_shader.cso ? sctx->ps_shader.current->key.part.ps.epilog.spi_shader_col_format
131 : 0;
132 unsigned sx_ps_downconvert = 0;
133 unsigned sx_blend_opt_epsilon = 0;
134 unsigned sx_blend_opt_control = 0;
135
136 for (i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
137 struct si_surface *surf = (struct si_surface *)sctx->framebuffer.state.cbufs[i];
138 unsigned format, swap, spi_format, colormask;
139 bool has_alpha, has_rgb;
140
141 if (!surf) {
142 /* If the color buffer is not set, the driver sets 32_R
143 * as the SPI color format, because the hw doesn't allow
144 * holes between color outputs, so also set this to
145 * enable RB+.
146 */
147 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4);
148 continue;
149 }
150
151 format = G_028C70_FORMAT(surf->cb_color_info);
152 swap = G_028C70_COMP_SWAP(surf->cb_color_info);
153 spi_format = (spi_shader_col_format >> (i * 4)) & 0xf;
154 colormask = (cb_target_mask >> (i * 4)) & 0xf;
155
156 /* Set if RGB and A are present. */
157 has_alpha = !G_028C74_FORCE_DST_ALPHA_1(surf->cb_color_attrib);
158
159 if (format == V_028C70_COLOR_8 || format == V_028C70_COLOR_16 ||
160 format == V_028C70_COLOR_32)
161 has_rgb = !has_alpha;
162 else
163 has_rgb = true;
164
165 /* Check the colormask and export format. */
166 if (!(colormask & (PIPE_MASK_RGBA & ~PIPE_MASK_A)))
167 has_rgb = false;
168 if (!(colormask & PIPE_MASK_A))
169 has_alpha = false;
170
171 if (spi_format == V_028714_SPI_SHADER_ZERO) {
172 has_rgb = false;
173 has_alpha = false;
174 }
175
176 /* Disable value checking for disabled channels. */
177 if (!has_rgb)
178 sx_blend_opt_control |= S_02875C_MRT0_COLOR_OPT_DISABLE(1) << (i * 4);
179 if (!has_alpha)
180 sx_blend_opt_control |= S_02875C_MRT0_ALPHA_OPT_DISABLE(1) << (i * 4);
181
182 /* Enable down-conversion for 32bpp and smaller formats. */
183 switch (format) {
184 case V_028C70_COLOR_8:
185 case V_028C70_COLOR_8_8:
186 case V_028C70_COLOR_8_8_8_8:
187 /* For 1 and 2-channel formats, use the superset thereof. */
188 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR ||
189 spi_format == V_028714_SPI_SHADER_UINT16_ABGR ||
190 spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
191 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_8_8_8_8 << (i * 4);
192 sx_blend_opt_epsilon |= V_028758_8BIT_FORMAT << (i * 4);
193 }
194 break;
195
196 case V_028C70_COLOR_5_6_5:
197 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
198 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_5_6_5 << (i * 4);
199 sx_blend_opt_epsilon |= V_028758_6BIT_FORMAT << (i * 4);
200 }
201 break;
202
203 case V_028C70_COLOR_1_5_5_5:
204 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
205 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_1_5_5_5 << (i * 4);
206 sx_blend_opt_epsilon |= V_028758_5BIT_FORMAT << (i * 4);
207 }
208 break;
209
210 case V_028C70_COLOR_4_4_4_4:
211 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
212 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_4_4_4_4 << (i * 4);
213 sx_blend_opt_epsilon |= V_028758_4BIT_FORMAT << (i * 4);
214 }
215 break;
216
217 case V_028C70_COLOR_32:
218 if (swap == V_028C70_SWAP_STD && spi_format == V_028714_SPI_SHADER_32_R)
219 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_R << (i * 4);
220 else if (swap == V_028C70_SWAP_ALT_REV && spi_format == V_028714_SPI_SHADER_32_AR)
221 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_32_A << (i * 4);
222 break;
223
224 case V_028C70_COLOR_16:
225 case V_028C70_COLOR_16_16:
226 /* For 1-channel formats, use the superset thereof. */
227 if (spi_format == V_028714_SPI_SHADER_UNORM16_ABGR ||
228 spi_format == V_028714_SPI_SHADER_SNORM16_ABGR ||
229 spi_format == V_028714_SPI_SHADER_UINT16_ABGR ||
230 spi_format == V_028714_SPI_SHADER_SINT16_ABGR) {
231 if (swap == V_028C70_SWAP_STD || swap == V_028C70_SWAP_STD_REV)
232 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_GR << (i * 4);
233 else
234 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_16_16_AR << (i * 4);
235 }
236 break;
237
238 case V_028C70_COLOR_10_11_11:
239 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR)
240 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_10_11_11 << (i * 4);
241 break;
242
243 case V_028C70_COLOR_2_10_10_10:
244 if (spi_format == V_028714_SPI_SHADER_FP16_ABGR) {
245 sx_ps_downconvert |= V_028754_SX_RT_EXPORT_2_10_10_10 << (i * 4);
246 sx_blend_opt_epsilon |= V_028758_10BIT_FORMAT << (i * 4);
247 }
248 break;
249 }
250 }
251
252 /* If there are no color outputs, the first color export is
253 * always enabled as 32_R, so also set this to enable RB+.
254 */
255 if (!sx_ps_downconvert)
256 sx_ps_downconvert = V_028754_SX_RT_EXPORT_32_R;
257
258 /* SX_PS_DOWNCONVERT, SX_BLEND_OPT_EPSILON, SX_BLEND_OPT_CONTROL */
259 radeon_opt_set_context_reg3(sctx, R_028754_SX_PS_DOWNCONVERT, SI_TRACKED_SX_PS_DOWNCONVERT,
260 sx_ps_downconvert, sx_blend_opt_epsilon, sx_blend_opt_control);
261 }
262 if (initial_cdw != cs->current.cdw)
263 sctx->context_roll = true;
264 }
265
266 /*
267 * Blender functions
268 */
269
270 static uint32_t si_translate_blend_function(int blend_func)
271 {
272 switch (blend_func) {
273 case PIPE_BLEND_ADD:
274 return V_028780_COMB_DST_PLUS_SRC;
275 case PIPE_BLEND_SUBTRACT:
276 return V_028780_COMB_SRC_MINUS_DST;
277 case PIPE_BLEND_REVERSE_SUBTRACT:
278 return V_028780_COMB_DST_MINUS_SRC;
279 case PIPE_BLEND_MIN:
280 return V_028780_COMB_MIN_DST_SRC;
281 case PIPE_BLEND_MAX:
282 return V_028780_COMB_MAX_DST_SRC;
283 default:
284 PRINT_ERR("Unknown blend function %d\n", blend_func);
285 assert(0);
286 break;
287 }
288 return 0;
289 }
290
291 static uint32_t si_translate_blend_factor(int blend_fact)
292 {
293 switch (blend_fact) {
294 case PIPE_BLENDFACTOR_ONE:
295 return V_028780_BLEND_ONE;
296 case PIPE_BLENDFACTOR_SRC_COLOR:
297 return V_028780_BLEND_SRC_COLOR;
298 case PIPE_BLENDFACTOR_SRC_ALPHA:
299 return V_028780_BLEND_SRC_ALPHA;
300 case PIPE_BLENDFACTOR_DST_ALPHA:
301 return V_028780_BLEND_DST_ALPHA;
302 case PIPE_BLENDFACTOR_DST_COLOR:
303 return V_028780_BLEND_DST_COLOR;
304 case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
305 return V_028780_BLEND_SRC_ALPHA_SATURATE;
306 case PIPE_BLENDFACTOR_CONST_COLOR:
307 return V_028780_BLEND_CONSTANT_COLOR;
308 case PIPE_BLENDFACTOR_CONST_ALPHA:
309 return V_028780_BLEND_CONSTANT_ALPHA;
310 case PIPE_BLENDFACTOR_ZERO:
311 return V_028780_BLEND_ZERO;
312 case PIPE_BLENDFACTOR_INV_SRC_COLOR:
313 return V_028780_BLEND_ONE_MINUS_SRC_COLOR;
314 case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
315 return V_028780_BLEND_ONE_MINUS_SRC_ALPHA;
316 case PIPE_BLENDFACTOR_INV_DST_ALPHA:
317 return V_028780_BLEND_ONE_MINUS_DST_ALPHA;
318 case PIPE_BLENDFACTOR_INV_DST_COLOR:
319 return V_028780_BLEND_ONE_MINUS_DST_COLOR;
320 case PIPE_BLENDFACTOR_INV_CONST_COLOR:
321 return V_028780_BLEND_ONE_MINUS_CONSTANT_COLOR;
322 case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
323 return V_028780_BLEND_ONE_MINUS_CONSTANT_ALPHA;
324 case PIPE_BLENDFACTOR_SRC1_COLOR:
325 return V_028780_BLEND_SRC1_COLOR;
326 case PIPE_BLENDFACTOR_SRC1_ALPHA:
327 return V_028780_BLEND_SRC1_ALPHA;
328 case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
329 return V_028780_BLEND_INV_SRC1_COLOR;
330 case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
331 return V_028780_BLEND_INV_SRC1_ALPHA;
332 default:
333 PRINT_ERR("Bad blend factor %d not supported!\n", blend_fact);
334 assert(0);
335 break;
336 }
337 return 0;
338 }
339
340 static uint32_t si_translate_blend_opt_function(int blend_func)
341 {
342 switch (blend_func) {
343 case PIPE_BLEND_ADD:
344 return V_028760_OPT_COMB_ADD;
345 case PIPE_BLEND_SUBTRACT:
346 return V_028760_OPT_COMB_SUBTRACT;
347 case PIPE_BLEND_REVERSE_SUBTRACT:
348 return V_028760_OPT_COMB_REVSUBTRACT;
349 case PIPE_BLEND_MIN:
350 return V_028760_OPT_COMB_MIN;
351 case PIPE_BLEND_MAX:
352 return V_028760_OPT_COMB_MAX;
353 default:
354 return V_028760_OPT_COMB_BLEND_DISABLED;
355 }
356 }
357
358 static uint32_t si_translate_blend_opt_factor(int blend_fact, bool is_alpha)
359 {
360 switch (blend_fact) {
361 case PIPE_BLENDFACTOR_ZERO:
362 return V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_ALL;
363 case PIPE_BLENDFACTOR_ONE:
364 return V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE;
365 case PIPE_BLENDFACTOR_SRC_COLOR:
366 return is_alpha ? V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0
367 : V_028760_BLEND_OPT_PRESERVE_C1_IGNORE_C0;
368 case PIPE_BLENDFACTOR_INV_SRC_COLOR:
369 return is_alpha ? V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1
370 : V_028760_BLEND_OPT_PRESERVE_C0_IGNORE_C1;
371 case PIPE_BLENDFACTOR_SRC_ALPHA:
372 return V_028760_BLEND_OPT_PRESERVE_A1_IGNORE_A0;
373 case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
374 return V_028760_BLEND_OPT_PRESERVE_A0_IGNORE_A1;
375 case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
376 return is_alpha ? V_028760_BLEND_OPT_PRESERVE_ALL_IGNORE_NONE
377 : V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0;
378 default:
379 return V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
380 }
381 }
382
383 static void si_blend_check_commutativity(struct si_screen *sscreen, struct si_state_blend *blend,
384 enum pipe_blend_func func, enum pipe_blendfactor src,
385 enum pipe_blendfactor dst, unsigned chanmask)
386 {
387 /* Src factor is allowed when it does not depend on Dst */
388 static const uint32_t src_allowed =
389 (1u << PIPE_BLENDFACTOR_ONE) | (1u << PIPE_BLENDFACTOR_SRC_COLOR) |
390 (1u << PIPE_BLENDFACTOR_SRC_ALPHA) | (1u << PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE) |
391 (1u << PIPE_BLENDFACTOR_CONST_COLOR) | (1u << PIPE_BLENDFACTOR_CONST_ALPHA) |
392 (1u << PIPE_BLENDFACTOR_SRC1_COLOR) | (1u << PIPE_BLENDFACTOR_SRC1_ALPHA) |
393 (1u << PIPE_BLENDFACTOR_ZERO) | (1u << PIPE_BLENDFACTOR_INV_SRC_COLOR) |
394 (1u << PIPE_BLENDFACTOR_INV_SRC_ALPHA) | (1u << PIPE_BLENDFACTOR_INV_CONST_COLOR) |
395 (1u << PIPE_BLENDFACTOR_INV_CONST_ALPHA) | (1u << PIPE_BLENDFACTOR_INV_SRC1_COLOR) |
396 (1u << PIPE_BLENDFACTOR_INV_SRC1_ALPHA);
397
398 if (dst == PIPE_BLENDFACTOR_ONE && (src_allowed & (1u << src))) {
399 /* Addition is commutative, but floating point addition isn't
400 * associative: subtle changes can be introduced via different
401 * rounding.
402 *
403 * Out-of-order is also non-deterministic, which means that
404 * this breaks OpenGL invariance requirements. So only enable
405 * out-of-order additive blending if explicitly allowed by a
406 * setting.
407 */
408 if (func == PIPE_BLEND_MAX || func == PIPE_BLEND_MIN ||
409 (func == PIPE_BLEND_ADD && sscreen->commutative_blend_add))
410 blend->commutative_4bit |= chanmask;
411 }
412 }
413
414 /**
415 * Get rid of DST in the blend factors by commuting the operands:
416 * func(src * DST, dst * 0) ---> func(src * 0, dst * SRC)
417 */
418 static void si_blend_remove_dst(unsigned *func, unsigned *src_factor, unsigned *dst_factor,
419 unsigned expected_dst, unsigned replacement_src)
420 {
421 if (*src_factor == expected_dst && *dst_factor == PIPE_BLENDFACTOR_ZERO) {
422 *src_factor = PIPE_BLENDFACTOR_ZERO;
423 *dst_factor = replacement_src;
424
425 /* Commuting the operands requires reversing subtractions. */
426 if (*func == PIPE_BLEND_SUBTRACT)
427 *func = PIPE_BLEND_REVERSE_SUBTRACT;
428 else if (*func == PIPE_BLEND_REVERSE_SUBTRACT)
429 *func = PIPE_BLEND_SUBTRACT;
430 }
431 }
432
433 static bool si_blend_factor_uses_dst(unsigned factor)
434 {
435 return factor == PIPE_BLENDFACTOR_DST_COLOR || factor == PIPE_BLENDFACTOR_DST_ALPHA ||
436 factor == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE ||
437 factor == PIPE_BLENDFACTOR_INV_DST_ALPHA || factor == PIPE_BLENDFACTOR_INV_DST_COLOR;
438 }
439
440 static void *si_create_blend_state_mode(struct pipe_context *ctx,
441 const struct pipe_blend_state *state, unsigned mode)
442 {
443 struct si_context *sctx = (struct si_context *)ctx;
444 struct si_state_blend *blend = CALLOC_STRUCT(si_state_blend);
445 struct si_pm4_state *pm4 = &blend->pm4;
446 uint32_t sx_mrt_blend_opt[8] = {0};
447 uint32_t color_control = 0;
448 bool logicop_enable = state->logicop_enable && state->logicop_func != PIPE_LOGICOP_COPY;
449
450 if (!blend)
451 return NULL;
452
453 blend->alpha_to_coverage = state->alpha_to_coverage;
454 blend->alpha_to_one = state->alpha_to_one;
455 blend->dual_src_blend = util_blend_state_is_dual(state, 0);
456 blend->logicop_enable = logicop_enable;
457
458 if (logicop_enable) {
459 color_control |= S_028808_ROP3(state->logicop_func | (state->logicop_func << 4));
460 } else {
461 color_control |= S_028808_ROP3(0xcc);
462 }
463
464 si_pm4_set_reg(pm4, R_028B70_DB_ALPHA_TO_MASK,
465 S_028B70_ALPHA_TO_MASK_ENABLE(state->alpha_to_coverage) |
466 S_028B70_ALPHA_TO_MASK_OFFSET0(3) | S_028B70_ALPHA_TO_MASK_OFFSET1(1) |
467 S_028B70_ALPHA_TO_MASK_OFFSET2(0) | S_028B70_ALPHA_TO_MASK_OFFSET3(2) |
468 S_028B70_OFFSET_ROUND(1));
469
470 if (state->alpha_to_coverage)
471 blend->need_src_alpha_4bit |= 0xf;
472
473 blend->cb_target_mask = 0;
474 blend->cb_target_enabled_4bit = 0;
475
476 for (int i = 0; i < 8; i++) {
477 /* state->rt entries > 0 only written if independent blending */
478 const int j = state->independent_blend_enable ? i : 0;
479
480 unsigned eqRGB = state->rt[j].rgb_func;
481 unsigned srcRGB = state->rt[j].rgb_src_factor;
482 unsigned dstRGB = state->rt[j].rgb_dst_factor;
483 unsigned eqA = state->rt[j].alpha_func;
484 unsigned srcA = state->rt[j].alpha_src_factor;
485 unsigned dstA = state->rt[j].alpha_dst_factor;
486
487 unsigned srcRGB_opt, dstRGB_opt, srcA_opt, dstA_opt;
488 unsigned blend_cntl = 0;
489
490 sx_mrt_blend_opt[i] = S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED) |
491 S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED);
492
493 /* Only set dual source blending for MRT0 to avoid a hang. */
494 if (i >= 1 && blend->dual_src_blend) {
495 /* Vulkan does this for dual source blending. */
496 if (i == 1)
497 blend_cntl |= S_028780_ENABLE(1);
498
499 si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl);
500 continue;
501 }
502
503 /* Only addition and subtraction equations are supported with
504 * dual source blending.
505 */
506 if (blend->dual_src_blend && (eqRGB == PIPE_BLEND_MIN || eqRGB == PIPE_BLEND_MAX ||
507 eqA == PIPE_BLEND_MIN || eqA == PIPE_BLEND_MAX)) {
508 assert(!"Unsupported equation for dual source blending");
509 si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl);
510 continue;
511 }
512
513 /* cb_render_state will disable unused ones */
514 blend->cb_target_mask |= (unsigned)state->rt[j].colormask << (4 * i);
515 if (state->rt[j].colormask)
516 blend->cb_target_enabled_4bit |= 0xf << (4 * i);
517
518 if (!state->rt[j].colormask || !state->rt[j].blend_enable) {
519 si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl);
520 continue;
521 }
522
523 si_blend_check_commutativity(sctx->screen, blend, eqRGB, srcRGB, dstRGB, 0x7 << (4 * i));
524 si_blend_check_commutativity(sctx->screen, blend, eqA, srcA, dstA, 0x8 << (4 * i));
525
526 /* Blending optimizations for RB+.
527 * These transformations don't change the behavior.
528 *
529 * First, get rid of DST in the blend factors:
530 * func(src * DST, dst * 0) ---> func(src * 0, dst * SRC)
531 */
532 si_blend_remove_dst(&eqRGB, &srcRGB, &dstRGB, PIPE_BLENDFACTOR_DST_COLOR,
533 PIPE_BLENDFACTOR_SRC_COLOR);
534 si_blend_remove_dst(&eqA, &srcA, &dstA, PIPE_BLENDFACTOR_DST_COLOR,
535 PIPE_BLENDFACTOR_SRC_COLOR);
536 si_blend_remove_dst(&eqA, &srcA, &dstA, PIPE_BLENDFACTOR_DST_ALPHA,
537 PIPE_BLENDFACTOR_SRC_ALPHA);
538
539 /* Look up the ideal settings from tables. */
540 srcRGB_opt = si_translate_blend_opt_factor(srcRGB, false);
541 dstRGB_opt = si_translate_blend_opt_factor(dstRGB, false);
542 srcA_opt = si_translate_blend_opt_factor(srcA, true);
543 dstA_opt = si_translate_blend_opt_factor(dstA, true);
544
545 /* Handle interdependencies. */
546 if (si_blend_factor_uses_dst(srcRGB))
547 dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
548 if (si_blend_factor_uses_dst(srcA))
549 dstA_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_NONE;
550
551 if (srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE &&
552 (dstRGB == PIPE_BLENDFACTOR_ZERO || dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA ||
553 dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE))
554 dstRGB_opt = V_028760_BLEND_OPT_PRESERVE_NONE_IGNORE_A0;
555
556 /* Set the final value. */
557 sx_mrt_blend_opt[i] = S_028760_COLOR_SRC_OPT(srcRGB_opt) |
558 S_028760_COLOR_DST_OPT(dstRGB_opt) |
559 S_028760_COLOR_COMB_FCN(si_translate_blend_opt_function(eqRGB)) |
560 S_028760_ALPHA_SRC_OPT(srcA_opt) | S_028760_ALPHA_DST_OPT(dstA_opt) |
561 S_028760_ALPHA_COMB_FCN(si_translate_blend_opt_function(eqA));
562
563 /* Set blend state. */
564 blend_cntl |= S_028780_ENABLE(1);
565 blend_cntl |= S_028780_COLOR_COMB_FCN(si_translate_blend_function(eqRGB));
566 blend_cntl |= S_028780_COLOR_SRCBLEND(si_translate_blend_factor(srcRGB));
567 blend_cntl |= S_028780_COLOR_DESTBLEND(si_translate_blend_factor(dstRGB));
568
569 if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB) {
570 blend_cntl |= S_028780_SEPARATE_ALPHA_BLEND(1);
571 blend_cntl |= S_028780_ALPHA_COMB_FCN(si_translate_blend_function(eqA));
572 blend_cntl |= S_028780_ALPHA_SRCBLEND(si_translate_blend_factor(srcA));
573 blend_cntl |= S_028780_ALPHA_DESTBLEND(si_translate_blend_factor(dstA));
574 }
575 si_pm4_set_reg(pm4, R_028780_CB_BLEND0_CONTROL + i * 4, blend_cntl);
576
577 blend->blend_enable_4bit |= 0xfu << (i * 4);
578
579 if (sctx->chip_class >= GFX8 && sctx->family <= CHIP_NAVI14)
580 blend->dcc_msaa_corruption_4bit |= 0xfu << (i * 4);
581
582 /* This is only important for formats without alpha. */
583 if (srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA || dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA ||
584 srcRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE ||
585 dstRGB == PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE ||
586 srcRGB == PIPE_BLENDFACTOR_INV_SRC_ALPHA || dstRGB == PIPE_BLENDFACTOR_INV_SRC_ALPHA)
587 blend->need_src_alpha_4bit |= 0xfu << (i * 4);
588 }
589
590 if (sctx->chip_class >= GFX8 && sctx->family <= CHIP_NAVI14 && logicop_enable)
591 blend->dcc_msaa_corruption_4bit |= blend->cb_target_enabled_4bit;
592
593 if (blend->cb_target_mask) {
594 color_control |= S_028808_MODE(mode);
595 } else {
596 color_control |= S_028808_MODE(V_028808_CB_DISABLE);
597 }
598
599 if (sctx->screen->info.rbplus_allowed) {
600 /* Disable RB+ blend optimizations for dual source blending.
601 * Vulkan does this.
602 */
603 if (blend->dual_src_blend) {
604 for (int i = 0; i < 8; i++) {
605 sx_mrt_blend_opt[i] = S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_NONE) |
606 S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_NONE);
607 }
608 }
609
610 for (int i = 0; i < 8; i++)
611 si_pm4_set_reg(pm4, R_028760_SX_MRT0_BLEND_OPT + i * 4, sx_mrt_blend_opt[i]);
612
613 /* RB+ doesn't work with dual source blending, logic op, and RESOLVE. */
614 if (blend->dual_src_blend || logicop_enable || mode == V_028808_CB_RESOLVE)
615 color_control |= S_028808_DISABLE_DUAL_QUAD(1);
616 }
617
618 si_pm4_set_reg(pm4, R_028808_CB_COLOR_CONTROL, color_control);
619 return blend;
620 }
621
622 static void *si_create_blend_state(struct pipe_context *ctx, const struct pipe_blend_state *state)
623 {
624 return si_create_blend_state_mode(ctx, state, V_028808_CB_NORMAL);
625 }
626
627 static void si_bind_blend_state(struct pipe_context *ctx, void *state)
628 {
629 struct si_context *sctx = (struct si_context *)ctx;
630 struct si_state_blend *old_blend = sctx->queued.named.blend;
631 struct si_state_blend *blend = (struct si_state_blend *)state;
632
633 if (!blend)
634 blend = (struct si_state_blend *)sctx->noop_blend;
635
636 si_pm4_bind_state(sctx, blend, blend);
637
638 if (old_blend->cb_target_mask != blend->cb_target_mask ||
639 old_blend->dual_src_blend != blend->dual_src_blend ||
640 (old_blend->dcc_msaa_corruption_4bit != blend->dcc_msaa_corruption_4bit &&
641 sctx->framebuffer.nr_samples >= 2 && sctx->screen->dcc_msaa_allowed))
642 si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state);
643
644 if (old_blend->cb_target_mask != blend->cb_target_mask ||
645 old_blend->alpha_to_coverage != blend->alpha_to_coverage ||
646 old_blend->alpha_to_one != blend->alpha_to_one ||
647 old_blend->dual_src_blend != blend->dual_src_blend ||
648 old_blend->blend_enable_4bit != blend->blend_enable_4bit ||
649 old_blend->need_src_alpha_4bit != blend->need_src_alpha_4bit)
650 sctx->do_update_shaders = true;
651
652 if (sctx->screen->dpbb_allowed &&
653 (old_blend->alpha_to_coverage != blend->alpha_to_coverage ||
654 old_blend->blend_enable_4bit != blend->blend_enable_4bit ||
655 old_blend->cb_target_enabled_4bit != blend->cb_target_enabled_4bit))
656 si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
657
658 if (sctx->screen->has_out_of_order_rast &&
659 ((old_blend->blend_enable_4bit != blend->blend_enable_4bit ||
660 old_blend->cb_target_enabled_4bit != blend->cb_target_enabled_4bit ||
661 old_blend->commutative_4bit != blend->commutative_4bit ||
662 old_blend->logicop_enable != blend->logicop_enable)))
663 si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
664 }
665
666 static void si_delete_blend_state(struct pipe_context *ctx, void *state)
667 {
668 struct si_context *sctx = (struct si_context *)ctx;
669
670 if (sctx->queued.named.blend == state)
671 si_bind_blend_state(ctx, sctx->noop_blend);
672
673 si_pm4_delete_state(sctx, blend, (struct si_state_blend *)state);
674 }
675
676 static void si_set_blend_color(struct pipe_context *ctx, const struct pipe_blend_color *state)
677 {
678 struct si_context *sctx = (struct si_context *)ctx;
679 static const struct pipe_blend_color zeros;
680
681 sctx->blend_color.state = *state;
682 sctx->blend_color.any_nonzeros = memcmp(state, &zeros, sizeof(*state)) != 0;
683 si_mark_atom_dirty(sctx, &sctx->atoms.s.blend_color);
684 }
685
686 static void si_emit_blend_color(struct si_context *sctx)
687 {
688 struct radeon_cmdbuf *cs = sctx->gfx_cs;
689
690 radeon_set_context_reg_seq(cs, R_028414_CB_BLEND_RED, 4);
691 radeon_emit_array(cs, (uint32_t *)sctx->blend_color.state.color, 4);
692 }
693
694 /*
695 * Clipping
696 */
697
698 static void si_set_clip_state(struct pipe_context *ctx, const struct pipe_clip_state *state)
699 {
700 struct si_context *sctx = (struct si_context *)ctx;
701 struct pipe_constant_buffer cb;
702 static const struct pipe_clip_state zeros;
703
704 if (memcmp(&sctx->clip_state.state, state, sizeof(*state)) == 0)
705 return;
706
707 sctx->clip_state.state = *state;
708 sctx->clip_state.any_nonzeros = memcmp(state, &zeros, sizeof(*state)) != 0;
709 si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_state);
710
711 cb.buffer = NULL;
712 cb.user_buffer = state->ucp;
713 cb.buffer_offset = 0;
714 cb.buffer_size = 4 * 4 * 8;
715 si_set_rw_buffer(sctx, SI_VS_CONST_CLIP_PLANES, &cb);
716 pipe_resource_reference(&cb.buffer, NULL);
717 }
718
719 static void si_emit_clip_state(struct si_context *sctx)
720 {
721 struct radeon_cmdbuf *cs = sctx->gfx_cs;
722
723 radeon_set_context_reg_seq(cs, R_0285BC_PA_CL_UCP_0_X, 6 * 4);
724 radeon_emit_array(cs, (uint32_t *)sctx->clip_state.state.ucp, 6 * 4);
725 }
726
727 static void si_emit_clip_regs(struct si_context *sctx)
728 {
729 struct si_shader *vs = si_get_vs_state(sctx);
730 struct si_shader_selector *vs_sel = vs->selector;
731 struct si_shader_info *info = &vs_sel->info;
732 struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
733 unsigned window_space = info->properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
734 unsigned clipdist_mask = vs_sel->clipdist_mask;
735 unsigned ucp_mask = clipdist_mask ? 0 : rs->clip_plane_enable & SIX_BITS;
736 unsigned culldist_mask = vs_sel->culldist_mask;
737 unsigned total_mask;
738
739 if (vs->key.opt.clip_disable) {
740 assert(!info->culldist_writemask);
741 clipdist_mask = 0;
742 culldist_mask = 0;
743 }
744 total_mask = clipdist_mask | culldist_mask;
745
746 /* Clip distances on points have no effect, so need to be implemented
747 * as cull distances. This applies for the clipvertex case as well.
748 *
749 * Setting this for primitives other than points should have no adverse
750 * effects.
751 */
752 clipdist_mask &= rs->clip_plane_enable;
753 culldist_mask |= clipdist_mask;
754
755 unsigned initial_cdw = sctx->gfx_cs->current.cdw;
756 unsigned pa_cl_cntl = S_02881C_VS_OUT_CCDIST0_VEC_ENA((total_mask & 0x0F) != 0) |
757 S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xF0) != 0) | clipdist_mask |
758 (culldist_mask << 8);
759
760 if (sctx->chip_class >= GFX10) {
761 radeon_opt_set_context_reg_rmw(sctx, R_02881C_PA_CL_VS_OUT_CNTL,
762 SI_TRACKED_PA_CL_VS_OUT_CNTL__CL, pa_cl_cntl,
763 ~SI_TRACKED_PA_CL_VS_OUT_CNTL__VS_MASK);
764 } else {
765 radeon_opt_set_context_reg(sctx, R_02881C_PA_CL_VS_OUT_CNTL, SI_TRACKED_PA_CL_VS_OUT_CNTL__CL,
766 vs_sel->pa_cl_vs_out_cntl | pa_cl_cntl);
767 }
768 radeon_opt_set_context_reg(sctx, R_028810_PA_CL_CLIP_CNTL, SI_TRACKED_PA_CL_CLIP_CNTL,
769 rs->pa_cl_clip_cntl | ucp_mask | S_028810_CLIP_DISABLE(window_space));
770
771 if (initial_cdw != sctx->gfx_cs->current.cdw)
772 sctx->context_roll = true;
773 }
774
775 /*
776 * inferred state between framebuffer and rasterizer
777 */
778 static void si_update_poly_offset_state(struct si_context *sctx)
779 {
780 struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
781
782 if (!rs->uses_poly_offset || !sctx->framebuffer.state.zsbuf) {
783 si_pm4_bind_state(sctx, poly_offset, NULL);
784 return;
785 }
786
787 /* Use the user format, not db_render_format, so that the polygon
788 * offset behaves as expected by applications.
789 */
790 switch (sctx->framebuffer.state.zsbuf->texture->format) {
791 case PIPE_FORMAT_Z16_UNORM:
792 si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[0]);
793 break;
794 default: /* 24-bit */
795 si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[1]);
796 break;
797 case PIPE_FORMAT_Z32_FLOAT:
798 case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
799 si_pm4_bind_state(sctx, poly_offset, &rs->pm4_poly_offset[2]);
800 break;
801 }
802 }
803
804 /*
805 * Rasterizer
806 */
807
808 static uint32_t si_translate_fill(uint32_t func)
809 {
810 switch (func) {
811 case PIPE_POLYGON_MODE_FILL:
812 return V_028814_X_DRAW_TRIANGLES;
813 case PIPE_POLYGON_MODE_LINE:
814 return V_028814_X_DRAW_LINES;
815 case PIPE_POLYGON_MODE_POINT:
816 return V_028814_X_DRAW_POINTS;
817 default:
818 assert(0);
819 return V_028814_X_DRAW_POINTS;
820 }
821 }
822
823 static void *si_create_rs_state(struct pipe_context *ctx, const struct pipe_rasterizer_state *state)
824 {
825 struct si_screen *sscreen = ((struct si_context *)ctx)->screen;
826 struct si_state_rasterizer *rs = CALLOC_STRUCT(si_state_rasterizer);
827 struct si_pm4_state *pm4 = &rs->pm4;
828 unsigned tmp, i;
829 float psize_min, psize_max;
830
831 if (!rs) {
832 return NULL;
833 }
834
835 if (!state->front_ccw) {
836 rs->cull_front = !!(state->cull_face & PIPE_FACE_FRONT);
837 rs->cull_back = !!(state->cull_face & PIPE_FACE_BACK);
838 } else {
839 rs->cull_back = !!(state->cull_face & PIPE_FACE_FRONT);
840 rs->cull_front = !!(state->cull_face & PIPE_FACE_BACK);
841 }
842 rs->depth_clamp_any = !state->depth_clip_near || !state->depth_clip_far;
843 rs->provoking_vertex_first = state->flatshade_first;
844 rs->scissor_enable = state->scissor;
845 rs->clip_halfz = state->clip_halfz;
846 rs->two_side = state->light_twoside;
847 rs->multisample_enable = state->multisample;
848 rs->force_persample_interp = state->force_persample_interp;
849 rs->clip_plane_enable = state->clip_plane_enable;
850 rs->half_pixel_center = state->half_pixel_center;
851 rs->line_stipple_enable = state->line_stipple_enable;
852 rs->poly_stipple_enable = state->poly_stipple_enable;
853 rs->line_smooth = state->line_smooth;
854 rs->line_width = state->line_width;
855 rs->poly_smooth = state->poly_smooth;
856 rs->uses_poly_offset = state->offset_point || state->offset_line || state->offset_tri;
857 rs->clamp_fragment_color = state->clamp_fragment_color;
858 rs->clamp_vertex_color = state->clamp_vertex_color;
859 rs->flatshade = state->flatshade;
860 rs->flatshade_first = state->flatshade_first;
861 rs->sprite_coord_enable = state->sprite_coord_enable;
862 rs->rasterizer_discard = state->rasterizer_discard;
863 rs->polygon_mode_enabled =
864 (state->fill_front != PIPE_POLYGON_MODE_FILL && !(state->cull_face & PIPE_FACE_FRONT)) ||
865 (state->fill_back != PIPE_POLYGON_MODE_FILL && !(state->cull_face & PIPE_FACE_BACK));
866 rs->polygon_mode_is_lines =
867 (state->fill_front == PIPE_POLYGON_MODE_LINE && !(state->cull_face & PIPE_FACE_FRONT)) ||
868 (state->fill_back == PIPE_POLYGON_MODE_LINE && !(state->cull_face & PIPE_FACE_BACK));
869 rs->pa_sc_line_stipple = state->line_stipple_enable
870 ? S_028A0C_LINE_PATTERN(state->line_stipple_pattern) |
871 S_028A0C_REPEAT_COUNT(state->line_stipple_factor)
872 : 0;
873 rs->pa_cl_clip_cntl = S_028810_DX_CLIP_SPACE_DEF(state->clip_halfz) |
874 S_028810_ZCLIP_NEAR_DISABLE(!state->depth_clip_near) |
875 S_028810_ZCLIP_FAR_DISABLE(!state->depth_clip_far) |
876 S_028810_DX_RASTERIZATION_KILL(state->rasterizer_discard) |
877 S_028810_DX_LINEAR_ATTR_CLIP_ENA(1);
878
879 si_pm4_set_reg(
880 pm4, R_0286D4_SPI_INTERP_CONTROL_0,
881 S_0286D4_FLAT_SHADE_ENA(1) | S_0286D4_PNT_SPRITE_ENA(state->point_quad_rasterization) |
882 S_0286D4_PNT_SPRITE_OVRD_X(V_0286D4_SPI_PNT_SPRITE_SEL_S) |
883 S_0286D4_PNT_SPRITE_OVRD_Y(V_0286D4_SPI_PNT_SPRITE_SEL_T) |
884 S_0286D4_PNT_SPRITE_OVRD_Z(V_0286D4_SPI_PNT_SPRITE_SEL_0) |
885 S_0286D4_PNT_SPRITE_OVRD_W(V_0286D4_SPI_PNT_SPRITE_SEL_1) |
886 S_0286D4_PNT_SPRITE_TOP_1(state->sprite_coord_mode != PIPE_SPRITE_COORD_UPPER_LEFT));
887
888 /* point size 12.4 fixed point */
889 tmp = (unsigned)(state->point_size * 8.0);
890 si_pm4_set_reg(pm4, R_028A00_PA_SU_POINT_SIZE, S_028A00_HEIGHT(tmp) | S_028A00_WIDTH(tmp));
891
892 if (state->point_size_per_vertex) {
893 psize_min = util_get_min_point_size(state);
894 psize_max = SI_MAX_POINT_SIZE;
895 } else {
896 /* Force the point size to be as if the vertex output was disabled. */
897 psize_min = state->point_size;
898 psize_max = state->point_size;
899 }
900 rs->max_point_size = psize_max;
901
902 /* Divide by two, because 0.5 = 1 pixel. */
903 si_pm4_set_reg(pm4, R_028A04_PA_SU_POINT_MINMAX,
904 S_028A04_MIN_SIZE(si_pack_float_12p4(psize_min / 2)) |
905 S_028A04_MAX_SIZE(si_pack_float_12p4(psize_max / 2)));
906
907 si_pm4_set_reg(pm4, R_028A08_PA_SU_LINE_CNTL,
908 S_028A08_WIDTH(si_pack_float_12p4(state->line_width / 2)));
909 si_pm4_set_reg(
910 pm4, R_028A48_PA_SC_MODE_CNTL_0,
911 S_028A48_LINE_STIPPLE_ENABLE(state->line_stipple_enable) |
912 S_028A48_MSAA_ENABLE(state->multisample || state->poly_smooth || state->line_smooth) |
913 S_028A48_VPORT_SCISSOR_ENABLE(1) |
914 S_028A48_ALTERNATE_RBS_PER_TILE(sscreen->info.chip_class >= GFX9));
915
916 si_pm4_set_reg(pm4, R_028B7C_PA_SU_POLY_OFFSET_CLAMP, fui(state->offset_clamp));
917 si_pm4_set_reg(pm4, R_028814_PA_SU_SC_MODE_CNTL,
918 S_028814_PROVOKING_VTX_LAST(!state->flatshade_first) |
919 S_028814_CULL_FRONT((state->cull_face & PIPE_FACE_FRONT) ? 1 : 0) |
920 S_028814_CULL_BACK((state->cull_face & PIPE_FACE_BACK) ? 1 : 0) |
921 S_028814_FACE(!state->front_ccw) |
922 S_028814_POLY_OFFSET_FRONT_ENABLE(util_get_offset(state, state->fill_front)) |
923 S_028814_POLY_OFFSET_BACK_ENABLE(util_get_offset(state, state->fill_back)) |
924 S_028814_POLY_OFFSET_PARA_ENABLE(state->offset_point || state->offset_line) |
925 S_028814_POLY_MODE(rs->polygon_mode_enabled) |
926 S_028814_POLYMODE_FRONT_PTYPE(si_translate_fill(state->fill_front)) |
927 S_028814_POLYMODE_BACK_PTYPE(si_translate_fill(state->fill_back)));
928
929 if (!rs->uses_poly_offset)
930 return rs;
931
932 rs->pm4_poly_offset = CALLOC(3, sizeof(struct si_pm4_state));
933 if (!rs->pm4_poly_offset) {
934 FREE(rs);
935 return NULL;
936 }
937
938 /* Precalculate polygon offset states for 16-bit, 24-bit, and 32-bit zbuffers. */
939 for (i = 0; i < 3; i++) {
940 struct si_pm4_state *pm4 = &rs->pm4_poly_offset[i];
941 float offset_units = state->offset_units;
942 float offset_scale = state->offset_scale * 16.0f;
943 uint32_t pa_su_poly_offset_db_fmt_cntl = 0;
944
945 if (!state->offset_units_unscaled) {
946 switch (i) {
947 case 0: /* 16-bit zbuffer */
948 offset_units *= 4.0f;
949 pa_su_poly_offset_db_fmt_cntl = S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-16);
950 break;
951 case 1: /* 24-bit zbuffer */
952 offset_units *= 2.0f;
953 pa_su_poly_offset_db_fmt_cntl = S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-24);
954 break;
955 case 2: /* 32-bit zbuffer */
956 offset_units *= 1.0f;
957 pa_su_poly_offset_db_fmt_cntl =
958 S_028B78_POLY_OFFSET_NEG_NUM_DB_BITS(-23) | S_028B78_POLY_OFFSET_DB_IS_FLOAT_FMT(1);
959 break;
960 }
961 }
962
963 si_pm4_set_reg(pm4, R_028B80_PA_SU_POLY_OFFSET_FRONT_SCALE, fui(offset_scale));
964 si_pm4_set_reg(pm4, R_028B84_PA_SU_POLY_OFFSET_FRONT_OFFSET, fui(offset_units));
965 si_pm4_set_reg(pm4, R_028B88_PA_SU_POLY_OFFSET_BACK_SCALE, fui(offset_scale));
966 si_pm4_set_reg(pm4, R_028B8C_PA_SU_POLY_OFFSET_BACK_OFFSET, fui(offset_units));
967 si_pm4_set_reg(pm4, R_028B78_PA_SU_POLY_OFFSET_DB_FMT_CNTL, pa_su_poly_offset_db_fmt_cntl);
968 }
969
970 return rs;
971 }
972
973 static void si_bind_rs_state(struct pipe_context *ctx, void *state)
974 {
975 struct si_context *sctx = (struct si_context *)ctx;
976 struct si_state_rasterizer *old_rs = (struct si_state_rasterizer *)sctx->queued.named.rasterizer;
977 struct si_state_rasterizer *rs = (struct si_state_rasterizer *)state;
978
979 if (!rs)
980 rs = (struct si_state_rasterizer *)sctx->discard_rasterizer_state;
981
982 if (old_rs->multisample_enable != rs->multisample_enable) {
983 si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
984
985 /* Update the small primitive filter workaround if necessary. */
986 if (sctx->screen->info.has_msaa_sample_loc_bug && sctx->framebuffer.nr_samples > 1)
987 si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs);
988 }
989
990 sctx->current_vs_state &= C_VS_STATE_CLAMP_VERTEX_COLOR;
991 sctx->current_vs_state |= S_VS_STATE_CLAMP_VERTEX_COLOR(rs->clamp_vertex_color);
992
993 si_pm4_bind_state(sctx, rasterizer, rs);
994 si_update_poly_offset_state(sctx);
995
996 if (old_rs->scissor_enable != rs->scissor_enable)
997 si_mark_atom_dirty(sctx, &sctx->atoms.s.scissors);
998
999 if (old_rs->line_width != rs->line_width || old_rs->max_point_size != rs->max_point_size ||
1000 old_rs->half_pixel_center != rs->half_pixel_center)
1001 si_mark_atom_dirty(sctx, &sctx->atoms.s.guardband);
1002
1003 if (old_rs->clip_halfz != rs->clip_halfz)
1004 si_mark_atom_dirty(sctx, &sctx->atoms.s.viewports);
1005
1006 if (old_rs->clip_plane_enable != rs->clip_plane_enable ||
1007 old_rs->pa_cl_clip_cntl != rs->pa_cl_clip_cntl)
1008 si_mark_atom_dirty(sctx, &sctx->atoms.s.clip_regs);
1009
1010 if (old_rs->clip_plane_enable != rs->clip_plane_enable ||
1011 old_rs->rasterizer_discard != rs->rasterizer_discard ||
1012 old_rs->sprite_coord_enable != rs->sprite_coord_enable ||
1013 old_rs->flatshade != rs->flatshade || old_rs->two_side != rs->two_side ||
1014 old_rs->multisample_enable != rs->multisample_enable ||
1015 old_rs->poly_stipple_enable != rs->poly_stipple_enable ||
1016 old_rs->poly_smooth != rs->poly_smooth || old_rs->line_smooth != rs->line_smooth ||
1017 old_rs->clamp_fragment_color != rs->clamp_fragment_color ||
1018 old_rs->force_persample_interp != rs->force_persample_interp)
1019 sctx->do_update_shaders = true;
1020 }
1021
1022 static void si_delete_rs_state(struct pipe_context *ctx, void *state)
1023 {
1024 struct si_context *sctx = (struct si_context *)ctx;
1025 struct si_state_rasterizer *rs = (struct si_state_rasterizer *)state;
1026
1027 if (sctx->queued.named.rasterizer == state)
1028 si_bind_rs_state(ctx, sctx->discard_rasterizer_state);
1029
1030 FREE(rs->pm4_poly_offset);
1031 si_pm4_delete_state(sctx, rasterizer, rs);
1032 }
1033
1034 /*
1035 * infeered state between dsa and stencil ref
1036 */
1037 static void si_emit_stencil_ref(struct si_context *sctx)
1038 {
1039 struct radeon_cmdbuf *cs = sctx->gfx_cs;
1040 struct pipe_stencil_ref *ref = &sctx->stencil_ref.state;
1041 struct si_dsa_stencil_ref_part *dsa = &sctx->stencil_ref.dsa_part;
1042
1043 radeon_set_context_reg_seq(cs, R_028430_DB_STENCILREFMASK, 2);
1044 radeon_emit(cs, S_028430_STENCILTESTVAL(ref->ref_value[0]) |
1045 S_028430_STENCILMASK(dsa->valuemask[0]) |
1046 S_028430_STENCILWRITEMASK(dsa->writemask[0]) | S_028430_STENCILOPVAL(1));
1047 radeon_emit(cs, S_028434_STENCILTESTVAL_BF(ref->ref_value[1]) |
1048 S_028434_STENCILMASK_BF(dsa->valuemask[1]) |
1049 S_028434_STENCILWRITEMASK_BF(dsa->writemask[1]) |
1050 S_028434_STENCILOPVAL_BF(1));
1051 }
1052
1053 static void si_set_stencil_ref(struct pipe_context *ctx, const struct pipe_stencil_ref *state)
1054 {
1055 struct si_context *sctx = (struct si_context *)ctx;
1056
1057 if (memcmp(&sctx->stencil_ref.state, state, sizeof(*state)) == 0)
1058 return;
1059
1060 sctx->stencil_ref.state = *state;
1061 si_mark_atom_dirty(sctx, &sctx->atoms.s.stencil_ref);
1062 }
1063
1064 /*
1065 * DSA
1066 */
1067
1068 static uint32_t si_translate_stencil_op(int s_op)
1069 {
1070 switch (s_op) {
1071 case PIPE_STENCIL_OP_KEEP:
1072 return V_02842C_STENCIL_KEEP;
1073 case PIPE_STENCIL_OP_ZERO:
1074 return V_02842C_STENCIL_ZERO;
1075 case PIPE_STENCIL_OP_REPLACE:
1076 return V_02842C_STENCIL_REPLACE_TEST;
1077 case PIPE_STENCIL_OP_INCR:
1078 return V_02842C_STENCIL_ADD_CLAMP;
1079 case PIPE_STENCIL_OP_DECR:
1080 return V_02842C_STENCIL_SUB_CLAMP;
1081 case PIPE_STENCIL_OP_INCR_WRAP:
1082 return V_02842C_STENCIL_ADD_WRAP;
1083 case PIPE_STENCIL_OP_DECR_WRAP:
1084 return V_02842C_STENCIL_SUB_WRAP;
1085 case PIPE_STENCIL_OP_INVERT:
1086 return V_02842C_STENCIL_INVERT;
1087 default:
1088 PRINT_ERR("Unknown stencil op %d", s_op);
1089 assert(0);
1090 break;
1091 }
1092 return 0;
1093 }
1094
1095 static bool si_dsa_writes_stencil(const struct pipe_stencil_state *s)
1096 {
1097 return s->enabled && s->writemask &&
1098 (s->fail_op != PIPE_STENCIL_OP_KEEP || s->zfail_op != PIPE_STENCIL_OP_KEEP ||
1099 s->zpass_op != PIPE_STENCIL_OP_KEEP);
1100 }
1101
1102 static bool si_order_invariant_stencil_op(enum pipe_stencil_op op)
1103 {
1104 /* REPLACE is normally order invariant, except when the stencil
1105 * reference value is written by the fragment shader. Tracking this
1106 * interaction does not seem worth the effort, so be conservative. */
1107 return op != PIPE_STENCIL_OP_INCR && op != PIPE_STENCIL_OP_DECR && op != PIPE_STENCIL_OP_REPLACE;
1108 }
1109
1110 /* Compute whether, assuming Z writes are disabled, this stencil state is order
1111 * invariant in the sense that the set of passing fragments as well as the
1112 * final stencil buffer result does not depend on the order of fragments. */
1113 static bool si_order_invariant_stencil_state(const struct pipe_stencil_state *state)
1114 {
1115 return !state->enabled || !state->writemask ||
1116 /* The following assumes that Z writes are disabled. */
1117 (state->func == PIPE_FUNC_ALWAYS && si_order_invariant_stencil_op(state->zpass_op) &&
1118 si_order_invariant_stencil_op(state->zfail_op)) ||
1119 (state->func == PIPE_FUNC_NEVER && si_order_invariant_stencil_op(state->fail_op));
1120 }
1121
1122 static void *si_create_dsa_state(struct pipe_context *ctx,
1123 const struct pipe_depth_stencil_alpha_state *state)
1124 {
1125 struct si_context *sctx = (struct si_context *)ctx;
1126 struct si_state_dsa *dsa = CALLOC_STRUCT(si_state_dsa);
1127 struct si_pm4_state *pm4 = &dsa->pm4;
1128 unsigned db_depth_control;
1129 uint32_t db_stencil_control = 0;
1130
1131 if (!dsa) {
1132 return NULL;
1133 }
1134
1135 dsa->stencil_ref.valuemask[0] = state->stencil[0].valuemask;
1136 dsa->stencil_ref.valuemask[1] = state->stencil[1].valuemask;
1137 dsa->stencil_ref.writemask[0] = state->stencil[0].writemask;
1138 dsa->stencil_ref.writemask[1] = state->stencil[1].writemask;
1139
1140 db_depth_control =
1141 S_028800_Z_ENABLE(state->depth.enabled) | S_028800_Z_WRITE_ENABLE(state->depth.writemask) |
1142 S_028800_ZFUNC(state->depth.func) | S_028800_DEPTH_BOUNDS_ENABLE(state->depth.bounds_test);
1143
1144 /* stencil */
1145 if (state->stencil[0].enabled) {
1146 db_depth_control |= S_028800_STENCIL_ENABLE(1);
1147 db_depth_control |= S_028800_STENCILFUNC(state->stencil[0].func);
1148 db_stencil_control |=
1149 S_02842C_STENCILFAIL(si_translate_stencil_op(state->stencil[0].fail_op));
1150 db_stencil_control |=
1151 S_02842C_STENCILZPASS(si_translate_stencil_op(state->stencil[0].zpass_op));
1152 db_stencil_control |=
1153 S_02842C_STENCILZFAIL(si_translate_stencil_op(state->stencil[0].zfail_op));
1154
1155 if (state->stencil[1].enabled) {
1156 db_depth_control |= S_028800_BACKFACE_ENABLE(1);
1157 db_depth_control |= S_028800_STENCILFUNC_BF(state->stencil[1].func);
1158 db_stencil_control |=
1159 S_02842C_STENCILFAIL_BF(si_translate_stencil_op(state->stencil[1].fail_op));
1160 db_stencil_control |=
1161 S_02842C_STENCILZPASS_BF(si_translate_stencil_op(state->stencil[1].zpass_op));
1162 db_stencil_control |=
1163 S_02842C_STENCILZFAIL_BF(si_translate_stencil_op(state->stencil[1].zfail_op));
1164 }
1165 }
1166
1167 /* alpha */
1168 if (state->alpha.enabled) {
1169 dsa->alpha_func = state->alpha.func;
1170
1171 si_pm4_set_reg(pm4, R_00B030_SPI_SHADER_USER_DATA_PS_0 + SI_SGPR_ALPHA_REF * 4,
1172 fui(state->alpha.ref_value));
1173 } else {
1174 dsa->alpha_func = PIPE_FUNC_ALWAYS;
1175 }
1176
1177 si_pm4_set_reg(pm4, R_028800_DB_DEPTH_CONTROL, db_depth_control);
1178 if (state->stencil[0].enabled)
1179 si_pm4_set_reg(pm4, R_02842C_DB_STENCIL_CONTROL, db_stencil_control);
1180 if (state->depth.bounds_test) {
1181 si_pm4_set_reg(pm4, R_028020_DB_DEPTH_BOUNDS_MIN, fui(state->depth.bounds_min));
1182 si_pm4_set_reg(pm4, R_028024_DB_DEPTH_BOUNDS_MAX, fui(state->depth.bounds_max));
1183 }
1184
1185 dsa->depth_enabled = state->depth.enabled;
1186 dsa->depth_write_enabled = state->depth.enabled && state->depth.writemask;
1187 dsa->stencil_enabled = state->stencil[0].enabled;
1188 dsa->stencil_write_enabled =
1189 state->stencil[0].enabled &&
1190 (si_dsa_writes_stencil(&state->stencil[0]) || si_dsa_writes_stencil(&state->stencil[1]));
1191 dsa->db_can_write = dsa->depth_write_enabled || dsa->stencil_write_enabled;
1192
1193 bool zfunc_is_ordered =
1194 state->depth.func == PIPE_FUNC_NEVER || state->depth.func == PIPE_FUNC_LESS ||
1195 state->depth.func == PIPE_FUNC_LEQUAL || state->depth.func == PIPE_FUNC_GREATER ||
1196 state->depth.func == PIPE_FUNC_GEQUAL;
1197
1198 bool nozwrite_and_order_invariant_stencil =
1199 !dsa->db_can_write ||
1200 (!dsa->depth_write_enabled && si_order_invariant_stencil_state(&state->stencil[0]) &&
1201 si_order_invariant_stencil_state(&state->stencil[1]));
1202
1203 dsa->order_invariance[1].zs =
1204 nozwrite_and_order_invariant_stencil || (!dsa->stencil_write_enabled && zfunc_is_ordered);
1205 dsa->order_invariance[0].zs = !dsa->depth_write_enabled || zfunc_is_ordered;
1206
1207 dsa->order_invariance[1].pass_set =
1208 nozwrite_and_order_invariant_stencil ||
1209 (!dsa->stencil_write_enabled &&
1210 (state->depth.func == PIPE_FUNC_ALWAYS || state->depth.func == PIPE_FUNC_NEVER));
1211 dsa->order_invariance[0].pass_set =
1212 !dsa->depth_write_enabled ||
1213 (state->depth.func == PIPE_FUNC_ALWAYS || state->depth.func == PIPE_FUNC_NEVER);
1214
1215 dsa->order_invariance[1].pass_last = sctx->screen->assume_no_z_fights &&
1216 !dsa->stencil_write_enabled && dsa->depth_write_enabled &&
1217 zfunc_is_ordered;
1218 dsa->order_invariance[0].pass_last =
1219 sctx->screen->assume_no_z_fights && dsa->depth_write_enabled && zfunc_is_ordered;
1220
1221 return dsa;
1222 }
1223
1224 static void si_bind_dsa_state(struct pipe_context *ctx, void *state)
1225 {
1226 struct si_context *sctx = (struct si_context *)ctx;
1227 struct si_state_dsa *old_dsa = sctx->queued.named.dsa;
1228 struct si_state_dsa *dsa = state;
1229
1230 if (!dsa)
1231 dsa = (struct si_state_dsa *)sctx->noop_dsa;
1232
1233 si_pm4_bind_state(sctx, dsa, dsa);
1234
1235 if (memcmp(&dsa->stencil_ref, &sctx->stencil_ref.dsa_part,
1236 sizeof(struct si_dsa_stencil_ref_part)) != 0) {
1237 sctx->stencil_ref.dsa_part = dsa->stencil_ref;
1238 si_mark_atom_dirty(sctx, &sctx->atoms.s.stencil_ref);
1239 }
1240
1241 if (old_dsa->alpha_func != dsa->alpha_func)
1242 sctx->do_update_shaders = true;
1243
1244 if (sctx->screen->dpbb_allowed && ((old_dsa->depth_enabled != dsa->depth_enabled ||
1245 old_dsa->stencil_enabled != dsa->stencil_enabled ||
1246 old_dsa->db_can_write != dsa->db_can_write)))
1247 si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
1248
1249 if (sctx->screen->has_out_of_order_rast &&
1250 (memcmp(old_dsa->order_invariance, dsa->order_invariance,
1251 sizeof(old_dsa->order_invariance))))
1252 si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
1253 }
1254
1255 static void si_delete_dsa_state(struct pipe_context *ctx, void *state)
1256 {
1257 struct si_context *sctx = (struct si_context *)ctx;
1258
1259 if (sctx->queued.named.dsa == state)
1260 si_bind_dsa_state(ctx, sctx->noop_dsa);
1261
1262 si_pm4_delete_state(sctx, dsa, (struct si_state_dsa *)state);
1263 }
1264
1265 static void *si_create_db_flush_dsa(struct si_context *sctx)
1266 {
1267 struct pipe_depth_stencil_alpha_state dsa = {};
1268
1269 return sctx->b.create_depth_stencil_alpha_state(&sctx->b, &dsa);
1270 }
1271
1272 /* DB RENDER STATE */
1273
1274 static void si_set_active_query_state(struct pipe_context *ctx, bool enable)
1275 {
1276 struct si_context *sctx = (struct si_context *)ctx;
1277
1278 /* Pipeline stat & streamout queries. */
1279 if (enable) {
1280 sctx->flags &= ~SI_CONTEXT_STOP_PIPELINE_STATS;
1281 sctx->flags |= SI_CONTEXT_START_PIPELINE_STATS;
1282 } else {
1283 sctx->flags &= ~SI_CONTEXT_START_PIPELINE_STATS;
1284 sctx->flags |= SI_CONTEXT_STOP_PIPELINE_STATS;
1285 }
1286
1287 /* Occlusion queries. */
1288 if (sctx->occlusion_queries_disabled != !enable) {
1289 sctx->occlusion_queries_disabled = !enable;
1290 si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
1291 }
1292 }
1293
1294 void si_set_occlusion_query_state(struct si_context *sctx, bool old_perfect_enable)
1295 {
1296 si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
1297
1298 bool perfect_enable = sctx->num_perfect_occlusion_queries != 0;
1299
1300 if (perfect_enable != old_perfect_enable)
1301 si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
1302 }
1303
1304 void si_save_qbo_state(struct si_context *sctx, struct si_qbo_state *st)
1305 {
1306 st->saved_compute = sctx->cs_shader_state.program;
1307
1308 si_get_pipe_constant_buffer(sctx, PIPE_SHADER_COMPUTE, 0, &st->saved_const0);
1309 si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, 3, st->saved_ssbo);
1310
1311 st->saved_ssbo_writable_mask = 0;
1312
1313 for (unsigned i = 0; i < 3; i++) {
1314 if (sctx->const_and_shader_buffers[PIPE_SHADER_COMPUTE].writable_mask &
1315 (1u << si_get_shaderbuf_slot(i)))
1316 st->saved_ssbo_writable_mask |= 1 << i;
1317 }
1318 }
1319
1320 void si_restore_qbo_state(struct si_context *sctx, struct si_qbo_state *st)
1321 {
1322 sctx->b.bind_compute_state(&sctx->b, st->saved_compute);
1323
1324 sctx->b.set_constant_buffer(&sctx->b, PIPE_SHADER_COMPUTE, 0, &st->saved_const0);
1325 pipe_resource_reference(&st->saved_const0.buffer, NULL);
1326
1327 sctx->b.set_shader_buffers(&sctx->b, PIPE_SHADER_COMPUTE, 0, 3, st->saved_ssbo,
1328 st->saved_ssbo_writable_mask);
1329 for (unsigned i = 0; i < 3; ++i)
1330 pipe_resource_reference(&st->saved_ssbo[i].buffer, NULL);
1331 }
1332
1333 static void si_emit_db_render_state(struct si_context *sctx)
1334 {
1335 struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
1336 unsigned db_shader_control, db_render_control, db_count_control;
1337 unsigned initial_cdw = sctx->gfx_cs->current.cdw;
1338
1339 /* DB_RENDER_CONTROL */
1340 if (sctx->dbcb_depth_copy_enabled || sctx->dbcb_stencil_copy_enabled) {
1341 db_render_control = S_028000_DEPTH_COPY(sctx->dbcb_depth_copy_enabled) |
1342 S_028000_STENCIL_COPY(sctx->dbcb_stencil_copy_enabled) |
1343 S_028000_COPY_CENTROID(1) | S_028000_COPY_SAMPLE(sctx->dbcb_copy_sample);
1344 } else if (sctx->db_flush_depth_inplace || sctx->db_flush_stencil_inplace) {
1345 db_render_control = S_028000_DEPTH_COMPRESS_DISABLE(sctx->db_flush_depth_inplace) |
1346 S_028000_STENCIL_COMPRESS_DISABLE(sctx->db_flush_stencil_inplace);
1347 } else {
1348 db_render_control = S_028000_DEPTH_CLEAR_ENABLE(sctx->db_depth_clear) |
1349 S_028000_STENCIL_CLEAR_ENABLE(sctx->db_stencil_clear);
1350 }
1351
1352 /* DB_COUNT_CONTROL (occlusion queries) */
1353 if (sctx->num_occlusion_queries > 0 && !sctx->occlusion_queries_disabled) {
1354 bool perfect = sctx->num_perfect_occlusion_queries > 0;
1355 bool gfx10_perfect = sctx->chip_class >= GFX10 && perfect;
1356
1357 if (sctx->chip_class >= GFX7) {
1358 unsigned log_sample_rate = sctx->framebuffer.log_samples;
1359
1360 /* Stoney doesn't increment occlusion query counters
1361 * if the sample rate is 16x. Use 8x sample rate instead.
1362 */
1363 if (sctx->family == CHIP_STONEY)
1364 log_sample_rate = MIN2(log_sample_rate, 3);
1365
1366 db_count_control = S_028004_PERFECT_ZPASS_COUNTS(perfect) |
1367 S_028004_DISABLE_CONSERVATIVE_ZPASS_COUNTS(gfx10_perfect) |
1368 S_028004_SAMPLE_RATE(log_sample_rate) | S_028004_ZPASS_ENABLE(1) |
1369 S_028004_SLICE_EVEN_ENABLE(1) | S_028004_SLICE_ODD_ENABLE(1);
1370 } else {
1371 db_count_control = S_028004_PERFECT_ZPASS_COUNTS(perfect) |
1372 S_028004_SAMPLE_RATE(sctx->framebuffer.log_samples);
1373 }
1374 } else {
1375 /* Disable occlusion queries. */
1376 if (sctx->chip_class >= GFX7) {
1377 db_count_control = 0;
1378 } else {
1379 db_count_control = S_028004_ZPASS_INCREMENT_DISABLE(1);
1380 }
1381 }
1382
1383 radeon_opt_set_context_reg2(sctx, R_028000_DB_RENDER_CONTROL, SI_TRACKED_DB_RENDER_CONTROL,
1384 db_render_control, db_count_control);
1385
1386 /* DB_RENDER_OVERRIDE2 */
1387 radeon_opt_set_context_reg(
1388 sctx, R_028010_DB_RENDER_OVERRIDE2, SI_TRACKED_DB_RENDER_OVERRIDE2,
1389 S_028010_DISABLE_ZMASK_EXPCLEAR_OPTIMIZATION(sctx->db_depth_disable_expclear) |
1390 S_028010_DISABLE_SMEM_EXPCLEAR_OPTIMIZATION(sctx->db_stencil_disable_expclear) |
1391 S_028010_DECOMPRESS_Z_ON_FLUSH(sctx->framebuffer.nr_samples >= 4));
1392
1393 db_shader_control = sctx->ps_db_shader_control;
1394
1395 /* Bug workaround for smoothing (overrasterization) on GFX6. */
1396 if (sctx->chip_class == GFX6 && sctx->smoothing_enabled) {
1397 db_shader_control &= C_02880C_Z_ORDER;
1398 db_shader_control |= S_02880C_Z_ORDER(V_02880C_LATE_Z);
1399 }
1400
1401 /* Disable the gl_SampleMask fragment shader output if MSAA is disabled. */
1402 if (!rs->multisample_enable)
1403 db_shader_control &= C_02880C_MASK_EXPORT_ENABLE;
1404
1405 if (sctx->screen->info.has_rbplus && !sctx->screen->info.rbplus_allowed)
1406 db_shader_control |= S_02880C_DUAL_QUAD_DISABLE(1);
1407
1408 radeon_opt_set_context_reg(sctx, R_02880C_DB_SHADER_CONTROL, SI_TRACKED_DB_SHADER_CONTROL,
1409 db_shader_control);
1410
1411 if (initial_cdw != sctx->gfx_cs->current.cdw)
1412 sctx->context_roll = true;
1413 }
1414
1415 /*
1416 * format translation
1417 */
1418 static uint32_t si_translate_colorformat(enum pipe_format format)
1419 {
1420 const struct util_format_description *desc = util_format_description(format);
1421 if (!desc)
1422 return V_028C70_COLOR_INVALID;
1423
1424 #define HAS_SIZE(x, y, z, w) \
1425 (desc->channel[0].size == (x) && desc->channel[1].size == (y) && \
1426 desc->channel[2].size == (z) && desc->channel[3].size == (w))
1427
1428 if (format == PIPE_FORMAT_R11G11B10_FLOAT) /* isn't plain */
1429 return V_028C70_COLOR_10_11_11;
1430
1431 if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN)
1432 return V_028C70_COLOR_INVALID;
1433
1434 /* hw cannot support mixed formats (except depth/stencil, since
1435 * stencil is not written to). */
1436 if (desc->is_mixed && desc->colorspace != UTIL_FORMAT_COLORSPACE_ZS)
1437 return V_028C70_COLOR_INVALID;
1438
1439 switch (desc->nr_channels) {
1440 case 1:
1441 switch (desc->channel[0].size) {
1442 case 8:
1443 return V_028C70_COLOR_8;
1444 case 16:
1445 return V_028C70_COLOR_16;
1446 case 32:
1447 return V_028C70_COLOR_32;
1448 }
1449 break;
1450 case 2:
1451 if (desc->channel[0].size == desc->channel[1].size) {
1452 switch (desc->channel[0].size) {
1453 case 8:
1454 return V_028C70_COLOR_8_8;
1455 case 16:
1456 return V_028C70_COLOR_16_16;
1457 case 32:
1458 return V_028C70_COLOR_32_32;
1459 }
1460 } else if (HAS_SIZE(8, 24, 0, 0)) {
1461 return V_028C70_COLOR_24_8;
1462 } else if (HAS_SIZE(24, 8, 0, 0)) {
1463 return V_028C70_COLOR_8_24;
1464 }
1465 break;
1466 case 3:
1467 if (HAS_SIZE(5, 6, 5, 0)) {
1468 return V_028C70_COLOR_5_6_5;
1469 } else if (HAS_SIZE(32, 8, 24, 0)) {
1470 return V_028C70_COLOR_X24_8_32_FLOAT;
1471 }
1472 break;
1473 case 4:
1474 if (desc->channel[0].size == desc->channel[1].size &&
1475 desc->channel[0].size == desc->channel[2].size &&
1476 desc->channel[0].size == desc->channel[3].size) {
1477 switch (desc->channel[0].size) {
1478 case 4:
1479 return V_028C70_COLOR_4_4_4_4;
1480 case 8:
1481 return V_028C70_COLOR_8_8_8_8;
1482 case 16:
1483 return V_028C70_COLOR_16_16_16_16;
1484 case 32:
1485 return V_028C70_COLOR_32_32_32_32;
1486 }
1487 } else if (HAS_SIZE(5, 5, 5, 1)) {
1488 return V_028C70_COLOR_1_5_5_5;
1489 } else if (HAS_SIZE(1, 5, 5, 5)) {
1490 return V_028C70_COLOR_5_5_5_1;
1491 } else if (HAS_SIZE(10, 10, 10, 2)) {
1492 return V_028C70_COLOR_2_10_10_10;
1493 }
1494 break;
1495 }
1496 return V_028C70_COLOR_INVALID;
1497 }
1498
1499 static uint32_t si_colorformat_endian_swap(uint32_t colorformat)
1500 {
1501 if (SI_BIG_ENDIAN) {
1502 switch (colorformat) {
1503 /* 8-bit buffers. */
1504 case V_028C70_COLOR_8:
1505 return V_028C70_ENDIAN_NONE;
1506
1507 /* 16-bit buffers. */
1508 case V_028C70_COLOR_5_6_5:
1509 case V_028C70_COLOR_1_5_5_5:
1510 case V_028C70_COLOR_4_4_4_4:
1511 case V_028C70_COLOR_16:
1512 case V_028C70_COLOR_8_8:
1513 return V_028C70_ENDIAN_8IN16;
1514
1515 /* 32-bit buffers. */
1516 case V_028C70_COLOR_8_8_8_8:
1517 case V_028C70_COLOR_2_10_10_10:
1518 case V_028C70_COLOR_8_24:
1519 case V_028C70_COLOR_24_8:
1520 case V_028C70_COLOR_16_16:
1521 return V_028C70_ENDIAN_8IN32;
1522
1523 /* 64-bit buffers. */
1524 case V_028C70_COLOR_16_16_16_16:
1525 return V_028C70_ENDIAN_8IN16;
1526
1527 case V_028C70_COLOR_32_32:
1528 return V_028C70_ENDIAN_8IN32;
1529
1530 /* 128-bit buffers. */
1531 case V_028C70_COLOR_32_32_32_32:
1532 return V_028C70_ENDIAN_8IN32;
1533 default:
1534 return V_028C70_ENDIAN_NONE; /* Unsupported. */
1535 }
1536 } else {
1537 return V_028C70_ENDIAN_NONE;
1538 }
1539 }
1540
1541 static uint32_t si_translate_dbformat(enum pipe_format format)
1542 {
1543 switch (format) {
1544 case PIPE_FORMAT_Z16_UNORM:
1545 return V_028040_Z_16;
1546 case PIPE_FORMAT_S8_UINT_Z24_UNORM:
1547 case PIPE_FORMAT_X8Z24_UNORM:
1548 case PIPE_FORMAT_Z24X8_UNORM:
1549 case PIPE_FORMAT_Z24_UNORM_S8_UINT:
1550 return V_028040_Z_24; /* deprecated on AMD GCN */
1551 case PIPE_FORMAT_Z32_FLOAT:
1552 case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
1553 return V_028040_Z_32_FLOAT;
1554 default:
1555 return V_028040_Z_INVALID;
1556 }
1557 }
1558
1559 /*
1560 * Texture translation
1561 */
1562
1563 static uint32_t si_translate_texformat(struct pipe_screen *screen, enum pipe_format format,
1564 const struct util_format_description *desc,
1565 int first_non_void)
1566 {
1567 struct si_screen *sscreen = (struct si_screen *)screen;
1568 bool uniform = true;
1569 int i;
1570
1571 assert(sscreen->info.chip_class <= GFX9);
1572
1573 /* Colorspace (return non-RGB formats directly). */
1574 switch (desc->colorspace) {
1575 /* Depth stencil formats */
1576 case UTIL_FORMAT_COLORSPACE_ZS:
1577 switch (format) {
1578 case PIPE_FORMAT_Z16_UNORM:
1579 return V_008F14_IMG_DATA_FORMAT_16;
1580 case PIPE_FORMAT_X24S8_UINT:
1581 case PIPE_FORMAT_S8X24_UINT:
1582 /*
1583 * Implemented as an 8_8_8_8 data format to fix texture
1584 * gathers in stencil sampling. This affects at least
1585 * GL45-CTS.texture_cube_map_array.sampling on GFX8.
1586 */
1587 if (sscreen->info.chip_class <= GFX8)
1588 return V_008F14_IMG_DATA_FORMAT_8_8_8_8;
1589
1590 if (format == PIPE_FORMAT_X24S8_UINT)
1591 return V_008F14_IMG_DATA_FORMAT_8_24;
1592 else
1593 return V_008F14_IMG_DATA_FORMAT_24_8;
1594 case PIPE_FORMAT_Z24X8_UNORM:
1595 case PIPE_FORMAT_Z24_UNORM_S8_UINT:
1596 return V_008F14_IMG_DATA_FORMAT_8_24;
1597 case PIPE_FORMAT_X8Z24_UNORM:
1598 case PIPE_FORMAT_S8_UINT_Z24_UNORM:
1599 return V_008F14_IMG_DATA_FORMAT_24_8;
1600 case PIPE_FORMAT_S8_UINT:
1601 return V_008F14_IMG_DATA_FORMAT_8;
1602 case PIPE_FORMAT_Z32_FLOAT:
1603 return V_008F14_IMG_DATA_FORMAT_32;
1604 case PIPE_FORMAT_X32_S8X24_UINT:
1605 case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
1606 return V_008F14_IMG_DATA_FORMAT_X24_8_32;
1607 default:
1608 goto out_unknown;
1609 }
1610
1611 case UTIL_FORMAT_COLORSPACE_YUV:
1612 goto out_unknown; /* TODO */
1613
1614 case UTIL_FORMAT_COLORSPACE_SRGB:
1615 if (desc->nr_channels != 4 && desc->nr_channels != 1)
1616 goto out_unknown;
1617 break;
1618
1619 default:
1620 break;
1621 }
1622
1623 if (desc->layout == UTIL_FORMAT_LAYOUT_RGTC) {
1624 if (!sscreen->info.has_format_bc1_through_bc7)
1625 goto out_unknown;
1626
1627 switch (format) {
1628 case PIPE_FORMAT_RGTC1_SNORM:
1629 case PIPE_FORMAT_LATC1_SNORM:
1630 case PIPE_FORMAT_RGTC1_UNORM:
1631 case PIPE_FORMAT_LATC1_UNORM:
1632 return V_008F14_IMG_DATA_FORMAT_BC4;
1633 case PIPE_FORMAT_RGTC2_SNORM:
1634 case PIPE_FORMAT_LATC2_SNORM:
1635 case PIPE_FORMAT_RGTC2_UNORM:
1636 case PIPE_FORMAT_LATC2_UNORM:
1637 return V_008F14_IMG_DATA_FORMAT_BC5;
1638 default:
1639 goto out_unknown;
1640 }
1641 }
1642
1643 if (desc->layout == UTIL_FORMAT_LAYOUT_ETC &&
1644 (sscreen->info.family == CHIP_STONEY || sscreen->info.family == CHIP_VEGA10 ||
1645 sscreen->info.family == CHIP_RAVEN)) {
1646 switch (format) {
1647 case PIPE_FORMAT_ETC1_RGB8:
1648 case PIPE_FORMAT_ETC2_RGB8:
1649 case PIPE_FORMAT_ETC2_SRGB8:
1650 return V_008F14_IMG_DATA_FORMAT_ETC2_RGB;
1651 case PIPE_FORMAT_ETC2_RGB8A1:
1652 case PIPE_FORMAT_ETC2_SRGB8A1:
1653 return V_008F14_IMG_DATA_FORMAT_ETC2_RGBA1;
1654 case PIPE_FORMAT_ETC2_RGBA8:
1655 case PIPE_FORMAT_ETC2_SRGBA8:
1656 return V_008F14_IMG_DATA_FORMAT_ETC2_RGBA;
1657 case PIPE_FORMAT_ETC2_R11_UNORM:
1658 case PIPE_FORMAT_ETC2_R11_SNORM:
1659 return V_008F14_IMG_DATA_FORMAT_ETC2_R;
1660 case PIPE_FORMAT_ETC2_RG11_UNORM:
1661 case PIPE_FORMAT_ETC2_RG11_SNORM:
1662 return V_008F14_IMG_DATA_FORMAT_ETC2_RG;
1663 default:
1664 goto out_unknown;
1665 }
1666 }
1667
1668 if (desc->layout == UTIL_FORMAT_LAYOUT_BPTC) {
1669 if (!sscreen->info.has_format_bc1_through_bc7)
1670 goto out_unknown;
1671
1672 switch (format) {
1673 case PIPE_FORMAT_BPTC_RGBA_UNORM:
1674 case PIPE_FORMAT_BPTC_SRGBA:
1675 return V_008F14_IMG_DATA_FORMAT_BC7;
1676 case PIPE_FORMAT_BPTC_RGB_FLOAT:
1677 case PIPE_FORMAT_BPTC_RGB_UFLOAT:
1678 return V_008F14_IMG_DATA_FORMAT_BC6;
1679 default:
1680 goto out_unknown;
1681 }
1682 }
1683
1684 if (desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) {
1685 switch (format) {
1686 case PIPE_FORMAT_R8G8_B8G8_UNORM:
1687 case PIPE_FORMAT_G8R8_B8R8_UNORM:
1688 return V_008F14_IMG_DATA_FORMAT_GB_GR;
1689 case PIPE_FORMAT_G8R8_G8B8_UNORM:
1690 case PIPE_FORMAT_R8G8_R8B8_UNORM:
1691 return V_008F14_IMG_DATA_FORMAT_BG_RG;
1692 default:
1693 goto out_unknown;
1694 }
1695 }
1696
1697 if (desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
1698 if (!sscreen->info.has_format_bc1_through_bc7)
1699 goto out_unknown;
1700
1701 switch (format) {
1702 case PIPE_FORMAT_DXT1_RGB:
1703 case PIPE_FORMAT_DXT1_RGBA:
1704 case PIPE_FORMAT_DXT1_SRGB:
1705 case PIPE_FORMAT_DXT1_SRGBA:
1706 return V_008F14_IMG_DATA_FORMAT_BC1;
1707 case PIPE_FORMAT_DXT3_RGBA:
1708 case PIPE_FORMAT_DXT3_SRGBA:
1709 return V_008F14_IMG_DATA_FORMAT_BC2;
1710 case PIPE_FORMAT_DXT5_RGBA:
1711 case PIPE_FORMAT_DXT5_SRGBA:
1712 return V_008F14_IMG_DATA_FORMAT_BC3;
1713 default:
1714 goto out_unknown;
1715 }
1716 }
1717
1718 if (format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
1719 return V_008F14_IMG_DATA_FORMAT_5_9_9_9;
1720 } else if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
1721 return V_008F14_IMG_DATA_FORMAT_10_11_11;
1722 }
1723
1724 /* R8G8Bx_SNORM - TODO CxV8U8 */
1725
1726 /* hw cannot support mixed formats (except depth/stencil, since only
1727 * depth is read).*/
1728 if (desc->is_mixed && desc->colorspace != UTIL_FORMAT_COLORSPACE_ZS)
1729 goto out_unknown;
1730
1731 /* See whether the components are of the same size. */
1732 for (i = 1; i < desc->nr_channels; i++) {
1733 uniform = uniform && desc->channel[0].size == desc->channel[i].size;
1734 }
1735
1736 /* Non-uniform formats. */
1737 if (!uniform) {
1738 switch (desc->nr_channels) {
1739 case 3:
1740 if (desc->channel[0].size == 5 && desc->channel[1].size == 6 &&
1741 desc->channel[2].size == 5) {
1742 return V_008F14_IMG_DATA_FORMAT_5_6_5;
1743 }
1744 goto out_unknown;
1745 case 4:
1746 if (desc->channel[0].size == 5 && desc->channel[1].size == 5 &&
1747 desc->channel[2].size == 5 && desc->channel[3].size == 1) {
1748 return V_008F14_IMG_DATA_FORMAT_1_5_5_5;
1749 }
1750 if (desc->channel[0].size == 1 && desc->channel[1].size == 5 &&
1751 desc->channel[2].size == 5 && desc->channel[3].size == 5) {
1752 return V_008F14_IMG_DATA_FORMAT_5_5_5_1;
1753 }
1754 if (desc->channel[0].size == 10 && desc->channel[1].size == 10 &&
1755 desc->channel[2].size == 10 && desc->channel[3].size == 2) {
1756 return V_008F14_IMG_DATA_FORMAT_2_10_10_10;
1757 }
1758 goto out_unknown;
1759 }
1760 goto out_unknown;
1761 }
1762
1763 if (first_non_void < 0 || first_non_void > 3)
1764 goto out_unknown;
1765
1766 /* uniform formats */
1767 switch (desc->channel[first_non_void].size) {
1768 case 4:
1769 switch (desc->nr_channels) {
1770 #if 0 /* Not supported for render targets */
1771 case 2:
1772 return V_008F14_IMG_DATA_FORMAT_4_4;
1773 #endif
1774 case 4:
1775 return V_008F14_IMG_DATA_FORMAT_4_4_4_4;
1776 }
1777 break;
1778 case 8:
1779 switch (desc->nr_channels) {
1780 case 1:
1781 return V_008F14_IMG_DATA_FORMAT_8;
1782 case 2:
1783 return V_008F14_IMG_DATA_FORMAT_8_8;
1784 case 4:
1785 return V_008F14_IMG_DATA_FORMAT_8_8_8_8;
1786 }
1787 break;
1788 case 16:
1789 switch (desc->nr_channels) {
1790 case 1:
1791 return V_008F14_IMG_DATA_FORMAT_16;
1792 case 2:
1793 return V_008F14_IMG_DATA_FORMAT_16_16;
1794 case 4:
1795 return V_008F14_IMG_DATA_FORMAT_16_16_16_16;
1796 }
1797 break;
1798 case 32:
1799 switch (desc->nr_channels) {
1800 case 1:
1801 return V_008F14_IMG_DATA_FORMAT_32;
1802 case 2:
1803 return V_008F14_IMG_DATA_FORMAT_32_32;
1804 #if 0 /* Not supported for render targets */
1805 case 3:
1806 return V_008F14_IMG_DATA_FORMAT_32_32_32;
1807 #endif
1808 case 4:
1809 return V_008F14_IMG_DATA_FORMAT_32_32_32_32;
1810 }
1811 }
1812
1813 out_unknown:
1814 return ~0;
1815 }
1816
1817 static unsigned si_tex_wrap(unsigned wrap)
1818 {
1819 switch (wrap) {
1820 default:
1821 case PIPE_TEX_WRAP_REPEAT:
1822 return V_008F30_SQ_TEX_WRAP;
1823 case PIPE_TEX_WRAP_CLAMP:
1824 return V_008F30_SQ_TEX_CLAMP_HALF_BORDER;
1825 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
1826 return V_008F30_SQ_TEX_CLAMP_LAST_TEXEL;
1827 case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
1828 return V_008F30_SQ_TEX_CLAMP_BORDER;
1829 case PIPE_TEX_WRAP_MIRROR_REPEAT:
1830 return V_008F30_SQ_TEX_MIRROR;
1831 case PIPE_TEX_WRAP_MIRROR_CLAMP:
1832 return V_008F30_SQ_TEX_MIRROR_ONCE_HALF_BORDER;
1833 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
1834 return V_008F30_SQ_TEX_MIRROR_ONCE_LAST_TEXEL;
1835 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
1836 return V_008F30_SQ_TEX_MIRROR_ONCE_BORDER;
1837 }
1838 }
1839
1840 static unsigned si_tex_mipfilter(unsigned filter)
1841 {
1842 switch (filter) {
1843 case PIPE_TEX_MIPFILTER_NEAREST:
1844 return V_008F38_SQ_TEX_Z_FILTER_POINT;
1845 case PIPE_TEX_MIPFILTER_LINEAR:
1846 return V_008F38_SQ_TEX_Z_FILTER_LINEAR;
1847 default:
1848 case PIPE_TEX_MIPFILTER_NONE:
1849 return V_008F38_SQ_TEX_Z_FILTER_NONE;
1850 }
1851 }
1852
1853 static unsigned si_tex_compare(unsigned compare)
1854 {
1855 switch (compare) {
1856 default:
1857 case PIPE_FUNC_NEVER:
1858 return V_008F30_SQ_TEX_DEPTH_COMPARE_NEVER;
1859 case PIPE_FUNC_LESS:
1860 return V_008F30_SQ_TEX_DEPTH_COMPARE_LESS;
1861 case PIPE_FUNC_EQUAL:
1862 return V_008F30_SQ_TEX_DEPTH_COMPARE_EQUAL;
1863 case PIPE_FUNC_LEQUAL:
1864 return V_008F30_SQ_TEX_DEPTH_COMPARE_LESSEQUAL;
1865 case PIPE_FUNC_GREATER:
1866 return V_008F30_SQ_TEX_DEPTH_COMPARE_GREATER;
1867 case PIPE_FUNC_NOTEQUAL:
1868 return V_008F30_SQ_TEX_DEPTH_COMPARE_NOTEQUAL;
1869 case PIPE_FUNC_GEQUAL:
1870 return V_008F30_SQ_TEX_DEPTH_COMPARE_GREATEREQUAL;
1871 case PIPE_FUNC_ALWAYS:
1872 return V_008F30_SQ_TEX_DEPTH_COMPARE_ALWAYS;
1873 }
1874 }
1875
1876 static unsigned si_tex_dim(struct si_screen *sscreen, struct si_texture *tex, unsigned view_target,
1877 unsigned nr_samples)
1878 {
1879 unsigned res_target = tex->buffer.b.b.target;
1880
1881 if (view_target == PIPE_TEXTURE_CUBE || view_target == PIPE_TEXTURE_CUBE_ARRAY)
1882 res_target = view_target;
1883 /* If interpreting cubemaps as something else, set 2D_ARRAY. */
1884 else if (res_target == PIPE_TEXTURE_CUBE || res_target == PIPE_TEXTURE_CUBE_ARRAY)
1885 res_target = PIPE_TEXTURE_2D_ARRAY;
1886
1887 /* GFX9 allocates 1D textures as 2D. */
1888 if ((res_target == PIPE_TEXTURE_1D || res_target == PIPE_TEXTURE_1D_ARRAY) &&
1889 sscreen->info.chip_class == GFX9 &&
1890 tex->surface.u.gfx9.resource_type == RADEON_RESOURCE_2D) {
1891 if (res_target == PIPE_TEXTURE_1D)
1892 res_target = PIPE_TEXTURE_2D;
1893 else
1894 res_target = PIPE_TEXTURE_2D_ARRAY;
1895 }
1896
1897 switch (res_target) {
1898 default:
1899 case PIPE_TEXTURE_1D:
1900 return V_008F1C_SQ_RSRC_IMG_1D;
1901 case PIPE_TEXTURE_1D_ARRAY:
1902 return V_008F1C_SQ_RSRC_IMG_1D_ARRAY;
1903 case PIPE_TEXTURE_2D:
1904 case PIPE_TEXTURE_RECT:
1905 return nr_samples > 1 ? V_008F1C_SQ_RSRC_IMG_2D_MSAA : V_008F1C_SQ_RSRC_IMG_2D;
1906 case PIPE_TEXTURE_2D_ARRAY:
1907 return nr_samples > 1 ? V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY : V_008F1C_SQ_RSRC_IMG_2D_ARRAY;
1908 case PIPE_TEXTURE_3D:
1909 return V_008F1C_SQ_RSRC_IMG_3D;
1910 case PIPE_TEXTURE_CUBE:
1911 case PIPE_TEXTURE_CUBE_ARRAY:
1912 return V_008F1C_SQ_RSRC_IMG_CUBE;
1913 }
1914 }
1915
1916 /*
1917 * Format support testing
1918 */
1919
1920 static bool si_is_sampler_format_supported(struct pipe_screen *screen, enum pipe_format format)
1921 {
1922 struct si_screen *sscreen = (struct si_screen *)screen;
1923
1924 if (sscreen->info.chip_class >= GFX10) {
1925 const struct gfx10_format *fmt = &gfx10_format_table[format];
1926 if (!fmt->img_format || fmt->buffers_only)
1927 return false;
1928 return true;
1929 }
1930
1931 const struct util_format_description *desc = util_format_description(format);
1932 if (!desc)
1933 return false;
1934
1935 return si_translate_texformat(screen, format, desc,
1936 util_format_get_first_non_void_channel(format)) != ~0U;
1937 }
1938
1939 static uint32_t si_translate_buffer_dataformat(struct pipe_screen *screen,
1940 const struct util_format_description *desc,
1941 int first_non_void)
1942 {
1943 int i;
1944
1945 assert(((struct si_screen *)screen)->info.chip_class <= GFX9);
1946
1947 if (desc->format == PIPE_FORMAT_R11G11B10_FLOAT)
1948 return V_008F0C_BUF_DATA_FORMAT_10_11_11;
1949
1950 assert(first_non_void >= 0);
1951
1952 if (desc->nr_channels == 4 && desc->channel[0].size == 10 && desc->channel[1].size == 10 &&
1953 desc->channel[2].size == 10 && desc->channel[3].size == 2)
1954 return V_008F0C_BUF_DATA_FORMAT_2_10_10_10;
1955
1956 /* See whether the components are of the same size. */
1957 for (i = 0; i < desc->nr_channels; i++) {
1958 if (desc->channel[first_non_void].size != desc->channel[i].size)
1959 return V_008F0C_BUF_DATA_FORMAT_INVALID;
1960 }
1961
1962 switch (desc->channel[first_non_void].size) {
1963 case 8:
1964 switch (desc->nr_channels) {
1965 case 1:
1966 case 3: /* 3 loads */
1967 return V_008F0C_BUF_DATA_FORMAT_8;
1968 case 2:
1969 return V_008F0C_BUF_DATA_FORMAT_8_8;
1970 case 4:
1971 return V_008F0C_BUF_DATA_FORMAT_8_8_8_8;
1972 }
1973 break;
1974 case 16:
1975 switch (desc->nr_channels) {
1976 case 1:
1977 case 3: /* 3 loads */
1978 return V_008F0C_BUF_DATA_FORMAT_16;
1979 case 2:
1980 return V_008F0C_BUF_DATA_FORMAT_16_16;
1981 case 4:
1982 return V_008F0C_BUF_DATA_FORMAT_16_16_16_16;
1983 }
1984 break;
1985 case 32:
1986 switch (desc->nr_channels) {
1987 case 1:
1988 return V_008F0C_BUF_DATA_FORMAT_32;
1989 case 2:
1990 return V_008F0C_BUF_DATA_FORMAT_32_32;
1991 case 3:
1992 return V_008F0C_BUF_DATA_FORMAT_32_32_32;
1993 case 4:
1994 return V_008F0C_BUF_DATA_FORMAT_32_32_32_32;
1995 }
1996 break;
1997 case 64:
1998 /* Legacy double formats. */
1999 switch (desc->nr_channels) {
2000 case 1: /* 1 load */
2001 return V_008F0C_BUF_DATA_FORMAT_32_32;
2002 case 2: /* 1 load */
2003 return V_008F0C_BUF_DATA_FORMAT_32_32_32_32;
2004 case 3: /* 3 loads */
2005 return V_008F0C_BUF_DATA_FORMAT_32_32;
2006 case 4: /* 2 loads */
2007 return V_008F0C_BUF_DATA_FORMAT_32_32_32_32;
2008 }
2009 break;
2010 }
2011
2012 return V_008F0C_BUF_DATA_FORMAT_INVALID;
2013 }
2014
2015 static uint32_t si_translate_buffer_numformat(struct pipe_screen *screen,
2016 const struct util_format_description *desc,
2017 int first_non_void)
2018 {
2019 assert(((struct si_screen *)screen)->info.chip_class <= GFX9);
2020
2021 if (desc->format == PIPE_FORMAT_R11G11B10_FLOAT)
2022 return V_008F0C_BUF_NUM_FORMAT_FLOAT;
2023
2024 assert(first_non_void >= 0);
2025
2026 switch (desc->channel[first_non_void].type) {
2027 case UTIL_FORMAT_TYPE_SIGNED:
2028 case UTIL_FORMAT_TYPE_FIXED:
2029 if (desc->channel[first_non_void].size >= 32 || desc->channel[first_non_void].pure_integer)
2030 return V_008F0C_BUF_NUM_FORMAT_SINT;
2031 else if (desc->channel[first_non_void].normalized)
2032 return V_008F0C_BUF_NUM_FORMAT_SNORM;
2033 else
2034 return V_008F0C_BUF_NUM_FORMAT_SSCALED;
2035 break;
2036 case UTIL_FORMAT_TYPE_UNSIGNED:
2037 if (desc->channel[first_non_void].size >= 32 || desc->channel[first_non_void].pure_integer)
2038 return V_008F0C_BUF_NUM_FORMAT_UINT;
2039 else if (desc->channel[first_non_void].normalized)
2040 return V_008F0C_BUF_NUM_FORMAT_UNORM;
2041 else
2042 return V_008F0C_BUF_NUM_FORMAT_USCALED;
2043 break;
2044 case UTIL_FORMAT_TYPE_FLOAT:
2045 default:
2046 return V_008F0C_BUF_NUM_FORMAT_FLOAT;
2047 }
2048 }
2049
2050 static unsigned si_is_vertex_format_supported(struct pipe_screen *screen, enum pipe_format format,
2051 unsigned usage)
2052 {
2053 struct si_screen *sscreen = (struct si_screen *)screen;
2054 const struct util_format_description *desc;
2055 int first_non_void;
2056 unsigned data_format;
2057
2058 assert((usage & ~(PIPE_BIND_SHADER_IMAGE | PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_VERTEX_BUFFER)) ==
2059 0);
2060
2061 desc = util_format_description(format);
2062 if (!desc)
2063 return 0;
2064
2065 /* There are no native 8_8_8 or 16_16_16 data formats, and we currently
2066 * select 8_8_8_8 and 16_16_16_16 instead. This works reasonably well
2067 * for read-only access (with caveats surrounding bounds checks), but
2068 * obviously fails for write access which we have to implement for
2069 * shader images. Luckily, OpenGL doesn't expect this to be supported
2070 * anyway, and so the only impact is on PBO uploads / downloads, which
2071 * shouldn't be expected to be fast for GL_RGB anyway.
2072 */
2073 if (desc->block.bits == 3 * 8 || desc->block.bits == 3 * 16) {
2074 if (usage & (PIPE_BIND_SHADER_IMAGE | PIPE_BIND_SAMPLER_VIEW)) {
2075 usage &= ~(PIPE_BIND_SHADER_IMAGE | PIPE_BIND_SAMPLER_VIEW);
2076 if (!usage)
2077 return 0;
2078 }
2079 }
2080
2081 if (sscreen->info.chip_class >= GFX10) {
2082 const struct gfx10_format *fmt = &gfx10_format_table[format];
2083 if (!fmt->img_format || fmt->img_format >= 128)
2084 return 0;
2085 return usage;
2086 }
2087
2088 first_non_void = util_format_get_first_non_void_channel(format);
2089 data_format = si_translate_buffer_dataformat(screen, desc, first_non_void);
2090 if (data_format == V_008F0C_BUF_DATA_FORMAT_INVALID)
2091 return 0;
2092
2093 return usage;
2094 }
2095
2096 static bool si_is_colorbuffer_format_supported(enum pipe_format format)
2097 {
2098 return si_translate_colorformat(format) != V_028C70_COLOR_INVALID &&
2099 si_translate_colorswap(format, false) != ~0U;
2100 }
2101
2102 static bool si_is_zs_format_supported(enum pipe_format format)
2103 {
2104 return si_translate_dbformat(format) != V_028040_Z_INVALID;
2105 }
2106
2107 static bool si_is_format_supported(struct pipe_screen *screen, enum pipe_format format,
2108 enum pipe_texture_target target, unsigned sample_count,
2109 unsigned storage_sample_count, unsigned usage)
2110 {
2111 struct si_screen *sscreen = (struct si_screen *)screen;
2112 unsigned retval = 0;
2113
2114 if (target >= PIPE_MAX_TEXTURE_TYPES) {
2115 PRINT_ERR("radeonsi: unsupported texture type %d\n", target);
2116 return false;
2117 }
2118
2119 if (MAX2(1, sample_count) < MAX2(1, storage_sample_count))
2120 return false;
2121
2122 if (sample_count > 1) {
2123 if (!screen->get_param(screen, PIPE_CAP_TEXTURE_MULTISAMPLE))
2124 return false;
2125
2126 /* Only power-of-two sample counts are supported. */
2127 if (!util_is_power_of_two_or_zero(sample_count) ||
2128 !util_is_power_of_two_or_zero(storage_sample_count))
2129 return false;
2130
2131 /* MSAA support without framebuffer attachments. */
2132 if (format == PIPE_FORMAT_NONE && sample_count <= 16)
2133 return true;
2134
2135 if (!sscreen->info.has_eqaa_surface_allocator || util_format_is_depth_or_stencil(format)) {
2136 /* Color without EQAA or depth/stencil. */
2137 if (sample_count > 8 || sample_count != storage_sample_count)
2138 return false;
2139 } else {
2140 /* Color with EQAA. */
2141 if (sample_count > 16 || storage_sample_count > 8)
2142 return false;
2143 }
2144 }
2145
2146 if (usage & (PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_SHADER_IMAGE)) {
2147 if (target == PIPE_BUFFER) {
2148 retval |= si_is_vertex_format_supported(
2149 screen, format, usage & (PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_SHADER_IMAGE));
2150 } else {
2151 if (si_is_sampler_format_supported(screen, format))
2152 retval |= usage & (PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_SHADER_IMAGE);
2153 }
2154 }
2155
2156 if ((usage & (PIPE_BIND_RENDER_TARGET | PIPE_BIND_DISPLAY_TARGET | PIPE_BIND_SCANOUT |
2157 PIPE_BIND_SHARED | PIPE_BIND_BLENDABLE)) &&
2158 si_is_colorbuffer_format_supported(format)) {
2159 retval |= usage & (PIPE_BIND_RENDER_TARGET | PIPE_BIND_DISPLAY_TARGET | PIPE_BIND_SCANOUT |
2160 PIPE_BIND_SHARED);
2161 if (!util_format_is_pure_integer(format) && !util_format_is_depth_or_stencil(format))
2162 retval |= usage & PIPE_BIND_BLENDABLE;
2163 }
2164
2165 if ((usage & PIPE_BIND_DEPTH_STENCIL) && si_is_zs_format_supported(format)) {
2166 retval |= PIPE_BIND_DEPTH_STENCIL;
2167 }
2168
2169 if (usage & PIPE_BIND_VERTEX_BUFFER) {
2170 retval |= si_is_vertex_format_supported(screen, format, PIPE_BIND_VERTEX_BUFFER);
2171 }
2172
2173 if ((usage & PIPE_BIND_LINEAR) && !util_format_is_compressed(format) &&
2174 !(usage & PIPE_BIND_DEPTH_STENCIL))
2175 retval |= PIPE_BIND_LINEAR;
2176
2177 return retval == usage;
2178 }
2179
2180 /*
2181 * framebuffer handling
2182 */
2183
2184 static void si_choose_spi_color_formats(struct si_surface *surf, unsigned format, unsigned swap,
2185 unsigned ntype, bool is_depth)
2186 {
2187 /* Alpha is needed for alpha-to-coverage.
2188 * Blending may be with or without alpha.
2189 */
2190 unsigned normal = 0; /* most optimal, may not support blending or export alpha */
2191 unsigned alpha = 0; /* exports alpha, but may not support blending */
2192 unsigned blend = 0; /* supports blending, but may not export alpha */
2193 unsigned blend_alpha = 0; /* least optimal, supports blending and exports alpha */
2194
2195 /* Choose the SPI color formats. These are required values for RB+.
2196 * Other chips have multiple choices, though they are not necessarily better.
2197 */
2198 switch (format) {
2199 case V_028C70_COLOR_5_6_5:
2200 case V_028C70_COLOR_1_5_5_5:
2201 case V_028C70_COLOR_5_5_5_1:
2202 case V_028C70_COLOR_4_4_4_4:
2203 case V_028C70_COLOR_10_11_11:
2204 case V_028C70_COLOR_11_11_10:
2205 case V_028C70_COLOR_8:
2206 case V_028C70_COLOR_8_8:
2207 case V_028C70_COLOR_8_8_8_8:
2208 case V_028C70_COLOR_10_10_10_2:
2209 case V_028C70_COLOR_2_10_10_10:
2210 if (ntype == V_028C70_NUMBER_UINT)
2211 alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_UINT16_ABGR;
2212 else if (ntype == V_028C70_NUMBER_SINT)
2213 alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_SINT16_ABGR;
2214 else
2215 alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_FP16_ABGR;
2216 break;
2217
2218 case V_028C70_COLOR_16:
2219 case V_028C70_COLOR_16_16:
2220 case V_028C70_COLOR_16_16_16_16:
2221 if (ntype == V_028C70_NUMBER_UNORM || ntype == V_028C70_NUMBER_SNORM) {
2222 /* UNORM16 and SNORM16 don't support blending */
2223 if (ntype == V_028C70_NUMBER_UNORM)
2224 normal = alpha = V_028714_SPI_SHADER_UNORM16_ABGR;
2225 else
2226 normal = alpha = V_028714_SPI_SHADER_SNORM16_ABGR;
2227
2228 /* Use 32 bits per channel for blending. */
2229 if (format == V_028C70_COLOR_16) {
2230 if (swap == V_028C70_SWAP_STD) { /* R */
2231 blend = V_028714_SPI_SHADER_32_R;
2232 blend_alpha = V_028714_SPI_SHADER_32_AR;
2233 } else if (swap == V_028C70_SWAP_ALT_REV) /* A */
2234 blend = blend_alpha = V_028714_SPI_SHADER_32_AR;
2235 else
2236 assert(0);
2237 } else if (format == V_028C70_COLOR_16_16) {
2238 if (swap == V_028C70_SWAP_STD) { /* RG */
2239 blend = V_028714_SPI_SHADER_32_GR;
2240 blend_alpha = V_028714_SPI_SHADER_32_ABGR;
2241 } else if (swap == V_028C70_SWAP_ALT) /* RA */
2242 blend = blend_alpha = V_028714_SPI_SHADER_32_AR;
2243 else
2244 assert(0);
2245 } else /* 16_16_16_16 */
2246 blend = blend_alpha = V_028714_SPI_SHADER_32_ABGR;
2247 } else if (ntype == V_028C70_NUMBER_UINT)
2248 alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_UINT16_ABGR;
2249 else if (ntype == V_028C70_NUMBER_SINT)
2250 alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_SINT16_ABGR;
2251 else if (ntype == V_028C70_NUMBER_FLOAT)
2252 alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_FP16_ABGR;
2253 else
2254 assert(0);
2255 break;
2256
2257 case V_028C70_COLOR_32:
2258 if (swap == V_028C70_SWAP_STD) { /* R */
2259 blend = normal = V_028714_SPI_SHADER_32_R;
2260 alpha = blend_alpha = V_028714_SPI_SHADER_32_AR;
2261 } else if (swap == V_028C70_SWAP_ALT_REV) /* A */
2262 alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_AR;
2263 else
2264 assert(0);
2265 break;
2266
2267 case V_028C70_COLOR_32_32:
2268 if (swap == V_028C70_SWAP_STD) { /* RG */
2269 blend = normal = V_028714_SPI_SHADER_32_GR;
2270 alpha = blend_alpha = V_028714_SPI_SHADER_32_ABGR;
2271 } else if (swap == V_028C70_SWAP_ALT) /* RA */
2272 alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_AR;
2273 else
2274 assert(0);
2275 break;
2276
2277 case V_028C70_COLOR_32_32_32_32:
2278 case V_028C70_COLOR_8_24:
2279 case V_028C70_COLOR_24_8:
2280 case V_028C70_COLOR_X24_8_32_FLOAT:
2281 alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_ABGR;
2282 break;
2283
2284 default:
2285 assert(0);
2286 return;
2287 }
2288
2289 /* The DB->CB copy needs 32_ABGR. */
2290 if (is_depth)
2291 alpha = blend = blend_alpha = normal = V_028714_SPI_SHADER_32_ABGR;
2292
2293 surf->spi_shader_col_format = normal;
2294 surf->spi_shader_col_format_alpha = alpha;
2295 surf->spi_shader_col_format_blend = blend;
2296 surf->spi_shader_col_format_blend_alpha = blend_alpha;
2297 }
2298
2299 static void si_initialize_color_surface(struct si_context *sctx, struct si_surface *surf)
2300 {
2301 struct si_texture *tex = (struct si_texture *)surf->base.texture;
2302 unsigned color_info, color_attrib;
2303 unsigned format, swap, ntype, endian;
2304 const struct util_format_description *desc;
2305 int firstchan;
2306 unsigned blend_clamp = 0, blend_bypass = 0;
2307
2308 desc = util_format_description(surf->base.format);
2309 for (firstchan = 0; firstchan < 4; firstchan++) {
2310 if (desc->channel[firstchan].type != UTIL_FORMAT_TYPE_VOID) {
2311 break;
2312 }
2313 }
2314 if (firstchan == 4 || desc->channel[firstchan].type == UTIL_FORMAT_TYPE_FLOAT) {
2315 ntype = V_028C70_NUMBER_FLOAT;
2316 } else {
2317 ntype = V_028C70_NUMBER_UNORM;
2318 if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB)
2319 ntype = V_028C70_NUMBER_SRGB;
2320 else if (desc->channel[firstchan].type == UTIL_FORMAT_TYPE_SIGNED) {
2321 if (desc->channel[firstchan].pure_integer) {
2322 ntype = V_028C70_NUMBER_SINT;
2323 } else {
2324 assert(desc->channel[firstchan].normalized);
2325 ntype = V_028C70_NUMBER_SNORM;
2326 }
2327 } else if (desc->channel[firstchan].type == UTIL_FORMAT_TYPE_UNSIGNED) {
2328 if (desc->channel[firstchan].pure_integer) {
2329 ntype = V_028C70_NUMBER_UINT;
2330 } else {
2331 assert(desc->channel[firstchan].normalized);
2332 ntype = V_028C70_NUMBER_UNORM;
2333 }
2334 }
2335 }
2336
2337 format = si_translate_colorformat(surf->base.format);
2338 if (format == V_028C70_COLOR_INVALID) {
2339 PRINT_ERR("Invalid CB format: %d, disabling CB.\n", surf->base.format);
2340 }
2341 assert(format != V_028C70_COLOR_INVALID);
2342 swap = si_translate_colorswap(surf->base.format, false);
2343 endian = si_colorformat_endian_swap(format);
2344
2345 /* blend clamp should be set for all NORM/SRGB types */
2346 if (ntype == V_028C70_NUMBER_UNORM || ntype == V_028C70_NUMBER_SNORM ||
2347 ntype == V_028C70_NUMBER_SRGB)
2348 blend_clamp = 1;
2349
2350 /* set blend bypass according to docs if SINT/UINT or
2351 8/24 COLOR variants */
2352 if (ntype == V_028C70_NUMBER_UINT || ntype == V_028C70_NUMBER_SINT ||
2353 format == V_028C70_COLOR_8_24 || format == V_028C70_COLOR_24_8 ||
2354 format == V_028C70_COLOR_X24_8_32_FLOAT) {
2355 blend_clamp = 0;
2356 blend_bypass = 1;
2357 }
2358
2359 if (ntype == V_028C70_NUMBER_UINT || ntype == V_028C70_NUMBER_SINT) {
2360 if (format == V_028C70_COLOR_8 || format == V_028C70_COLOR_8_8 ||
2361 format == V_028C70_COLOR_8_8_8_8)
2362 surf->color_is_int8 = true;
2363 else if (format == V_028C70_COLOR_10_10_10_2 || format == V_028C70_COLOR_2_10_10_10)
2364 surf->color_is_int10 = true;
2365 }
2366
2367 color_info =
2368 S_028C70_FORMAT(format) | S_028C70_COMP_SWAP(swap) | S_028C70_BLEND_CLAMP(blend_clamp) |
2369 S_028C70_BLEND_BYPASS(blend_bypass) | S_028C70_SIMPLE_FLOAT(1) |
2370 S_028C70_ROUND_MODE(ntype != V_028C70_NUMBER_UNORM && ntype != V_028C70_NUMBER_SNORM &&
2371 ntype != V_028C70_NUMBER_SRGB && format != V_028C70_COLOR_8_24 &&
2372 format != V_028C70_COLOR_24_8) |
2373 S_028C70_NUMBER_TYPE(ntype) | S_028C70_ENDIAN(endian);
2374
2375 /* Intensity is implemented as Red, so treat it that way. */
2376 color_attrib = S_028C74_FORCE_DST_ALPHA_1(desc->swizzle[3] == PIPE_SWIZZLE_1 ||
2377 util_format_is_intensity(surf->base.format));
2378
2379 if (tex->buffer.b.b.nr_samples > 1) {
2380 unsigned log_samples = util_logbase2(tex->buffer.b.b.nr_samples);
2381 unsigned log_fragments = util_logbase2(tex->buffer.b.b.nr_storage_samples);
2382
2383 color_attrib |= S_028C74_NUM_SAMPLES(log_samples) | S_028C74_NUM_FRAGMENTS(log_fragments);
2384
2385 if (tex->surface.fmask_offset) {
2386 color_info |= S_028C70_COMPRESSION(1);
2387 unsigned fmask_bankh = util_logbase2(tex->surface.u.legacy.fmask.bankh);
2388
2389 if (sctx->chip_class == GFX6) {
2390 /* due to a hw bug, FMASK_BANK_HEIGHT must be set on GFX6 too */
2391 color_attrib |= S_028C74_FMASK_BANK_HEIGHT(fmask_bankh);
2392 }
2393 }
2394 }
2395
2396 if (sctx->chip_class >= GFX10) {
2397 unsigned min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_32B;
2398
2399 /* amdvlk: [min-compressed-block-size] should be set to 32 for dGPU and
2400 64 for APU because all of our APUs to date use DIMMs which have
2401 a request granularity size of 64B while all other chips have a
2402 32B request size */
2403 if (!sctx->screen->info.has_dedicated_vram)
2404 min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_64B;
2405
2406 surf->cb_dcc_control = S_028C78_MAX_UNCOMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_256B) |
2407 S_028C78_MAX_COMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_128B) |
2408 S_028C78_MIN_COMPRESSED_BLOCK_SIZE(min_compressed_block_size) |
2409 S_028C78_INDEPENDENT_64B_BLOCKS(0) |
2410 S_028C78_INDEPENDENT_128B_BLOCKS(1);
2411 } else if (sctx->chip_class >= GFX8) {
2412 unsigned max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_256B;
2413 unsigned min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_32B;
2414
2415 /* amdvlk: [min-compressed-block-size] should be set to 32 for dGPU and
2416 64 for APU because all of our APUs to date use DIMMs which have
2417 a request granularity size of 64B while all other chips have a
2418 32B request size */
2419 if (!sctx->screen->info.has_dedicated_vram)
2420 min_compressed_block_size = V_028C78_MIN_BLOCK_SIZE_64B;
2421
2422 if (tex->buffer.b.b.nr_storage_samples > 1) {
2423 if (tex->surface.bpe == 1)
2424 max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_64B;
2425 else if (tex->surface.bpe == 2)
2426 max_uncompressed_block_size = V_028C78_MAX_BLOCK_SIZE_128B;
2427 }
2428
2429 surf->cb_dcc_control = S_028C78_MAX_UNCOMPRESSED_BLOCK_SIZE(max_uncompressed_block_size) |
2430 S_028C78_MIN_COMPRESSED_BLOCK_SIZE(min_compressed_block_size) |
2431 S_028C78_INDEPENDENT_64B_BLOCKS(1);
2432 }
2433
2434 /* This must be set for fast clear to work without FMASK. */
2435 if (!tex->surface.fmask_size && sctx->chip_class == GFX6) {
2436 unsigned bankh = util_logbase2(tex->surface.u.legacy.bankh);
2437 color_attrib |= S_028C74_FMASK_BANK_HEIGHT(bankh);
2438 }
2439
2440 /* GFX10 field has the same base shift as the GFX6 field */
2441 unsigned color_view = S_028C6C_SLICE_START(surf->base.u.tex.first_layer) |
2442 S_028C6C_SLICE_MAX_GFX10(surf->base.u.tex.last_layer);
2443 unsigned mip0_depth = util_max_layer(&tex->buffer.b.b, 0);
2444
2445 if (sctx->chip_class >= GFX10) {
2446 color_view |= S_028C6C_MIP_LEVEL_GFX10(surf->base.u.tex.level);
2447
2448 surf->cb_color_attrib3 = S_028EE0_MIP0_DEPTH(mip0_depth) |
2449 S_028EE0_RESOURCE_TYPE(tex->surface.u.gfx9.resource_type) |
2450 S_028EE0_RESOURCE_LEVEL(1);
2451 } else if (sctx->chip_class == GFX9) {
2452 color_view |= S_028C6C_MIP_LEVEL_GFX9(surf->base.u.tex.level);
2453 color_attrib |= S_028C74_MIP0_DEPTH(mip0_depth) |
2454 S_028C74_RESOURCE_TYPE(tex->surface.u.gfx9.resource_type);
2455 }
2456
2457 if (sctx->chip_class >= GFX9) {
2458 surf->cb_color_attrib2 = S_028C68_MIP0_WIDTH(surf->width0 - 1) |
2459 S_028C68_MIP0_HEIGHT(surf->height0 - 1) |
2460 S_028C68_MAX_MIP(tex->buffer.b.b.last_level);
2461 }
2462
2463 surf->cb_color_view = color_view;
2464 surf->cb_color_info = color_info;
2465 surf->cb_color_attrib = color_attrib;
2466
2467 /* Determine pixel shader export format */
2468 si_choose_spi_color_formats(surf, format, swap, ntype, tex->is_depth);
2469
2470 surf->color_initialized = true;
2471 }
2472
2473 static void si_init_depth_surface(struct si_context *sctx, struct si_surface *surf)
2474 {
2475 struct si_texture *tex = (struct si_texture *)surf->base.texture;
2476 unsigned level = surf->base.u.tex.level;
2477 unsigned format, stencil_format;
2478 uint32_t z_info, s_info;
2479
2480 format = si_translate_dbformat(tex->db_render_format);
2481 stencil_format = tex->surface.has_stencil ? V_028044_STENCIL_8 : V_028044_STENCIL_INVALID;
2482
2483 assert(format != V_028040_Z_INVALID);
2484 if (format == V_028040_Z_INVALID)
2485 PRINT_ERR("Invalid DB format: %d, disabling DB.\n", tex->buffer.b.b.format);
2486
2487 surf->db_depth_view = S_028008_SLICE_START(surf->base.u.tex.first_layer) |
2488 S_028008_SLICE_MAX(surf->base.u.tex.last_layer);
2489 surf->db_htile_data_base = 0;
2490 surf->db_htile_surface = 0;
2491
2492 if (sctx->chip_class >= GFX10) {
2493 surf->db_depth_view |= S_028008_SLICE_START_HI(surf->base.u.tex.first_layer >> 11) |
2494 S_028008_SLICE_MAX_HI(surf->base.u.tex.last_layer >> 11);
2495 }
2496
2497 if (sctx->chip_class >= GFX9) {
2498 assert(tex->surface.u.gfx9.surf_offset == 0);
2499 surf->db_depth_base = tex->buffer.gpu_address >> 8;
2500 surf->db_stencil_base = (tex->buffer.gpu_address + tex->surface.u.gfx9.stencil_offset) >> 8;
2501 z_info = S_028038_FORMAT(format) |
2502 S_028038_NUM_SAMPLES(util_logbase2(tex->buffer.b.b.nr_samples)) |
2503 S_028038_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode) |
2504 S_028038_MAXMIP(tex->buffer.b.b.last_level);
2505 s_info = S_02803C_FORMAT(stencil_format) |
2506 S_02803C_SW_MODE(tex->surface.u.gfx9.stencil.swizzle_mode);
2507
2508 if (sctx->chip_class == GFX9) {
2509 surf->db_z_info2 = S_028068_EPITCH(tex->surface.u.gfx9.surf.epitch);
2510 surf->db_stencil_info2 = S_02806C_EPITCH(tex->surface.u.gfx9.stencil.epitch);
2511 }
2512 surf->db_depth_view |= S_028008_MIPID(level);
2513 surf->db_depth_size =
2514 S_02801C_X_MAX(tex->buffer.b.b.width0 - 1) | S_02801C_Y_MAX(tex->buffer.b.b.height0 - 1);
2515
2516 if (si_htile_enabled(tex, level, PIPE_MASK_ZS)) {
2517 z_info |= S_028038_TILE_SURFACE_ENABLE(1) | S_028038_ALLOW_EXPCLEAR(1);
2518
2519 if (tex->tc_compatible_htile) {
2520 unsigned max_zplanes = 4;
2521
2522 if (tex->db_render_format == PIPE_FORMAT_Z16_UNORM && tex->buffer.b.b.nr_samples > 1)
2523 max_zplanes = 2;
2524
2525 z_info |= S_028038_DECOMPRESS_ON_N_ZPLANES(max_zplanes + 1);
2526
2527 if (sctx->chip_class >= GFX10) {
2528 z_info |= S_028040_ITERATE_FLUSH(1);
2529 s_info |= S_028044_ITERATE_FLUSH(!tex->htile_stencil_disabled);
2530 } else {
2531 z_info |= S_028038_ITERATE_FLUSH(1);
2532 s_info |= S_02803C_ITERATE_FLUSH(1);
2533 }
2534 }
2535
2536 if (tex->surface.has_stencil && !tex->htile_stencil_disabled) {
2537 /* Stencil buffer workaround ported from the GFX6-GFX8 code.
2538 * See that for explanation.
2539 */
2540 s_info |= S_02803C_ALLOW_EXPCLEAR(tex->buffer.b.b.nr_samples <= 1);
2541 } else {
2542 /* Use all HTILE for depth if there's no stencil. */
2543 s_info |= S_02803C_TILE_STENCIL_DISABLE(1);
2544 }
2545
2546 surf->db_htile_data_base = (tex->buffer.gpu_address + tex->surface.htile_offset) >> 8;
2547 surf->db_htile_surface =
2548 S_028ABC_FULL_CACHE(1) | S_028ABC_PIPE_ALIGNED(tex->surface.u.gfx9.htile.pipe_aligned);
2549 if (sctx->chip_class == GFX9) {
2550 surf->db_htile_surface |= S_028ABC_RB_ALIGNED(tex->surface.u.gfx9.htile.rb_aligned);
2551 }
2552 }
2553 } else {
2554 /* GFX6-GFX8 */
2555 struct legacy_surf_level *levelinfo = &tex->surface.u.legacy.level[level];
2556
2557 assert(levelinfo->nblk_x % 8 == 0 && levelinfo->nblk_y % 8 == 0);
2558
2559 surf->db_depth_base =
2560 (tex->buffer.gpu_address + tex->surface.u.legacy.level[level].offset) >> 8;
2561 surf->db_stencil_base =
2562 (tex->buffer.gpu_address + tex->surface.u.legacy.stencil_level[level].offset) >> 8;
2563
2564 z_info =
2565 S_028040_FORMAT(format) | S_028040_NUM_SAMPLES(util_logbase2(tex->buffer.b.b.nr_samples));
2566 s_info = S_028044_FORMAT(stencil_format);
2567 surf->db_depth_info = S_02803C_ADDR5_SWIZZLE_MASK(!tex->tc_compatible_htile);
2568
2569 if (sctx->chip_class >= GFX7) {
2570 struct radeon_info *info = &sctx->screen->info;
2571 unsigned index = tex->surface.u.legacy.tiling_index[level];
2572 unsigned stencil_index = tex->surface.u.legacy.stencil_tiling_index[level];
2573 unsigned macro_index = tex->surface.u.legacy.macro_tile_index;
2574 unsigned tile_mode = info->si_tile_mode_array[index];
2575 unsigned stencil_tile_mode = info->si_tile_mode_array[stencil_index];
2576 unsigned macro_mode = info->cik_macrotile_mode_array[macro_index];
2577
2578 surf->db_depth_info |= S_02803C_ARRAY_MODE(G_009910_ARRAY_MODE(tile_mode)) |
2579 S_02803C_PIPE_CONFIG(G_009910_PIPE_CONFIG(tile_mode)) |
2580 S_02803C_BANK_WIDTH(G_009990_BANK_WIDTH(macro_mode)) |
2581 S_02803C_BANK_HEIGHT(G_009990_BANK_HEIGHT(macro_mode)) |
2582 S_02803C_MACRO_TILE_ASPECT(G_009990_MACRO_TILE_ASPECT(macro_mode)) |
2583 S_02803C_NUM_BANKS(G_009990_NUM_BANKS(macro_mode));
2584 z_info |= S_028040_TILE_SPLIT(G_009910_TILE_SPLIT(tile_mode));
2585 s_info |= S_028044_TILE_SPLIT(G_009910_TILE_SPLIT(stencil_tile_mode));
2586 } else {
2587 unsigned tile_mode_index = si_tile_mode_index(tex, level, false);
2588 z_info |= S_028040_TILE_MODE_INDEX(tile_mode_index);
2589 tile_mode_index = si_tile_mode_index(tex, level, true);
2590 s_info |= S_028044_TILE_MODE_INDEX(tile_mode_index);
2591 }
2592
2593 surf->db_depth_size = S_028058_PITCH_TILE_MAX((levelinfo->nblk_x / 8) - 1) |
2594 S_028058_HEIGHT_TILE_MAX((levelinfo->nblk_y / 8) - 1);
2595 surf->db_depth_slice =
2596 S_02805C_SLICE_TILE_MAX((levelinfo->nblk_x * levelinfo->nblk_y) / 64 - 1);
2597
2598 if (si_htile_enabled(tex, level, PIPE_MASK_ZS)) {
2599 z_info |= S_028040_TILE_SURFACE_ENABLE(1) | S_028040_ALLOW_EXPCLEAR(1);
2600
2601 if (tex->surface.has_stencil) {
2602 /* Workaround: For a not yet understood reason, the
2603 * combination of MSAA, fast stencil clear and stencil
2604 * decompress messes with subsequent stencil buffer
2605 * uses. Problem was reproduced on Verde, Bonaire,
2606 * Tonga, and Carrizo.
2607 *
2608 * Disabling EXPCLEAR works around the problem.
2609 *
2610 * Check piglit's arb_texture_multisample-stencil-clear
2611 * test if you want to try changing this.
2612 */
2613 if (tex->buffer.b.b.nr_samples <= 1)
2614 s_info |= S_028044_ALLOW_EXPCLEAR(1);
2615 } else if (!tex->tc_compatible_htile) {
2616 /* Use all of the htile_buffer for depth if there's no stencil.
2617 * This must not be set when TC-compatible HTILE is enabled
2618 * due to a hw bug.
2619 */
2620 s_info |= S_028044_TILE_STENCIL_DISABLE(1);
2621 }
2622
2623 surf->db_htile_data_base = (tex->buffer.gpu_address + tex->surface.htile_offset) >> 8;
2624 surf->db_htile_surface = S_028ABC_FULL_CACHE(1);
2625
2626 if (tex->tc_compatible_htile) {
2627 surf->db_htile_surface |= S_028ABC_TC_COMPATIBLE(1);
2628
2629 /* 0 = full compression. N = only compress up to N-1 Z planes. */
2630 if (tex->buffer.b.b.nr_samples <= 1)
2631 z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(5);
2632 else if (tex->buffer.b.b.nr_samples <= 4)
2633 z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(3);
2634 else
2635 z_info |= S_028040_DECOMPRESS_ON_N_ZPLANES(2);
2636 }
2637 }
2638 }
2639
2640 surf->db_z_info = z_info;
2641 surf->db_stencil_info = s_info;
2642
2643 surf->depth_initialized = true;
2644 }
2645
2646 void si_update_fb_dirtiness_after_rendering(struct si_context *sctx)
2647 {
2648 if (sctx->decompression_enabled)
2649 return;
2650
2651 if (sctx->framebuffer.state.zsbuf) {
2652 struct pipe_surface *surf = sctx->framebuffer.state.zsbuf;
2653 struct si_texture *tex = (struct si_texture *)surf->texture;
2654
2655 tex->dirty_level_mask |= 1 << surf->u.tex.level;
2656
2657 if (tex->surface.has_stencil)
2658 tex->stencil_dirty_level_mask |= 1 << surf->u.tex.level;
2659 }
2660
2661 unsigned compressed_cb_mask = sctx->framebuffer.compressed_cb_mask;
2662 while (compressed_cb_mask) {
2663 unsigned i = u_bit_scan(&compressed_cb_mask);
2664 struct pipe_surface *surf = sctx->framebuffer.state.cbufs[i];
2665 struct si_texture *tex = (struct si_texture *)surf->texture;
2666
2667 if (tex->surface.fmask_offset) {
2668 tex->dirty_level_mask |= 1 << surf->u.tex.level;
2669 tex->fmask_is_identity = false;
2670 }
2671 if (tex->dcc_gather_statistics)
2672 tex->separate_dcc_dirty = true;
2673 }
2674 }
2675
2676 static void si_dec_framebuffer_counters(const struct pipe_framebuffer_state *state)
2677 {
2678 for (int i = 0; i < state->nr_cbufs; ++i) {
2679 struct si_surface *surf = NULL;
2680 struct si_texture *tex;
2681
2682 if (!state->cbufs[i])
2683 continue;
2684 surf = (struct si_surface *)state->cbufs[i];
2685 tex = (struct si_texture *)surf->base.texture;
2686
2687 p_atomic_dec(&tex->framebuffers_bound);
2688 }
2689 }
2690
2691 static void si_set_framebuffer_state(struct pipe_context *ctx,
2692 const struct pipe_framebuffer_state *state)
2693 {
2694 struct si_context *sctx = (struct si_context *)ctx;
2695 struct si_surface *surf = NULL;
2696 struct si_texture *tex;
2697 bool old_any_dst_linear = sctx->framebuffer.any_dst_linear;
2698 unsigned old_nr_samples = sctx->framebuffer.nr_samples;
2699 unsigned old_colorbuf_enabled_4bit = sctx->framebuffer.colorbuf_enabled_4bit;
2700 bool old_has_zsbuf = !!sctx->framebuffer.state.zsbuf;
2701 bool old_has_stencil =
2702 old_has_zsbuf &&
2703 ((struct si_texture *)sctx->framebuffer.state.zsbuf->texture)->surface.has_stencil;
2704 bool unbound = false;
2705 int i;
2706
2707 /* Reject zero-sized framebuffers due to a hw bug on GFX6 that occurs
2708 * when PA_SU_HARDWARE_SCREEN_OFFSET != 0 and any_scissor.BR_X/Y <= 0.
2709 * We could implement the full workaround here, but it's a useless case.
2710 */
2711 if ((!state->width || !state->height) && (state->nr_cbufs || state->zsbuf)) {
2712 unreachable("the framebuffer shouldn't have zero area");
2713 return;
2714 }
2715
2716 si_update_fb_dirtiness_after_rendering(sctx);
2717
2718 for (i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
2719 if (!sctx->framebuffer.state.cbufs[i])
2720 continue;
2721
2722 tex = (struct si_texture *)sctx->framebuffer.state.cbufs[i]->texture;
2723 if (tex->dcc_gather_statistics)
2724 vi_separate_dcc_stop_query(sctx, tex);
2725 }
2726
2727 /* Disable DCC if the formats are incompatible. */
2728 for (i = 0; i < state->nr_cbufs; i++) {
2729 if (!state->cbufs[i])
2730 continue;
2731
2732 surf = (struct si_surface *)state->cbufs[i];
2733 tex = (struct si_texture *)surf->base.texture;
2734
2735 if (!surf->dcc_incompatible)
2736 continue;
2737
2738 /* Since the DCC decompression calls back into set_framebuffer-
2739 * _state, we need to unbind the framebuffer, so that
2740 * vi_separate_dcc_stop_query isn't called twice with the same
2741 * color buffer.
2742 */
2743 if (!unbound) {
2744 util_copy_framebuffer_state(&sctx->framebuffer.state, NULL);
2745 unbound = true;
2746 }
2747
2748 if (vi_dcc_enabled(tex, surf->base.u.tex.level))
2749 if (!si_texture_disable_dcc(sctx, tex))
2750 si_decompress_dcc(sctx, tex);
2751
2752 surf->dcc_incompatible = false;
2753 }
2754
2755 /* Only flush TC when changing the framebuffer state, because
2756 * the only client not using TC that can change textures is
2757 * the framebuffer.
2758 *
2759 * Wait for compute shaders because of possible transitions:
2760 * - FB write -> shader read
2761 * - shader write -> FB read
2762 *
2763 * DB caches are flushed on demand (using si_decompress_textures).
2764 *
2765 * When MSAA is enabled, CB and TC caches are flushed on demand
2766 * (after FMASK decompression). Shader write -> FB read transitions
2767 * cannot happen for MSAA textures, because MSAA shader images are
2768 * not supported.
2769 *
2770 * Only flush and wait for CB if there is actually a bound color buffer.
2771 */
2772 if (sctx->framebuffer.uncompressed_cb_mask) {
2773 si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples,
2774 sctx->framebuffer.CB_has_shader_readable_metadata,
2775 sctx->framebuffer.all_DCC_pipe_aligned);
2776 }
2777
2778 sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH;
2779
2780 /* u_blitter doesn't invoke depth decompression when it does multiple
2781 * blits in a row, but the only case when it matters for DB is when
2782 * doing generate_mipmap. So here we flush DB manually between
2783 * individual generate_mipmap blits.
2784 * Note that lower mipmap levels aren't compressed.
2785 */
2786 if (sctx->generate_mipmap_for_depth) {
2787 si_make_DB_shader_coherent(sctx, 1, false, sctx->framebuffer.DB_has_shader_readable_metadata);
2788 } else if (sctx->chip_class == GFX9) {
2789 /* It appears that DB metadata "leaks" in a sequence of:
2790 * - depth clear
2791 * - DCC decompress for shader image writes (with DB disabled)
2792 * - render with DEPTH_BEFORE_SHADER=1
2793 * Flushing DB metadata works around the problem.
2794 */
2795 sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_DB_META;
2796 }
2797
2798 /* Take the maximum of the old and new count. If the new count is lower,
2799 * dirtying is needed to disable the unbound colorbuffers.
2800 */
2801 sctx->framebuffer.dirty_cbufs |=
2802 (1 << MAX2(sctx->framebuffer.state.nr_cbufs, state->nr_cbufs)) - 1;
2803 sctx->framebuffer.dirty_zsbuf |= sctx->framebuffer.state.zsbuf != state->zsbuf;
2804
2805 si_dec_framebuffer_counters(&sctx->framebuffer.state);
2806 util_copy_framebuffer_state(&sctx->framebuffer.state, state);
2807
2808 sctx->framebuffer.colorbuf_enabled_4bit = 0;
2809 sctx->framebuffer.spi_shader_col_format = 0;
2810 sctx->framebuffer.spi_shader_col_format_alpha = 0;
2811 sctx->framebuffer.spi_shader_col_format_blend = 0;
2812 sctx->framebuffer.spi_shader_col_format_blend_alpha = 0;
2813 sctx->framebuffer.color_is_int8 = 0;
2814 sctx->framebuffer.color_is_int10 = 0;
2815
2816 sctx->framebuffer.compressed_cb_mask = 0;
2817 sctx->framebuffer.uncompressed_cb_mask = 0;
2818 sctx->framebuffer.displayable_dcc_cb_mask = 0;
2819 sctx->framebuffer.nr_samples = util_framebuffer_get_num_samples(state);
2820 sctx->framebuffer.nr_color_samples = sctx->framebuffer.nr_samples;
2821 sctx->framebuffer.log_samples = util_logbase2(sctx->framebuffer.nr_samples);
2822 sctx->framebuffer.any_dst_linear = false;
2823 sctx->framebuffer.CB_has_shader_readable_metadata = false;
2824 sctx->framebuffer.DB_has_shader_readable_metadata = false;
2825 sctx->framebuffer.all_DCC_pipe_aligned = true;
2826 sctx->framebuffer.min_bytes_per_pixel = 0;
2827
2828 for (i = 0; i < state->nr_cbufs; i++) {
2829 if (!state->cbufs[i])
2830 continue;
2831
2832 surf = (struct si_surface *)state->cbufs[i];
2833 tex = (struct si_texture *)surf->base.texture;
2834
2835 if (!surf->color_initialized) {
2836 si_initialize_color_surface(sctx, surf);
2837 }
2838
2839 sctx->framebuffer.colorbuf_enabled_4bit |= 0xf << (i * 4);
2840 sctx->framebuffer.spi_shader_col_format |= surf->spi_shader_col_format << (i * 4);
2841 sctx->framebuffer.spi_shader_col_format_alpha |= surf->spi_shader_col_format_alpha << (i * 4);
2842 sctx->framebuffer.spi_shader_col_format_blend |= surf->spi_shader_col_format_blend << (i * 4);
2843 sctx->framebuffer.spi_shader_col_format_blend_alpha |= surf->spi_shader_col_format_blend_alpha
2844 << (i * 4);
2845
2846 if (surf->color_is_int8)
2847 sctx->framebuffer.color_is_int8 |= 1 << i;
2848 if (surf->color_is_int10)
2849 sctx->framebuffer.color_is_int10 |= 1 << i;
2850
2851 if (tex->surface.fmask_offset)
2852 sctx->framebuffer.compressed_cb_mask |= 1 << i;
2853 else
2854 sctx->framebuffer.uncompressed_cb_mask |= 1 << i;
2855
2856 if (tex->surface.dcc_offset)
2857 sctx->framebuffer.displayable_dcc_cb_mask |= 1 << i;
2858
2859 /* Don't update nr_color_samples for non-AA buffers.
2860 * (e.g. destination of MSAA resolve)
2861 */
2862 if (tex->buffer.b.b.nr_samples >= 2 &&
2863 tex->buffer.b.b.nr_storage_samples < tex->buffer.b.b.nr_samples) {
2864 sctx->framebuffer.nr_color_samples =
2865 MIN2(sctx->framebuffer.nr_color_samples, tex->buffer.b.b.nr_storage_samples);
2866 sctx->framebuffer.nr_color_samples = MAX2(1, sctx->framebuffer.nr_color_samples);
2867 }
2868
2869 if (tex->surface.is_linear)
2870 sctx->framebuffer.any_dst_linear = true;
2871
2872 if (vi_dcc_enabled(tex, surf->base.u.tex.level)) {
2873 sctx->framebuffer.CB_has_shader_readable_metadata = true;
2874
2875 if (sctx->chip_class >= GFX9 && !tex->surface.u.gfx9.dcc.pipe_aligned)
2876 sctx->framebuffer.all_DCC_pipe_aligned = false;
2877 }
2878
2879 si_context_add_resource_size(sctx, surf->base.texture);
2880
2881 p_atomic_inc(&tex->framebuffers_bound);
2882
2883 if (tex->dcc_gather_statistics) {
2884 /* Dirty tracking must be enabled for DCC usage analysis. */
2885 sctx->framebuffer.compressed_cb_mask |= 1 << i;
2886 vi_separate_dcc_start_query(sctx, tex);
2887 }
2888
2889 /* Update the minimum but don't keep 0. */
2890 if (!sctx->framebuffer.min_bytes_per_pixel ||
2891 tex->surface.bpe < sctx->framebuffer.min_bytes_per_pixel)
2892 sctx->framebuffer.min_bytes_per_pixel = tex->surface.bpe;
2893 }
2894
2895 /* For optimal DCC performance. */
2896 if (sctx->chip_class >= GFX10)
2897 sctx->framebuffer.dcc_overwrite_combiner_watermark = 6;
2898 else
2899 sctx->framebuffer.dcc_overwrite_combiner_watermark = 4;
2900
2901 struct si_texture *zstex = NULL;
2902
2903 if (state->zsbuf) {
2904 surf = (struct si_surface *)state->zsbuf;
2905 zstex = (struct si_texture *)surf->base.texture;
2906
2907 if (!surf->depth_initialized) {
2908 si_init_depth_surface(sctx, surf);
2909 }
2910
2911 if (vi_tc_compat_htile_enabled(zstex, surf->base.u.tex.level, PIPE_MASK_ZS))
2912 sctx->framebuffer.DB_has_shader_readable_metadata = true;
2913
2914 si_context_add_resource_size(sctx, surf->base.texture);
2915
2916 /* Update the minimum but don't keep 0. */
2917 if (!sctx->framebuffer.min_bytes_per_pixel ||
2918 zstex->surface.bpe < sctx->framebuffer.min_bytes_per_pixel)
2919 sctx->framebuffer.min_bytes_per_pixel = zstex->surface.bpe;
2920 }
2921
2922 si_update_ps_colorbuf0_slot(sctx);
2923 si_update_poly_offset_state(sctx);
2924 si_update_ngg_small_prim_precision(sctx);
2925 si_mark_atom_dirty(sctx, &sctx->atoms.s.cb_render_state);
2926 si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
2927
2928 if (sctx->screen->dpbb_allowed)
2929 si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
2930
2931 if (sctx->framebuffer.any_dst_linear != old_any_dst_linear)
2932 si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
2933
2934 if (sctx->screen->has_out_of_order_rast &&
2935 (sctx->framebuffer.colorbuf_enabled_4bit != old_colorbuf_enabled_4bit ||
2936 !!sctx->framebuffer.state.zsbuf != old_has_zsbuf ||
2937 (zstex && zstex->surface.has_stencil != old_has_stencil)))
2938 si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
2939
2940 if (sctx->framebuffer.nr_samples != old_nr_samples) {
2941 struct pipe_constant_buffer constbuf = {0};
2942
2943 si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
2944 si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
2945
2946 constbuf.buffer = sctx->sample_pos_buffer;
2947
2948 /* Set sample locations as fragment shader constants. */
2949 switch (sctx->framebuffer.nr_samples) {
2950 case 1:
2951 constbuf.buffer_offset = 0;
2952 break;
2953 case 2:
2954 constbuf.buffer_offset =
2955 (ubyte *)sctx->sample_positions.x2 - (ubyte *)sctx->sample_positions.x1;
2956 break;
2957 case 4:
2958 constbuf.buffer_offset =
2959 (ubyte *)sctx->sample_positions.x4 - (ubyte *)sctx->sample_positions.x1;
2960 break;
2961 case 8:
2962 constbuf.buffer_offset =
2963 (ubyte *)sctx->sample_positions.x8 - (ubyte *)sctx->sample_positions.x1;
2964 break;
2965 case 16:
2966 constbuf.buffer_offset =
2967 (ubyte *)sctx->sample_positions.x16 - (ubyte *)sctx->sample_positions.x1;
2968 break;
2969 default:
2970 PRINT_ERR("Requested an invalid number of samples %i.\n", sctx->framebuffer.nr_samples);
2971 assert(0);
2972 }
2973 constbuf.buffer_size = sctx->framebuffer.nr_samples * 2 * 4;
2974 si_set_rw_buffer(sctx, SI_PS_CONST_SAMPLE_POSITIONS, &constbuf);
2975
2976 si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_sample_locs);
2977 }
2978
2979 sctx->do_update_shaders = true;
2980
2981 if (!sctx->decompression_enabled) {
2982 /* Prevent textures decompression when the framebuffer state
2983 * changes come from the decompression passes themselves.
2984 */
2985 sctx->need_check_render_feedback = true;
2986 }
2987 }
2988
2989 static void si_emit_framebuffer_state(struct si_context *sctx)
2990 {
2991 struct radeon_cmdbuf *cs = sctx->gfx_cs;
2992 struct pipe_framebuffer_state *state = &sctx->framebuffer.state;
2993 unsigned i, nr_cbufs = state->nr_cbufs;
2994 struct si_texture *tex = NULL;
2995 struct si_surface *cb = NULL;
2996 unsigned cb_color_info = 0;
2997
2998 /* Colorbuffers. */
2999 for (i = 0; i < nr_cbufs; i++) {
3000 uint64_t cb_color_base, cb_color_fmask, cb_color_cmask, cb_dcc_base;
3001 unsigned cb_color_attrib;
3002
3003 if (!(sctx->framebuffer.dirty_cbufs & (1 << i)))
3004 continue;
3005
3006 cb = (struct si_surface *)state->cbufs[i];
3007 if (!cb) {
3008 radeon_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C,
3009 S_028C70_FORMAT(V_028C70_COLOR_INVALID));
3010 continue;
3011 }
3012
3013 tex = (struct si_texture *)cb->base.texture;
3014 radeon_add_to_buffer_list(
3015 sctx, sctx->gfx_cs, &tex->buffer, RADEON_USAGE_READWRITE,
3016 tex->buffer.b.b.nr_samples > 1 ? RADEON_PRIO_COLOR_BUFFER_MSAA : RADEON_PRIO_COLOR_BUFFER);
3017
3018 if (tex->cmask_buffer && tex->cmask_buffer != &tex->buffer) {
3019 radeon_add_to_buffer_list(sctx, sctx->gfx_cs, tex->cmask_buffer, RADEON_USAGE_READWRITE,
3020 RADEON_PRIO_SEPARATE_META);
3021 }
3022
3023 if (tex->dcc_separate_buffer)
3024 radeon_add_to_buffer_list(sctx, sctx->gfx_cs, tex->dcc_separate_buffer,
3025 RADEON_USAGE_READWRITE, RADEON_PRIO_SEPARATE_META);
3026
3027 /* Compute mutable surface parameters. */
3028 cb_color_base = tex->buffer.gpu_address >> 8;
3029 cb_color_fmask = 0;
3030 cb_color_cmask = tex->cmask_base_address_reg;
3031 cb_dcc_base = 0;
3032 cb_color_info = cb->cb_color_info | tex->cb_color_info;
3033 cb_color_attrib = cb->cb_color_attrib;
3034
3035 if (cb->base.u.tex.level > 0)
3036 cb_color_info &= C_028C70_FAST_CLEAR;
3037
3038 if (tex->surface.fmask_offset) {
3039 cb_color_fmask = (tex->buffer.gpu_address + tex->surface.fmask_offset) >> 8;
3040 cb_color_fmask |= tex->surface.fmask_tile_swizzle;
3041 }
3042
3043 /* Set up DCC. */
3044 if (vi_dcc_enabled(tex, cb->base.u.tex.level)) {
3045 bool is_msaa_resolve_dst = state->cbufs[0] && state->cbufs[0]->texture->nr_samples > 1 &&
3046 state->cbufs[1] == &cb->base &&
3047 state->cbufs[1]->texture->nr_samples <= 1;
3048
3049 if (!is_msaa_resolve_dst)
3050 cb_color_info |= S_028C70_DCC_ENABLE(1);
3051
3052 cb_dcc_base =
3053 ((!tex->dcc_separate_buffer ? tex->buffer.gpu_address : 0) + tex->surface.dcc_offset) >>
3054 8;
3055
3056 unsigned dcc_tile_swizzle = tex->surface.tile_swizzle;
3057 dcc_tile_swizzle &= (tex->surface.dcc_alignment - 1) >> 8;
3058 cb_dcc_base |= dcc_tile_swizzle;
3059 }
3060
3061 if (sctx->chip_class >= GFX10) {
3062 unsigned cb_color_attrib3;
3063
3064 /* Set mutable surface parameters. */
3065 cb_color_base += tex->surface.u.gfx9.surf_offset >> 8;
3066 cb_color_base |= tex->surface.tile_swizzle;
3067 if (!tex->surface.fmask_offset)
3068 cb_color_fmask = cb_color_base;
3069 if (cb->base.u.tex.level > 0)
3070 cb_color_cmask = cb_color_base;
3071
3072 cb_color_attrib3 = cb->cb_color_attrib3 |
3073 S_028EE0_COLOR_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode) |
3074 S_028EE0_FMASK_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode) |
3075 S_028EE0_CMASK_PIPE_ALIGNED(tex->surface.u.gfx9.cmask.pipe_aligned) |
3076 S_028EE0_DCC_PIPE_ALIGNED(tex->surface.u.gfx9.dcc.pipe_aligned);
3077
3078 radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 14);
3079 radeon_emit(cs, cb_color_base); /* CB_COLOR0_BASE */
3080 radeon_emit(cs, 0); /* hole */
3081 radeon_emit(cs, 0); /* hole */
3082 radeon_emit(cs, cb->cb_color_view); /* CB_COLOR0_VIEW */
3083 radeon_emit(cs, cb_color_info); /* CB_COLOR0_INFO */
3084 radeon_emit(cs, cb_color_attrib); /* CB_COLOR0_ATTRIB */
3085 radeon_emit(cs, cb->cb_dcc_control); /* CB_COLOR0_DCC_CONTROL */
3086 radeon_emit(cs, cb_color_cmask); /* CB_COLOR0_CMASK */
3087 radeon_emit(cs, 0); /* hole */
3088 radeon_emit(cs, cb_color_fmask); /* CB_COLOR0_FMASK */
3089 radeon_emit(cs, 0); /* hole */
3090 radeon_emit(cs, tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */
3091 radeon_emit(cs, tex->color_clear_value[1]); /* CB_COLOR0_CLEAR_WORD1 */
3092 radeon_emit(cs, cb_dcc_base); /* CB_COLOR0_DCC_BASE */
3093
3094 radeon_set_context_reg(cs, R_028E40_CB_COLOR0_BASE_EXT + i * 4, cb_color_base >> 32);
3095 radeon_set_context_reg(cs, R_028E60_CB_COLOR0_CMASK_BASE_EXT + i * 4,
3096 cb_color_cmask >> 32);
3097 radeon_set_context_reg(cs, R_028E80_CB_COLOR0_FMASK_BASE_EXT + i * 4,
3098 cb_color_fmask >> 32);
3099 radeon_set_context_reg(cs, R_028EA0_CB_COLOR0_DCC_BASE_EXT + i * 4, cb_dcc_base >> 32);
3100 radeon_set_context_reg(cs, R_028EC0_CB_COLOR0_ATTRIB2 + i * 4, cb->cb_color_attrib2);
3101 radeon_set_context_reg(cs, R_028EE0_CB_COLOR0_ATTRIB3 + i * 4, cb_color_attrib3);
3102 } else if (sctx->chip_class == GFX9) {
3103 struct gfx9_surf_meta_flags meta;
3104
3105 if (tex->surface.dcc_offset)
3106 meta = tex->surface.u.gfx9.dcc;
3107 else
3108 meta = tex->surface.u.gfx9.cmask;
3109
3110 /* Set mutable surface parameters. */
3111 cb_color_base += tex->surface.u.gfx9.surf_offset >> 8;
3112 cb_color_base |= tex->surface.tile_swizzle;
3113 if (!tex->surface.fmask_offset)
3114 cb_color_fmask = cb_color_base;
3115 if (cb->base.u.tex.level > 0)
3116 cb_color_cmask = cb_color_base;
3117 cb_color_attrib |= S_028C74_COLOR_SW_MODE(tex->surface.u.gfx9.surf.swizzle_mode) |
3118 S_028C74_FMASK_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode) |
3119 S_028C74_RB_ALIGNED(meta.rb_aligned) |
3120 S_028C74_PIPE_ALIGNED(meta.pipe_aligned);
3121
3122 radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C, 15);
3123 radeon_emit(cs, cb_color_base); /* CB_COLOR0_BASE */
3124 radeon_emit(cs, S_028C64_BASE_256B(cb_color_base >> 32)); /* CB_COLOR0_BASE_EXT */
3125 radeon_emit(cs, cb->cb_color_attrib2); /* CB_COLOR0_ATTRIB2 */
3126 radeon_emit(cs, cb->cb_color_view); /* CB_COLOR0_VIEW */
3127 radeon_emit(cs, cb_color_info); /* CB_COLOR0_INFO */
3128 radeon_emit(cs, cb_color_attrib); /* CB_COLOR0_ATTRIB */
3129 radeon_emit(cs, cb->cb_dcc_control); /* CB_COLOR0_DCC_CONTROL */
3130 radeon_emit(cs, cb_color_cmask); /* CB_COLOR0_CMASK */
3131 radeon_emit(cs, S_028C80_BASE_256B(cb_color_cmask >> 32)); /* CB_COLOR0_CMASK_BASE_EXT */
3132 radeon_emit(cs, cb_color_fmask); /* CB_COLOR0_FMASK */
3133 radeon_emit(cs, S_028C88_BASE_256B(cb_color_fmask >> 32)); /* CB_COLOR0_FMASK_BASE_EXT */
3134 radeon_emit(cs, tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */
3135 radeon_emit(cs, tex->color_clear_value[1]); /* CB_COLOR0_CLEAR_WORD1 */
3136 radeon_emit(cs, cb_dcc_base); /* CB_COLOR0_DCC_BASE */
3137 radeon_emit(cs, S_028C98_BASE_256B(cb_dcc_base >> 32)); /* CB_COLOR0_DCC_BASE_EXT */
3138
3139 radeon_set_context_reg(cs, R_0287A0_CB_MRT0_EPITCH + i * 4,
3140 S_0287A0_EPITCH(tex->surface.u.gfx9.surf.epitch));
3141 } else {
3142 /* Compute mutable surface parameters (GFX6-GFX8). */
3143 const struct legacy_surf_level *level_info =
3144 &tex->surface.u.legacy.level[cb->base.u.tex.level];
3145 unsigned pitch_tile_max, slice_tile_max, tile_mode_index;
3146 unsigned cb_color_pitch, cb_color_slice, cb_color_fmask_slice;
3147
3148 cb_color_base += level_info->offset >> 8;
3149 /* Only macrotiled modes can set tile swizzle. */
3150 if (level_info->mode == RADEON_SURF_MODE_2D)
3151 cb_color_base |= tex->surface.tile_swizzle;
3152
3153 if (!tex->surface.fmask_offset)
3154 cb_color_fmask = cb_color_base;
3155 if (cb->base.u.tex.level > 0)
3156 cb_color_cmask = cb_color_base;
3157 if (cb_dcc_base)
3158 cb_dcc_base += level_info->dcc_offset >> 8;
3159
3160 pitch_tile_max = level_info->nblk_x / 8 - 1;
3161 slice_tile_max = level_info->nblk_x * level_info->nblk_y / 64 - 1;
3162 tile_mode_index = si_tile_mode_index(tex, cb->base.u.tex.level, false);
3163
3164 cb_color_attrib |= S_028C74_TILE_MODE_INDEX(tile_mode_index);
3165 cb_color_pitch = S_028C64_TILE_MAX(pitch_tile_max);
3166 cb_color_slice = S_028C68_TILE_MAX(slice_tile_max);
3167
3168 if (tex->surface.fmask_offset) {
3169 if (sctx->chip_class >= GFX7)
3170 cb_color_pitch |=
3171 S_028C64_FMASK_TILE_MAX(tex->surface.u.legacy.fmask.pitch_in_pixels / 8 - 1);
3172 cb_color_attrib |=
3173 S_028C74_FMASK_TILE_MODE_INDEX(tex->surface.u.legacy.fmask.tiling_index);
3174 cb_color_fmask_slice = S_028C88_TILE_MAX(tex->surface.u.legacy.fmask.slice_tile_max);
3175 } else {
3176 /* This must be set for fast clear to work without FMASK. */
3177 if (sctx->chip_class >= GFX7)
3178 cb_color_pitch |= S_028C64_FMASK_TILE_MAX(pitch_tile_max);
3179 cb_color_attrib |= S_028C74_FMASK_TILE_MODE_INDEX(tile_mode_index);
3180 cb_color_fmask_slice = S_028C88_TILE_MAX(slice_tile_max);
3181 }
3182
3183 radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C,
3184 sctx->chip_class >= GFX8 ? 14 : 13);
3185 radeon_emit(cs, cb_color_base); /* CB_COLOR0_BASE */
3186 radeon_emit(cs, cb_color_pitch); /* CB_COLOR0_PITCH */
3187 radeon_emit(cs, cb_color_slice); /* CB_COLOR0_SLICE */
3188 radeon_emit(cs, cb->cb_color_view); /* CB_COLOR0_VIEW */
3189 radeon_emit(cs, cb_color_info); /* CB_COLOR0_INFO */
3190 radeon_emit(cs, cb_color_attrib); /* CB_COLOR0_ATTRIB */
3191 radeon_emit(cs, cb->cb_dcc_control); /* CB_COLOR0_DCC_CONTROL */
3192 radeon_emit(cs, cb_color_cmask); /* CB_COLOR0_CMASK */
3193 radeon_emit(cs, tex->surface.u.legacy.cmask_slice_tile_max); /* CB_COLOR0_CMASK_SLICE */
3194 radeon_emit(cs, cb_color_fmask); /* CB_COLOR0_FMASK */
3195 radeon_emit(cs, cb_color_fmask_slice); /* CB_COLOR0_FMASK_SLICE */
3196 radeon_emit(cs, tex->color_clear_value[0]); /* CB_COLOR0_CLEAR_WORD0 */
3197 radeon_emit(cs, tex->color_clear_value[1]); /* CB_COLOR0_CLEAR_WORD1 */
3198
3199 if (sctx->chip_class >= GFX8) /* R_028C94_CB_COLOR0_DCC_BASE */
3200 radeon_emit(cs, cb_dcc_base);
3201 }
3202 }
3203 for (; i < 8; i++)
3204 if (sctx->framebuffer.dirty_cbufs & (1 << i))
3205 radeon_set_context_reg(cs, R_028C70_CB_COLOR0_INFO + i * 0x3C, 0);
3206
3207 /* ZS buffer. */
3208 if (state->zsbuf && sctx->framebuffer.dirty_zsbuf) {
3209 struct si_surface *zb = (struct si_surface *)state->zsbuf;
3210 struct si_texture *tex = (struct si_texture *)zb->base.texture;
3211
3212 radeon_add_to_buffer_list(sctx, sctx->gfx_cs, &tex->buffer, RADEON_USAGE_READWRITE,
3213 zb->base.texture->nr_samples > 1 ? RADEON_PRIO_DEPTH_BUFFER_MSAA
3214 : RADEON_PRIO_DEPTH_BUFFER);
3215
3216 if (sctx->chip_class >= GFX10) {
3217 radeon_set_context_reg(cs, R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base);
3218 radeon_set_context_reg(cs, R_02801C_DB_DEPTH_SIZE_XY, zb->db_depth_size);
3219
3220 radeon_set_context_reg_seq(cs, R_02803C_DB_DEPTH_INFO, 7);
3221 radeon_emit(cs, S_02803C_RESOURCE_LEVEL(1)); /* DB_DEPTH_INFO */
3222 radeon_emit(cs, zb->db_z_info | /* DB_Z_INFO */
3223 S_028038_ZRANGE_PRECISION(tex->depth_clear_value != 0));
3224 radeon_emit(cs, zb->db_stencil_info); /* DB_STENCIL_INFO */
3225 radeon_emit(cs, zb->db_depth_base); /* DB_Z_READ_BASE */
3226 radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_READ_BASE */
3227 radeon_emit(cs, zb->db_depth_base); /* DB_Z_WRITE_BASE */
3228 radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */
3229
3230 radeon_set_context_reg_seq(cs, R_028068_DB_Z_READ_BASE_HI, 5);
3231 radeon_emit(cs, zb->db_depth_base >> 32); /* DB_Z_READ_BASE_HI */
3232 radeon_emit(cs, zb->db_stencil_base >> 32); /* DB_STENCIL_READ_BASE_HI */
3233 radeon_emit(cs, zb->db_depth_base >> 32); /* DB_Z_WRITE_BASE_HI */
3234 radeon_emit(cs, zb->db_stencil_base >> 32); /* DB_STENCIL_WRITE_BASE_HI */
3235 radeon_emit(cs, zb->db_htile_data_base >> 32); /* DB_HTILE_DATA_BASE_HI */
3236 } else if (sctx->chip_class == GFX9) {
3237 radeon_set_context_reg_seq(cs, R_028014_DB_HTILE_DATA_BASE, 3);
3238 radeon_emit(cs, zb->db_htile_data_base); /* DB_HTILE_DATA_BASE */
3239 radeon_emit(cs,
3240 S_028018_BASE_HI(zb->db_htile_data_base >> 32)); /* DB_HTILE_DATA_BASE_HI */
3241 radeon_emit(cs, zb->db_depth_size); /* DB_DEPTH_SIZE */
3242
3243 radeon_set_context_reg_seq(cs, R_028038_DB_Z_INFO, 10);
3244 radeon_emit(cs, zb->db_z_info | /* DB_Z_INFO */
3245 S_028038_ZRANGE_PRECISION(tex->depth_clear_value != 0));
3246 radeon_emit(cs, zb->db_stencil_info); /* DB_STENCIL_INFO */
3247 radeon_emit(cs, zb->db_depth_base); /* DB_Z_READ_BASE */
3248 radeon_emit(cs, S_028044_BASE_HI(zb->db_depth_base >> 32)); /* DB_Z_READ_BASE_HI */
3249 radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_READ_BASE */
3250 radeon_emit(cs, S_02804C_BASE_HI(zb->db_stencil_base >> 32)); /* DB_STENCIL_READ_BASE_HI */
3251 radeon_emit(cs, zb->db_depth_base); /* DB_Z_WRITE_BASE */
3252 radeon_emit(cs, S_028054_BASE_HI(zb->db_depth_base >> 32)); /* DB_Z_WRITE_BASE_HI */
3253 radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */
3254 radeon_emit(cs,
3255 S_02805C_BASE_HI(zb->db_stencil_base >> 32)); /* DB_STENCIL_WRITE_BASE_HI */
3256
3257 radeon_set_context_reg_seq(cs, R_028068_DB_Z_INFO2, 2);
3258 radeon_emit(cs, zb->db_z_info2); /* DB_Z_INFO2 */
3259 radeon_emit(cs, zb->db_stencil_info2); /* DB_STENCIL_INFO2 */
3260 } else {
3261 radeon_set_context_reg(cs, R_028014_DB_HTILE_DATA_BASE, zb->db_htile_data_base);
3262
3263 radeon_set_context_reg_seq(cs, R_02803C_DB_DEPTH_INFO, 9);
3264 radeon_emit(cs, zb->db_depth_info); /* DB_DEPTH_INFO */
3265 radeon_emit(cs, zb->db_z_info | /* DB_Z_INFO */
3266 S_028040_ZRANGE_PRECISION(tex->depth_clear_value != 0));
3267 radeon_emit(cs, zb->db_stencil_info); /* DB_STENCIL_INFO */
3268 radeon_emit(cs, zb->db_depth_base); /* DB_Z_READ_BASE */
3269 radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_READ_BASE */
3270 radeon_emit(cs, zb->db_depth_base); /* DB_Z_WRITE_BASE */
3271 radeon_emit(cs, zb->db_stencil_base); /* DB_STENCIL_WRITE_BASE */
3272 radeon_emit(cs, zb->db_depth_size); /* DB_DEPTH_SIZE */
3273 radeon_emit(cs, zb->db_depth_slice); /* DB_DEPTH_SLICE */
3274 }
3275
3276 radeon_set_context_reg_seq(cs, R_028028_DB_STENCIL_CLEAR, 2);
3277 radeon_emit(cs, tex->stencil_clear_value); /* R_028028_DB_STENCIL_CLEAR */
3278 radeon_emit(cs, fui(tex->depth_clear_value)); /* R_02802C_DB_DEPTH_CLEAR */
3279
3280 radeon_set_context_reg(cs, R_028008_DB_DEPTH_VIEW, zb->db_depth_view);
3281 radeon_set_context_reg(cs, R_028ABC_DB_HTILE_SURFACE, zb->db_htile_surface);
3282 } else if (sctx->framebuffer.dirty_zsbuf) {
3283 if (sctx->chip_class == GFX9)
3284 radeon_set_context_reg_seq(cs, R_028038_DB_Z_INFO, 2);
3285 else
3286 radeon_set_context_reg_seq(cs, R_028040_DB_Z_INFO, 2);
3287
3288 radeon_emit(cs, S_028040_FORMAT(V_028040_Z_INVALID)); /* DB_Z_INFO */
3289 radeon_emit(cs, S_028044_FORMAT(V_028044_STENCIL_INVALID)); /* DB_STENCIL_INFO */
3290 }
3291
3292 /* Framebuffer dimensions. */
3293 /* PA_SC_WINDOW_SCISSOR_TL is set in si_init_config() */
3294 radeon_set_context_reg(cs, R_028208_PA_SC_WINDOW_SCISSOR_BR,
3295 S_028208_BR_X(state->width) | S_028208_BR_Y(state->height));
3296
3297 if (sctx->screen->dfsm_allowed) {
3298 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
3299 radeon_emit(cs, EVENT_TYPE(V_028A90_BREAK_BATCH) | EVENT_INDEX(0));
3300 }
3301
3302 sctx->framebuffer.dirty_cbufs = 0;
3303 sctx->framebuffer.dirty_zsbuf = false;
3304 }
3305
3306 static void si_emit_msaa_sample_locs(struct si_context *sctx)
3307 {
3308 struct radeon_cmdbuf *cs = sctx->gfx_cs;
3309 struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
3310 unsigned nr_samples = sctx->framebuffer.nr_samples;
3311 bool has_msaa_sample_loc_bug = sctx->screen->info.has_msaa_sample_loc_bug;
3312
3313 /* Smoothing (only possible with nr_samples == 1) uses the same
3314 * sample locations as the MSAA it simulates.
3315 */
3316 if (nr_samples <= 1 && sctx->smoothing_enabled)
3317 nr_samples = SI_NUM_SMOOTH_AA_SAMPLES;
3318
3319 /* On Polaris, the small primitive filter uses the sample locations
3320 * even when MSAA is off, so we need to make sure they're set to 0.
3321 *
3322 * GFX10 uses sample locations unconditionally, so they always need
3323 * to be set up.
3324 */
3325 if ((nr_samples >= 2 || has_msaa_sample_loc_bug || sctx->chip_class >= GFX10) &&
3326 nr_samples != sctx->sample_locs_num_samples) {
3327 sctx->sample_locs_num_samples = nr_samples;
3328 si_emit_sample_locations(cs, nr_samples);
3329 }
3330
3331 if (sctx->family >= CHIP_POLARIS10) {
3332 unsigned small_prim_filter_cntl =
3333 S_028830_SMALL_PRIM_FILTER_ENABLE(1) |
3334 /* line bug */
3335 S_028830_LINE_FILTER_DISABLE(sctx->family <= CHIP_POLARIS12);
3336
3337 /* The alternative of setting sample locations to 0 would
3338 * require a DB flush to avoid Z errors, see
3339 * https://bugs.freedesktop.org/show_bug.cgi?id=96908
3340 */
3341 if (has_msaa_sample_loc_bug && sctx->framebuffer.nr_samples > 1 && !rs->multisample_enable)
3342 small_prim_filter_cntl &= C_028830_SMALL_PRIM_FILTER_ENABLE;
3343
3344 radeon_opt_set_context_reg(sctx, R_028830_PA_SU_SMALL_PRIM_FILTER_CNTL,
3345 SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL, small_prim_filter_cntl);
3346 }
3347
3348 /* The exclusion bits can be set to improve rasterization efficiency
3349 * if no sample lies on the pixel boundary (-8 sample offset).
3350 */
3351 bool exclusion = sctx->chip_class >= GFX7 && (!rs->multisample_enable || nr_samples != 16);
3352 radeon_opt_set_context_reg(
3353 sctx, R_02882C_PA_SU_PRIM_FILTER_CNTL, SI_TRACKED_PA_SU_PRIM_FILTER_CNTL,
3354 S_02882C_XMAX_RIGHT_EXCLUSION(exclusion) | S_02882C_YMAX_BOTTOM_EXCLUSION(exclusion));
3355 }
3356
3357 static bool si_out_of_order_rasterization(struct si_context *sctx)
3358 {
3359 struct si_state_blend *blend = sctx->queued.named.blend;
3360 struct si_state_dsa *dsa = sctx->queued.named.dsa;
3361
3362 if (!sctx->screen->has_out_of_order_rast)
3363 return false;
3364
3365 unsigned colormask = sctx->framebuffer.colorbuf_enabled_4bit;
3366
3367 colormask &= blend->cb_target_enabled_4bit;
3368
3369 /* Conservative: No logic op. */
3370 if (colormask && blend->logicop_enable)
3371 return false;
3372
3373 struct si_dsa_order_invariance dsa_order_invariant = {.zs = true,
3374 .pass_set = true,
3375 .pass_last = false};
3376
3377 if (sctx->framebuffer.state.zsbuf) {
3378 struct si_texture *zstex = (struct si_texture *)sctx->framebuffer.state.zsbuf->texture;
3379 bool has_stencil = zstex->surface.has_stencil;
3380 dsa_order_invariant = dsa->order_invariance[has_stencil];
3381 if (!dsa_order_invariant.zs)
3382 return false;
3383
3384 /* The set of PS invocations is always order invariant,
3385 * except when early Z/S tests are requested. */
3386 if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.writes_memory &&
3387 sctx->ps_shader.cso->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL] &&
3388 !dsa_order_invariant.pass_set)
3389 return false;
3390
3391 if (sctx->num_perfect_occlusion_queries != 0 && !dsa_order_invariant.pass_set)
3392 return false;
3393 }
3394
3395 if (!colormask)
3396 return true;
3397
3398 unsigned blendmask = colormask & blend->blend_enable_4bit;
3399
3400 if (blendmask) {
3401 /* Only commutative blending. */
3402 if (blendmask & ~blend->commutative_4bit)
3403 return false;
3404
3405 if (!dsa_order_invariant.pass_set)
3406 return false;
3407 }
3408
3409 if (colormask & ~blendmask) {
3410 if (!dsa_order_invariant.pass_last)
3411 return false;
3412 }
3413
3414 return true;
3415 }
3416
3417 static void si_emit_msaa_config(struct si_context *sctx)
3418 {
3419 struct radeon_cmdbuf *cs = sctx->gfx_cs;
3420 unsigned num_tile_pipes = sctx->screen->info.num_tile_pipes;
3421 /* 33% faster rendering to linear color buffers */
3422 bool dst_is_linear = sctx->framebuffer.any_dst_linear;
3423 bool out_of_order_rast = si_out_of_order_rasterization(sctx);
3424 unsigned sc_mode_cntl_1 =
3425 S_028A4C_WALK_SIZE(dst_is_linear) | S_028A4C_WALK_FENCE_ENABLE(!dst_is_linear) |
3426 S_028A4C_WALK_FENCE_SIZE(num_tile_pipes == 2 ? 2 : 3) |
3427 S_028A4C_OUT_OF_ORDER_PRIMITIVE_ENABLE(out_of_order_rast) |
3428 S_028A4C_OUT_OF_ORDER_WATER_MARK(0x7) |
3429 /* always 1: */
3430 S_028A4C_WALK_ALIGN8_PRIM_FITS_ST(1) | S_028A4C_SUPERTILE_WALK_ORDER_ENABLE(1) |
3431 S_028A4C_TILE_WALK_ORDER_ENABLE(1) | S_028A4C_MULTI_SHADER_ENGINE_PRIM_DISCARD_ENABLE(1) |
3432 S_028A4C_FORCE_EOV_CNTDWN_ENABLE(1) | S_028A4C_FORCE_EOV_REZ_ENABLE(1);
3433 unsigned db_eqaa = S_028804_HIGH_QUALITY_INTERSECTIONS(1) | S_028804_INCOHERENT_EQAA_READS(1) |
3434 S_028804_INTERPOLATE_COMP_Z(1) | S_028804_STATIC_ANCHOR_ASSOCIATIONS(1);
3435 unsigned coverage_samples, color_samples, z_samples;
3436 struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
3437
3438 /* S: Coverage samples (up to 16x):
3439 * - Scan conversion samples (PA_SC_AA_CONFIG.MSAA_NUM_SAMPLES)
3440 * - CB FMASK samples (CB_COLORi_ATTRIB.NUM_SAMPLES)
3441 *
3442 * Z: Z/S samples (up to 8x, must be <= coverage samples and >= color samples):
3443 * - Value seen by DB (DB_Z_INFO.NUM_SAMPLES)
3444 * - Value seen by CB, must be correct even if Z/S is unbound (DB_EQAA.MAX_ANCHOR_SAMPLES)
3445 * # Missing samples are derived from Z planes if Z is compressed (up to 16x quality), or
3446 * # from the closest defined sample if Z is uncompressed (same quality as the number of
3447 * # Z samples).
3448 *
3449 * F: Color samples (up to 8x, must be <= coverage samples):
3450 * - CB color samples (CB_COLORi_ATTRIB.NUM_FRAGMENTS)
3451 * - PS iter samples (DB_EQAA.PS_ITER_SAMPLES)
3452 *
3453 * Can be anything between coverage and color samples:
3454 * - SampleMaskIn samples (PA_SC_AA_CONFIG.MSAA_EXPOSED_SAMPLES)
3455 * - SampleMaskOut samples (DB_EQAA.MASK_EXPORT_NUM_SAMPLES)
3456 * - Alpha-to-coverage samples (DB_EQAA.ALPHA_TO_MASK_NUM_SAMPLES)
3457 * - Occlusion query samples (DB_COUNT_CONTROL.SAMPLE_RATE)
3458 * # All are currently set the same as coverage samples.
3459 *
3460 * If color samples < coverage samples, FMASK has a higher bpp to store an "unknown"
3461 * flag for undefined color samples. A shader-based resolve must handle unknowns
3462 * or mask them out with AND. Unknowns can also be guessed from neighbors via
3463 * an edge-detect shader-based resolve, which is required to make "color samples = 1"
3464 * useful. The CB resolve always drops unknowns.
3465 *
3466 * Sensible AA configurations:
3467 * EQAA 16s 8z 8f - might look the same as 16x MSAA if Z is compressed
3468 * EQAA 16s 8z 4f - might look the same as 16x MSAA if Z is compressed
3469 * EQAA 16s 4z 4f - might look the same as 16x MSAA if Z is compressed
3470 * EQAA 8s 8z 8f = 8x MSAA
3471 * EQAA 8s 8z 4f - might look the same as 8x MSAA
3472 * EQAA 8s 8z 2f - might look the same as 8x MSAA with low-density geometry
3473 * EQAA 8s 4z 4f - might look the same as 8x MSAA if Z is compressed
3474 * EQAA 8s 4z 2f - might look the same as 8x MSAA with low-density geometry if Z is compressed
3475 * EQAA 4s 4z 4f = 4x MSAA
3476 * EQAA 4s 4z 2f - might look the same as 4x MSAA with low-density geometry
3477 * EQAA 2s 2z 2f = 2x MSAA
3478 */
3479 if (sctx->framebuffer.nr_samples > 1 && rs->multisample_enable) {
3480 coverage_samples = sctx->framebuffer.nr_samples;
3481 color_samples = sctx->framebuffer.nr_color_samples;
3482
3483 if (sctx->framebuffer.state.zsbuf) {
3484 z_samples = sctx->framebuffer.state.zsbuf->texture->nr_samples;
3485 z_samples = MAX2(1, z_samples);
3486 } else {
3487 z_samples = coverage_samples;
3488 }
3489 } else if (sctx->smoothing_enabled) {
3490 coverage_samples = color_samples = z_samples = SI_NUM_SMOOTH_AA_SAMPLES;
3491 } else {
3492 coverage_samples = color_samples = z_samples = 1;
3493 }
3494
3495 /* Required by OpenGL line rasterization.
3496 *
3497 * TODO: We should also enable perpendicular endcaps for AA lines,
3498 * but that requires implementing line stippling in the pixel
3499 * shader. SC can only do line stippling with axis-aligned
3500 * endcaps.
3501 */
3502 unsigned sc_line_cntl = S_028BDC_DX10_DIAMOND_TEST_ENA(1);
3503 unsigned sc_aa_config = 0;
3504
3505 if (coverage_samples > 1) {
3506 /* distance from the pixel center, indexed by log2(nr_samples) */
3507 static unsigned max_dist[] = {
3508 0, /* unused */
3509 4, /* 2x MSAA */
3510 6, /* 4x MSAA */
3511 7, /* 8x MSAA */
3512 8, /* 16x MSAA */
3513 };
3514 unsigned log_samples = util_logbase2(coverage_samples);
3515 unsigned log_z_samples = util_logbase2(z_samples);
3516 unsigned ps_iter_samples = si_get_ps_iter_samples(sctx);
3517 unsigned log_ps_iter_samples = util_logbase2(ps_iter_samples);
3518
3519 sc_line_cntl |= S_028BDC_EXPAND_LINE_WIDTH(1);
3520 sc_aa_config = S_028BE0_MSAA_NUM_SAMPLES(log_samples) |
3521 S_028BE0_MAX_SAMPLE_DIST(max_dist[log_samples]) |
3522 S_028BE0_MSAA_EXPOSED_SAMPLES(log_samples);
3523
3524 if (sctx->framebuffer.nr_samples > 1) {
3525 db_eqaa |= S_028804_MAX_ANCHOR_SAMPLES(log_z_samples) |
3526 S_028804_PS_ITER_SAMPLES(log_ps_iter_samples) |
3527 S_028804_MASK_EXPORT_NUM_SAMPLES(log_samples) |
3528 S_028804_ALPHA_TO_MASK_NUM_SAMPLES(log_samples);
3529 sc_mode_cntl_1 |= S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1);
3530 } else if (sctx->smoothing_enabled) {
3531 db_eqaa |= S_028804_OVERRASTERIZATION_AMOUNT(log_samples);
3532 }
3533 }
3534
3535 unsigned initial_cdw = cs->current.cdw;
3536
3537 /* R_028BDC_PA_SC_LINE_CNTL, R_028BE0_PA_SC_AA_CONFIG */
3538 radeon_opt_set_context_reg2(sctx, R_028BDC_PA_SC_LINE_CNTL, SI_TRACKED_PA_SC_LINE_CNTL,
3539 sc_line_cntl, sc_aa_config);
3540 /* R_028804_DB_EQAA */
3541 radeon_opt_set_context_reg(sctx, R_028804_DB_EQAA, SI_TRACKED_DB_EQAA, db_eqaa);
3542 /* R_028A4C_PA_SC_MODE_CNTL_1 */
3543 radeon_opt_set_context_reg(sctx, R_028A4C_PA_SC_MODE_CNTL_1, SI_TRACKED_PA_SC_MODE_CNTL_1,
3544 sc_mode_cntl_1);
3545
3546 if (initial_cdw != cs->current.cdw) {
3547 sctx->context_roll = true;
3548
3549 /* GFX9: Flush DFSM when the AA mode changes. */
3550 if (sctx->screen->dfsm_allowed) {
3551 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
3552 radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0));
3553 }
3554 }
3555 }
3556
3557 void si_update_ps_iter_samples(struct si_context *sctx)
3558 {
3559 if (sctx->framebuffer.nr_samples > 1)
3560 si_mark_atom_dirty(sctx, &sctx->atoms.s.msaa_config);
3561 if (sctx->screen->dpbb_allowed)
3562 si_mark_atom_dirty(sctx, &sctx->atoms.s.dpbb_state);
3563 }
3564
3565 static void si_set_min_samples(struct pipe_context *ctx, unsigned min_samples)
3566 {
3567 struct si_context *sctx = (struct si_context *)ctx;
3568
3569 /* The hardware can only do sample shading with 2^n samples. */
3570 min_samples = util_next_power_of_two(min_samples);
3571
3572 if (sctx->ps_iter_samples == min_samples)
3573 return;
3574
3575 sctx->ps_iter_samples = min_samples;
3576 sctx->do_update_shaders = true;
3577
3578 si_update_ps_iter_samples(sctx);
3579 }
3580
3581 /*
3582 * Samplers
3583 */
3584
3585 /**
3586 * Build the sampler view descriptor for a buffer texture.
3587 * @param state 256-bit descriptor; only the high 128 bits are filled in
3588 */
3589 void si_make_buffer_descriptor(struct si_screen *screen, struct si_resource *buf,
3590 enum pipe_format format, unsigned offset, unsigned size,
3591 uint32_t *state)
3592 {
3593 const struct util_format_description *desc;
3594 unsigned stride;
3595 unsigned num_records;
3596
3597 desc = util_format_description(format);
3598 stride = desc->block.bits / 8;
3599
3600 num_records = size / stride;
3601 num_records = MIN2(num_records, (buf->b.b.width0 - offset) / stride);
3602
3603 /* The NUM_RECORDS field has a different meaning depending on the chip,
3604 * instruction type, STRIDE, and SWIZZLE_ENABLE.
3605 *
3606 * GFX6-7,10:
3607 * - If STRIDE == 0, it's in byte units.
3608 * - If STRIDE != 0, it's in units of STRIDE, used with inst.IDXEN.
3609 *
3610 * GFX8:
3611 * - For SMEM and STRIDE == 0, it's in byte units.
3612 * - For SMEM and STRIDE != 0, it's in units of STRIDE.
3613 * - For VMEM and STRIDE == 0 or SWIZZLE_ENABLE == 0, it's in byte units.
3614 * - For VMEM and STRIDE != 0 and SWIZZLE_ENABLE == 1, it's in units of STRIDE.
3615 * NOTE: There is incompatibility between VMEM and SMEM opcodes due to SWIZZLE_-
3616 * ENABLE. The workaround is to set STRIDE = 0 if SWIZZLE_ENABLE == 0 when
3617 * using SMEM. This can be done in the shader by clearing STRIDE with s_and.
3618 * That way the same descriptor can be used by both SMEM and VMEM.
3619 *
3620 * GFX9:
3621 * - For SMEM and STRIDE == 0, it's in byte units.
3622 * - For SMEM and STRIDE != 0, it's in units of STRIDE.
3623 * - For VMEM and inst.IDXEN == 0 or STRIDE == 0, it's in byte units.
3624 * - For VMEM and inst.IDXEN == 1 and STRIDE != 0, it's in units of STRIDE.
3625 */
3626 if (screen->info.chip_class == GFX8)
3627 num_records *= stride;
3628
3629 state[4] = 0;
3630 state[5] = S_008F04_STRIDE(stride);
3631 state[6] = num_records;
3632 state[7] = S_008F0C_DST_SEL_X(si_map_swizzle(desc->swizzle[0])) |
3633 S_008F0C_DST_SEL_Y(si_map_swizzle(desc->swizzle[1])) |
3634 S_008F0C_DST_SEL_Z(si_map_swizzle(desc->swizzle[2])) |
3635 S_008F0C_DST_SEL_W(si_map_swizzle(desc->swizzle[3]));
3636
3637 if (screen->info.chip_class >= GFX10) {
3638 const struct gfx10_format *fmt = &gfx10_format_table[format];
3639
3640 /* OOB_SELECT chooses the out-of-bounds check:
3641 * - 0: (index >= NUM_RECORDS) || (offset >= STRIDE)
3642 * - 1: index >= NUM_RECORDS
3643 * - 2: NUM_RECORDS == 0
3644 * - 3: if SWIZZLE_ENABLE == 0: offset >= NUM_RECORDS
3645 * else: swizzle_address >= NUM_RECORDS
3646 */
3647 state[7] |= S_008F0C_FORMAT(fmt->img_format) |
3648 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_STRUCTURED_WITH_OFFSET) |
3649 S_008F0C_RESOURCE_LEVEL(1);
3650 } else {
3651 int first_non_void;
3652 unsigned num_format, data_format;
3653
3654 first_non_void = util_format_get_first_non_void_channel(format);
3655 num_format = si_translate_buffer_numformat(&screen->b, desc, first_non_void);
3656 data_format = si_translate_buffer_dataformat(&screen->b, desc, first_non_void);
3657
3658 state[7] |= S_008F0C_NUM_FORMAT(num_format) | S_008F0C_DATA_FORMAT(data_format);
3659 }
3660 }
3661
3662 static unsigned gfx9_border_color_swizzle(const unsigned char swizzle[4])
3663 {
3664 unsigned bc_swizzle = V_008F20_BC_SWIZZLE_XYZW;
3665
3666 if (swizzle[3] == PIPE_SWIZZLE_X) {
3667 /* For the pre-defined border color values (white, opaque
3668 * black, transparent black), the only thing that matters is
3669 * that the alpha channel winds up in the correct place
3670 * (because the RGB channels are all the same) so either of
3671 * these enumerations will work.
3672 */
3673 if (swizzle[2] == PIPE_SWIZZLE_Y)
3674 bc_swizzle = V_008F20_BC_SWIZZLE_WZYX;
3675 else
3676 bc_swizzle = V_008F20_BC_SWIZZLE_WXYZ;
3677 } else if (swizzle[0] == PIPE_SWIZZLE_X) {
3678 if (swizzle[1] == PIPE_SWIZZLE_Y)
3679 bc_swizzle = V_008F20_BC_SWIZZLE_XYZW;
3680 else
3681 bc_swizzle = V_008F20_BC_SWIZZLE_XWYZ;
3682 } else if (swizzle[1] == PIPE_SWIZZLE_X) {
3683 bc_swizzle = V_008F20_BC_SWIZZLE_YXWZ;
3684 } else if (swizzle[2] == PIPE_SWIZZLE_X) {
3685 bc_swizzle = V_008F20_BC_SWIZZLE_ZYXW;
3686 }
3687
3688 return bc_swizzle;
3689 }
3690
3691 /**
3692 * Build the sampler view descriptor for a texture.
3693 */
3694 static void gfx10_make_texture_descriptor(
3695 struct si_screen *screen, struct si_texture *tex, bool sampler, enum pipe_texture_target target,
3696 enum pipe_format pipe_format, const unsigned char state_swizzle[4], unsigned first_level,
3697 unsigned last_level, unsigned first_layer, unsigned last_layer, unsigned width, unsigned height,
3698 unsigned depth, uint32_t *state, uint32_t *fmask_state)
3699 {
3700 struct pipe_resource *res = &tex->buffer.b.b;
3701 const struct util_format_description *desc;
3702 unsigned img_format;
3703 unsigned char swizzle[4];
3704 unsigned type;
3705 uint64_t va;
3706
3707 desc = util_format_description(pipe_format);
3708 img_format = gfx10_format_table[pipe_format].img_format;
3709
3710 if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
3711 const unsigned char swizzle_xxxx[4] = {0, 0, 0, 0};
3712 const unsigned char swizzle_yyyy[4] = {1, 1, 1, 1};
3713 const unsigned char swizzle_wwww[4] = {3, 3, 3, 3};
3714 bool is_stencil = false;
3715
3716 switch (pipe_format) {
3717 case PIPE_FORMAT_S8_UINT_Z24_UNORM:
3718 case PIPE_FORMAT_X32_S8X24_UINT:
3719 case PIPE_FORMAT_X8Z24_UNORM:
3720 util_format_compose_swizzles(swizzle_yyyy, state_swizzle, swizzle);
3721 is_stencil = true;
3722 break;
3723 case PIPE_FORMAT_X24S8_UINT:
3724 /*
3725 * X24S8 is implemented as an 8_8_8_8 data format, to
3726 * fix texture gathers. This affects at least
3727 * GL45-CTS.texture_cube_map_array.sampling on GFX8.
3728 */
3729 util_format_compose_swizzles(swizzle_wwww, state_swizzle, swizzle);
3730 is_stencil = true;
3731 break;
3732 default:
3733 util_format_compose_swizzles(swizzle_xxxx, state_swizzle, swizzle);
3734 is_stencil = pipe_format == PIPE_FORMAT_S8_UINT;
3735 }
3736
3737 if (tex->upgraded_depth && !is_stencil) {
3738 assert(img_format == V_008F0C_IMG_FORMAT_32_FLOAT);
3739 img_format = V_008F0C_IMG_FORMAT_32_FLOAT_CLAMP;
3740 }
3741 } else {
3742 util_format_compose_swizzles(desc->swizzle, state_swizzle, swizzle);
3743 }
3744
3745 if (!sampler && (res->target == PIPE_TEXTURE_CUBE || res->target == PIPE_TEXTURE_CUBE_ARRAY)) {
3746 /* For the purpose of shader images, treat cube maps as 2D
3747 * arrays.
3748 */
3749 type = V_008F1C_SQ_RSRC_IMG_2D_ARRAY;
3750 } else {
3751 type = si_tex_dim(screen, tex, target, res->nr_samples);
3752 }
3753
3754 if (type == V_008F1C_SQ_RSRC_IMG_1D_ARRAY) {
3755 height = 1;
3756 depth = res->array_size;
3757 } else if (type == V_008F1C_SQ_RSRC_IMG_2D_ARRAY || type == V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY) {
3758 if (sampler || res->target != PIPE_TEXTURE_3D)
3759 depth = res->array_size;
3760 } else if (type == V_008F1C_SQ_RSRC_IMG_CUBE)
3761 depth = res->array_size / 6;
3762
3763 state[0] = 0;
3764 state[1] = S_00A004_FORMAT(img_format) | S_00A004_WIDTH_LO(width - 1);
3765 state[2] = S_00A008_WIDTH_HI((width - 1) >> 2) | S_00A008_HEIGHT(height - 1) |
3766 S_00A008_RESOURCE_LEVEL(1);
3767 state[3] =
3768 S_00A00C_DST_SEL_X(si_map_swizzle(swizzle[0])) |
3769 S_00A00C_DST_SEL_Y(si_map_swizzle(swizzle[1])) |
3770 S_00A00C_DST_SEL_Z(si_map_swizzle(swizzle[2])) |
3771 S_00A00C_DST_SEL_W(si_map_swizzle(swizzle[3])) |
3772 S_00A00C_BASE_LEVEL(res->nr_samples > 1 ? 0 : first_level) |
3773 S_00A00C_LAST_LEVEL(res->nr_samples > 1 ? util_logbase2(res->nr_samples) : last_level) |
3774 S_00A00C_BC_SWIZZLE(gfx9_border_color_swizzle(desc->swizzle)) | S_00A00C_TYPE(type);
3775 /* Depth is the the last accessible layer on gfx9+. The hw doesn't need
3776 * to know the total number of layers.
3777 */
3778 state[4] =
3779 S_00A010_DEPTH((type == V_008F1C_SQ_RSRC_IMG_3D && sampler) ? depth - 1 : last_layer) |
3780 S_00A010_BASE_ARRAY(first_layer);
3781 state[5] = S_00A014_ARRAY_PITCH(!!(type == V_008F1C_SQ_RSRC_IMG_3D && !sampler)) |
3782 S_00A014_MAX_MIP(res->nr_samples > 1 ? util_logbase2(res->nr_samples)
3783 : tex->buffer.b.b.last_level) |
3784 S_00A014_PERF_MOD(4);
3785 state[6] = 0;
3786 state[7] = 0;
3787
3788 if (tex->surface.dcc_offset) {
3789 state[6] |= S_00A018_MAX_UNCOMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_256B) |
3790 S_00A018_MAX_COMPRESSED_BLOCK_SIZE(V_028C78_MAX_BLOCK_SIZE_128B) |
3791 S_00A018_ALPHA_IS_ON_MSB(vi_alpha_is_on_msb(screen, pipe_format));
3792 }
3793
3794 /* Initialize the sampler view for FMASK. */
3795 if (tex->surface.fmask_offset) {
3796 uint32_t format;
3797
3798 va = tex->buffer.gpu_address + tex->surface.fmask_offset;
3799
3800 #define FMASK(s, f) (((unsigned)(MAX2(1, s)) * 16) + (MAX2(1, f)))
3801 switch (FMASK(res->nr_samples, res->nr_storage_samples)) {
3802 case FMASK(2, 1):
3803 format = V_008F0C_IMG_FORMAT_FMASK8_S2_F1;
3804 break;
3805 case FMASK(2, 2):
3806 format = V_008F0C_IMG_FORMAT_FMASK8_S2_F2;
3807 break;
3808 case FMASK(4, 1):
3809 format = V_008F0C_IMG_FORMAT_FMASK8_S4_F1;
3810 break;
3811 case FMASK(4, 2):
3812 format = V_008F0C_IMG_FORMAT_FMASK8_S4_F2;
3813 break;
3814 case FMASK(4, 4):
3815 format = V_008F0C_IMG_FORMAT_FMASK8_S4_F4;
3816 break;
3817 case FMASK(8, 1):
3818 format = V_008F0C_IMG_FORMAT_FMASK8_S8_F1;
3819 break;
3820 case FMASK(8, 2):
3821 format = V_008F0C_IMG_FORMAT_FMASK16_S8_F2;
3822 break;
3823 case FMASK(8, 4):
3824 format = V_008F0C_IMG_FORMAT_FMASK32_S8_F4;
3825 break;
3826 case FMASK(8, 8):
3827 format = V_008F0C_IMG_FORMAT_FMASK32_S8_F8;
3828 break;
3829 case FMASK(16, 1):
3830 format = V_008F0C_IMG_FORMAT_FMASK16_S16_F1;
3831 break;
3832 case FMASK(16, 2):
3833 format = V_008F0C_IMG_FORMAT_FMASK32_S16_F2;
3834 break;
3835 case FMASK(16, 4):
3836 format = V_008F0C_IMG_FORMAT_FMASK64_S16_F4;
3837 break;
3838 case FMASK(16, 8):
3839 format = V_008F0C_IMG_FORMAT_FMASK64_S16_F8;
3840 break;
3841 default:
3842 unreachable("invalid nr_samples");
3843 }
3844 #undef FMASK
3845 fmask_state[0] = (va >> 8) | tex->surface.fmask_tile_swizzle;
3846 fmask_state[1] = S_00A004_BASE_ADDRESS_HI(va >> 40) | S_00A004_FORMAT(format) |
3847 S_00A004_WIDTH_LO(width - 1);
3848 fmask_state[2] = S_00A008_WIDTH_HI((width - 1) >> 2) | S_00A008_HEIGHT(height - 1) |
3849 S_00A008_RESOURCE_LEVEL(1);
3850 fmask_state[3] =
3851 S_00A00C_DST_SEL_X(V_008F1C_SQ_SEL_X) | S_00A00C_DST_SEL_Y(V_008F1C_SQ_SEL_X) |
3852 S_00A00C_DST_SEL_Z(V_008F1C_SQ_SEL_X) | S_00A00C_DST_SEL_W(V_008F1C_SQ_SEL_X) |
3853 S_00A00C_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode) |
3854 S_00A00C_TYPE(si_tex_dim(screen, tex, target, 0));
3855 fmask_state[4] = S_00A010_DEPTH(last_layer) | S_00A010_BASE_ARRAY(first_layer);
3856 fmask_state[5] = 0;
3857 fmask_state[6] = S_00A018_META_PIPE_ALIGNED(tex->surface.u.gfx9.cmask.pipe_aligned);
3858 fmask_state[7] = 0;
3859 }
3860 }
3861
3862 /**
3863 * Build the sampler view descriptor for a texture (SI-GFX9).
3864 */
3865 static void si_make_texture_descriptor(struct si_screen *screen, struct si_texture *tex,
3866 bool sampler, enum pipe_texture_target target,
3867 enum pipe_format pipe_format,
3868 const unsigned char state_swizzle[4], unsigned first_level,
3869 unsigned last_level, unsigned first_layer,
3870 unsigned last_layer, unsigned width, unsigned height,
3871 unsigned depth, uint32_t *state, uint32_t *fmask_state)
3872 {
3873 struct pipe_resource *res = &tex->buffer.b.b;
3874 const struct util_format_description *desc;
3875 unsigned char swizzle[4];
3876 int first_non_void;
3877 unsigned num_format, data_format, type, num_samples;
3878 uint64_t va;
3879
3880 desc = util_format_description(pipe_format);
3881
3882 num_samples = desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS ? MAX2(1, res->nr_samples)
3883 : MAX2(1, res->nr_storage_samples);
3884
3885 if (desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) {
3886 const unsigned char swizzle_xxxx[4] = {0, 0, 0, 0};
3887 const unsigned char swizzle_yyyy[4] = {1, 1, 1, 1};
3888 const unsigned char swizzle_wwww[4] = {3, 3, 3, 3};
3889
3890 switch (pipe_format) {
3891 case PIPE_FORMAT_S8_UINT_Z24_UNORM:
3892 case PIPE_FORMAT_X32_S8X24_UINT:
3893 case PIPE_FORMAT_X8Z24_UNORM:
3894 util_format_compose_swizzles(swizzle_yyyy, state_swizzle, swizzle);
3895 break;
3896 case PIPE_FORMAT_X24S8_UINT:
3897 /*
3898 * X24S8 is implemented as an 8_8_8_8 data format, to
3899 * fix texture gathers. This affects at least
3900 * GL45-CTS.texture_cube_map_array.sampling on GFX8.
3901 */
3902 if (screen->info.chip_class <= GFX8)
3903 util_format_compose_swizzles(swizzle_wwww, state_swizzle, swizzle);
3904 else
3905 util_format_compose_swizzles(swizzle_yyyy, state_swizzle, swizzle);
3906 break;
3907 default:
3908 util_format_compose_swizzles(swizzle_xxxx, state_swizzle, swizzle);
3909 }
3910 } else {
3911 util_format_compose_swizzles(desc->swizzle, state_swizzle, swizzle);
3912 }
3913
3914 first_non_void = util_format_get_first_non_void_channel(pipe_format);
3915
3916 switch (pipe_format) {
3917 case PIPE_FORMAT_S8_UINT_Z24_UNORM:
3918 num_format = V_008F14_IMG_NUM_FORMAT_UNORM;
3919 break;
3920 default:
3921 if (first_non_void < 0) {
3922 if (util_format_is_compressed(pipe_format)) {
3923 switch (pipe_format) {
3924 case PIPE_FORMAT_DXT1_SRGB:
3925 case PIPE_FORMAT_DXT1_SRGBA:
3926 case PIPE_FORMAT_DXT3_SRGBA:
3927 case PIPE_FORMAT_DXT5_SRGBA:
3928 case PIPE_FORMAT_BPTC_SRGBA:
3929 case PIPE_FORMAT_ETC2_SRGB8:
3930 case PIPE_FORMAT_ETC2_SRGB8A1:
3931 case PIPE_FORMAT_ETC2_SRGBA8:
3932 num_format = V_008F14_IMG_NUM_FORMAT_SRGB;
3933 break;
3934 case PIPE_FORMAT_RGTC1_SNORM:
3935 case PIPE_FORMAT_LATC1_SNORM:
3936 case PIPE_FORMAT_RGTC2_SNORM:
3937 case PIPE_FORMAT_LATC2_SNORM:
3938 case PIPE_FORMAT_ETC2_R11_SNORM:
3939 case PIPE_FORMAT_ETC2_RG11_SNORM:
3940 /* implies float, so use SNORM/UNORM to determine
3941 whether data is signed or not */
3942 case PIPE_FORMAT_BPTC_RGB_FLOAT:
3943 num_format = V_008F14_IMG_NUM_FORMAT_SNORM;
3944 break;
3945 default:
3946 num_format = V_008F14_IMG_NUM_FORMAT_UNORM;
3947 break;
3948 }
3949 } else if (desc->layout == UTIL_FORMAT_LAYOUT_SUBSAMPLED) {
3950 num_format = V_008F14_IMG_NUM_FORMAT_UNORM;
3951 } else {
3952 num_format = V_008F14_IMG_NUM_FORMAT_FLOAT;
3953 }
3954 } else if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) {
3955 num_format = V_008F14_IMG_NUM_FORMAT_SRGB;
3956 } else {
3957 num_format = V_008F14_IMG_NUM_FORMAT_UNORM;
3958
3959 switch (desc->channel[first_non_void].type) {
3960 case UTIL_FORMAT_TYPE_FLOAT:
3961 num_format = V_008F14_IMG_NUM_FORMAT_FLOAT;
3962 break;
3963 case UTIL_FORMAT_TYPE_SIGNED:
3964 if (desc->channel[first_non_void].normalized)
3965 num_format = V_008F14_IMG_NUM_FORMAT_SNORM;
3966 else if (desc->channel[first_non_void].pure_integer)
3967 num_format = V_008F14_IMG_NUM_FORMAT_SINT;
3968 else
3969 num_format = V_008F14_IMG_NUM_FORMAT_SSCALED;
3970 break;
3971 case UTIL_FORMAT_TYPE_UNSIGNED:
3972 if (desc->channel[first_non_void].normalized)
3973 num_format = V_008F14_IMG_NUM_FORMAT_UNORM;
3974 else if (desc->channel[first_non_void].pure_integer)
3975 num_format = V_008F14_IMG_NUM_FORMAT_UINT;
3976 else
3977 num_format = V_008F14_IMG_NUM_FORMAT_USCALED;
3978 }
3979 }
3980 }
3981
3982 data_format = si_translate_texformat(&screen->b, pipe_format, desc, first_non_void);
3983 if (data_format == ~0) {
3984 data_format = 0;
3985 }
3986
3987 /* S8 with Z32 HTILE needs a special format. */
3988 if (screen->info.chip_class == GFX9 && pipe_format == PIPE_FORMAT_S8_UINT &&
3989 tex->tc_compatible_htile)
3990 data_format = V_008F14_IMG_DATA_FORMAT_S8_32;
3991
3992 if (!sampler && (res->target == PIPE_TEXTURE_CUBE || res->target == PIPE_TEXTURE_CUBE_ARRAY ||
3993 (screen->info.chip_class <= GFX8 && res->target == PIPE_TEXTURE_3D))) {
3994 /* For the purpose of shader images, treat cube maps and 3D
3995 * textures as 2D arrays. For 3D textures, the address
3996 * calculations for mipmaps are different, so we rely on the
3997 * caller to effectively disable mipmaps.
3998 */
3999 type = V_008F1C_SQ_RSRC_IMG_2D_ARRAY;
4000
4001 assert(res->target != PIPE_TEXTURE_3D || (first_level == 0 && last_level == 0));
4002 } else {
4003 type = si_tex_dim(screen, tex, target, num_samples);
4004 }
4005
4006 if (type == V_008F1C_SQ_RSRC_IMG_1D_ARRAY) {
4007 height = 1;
4008 depth = res->array_size;
4009 } else if (type == V_008F1C_SQ_RSRC_IMG_2D_ARRAY || type == V_008F1C_SQ_RSRC_IMG_2D_MSAA_ARRAY) {
4010 if (sampler || res->target != PIPE_TEXTURE_3D)
4011 depth = res->array_size;
4012 } else if (type == V_008F1C_SQ_RSRC_IMG_CUBE)
4013 depth = res->array_size / 6;
4014
4015 state[0] = 0;
4016 state[1] = (S_008F14_DATA_FORMAT(data_format) | S_008F14_NUM_FORMAT(num_format));
4017 state[2] = (S_008F18_WIDTH(width - 1) | S_008F18_HEIGHT(height - 1) | S_008F18_PERF_MOD(4));
4018 state[3] = (S_008F1C_DST_SEL_X(si_map_swizzle(swizzle[0])) |
4019 S_008F1C_DST_SEL_Y(si_map_swizzle(swizzle[1])) |
4020 S_008F1C_DST_SEL_Z(si_map_swizzle(swizzle[2])) |
4021 S_008F1C_DST_SEL_W(si_map_swizzle(swizzle[3])) |
4022 S_008F1C_BASE_LEVEL(num_samples > 1 ? 0 : first_level) |
4023 S_008F1C_LAST_LEVEL(num_samples > 1 ? util_logbase2(num_samples) : last_level) |
4024 S_008F1C_TYPE(type));
4025 state[4] = 0;
4026 state[5] = S_008F24_BASE_ARRAY(first_layer);
4027 state[6] = 0;
4028 state[7] = 0;
4029
4030 if (screen->info.chip_class == GFX9) {
4031 unsigned bc_swizzle = gfx9_border_color_swizzle(desc->swizzle);
4032
4033 /* Depth is the the last accessible layer on Gfx9.
4034 * The hw doesn't need to know the total number of layers.
4035 */
4036 if (type == V_008F1C_SQ_RSRC_IMG_3D)
4037 state[4] |= S_008F20_DEPTH(depth - 1);
4038 else
4039 state[4] |= S_008F20_DEPTH(last_layer);
4040
4041 state[4] |= S_008F20_BC_SWIZZLE(bc_swizzle);
4042 state[5] |= S_008F24_MAX_MIP(num_samples > 1 ? util_logbase2(num_samples)
4043 : tex->buffer.b.b.last_level);
4044 } else {
4045 state[3] |= S_008F1C_POW2_PAD(res->last_level > 0);
4046 state[4] |= S_008F20_DEPTH(depth - 1);
4047 state[5] |= S_008F24_LAST_ARRAY(last_layer);
4048 }
4049
4050 if (tex->surface.dcc_offset) {
4051 state[6] = S_008F28_ALPHA_IS_ON_MSB(vi_alpha_is_on_msb(screen, pipe_format));
4052 } else {
4053 /* The last dword is unused by hw. The shader uses it to clear
4054 * bits in the first dword of sampler state.
4055 */
4056 if (screen->info.chip_class <= GFX7 && res->nr_samples <= 1) {
4057 if (first_level == last_level)
4058 state[7] = C_008F30_MAX_ANISO_RATIO;
4059 else
4060 state[7] = 0xffffffff;
4061 }
4062 }
4063
4064 /* Initialize the sampler view for FMASK. */
4065 if (tex->surface.fmask_offset) {
4066 uint32_t data_format, num_format;
4067
4068 va = tex->buffer.gpu_address + tex->surface.fmask_offset;
4069
4070 #define FMASK(s, f) (((unsigned)(MAX2(1, s)) * 16) + (MAX2(1, f)))
4071 if (screen->info.chip_class == GFX9) {
4072 data_format = V_008F14_IMG_DATA_FORMAT_FMASK;
4073 switch (FMASK(res->nr_samples, res->nr_storage_samples)) {
4074 case FMASK(2, 1):
4075 num_format = V_008F14_IMG_FMASK_8_2_1;
4076 break;
4077 case FMASK(2, 2):
4078 num_format = V_008F14_IMG_FMASK_8_2_2;
4079 break;
4080 case FMASK(4, 1):
4081 num_format = V_008F14_IMG_FMASK_8_4_1;
4082 break;
4083 case FMASK(4, 2):
4084 num_format = V_008F14_IMG_FMASK_8_4_2;
4085 break;
4086 case FMASK(4, 4):
4087 num_format = V_008F14_IMG_FMASK_8_4_4;
4088 break;
4089 case FMASK(8, 1):
4090 num_format = V_008F14_IMG_FMASK_8_8_1;
4091 break;
4092 case FMASK(8, 2):
4093 num_format = V_008F14_IMG_FMASK_16_8_2;
4094 break;
4095 case FMASK(8, 4):
4096 num_format = V_008F14_IMG_FMASK_32_8_4;
4097 break;
4098 case FMASK(8, 8):
4099 num_format = V_008F14_IMG_FMASK_32_8_8;
4100 break;
4101 case FMASK(16, 1):
4102 num_format = V_008F14_IMG_FMASK_16_16_1;
4103 break;
4104 case FMASK(16, 2):
4105 num_format = V_008F14_IMG_FMASK_32_16_2;
4106 break;
4107 case FMASK(16, 4):
4108 num_format = V_008F14_IMG_FMASK_64_16_4;
4109 break;
4110 case FMASK(16, 8):
4111 num_format = V_008F14_IMG_FMASK_64_16_8;
4112 break;
4113 default:
4114 unreachable("invalid nr_samples");
4115 }
4116 } else {
4117 switch (FMASK(res->nr_samples, res->nr_storage_samples)) {
4118 case FMASK(2, 1):
4119 data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S2_F1;
4120 break;
4121 case FMASK(2, 2):
4122 data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S2_F2;
4123 break;
4124 case FMASK(4, 1):
4125 data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F1;
4126 break;
4127 case FMASK(4, 2):
4128 data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F2;
4129 break;
4130 case FMASK(4, 4):
4131 data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S4_F4;
4132 break;
4133 case FMASK(8, 1):
4134 data_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S8_F1;
4135 break;
4136 case FMASK(8, 2):
4137 data_format = V_008F14_IMG_DATA_FORMAT_FMASK16_S8_F2;
4138 break;
4139 case FMASK(8, 4):
4140 data_format = V_008F14_IMG_DATA_FORMAT_FMASK32_S8_F4;
4141 break;
4142 case FMASK(8, 8):
4143 data_format = V_008F14_IMG_DATA_FORMAT_FMASK32_S8_F8;
4144 break;
4145 case FMASK(16, 1):
4146 data_format = V_008F14_IMG_DATA_FORMAT_FMASK16_S16_F1;
4147 break;
4148 case FMASK(16, 2):
4149 data_format = V_008F14_IMG_DATA_FORMAT_FMASK32_S16_F2;
4150 break;
4151 case FMASK(16, 4):
4152 data_format = V_008F14_IMG_DATA_FORMAT_FMASK64_S16_F4;
4153 break;
4154 case FMASK(16, 8):
4155 data_format = V_008F14_IMG_DATA_FORMAT_FMASK64_S16_F8;
4156 break;
4157 default:
4158 unreachable("invalid nr_samples");
4159 }
4160 num_format = V_008F14_IMG_NUM_FORMAT_UINT;
4161 }
4162 #undef FMASK
4163
4164 fmask_state[0] = (va >> 8) | tex->surface.fmask_tile_swizzle;
4165 fmask_state[1] = S_008F14_BASE_ADDRESS_HI(va >> 40) | S_008F14_DATA_FORMAT(data_format) |
4166 S_008F14_NUM_FORMAT(num_format);
4167 fmask_state[2] = S_008F18_WIDTH(width - 1) | S_008F18_HEIGHT(height - 1);
4168 fmask_state[3] =
4169 S_008F1C_DST_SEL_X(V_008F1C_SQ_SEL_X) | S_008F1C_DST_SEL_Y(V_008F1C_SQ_SEL_X) |
4170 S_008F1C_DST_SEL_Z(V_008F1C_SQ_SEL_X) | S_008F1C_DST_SEL_W(V_008F1C_SQ_SEL_X) |
4171 S_008F1C_TYPE(si_tex_dim(screen, tex, target, 0));
4172 fmask_state[4] = 0;
4173 fmask_state[5] = S_008F24_BASE_ARRAY(first_layer);
4174 fmask_state[6] = 0;
4175 fmask_state[7] = 0;
4176
4177 if (screen->info.chip_class == GFX9) {
4178 fmask_state[3] |= S_008F1C_SW_MODE(tex->surface.u.gfx9.fmask.swizzle_mode);
4179 fmask_state[4] |=
4180 S_008F20_DEPTH(last_layer) | S_008F20_PITCH(tex->surface.u.gfx9.fmask.epitch);
4181 fmask_state[5] |= S_008F24_META_PIPE_ALIGNED(tex->surface.u.gfx9.cmask.pipe_aligned) |
4182 S_008F24_META_RB_ALIGNED(tex->surface.u.gfx9.cmask.rb_aligned);
4183 } else {
4184 fmask_state[3] |= S_008F1C_TILING_INDEX(tex->surface.u.legacy.fmask.tiling_index);
4185 fmask_state[4] |= S_008F20_DEPTH(depth - 1) |
4186 S_008F20_PITCH(tex->surface.u.legacy.fmask.pitch_in_pixels - 1);
4187 fmask_state[5] |= S_008F24_LAST_ARRAY(last_layer);
4188 }
4189 }
4190 }
4191
4192 /**
4193 * Create a sampler view.
4194 *
4195 * @param ctx context
4196 * @param texture texture
4197 * @param state sampler view template
4198 * @param width0 width0 override (for compressed textures as int)
4199 * @param height0 height0 override (for compressed textures as int)
4200 * @param force_level set the base address to the level (for compressed textures)
4201 */
4202 struct pipe_sampler_view *si_create_sampler_view_custom(struct pipe_context *ctx,
4203 struct pipe_resource *texture,
4204 const struct pipe_sampler_view *state,
4205 unsigned width0, unsigned height0,
4206 unsigned force_level)
4207 {
4208 struct si_context *sctx = (struct si_context *)ctx;
4209 struct si_sampler_view *view = CALLOC_STRUCT(si_sampler_view);
4210 struct si_texture *tex = (struct si_texture *)texture;
4211 unsigned base_level, first_level, last_level;
4212 unsigned char state_swizzle[4];
4213 unsigned height, depth, width;
4214 unsigned last_layer = state->u.tex.last_layer;
4215 enum pipe_format pipe_format;
4216 const struct legacy_surf_level *surflevel;
4217
4218 if (!view)
4219 return NULL;
4220
4221 /* initialize base object */
4222 view->base = *state;
4223 view->base.texture = NULL;
4224 view->base.reference.count = 1;
4225 view->base.context = ctx;
4226
4227 assert(texture);
4228 pipe_resource_reference(&view->base.texture, texture);
4229
4230 if (state->format == PIPE_FORMAT_X24S8_UINT || state->format == PIPE_FORMAT_S8X24_UINT ||
4231 state->format == PIPE_FORMAT_X32_S8X24_UINT || state->format == PIPE_FORMAT_S8_UINT)
4232 view->is_stencil_sampler = true;
4233
4234 /* Buffer resource. */
4235 if (texture->target == PIPE_BUFFER) {
4236 si_make_buffer_descriptor(sctx->screen, si_resource(texture), state->format,
4237 state->u.buf.offset, state->u.buf.size, view->state);
4238 return &view->base;
4239 }
4240
4241 state_swizzle[0] = state->swizzle_r;
4242 state_swizzle[1] = state->swizzle_g;
4243 state_swizzle[2] = state->swizzle_b;
4244 state_swizzle[3] = state->swizzle_a;
4245
4246 base_level = 0;
4247 first_level = state->u.tex.first_level;
4248 last_level = state->u.tex.last_level;
4249 width = width0;
4250 height = height0;
4251 depth = texture->depth0;
4252
4253 if (sctx->chip_class <= GFX8 && force_level) {
4254 assert(force_level == first_level && force_level == last_level);
4255 base_level = force_level;
4256 first_level = 0;
4257 last_level = 0;
4258 width = u_minify(width, force_level);
4259 height = u_minify(height, force_level);
4260 depth = u_minify(depth, force_level);
4261 }
4262
4263 /* This is not needed if state trackers set last_layer correctly. */
4264 if (state->target == PIPE_TEXTURE_1D || state->target == PIPE_TEXTURE_2D ||
4265 state->target == PIPE_TEXTURE_RECT || state->target == PIPE_TEXTURE_CUBE)
4266 last_layer = state->u.tex.first_layer;
4267
4268 /* Texturing with separate depth and stencil. */
4269 pipe_format = state->format;
4270
4271 /* Depth/stencil texturing sometimes needs separate texture. */
4272 if (tex->is_depth && !si_can_sample_zs(tex, view->is_stencil_sampler)) {
4273 if (!tex->flushed_depth_texture && !si_init_flushed_depth_texture(ctx, texture)) {
4274 pipe_resource_reference(&view->base.texture, NULL);
4275 FREE(view);
4276 return NULL;
4277 }
4278
4279 assert(tex->flushed_depth_texture);
4280
4281 /* Override format for the case where the flushed texture
4282 * contains only Z or only S.
4283 */
4284 if (tex->flushed_depth_texture->buffer.b.b.format != tex->buffer.b.b.format)
4285 pipe_format = tex->flushed_depth_texture->buffer.b.b.format;
4286
4287 tex = tex->flushed_depth_texture;
4288 }
4289
4290 surflevel = tex->surface.u.legacy.level;
4291
4292 if (tex->db_compatible) {
4293 if (!view->is_stencil_sampler)
4294 pipe_format = tex->db_render_format;
4295
4296 switch (pipe_format) {
4297 case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
4298 pipe_format = PIPE_FORMAT_Z32_FLOAT;
4299 break;
4300 case PIPE_FORMAT_X8Z24_UNORM:
4301 case PIPE_FORMAT_S8_UINT_Z24_UNORM:
4302 /* Z24 is always stored like this for DB
4303 * compatibility.
4304 */
4305 pipe_format = PIPE_FORMAT_Z24X8_UNORM;
4306 break;
4307 case PIPE_FORMAT_X24S8_UINT:
4308 case PIPE_FORMAT_S8X24_UINT:
4309 case PIPE_FORMAT_X32_S8X24_UINT:
4310 pipe_format = PIPE_FORMAT_S8_UINT;
4311 surflevel = tex->surface.u.legacy.stencil_level;
4312 break;
4313 default:;
4314 }
4315 }
4316
4317 view->dcc_incompatible =
4318 vi_dcc_formats_are_incompatible(texture, state->u.tex.first_level, state->format);
4319
4320 sctx->screen->make_texture_descriptor(
4321 sctx->screen, tex, true, state->target, pipe_format, state_swizzle, first_level, last_level,
4322 state->u.tex.first_layer, last_layer, width, height, depth, view->state, view->fmask_state);
4323
4324 const struct util_format_description *desc = util_format_description(pipe_format);
4325 view->is_integer = false;
4326
4327 for (unsigned i = 0; i < desc->nr_channels; ++i) {
4328 if (desc->channel[i].type == UTIL_FORMAT_TYPE_VOID)
4329 continue;
4330
4331 /* Whether the number format is {U,S}{SCALED,INT} */
4332 view->is_integer = (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED ||
4333 desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) &&
4334 (desc->channel[i].pure_integer || !desc->channel[i].normalized);
4335 break;
4336 }
4337
4338 view->base_level_info = &surflevel[base_level];
4339 view->base_level = base_level;
4340 view->block_width = util_format_get_blockwidth(pipe_format);
4341 return &view->base;
4342 }
4343
4344 static struct pipe_sampler_view *si_create_sampler_view(struct pipe_context *ctx,
4345 struct pipe_resource *texture,
4346 const struct pipe_sampler_view *state)
4347 {
4348 return si_create_sampler_view_custom(ctx, texture, state, texture ? texture->width0 : 0,
4349 texture ? texture->height0 : 0, 0);
4350 }
4351
4352 static void si_sampler_view_destroy(struct pipe_context *ctx, struct pipe_sampler_view *state)
4353 {
4354 struct si_sampler_view *view = (struct si_sampler_view *)state;
4355
4356 pipe_resource_reference(&state->texture, NULL);
4357 FREE(view);
4358 }
4359
4360 static bool wrap_mode_uses_border_color(unsigned wrap, bool linear_filter)
4361 {
4362 return wrap == PIPE_TEX_WRAP_CLAMP_TO_BORDER || wrap == PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER ||
4363 (linear_filter && (wrap == PIPE_TEX_WRAP_CLAMP || wrap == PIPE_TEX_WRAP_MIRROR_CLAMP));
4364 }
4365
4366 static uint32_t si_translate_border_color(struct si_context *sctx,
4367 const struct pipe_sampler_state *state,
4368 const union pipe_color_union *color, bool is_integer)
4369 {
4370 bool linear_filter = state->min_img_filter != PIPE_TEX_FILTER_NEAREST ||
4371 state->mag_img_filter != PIPE_TEX_FILTER_NEAREST;
4372
4373 if (!wrap_mode_uses_border_color(state->wrap_s, linear_filter) &&
4374 !wrap_mode_uses_border_color(state->wrap_t, linear_filter) &&
4375 !wrap_mode_uses_border_color(state->wrap_r, linear_filter))
4376 return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK);
4377
4378 #define simple_border_types(elt) \
4379 do { \
4380 if (color->elt[0] == 0 && color->elt[1] == 0 && color->elt[2] == 0 && color->elt[3] == 0) \
4381 return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK); \
4382 if (color->elt[0] == 0 && color->elt[1] == 0 && color->elt[2] == 0 && color->elt[3] == 1) \
4383 return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_OPAQUE_BLACK); \
4384 if (color->elt[0] == 1 && color->elt[1] == 1 && color->elt[2] == 1 && color->elt[3] == 1) \
4385 return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_OPAQUE_WHITE); \
4386 } while (false)
4387
4388 if (is_integer)
4389 simple_border_types(ui);
4390 else
4391 simple_border_types(f);
4392
4393 #undef simple_border_types
4394
4395 int i;
4396
4397 /* Check if the border has been uploaded already. */
4398 for (i = 0; i < sctx->border_color_count; i++)
4399 if (memcmp(&sctx->border_color_table[i], color, sizeof(*color)) == 0)
4400 break;
4401
4402 if (i >= SI_MAX_BORDER_COLORS) {
4403 /* Getting 4096 unique border colors is very unlikely. */
4404 fprintf(stderr, "radeonsi: The border color table is full. "
4405 "Any new border colors will be just black. "
4406 "Please file a bug.\n");
4407 return S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_TRANS_BLACK);
4408 }
4409
4410 if (i == sctx->border_color_count) {
4411 /* Upload a new border color. */
4412 memcpy(&sctx->border_color_table[i], color, sizeof(*color));
4413 util_memcpy_cpu_to_le32(&sctx->border_color_map[i], color, sizeof(*color));
4414 sctx->border_color_count++;
4415 }
4416
4417 return S_008F3C_BORDER_COLOR_PTR(i) |
4418 S_008F3C_BORDER_COLOR_TYPE(V_008F3C_SQ_TEX_BORDER_COLOR_REGISTER);
4419 }
4420
4421 static inline int S_FIXED(float value, unsigned frac_bits)
4422 {
4423 return value * (1 << frac_bits);
4424 }
4425
4426 static inline unsigned si_tex_filter(unsigned filter, unsigned max_aniso)
4427 {
4428 if (filter == PIPE_TEX_FILTER_LINEAR)
4429 return max_aniso > 1 ? V_008F38_SQ_TEX_XY_FILTER_ANISO_BILINEAR
4430 : V_008F38_SQ_TEX_XY_FILTER_BILINEAR;
4431 else
4432 return max_aniso > 1 ? V_008F38_SQ_TEX_XY_FILTER_ANISO_POINT
4433 : V_008F38_SQ_TEX_XY_FILTER_POINT;
4434 }
4435
4436 static inline unsigned si_tex_aniso_filter(unsigned filter)
4437 {
4438 if (filter < 2)
4439 return 0;
4440 if (filter < 4)
4441 return 1;
4442 if (filter < 8)
4443 return 2;
4444 if (filter < 16)
4445 return 3;
4446 return 4;
4447 }
4448
4449 static void *si_create_sampler_state(struct pipe_context *ctx,
4450 const struct pipe_sampler_state *state)
4451 {
4452 struct si_context *sctx = (struct si_context *)ctx;
4453 struct si_screen *sscreen = sctx->screen;
4454 struct si_sampler_state *rstate = CALLOC_STRUCT(si_sampler_state);
4455 unsigned max_aniso = sscreen->force_aniso >= 0 ? sscreen->force_aniso : state->max_anisotropy;
4456 unsigned max_aniso_ratio = si_tex_aniso_filter(max_aniso);
4457 union pipe_color_union clamped_border_color;
4458
4459 if (!rstate) {
4460 return NULL;
4461 }
4462
4463 #ifndef NDEBUG
4464 rstate->magic = SI_SAMPLER_STATE_MAGIC;
4465 #endif
4466 rstate->val[0] =
4467 (S_008F30_CLAMP_X(si_tex_wrap(state->wrap_s)) | S_008F30_CLAMP_Y(si_tex_wrap(state->wrap_t)) |
4468 S_008F30_CLAMP_Z(si_tex_wrap(state->wrap_r)) | S_008F30_MAX_ANISO_RATIO(max_aniso_ratio) |
4469 S_008F30_DEPTH_COMPARE_FUNC(si_tex_compare(state->compare_func)) |
4470 S_008F30_FORCE_UNNORMALIZED(!state->normalized_coords) |
4471 S_008F30_ANISO_THRESHOLD(max_aniso_ratio >> 1) | S_008F30_ANISO_BIAS(max_aniso_ratio) |
4472 S_008F30_DISABLE_CUBE_WRAP(!state->seamless_cube_map) |
4473 S_008F30_COMPAT_MODE(sctx->chip_class == GFX8 || sctx->chip_class == GFX9));
4474 rstate->val[1] = (S_008F34_MIN_LOD(S_FIXED(CLAMP(state->min_lod, 0, 15), 8)) |
4475 S_008F34_MAX_LOD(S_FIXED(CLAMP(state->max_lod, 0, 15), 8)) |
4476 S_008F34_PERF_MIP(max_aniso_ratio ? max_aniso_ratio + 6 : 0));
4477 rstate->val[2] = (S_008F38_LOD_BIAS(S_FIXED(CLAMP(state->lod_bias, -16, 16), 8)) |
4478 S_008F38_XY_MAG_FILTER(si_tex_filter(state->mag_img_filter, max_aniso)) |
4479 S_008F38_XY_MIN_FILTER(si_tex_filter(state->min_img_filter, max_aniso)) |
4480 S_008F38_MIP_FILTER(si_tex_mipfilter(state->min_mip_filter)) |
4481 S_008F38_MIP_POINT_PRECLAMP(0));
4482 rstate->val[3] = si_translate_border_color(sctx, state, &state->border_color, false);
4483
4484 if (sscreen->info.chip_class >= GFX10) {
4485 rstate->val[2] |= S_008F38_ANISO_OVERRIDE_GFX10(1);
4486 } else {
4487 rstate->val[2] |= S_008F38_DISABLE_LSB_CEIL(sctx->chip_class <= GFX8) |
4488 S_008F38_FILTER_PREC_FIX(1) |
4489 S_008F38_ANISO_OVERRIDE_GFX6(sctx->chip_class >= GFX8);
4490 }
4491
4492 /* Create sampler resource for integer textures. */
4493 memcpy(rstate->integer_val, rstate->val, sizeof(rstate->val));
4494 rstate->integer_val[3] = si_translate_border_color(sctx, state, &state->border_color, true);
4495
4496 /* Create sampler resource for upgraded depth textures. */
4497 memcpy(rstate->upgraded_depth_val, rstate->val, sizeof(rstate->val));
4498
4499 for (unsigned i = 0; i < 4; ++i) {
4500 /* Use channel 0 on purpose, so that we can use OPAQUE_WHITE
4501 * when the border color is 1.0. */
4502 clamped_border_color.f[i] = CLAMP(state->border_color.f[0], 0, 1);
4503 }
4504
4505 if (memcmp(&state->border_color, &clamped_border_color, sizeof(clamped_border_color)) == 0) {
4506 if (sscreen->info.chip_class <= GFX9)
4507 rstate->upgraded_depth_val[3] |= S_008F3C_UPGRADED_DEPTH(1);
4508 } else {
4509 rstate->upgraded_depth_val[3] =
4510 si_translate_border_color(sctx, state, &clamped_border_color, false);
4511 }
4512
4513 return rstate;
4514 }
4515
4516 static void si_set_sample_mask(struct pipe_context *ctx, unsigned sample_mask)
4517 {
4518 struct si_context *sctx = (struct si_context *)ctx;
4519
4520 if (sctx->sample_mask == (uint16_t)sample_mask)
4521 return;
4522
4523 sctx->sample_mask = sample_mask;
4524 si_mark_atom_dirty(sctx, &sctx->atoms.s.sample_mask);
4525 }
4526
4527 static void si_emit_sample_mask(struct si_context *sctx)
4528 {
4529 struct radeon_cmdbuf *cs = sctx->gfx_cs;
4530 unsigned mask = sctx->sample_mask;
4531
4532 /* Needed for line and polygon smoothing as well as for the Polaris
4533 * small primitive filter. We expect the state tracker to take care of
4534 * this for us.
4535 */
4536 assert(mask == 0xffff || sctx->framebuffer.nr_samples > 1 ||
4537 (mask & 1 && sctx->blitter->running));
4538
4539 radeon_set_context_reg_seq(cs, R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2);
4540 radeon_emit(cs, mask | (mask << 16));
4541 radeon_emit(cs, mask | (mask << 16));
4542 }
4543
4544 static void si_delete_sampler_state(struct pipe_context *ctx, void *state)
4545 {
4546 #ifndef NDEBUG
4547 struct si_sampler_state *s = state;
4548
4549 assert(s->magic == SI_SAMPLER_STATE_MAGIC);
4550 s->magic = 0;
4551 #endif
4552 free(state);
4553 }
4554
4555 /*
4556 * Vertex elements & buffers
4557 */
4558
4559 struct si_fast_udiv_info32 si_compute_fast_udiv_info32(uint32_t D, unsigned num_bits)
4560 {
4561 struct util_fast_udiv_info info = util_compute_fast_udiv_info(D, num_bits, 32);
4562
4563 struct si_fast_udiv_info32 result = {
4564 info.multiplier,
4565 info.pre_shift,
4566 info.post_shift,
4567 info.increment,
4568 };
4569 return result;
4570 }
4571
4572 static void *si_create_vertex_elements(struct pipe_context *ctx, unsigned count,
4573 const struct pipe_vertex_element *elements)
4574 {
4575 struct si_screen *sscreen = (struct si_screen *)ctx->screen;
4576 struct si_vertex_elements *v = CALLOC_STRUCT(si_vertex_elements);
4577 bool used[SI_NUM_VERTEX_BUFFERS] = {};
4578 struct si_fast_udiv_info32 divisor_factors[SI_MAX_ATTRIBS] = {};
4579 STATIC_ASSERT(sizeof(struct si_fast_udiv_info32) == 16);
4580 STATIC_ASSERT(sizeof(divisor_factors[0].multiplier) == 4);
4581 STATIC_ASSERT(sizeof(divisor_factors[0].pre_shift) == 4);
4582 STATIC_ASSERT(sizeof(divisor_factors[0].post_shift) == 4);
4583 STATIC_ASSERT(sizeof(divisor_factors[0].increment) == 4);
4584 int i;
4585
4586 assert(count <= SI_MAX_ATTRIBS);
4587 if (!v)
4588 return NULL;
4589
4590 v->count = count;
4591
4592 unsigned alloc_count =
4593 count > sscreen->num_vbos_in_user_sgprs ? count - sscreen->num_vbos_in_user_sgprs : 0;
4594 v->vb_desc_list_alloc_size = align(alloc_count * 16, SI_CPDMA_ALIGNMENT);
4595
4596 for (i = 0; i < count; ++i) {
4597 const struct util_format_description *desc;
4598 const struct util_format_channel_description *channel;
4599 int first_non_void;
4600 unsigned vbo_index = elements[i].vertex_buffer_index;
4601
4602 if (vbo_index >= SI_NUM_VERTEX_BUFFERS) {
4603 FREE(v);
4604 return NULL;
4605 }
4606
4607 unsigned instance_divisor = elements[i].instance_divisor;
4608 if (instance_divisor) {
4609 v->uses_instance_divisors = true;
4610
4611 if (instance_divisor == 1) {
4612 v->instance_divisor_is_one |= 1u << i;
4613 } else {
4614 v->instance_divisor_is_fetched |= 1u << i;
4615 divisor_factors[i] = si_compute_fast_udiv_info32(instance_divisor, 32);
4616 }
4617 }
4618
4619 if (!used[vbo_index]) {
4620 v->first_vb_use_mask |= 1 << i;
4621 used[vbo_index] = true;
4622 }
4623
4624 desc = util_format_description(elements[i].src_format);
4625 first_non_void = util_format_get_first_non_void_channel(elements[i].src_format);
4626 channel = first_non_void >= 0 ? &desc->channel[first_non_void] : NULL;
4627
4628 v->format_size[i] = desc->block.bits / 8;
4629 v->src_offset[i] = elements[i].src_offset;
4630 v->vertex_buffer_index[i] = vbo_index;
4631
4632 bool always_fix = false;
4633 union si_vs_fix_fetch fix_fetch;
4634 unsigned log_hw_load_size; /* the load element size as seen by the hardware */
4635
4636 fix_fetch.bits = 0;
4637 log_hw_load_size = MIN2(2, util_logbase2(desc->block.bits) - 3);
4638
4639 if (channel) {
4640 switch (channel->type) {
4641 case UTIL_FORMAT_TYPE_FLOAT:
4642 fix_fetch.u.format = AC_FETCH_FORMAT_FLOAT;
4643 break;
4644 case UTIL_FORMAT_TYPE_FIXED:
4645 fix_fetch.u.format = AC_FETCH_FORMAT_FIXED;
4646 break;
4647 case UTIL_FORMAT_TYPE_SIGNED: {
4648 if (channel->pure_integer)
4649 fix_fetch.u.format = AC_FETCH_FORMAT_SINT;
4650 else if (channel->normalized)
4651 fix_fetch.u.format = AC_FETCH_FORMAT_SNORM;
4652 else
4653 fix_fetch.u.format = AC_FETCH_FORMAT_SSCALED;
4654 break;
4655 }
4656 case UTIL_FORMAT_TYPE_UNSIGNED: {
4657 if (channel->pure_integer)
4658 fix_fetch.u.format = AC_FETCH_FORMAT_UINT;
4659 else if (channel->normalized)
4660 fix_fetch.u.format = AC_FETCH_FORMAT_UNORM;
4661 else
4662 fix_fetch.u.format = AC_FETCH_FORMAT_USCALED;
4663 break;
4664 }
4665 default:
4666 unreachable("bad format type");
4667 }
4668 } else {
4669 switch (elements[i].src_format) {
4670 case PIPE_FORMAT_R11G11B10_FLOAT:
4671 fix_fetch.u.format = AC_FETCH_FORMAT_FLOAT;
4672 break;
4673 default:
4674 unreachable("bad other format");
4675 }
4676 }
4677
4678 if (desc->channel[0].size == 10) {
4679 fix_fetch.u.log_size = 3; /* special encoding for 2_10_10_10 */
4680 log_hw_load_size = 2;
4681
4682 /* The hardware always treats the 2-bit alpha channel as
4683 * unsigned, so a shader workaround is needed. The affected
4684 * chips are GFX8 and older except Stoney (GFX8.1).
4685 */
4686 always_fix = sscreen->info.chip_class <= GFX8 && sscreen->info.family != CHIP_STONEY &&
4687 channel->type == UTIL_FORMAT_TYPE_SIGNED;
4688 } else if (elements[i].src_format == PIPE_FORMAT_R11G11B10_FLOAT) {
4689 fix_fetch.u.log_size = 3; /* special encoding */
4690 fix_fetch.u.format = AC_FETCH_FORMAT_FIXED;
4691 log_hw_load_size = 2;
4692 } else {
4693 fix_fetch.u.log_size = util_logbase2(channel->size) - 3;
4694 fix_fetch.u.num_channels_m1 = desc->nr_channels - 1;
4695
4696 /* Always fix up:
4697 * - doubles (multiple loads + truncate to float)
4698 * - 32-bit requiring a conversion
4699 */
4700 always_fix = (fix_fetch.u.log_size == 3) ||
4701 (fix_fetch.u.log_size == 2 && fix_fetch.u.format != AC_FETCH_FORMAT_FLOAT &&
4702 fix_fetch.u.format != AC_FETCH_FORMAT_UINT &&
4703 fix_fetch.u.format != AC_FETCH_FORMAT_SINT);
4704
4705 /* Also fixup 8_8_8 and 16_16_16. */
4706 if (desc->nr_channels == 3 && fix_fetch.u.log_size <= 1) {
4707 always_fix = true;
4708 log_hw_load_size = fix_fetch.u.log_size;
4709 }
4710 }
4711
4712 if (desc->swizzle[0] != PIPE_SWIZZLE_X) {
4713 assert(desc->swizzle[0] == PIPE_SWIZZLE_Z &&
4714 (desc->swizzle[2] == PIPE_SWIZZLE_X || desc->swizzle[2] == PIPE_SWIZZLE_0));
4715 fix_fetch.u.reverse = 1;
4716 }
4717
4718 /* Force the workaround for unaligned access here already if the
4719 * offset relative to the vertex buffer base is unaligned.
4720 *
4721 * There is a theoretical case in which this is too conservative:
4722 * if the vertex buffer's offset is also unaligned in just the
4723 * right way, we end up with an aligned address after all.
4724 * However, this case should be extremely rare in practice (it
4725 * won't happen in well-behaved applications), and taking it
4726 * into account would complicate the fast path (where everything
4727 * is nicely aligned).
4728 */
4729 bool check_alignment = log_hw_load_size >= 1 && (sscreen->info.chip_class == GFX6 ||
4730 sscreen->info.chip_class == GFX10);
4731 bool opencode = sscreen->options.vs_fetch_always_opencode;
4732
4733 if (check_alignment && (elements[i].src_offset & ((1 << log_hw_load_size) - 1)) != 0)
4734 opencode = true;
4735
4736 if (always_fix || check_alignment || opencode)
4737 v->fix_fetch[i] = fix_fetch.bits;
4738
4739 if (opencode)
4740 v->fix_fetch_opencode |= 1 << i;
4741 if (opencode || always_fix)
4742 v->fix_fetch_always |= 1 << i;
4743
4744 if (check_alignment && !opencode) {
4745 assert(log_hw_load_size == 1 || log_hw_load_size == 2);
4746
4747 v->fix_fetch_unaligned |= 1 << i;
4748 v->hw_load_is_dword |= (log_hw_load_size - 1) << i;
4749 v->vb_alignment_check_mask |= 1 << vbo_index;
4750 }
4751
4752 v->rsrc_word3[i] = S_008F0C_DST_SEL_X(si_map_swizzle(desc->swizzle[0])) |
4753 S_008F0C_DST_SEL_Y(si_map_swizzle(desc->swizzle[1])) |
4754 S_008F0C_DST_SEL_Z(si_map_swizzle(desc->swizzle[2])) |
4755 S_008F0C_DST_SEL_W(si_map_swizzle(desc->swizzle[3]));
4756
4757 if (sscreen->info.chip_class >= GFX10) {
4758 const struct gfx10_format *fmt = &gfx10_format_table[elements[i].src_format];
4759 assert(fmt->img_format != 0 && fmt->img_format < 128);
4760 v->rsrc_word3[i] |= S_008F0C_FORMAT(fmt->img_format) | S_008F0C_RESOURCE_LEVEL(1);
4761 } else {
4762 unsigned data_format, num_format;
4763 data_format = si_translate_buffer_dataformat(ctx->screen, desc, first_non_void);
4764 num_format = si_translate_buffer_numformat(ctx->screen, desc, first_non_void);
4765 v->rsrc_word3[i] |= S_008F0C_NUM_FORMAT(num_format) | S_008F0C_DATA_FORMAT(data_format);
4766 }
4767 }
4768
4769 if (v->instance_divisor_is_fetched) {
4770 unsigned num_divisors = util_last_bit(v->instance_divisor_is_fetched);
4771
4772 v->instance_divisor_factor_buffer = (struct si_resource *)pipe_buffer_create(
4773 &sscreen->b, 0, PIPE_USAGE_DEFAULT, num_divisors * sizeof(divisor_factors[0]));
4774 if (!v->instance_divisor_factor_buffer) {
4775 FREE(v);
4776 return NULL;
4777 }
4778 void *map =
4779 sscreen->ws->buffer_map(v->instance_divisor_factor_buffer->buf, NULL, PIPE_TRANSFER_WRITE);
4780 memcpy(map, divisor_factors, num_divisors * sizeof(divisor_factors[0]));
4781 }
4782 return v;
4783 }
4784
4785 static void si_bind_vertex_elements(struct pipe_context *ctx, void *state)
4786 {
4787 struct si_context *sctx = (struct si_context *)ctx;
4788 struct si_vertex_elements *old = sctx->vertex_elements;
4789 struct si_vertex_elements *v = (struct si_vertex_elements *)state;
4790
4791 sctx->vertex_elements = v;
4792 sctx->num_vertex_elements = v ? v->count : 0;
4793
4794 if (sctx->num_vertex_elements) {
4795 sctx->vertex_buffers_dirty = true;
4796 } else {
4797 sctx->vertex_buffer_pointer_dirty = false;
4798 sctx->vertex_buffer_user_sgprs_dirty = false;
4799 }
4800
4801 if (v && (!old || old->count != v->count ||
4802 old->uses_instance_divisors != v->uses_instance_divisors ||
4803 /* we don't check which divisors changed */
4804 v->uses_instance_divisors ||
4805 (old->vb_alignment_check_mask ^ v->vb_alignment_check_mask) &
4806 sctx->vertex_buffer_unaligned ||
4807 ((v->vb_alignment_check_mask & sctx->vertex_buffer_unaligned) &&
4808 memcmp(old->vertex_buffer_index, v->vertex_buffer_index,
4809 sizeof(v->vertex_buffer_index[0]) * v->count)) ||
4810 /* fix_fetch_{always,opencode,unaligned} and hw_load_is_dword are
4811 * functions of fix_fetch and the src_offset alignment.
4812 * If they change and fix_fetch doesn't, it must be due to different
4813 * src_offset alignment, which is reflected in fix_fetch_opencode. */
4814 old->fix_fetch_opencode != v->fix_fetch_opencode ||
4815 memcmp(old->fix_fetch, v->fix_fetch, sizeof(v->fix_fetch[0]) * v->count)))
4816 sctx->do_update_shaders = true;
4817
4818 if (v && v->instance_divisor_is_fetched) {
4819 struct pipe_constant_buffer cb;
4820
4821 cb.buffer = &v->instance_divisor_factor_buffer->b.b;
4822 cb.user_buffer = NULL;
4823 cb.buffer_offset = 0;
4824 cb.buffer_size = 0xffffffff;
4825 si_set_rw_buffer(sctx, SI_VS_CONST_INSTANCE_DIVISORS, &cb);
4826 }
4827 }
4828
4829 static void si_delete_vertex_element(struct pipe_context *ctx, void *state)
4830 {
4831 struct si_context *sctx = (struct si_context *)ctx;
4832 struct si_vertex_elements *v = (struct si_vertex_elements *)state;
4833
4834 if (sctx->vertex_elements == state) {
4835 sctx->vertex_elements = NULL;
4836 sctx->num_vertex_elements = 0;
4837 }
4838 si_resource_reference(&v->instance_divisor_factor_buffer, NULL);
4839 FREE(state);
4840 }
4841
4842 static void si_set_vertex_buffers(struct pipe_context *ctx, unsigned start_slot, unsigned count,
4843 const struct pipe_vertex_buffer *buffers)
4844 {
4845 struct si_context *sctx = (struct si_context *)ctx;
4846 struct pipe_vertex_buffer *dst = sctx->vertex_buffer + start_slot;
4847 unsigned updated_mask = u_bit_consecutive(start_slot, count);
4848 uint32_t orig_unaligned = sctx->vertex_buffer_unaligned;
4849 uint32_t unaligned = 0;
4850 int i;
4851
4852 assert(start_slot + count <= ARRAY_SIZE(sctx->vertex_buffer));
4853
4854 if (buffers) {
4855 for (i = 0; i < count; i++) {
4856 const struct pipe_vertex_buffer *src = buffers + i;
4857 struct pipe_vertex_buffer *dsti = dst + i;
4858 struct pipe_resource *buf = src->buffer.resource;
4859 unsigned slot_bit = 1 << (start_slot + i);
4860
4861 pipe_resource_reference(&dsti->buffer.resource, buf);
4862 dsti->buffer_offset = src->buffer_offset;
4863 dsti->stride = src->stride;
4864
4865 if (dsti->buffer_offset & 3 || dsti->stride & 3)
4866 unaligned |= slot_bit;
4867
4868 si_context_add_resource_size(sctx, buf);
4869 if (buf)
4870 si_resource(buf)->bind_history |= PIPE_BIND_VERTEX_BUFFER;
4871 }
4872 } else {
4873 for (i = 0; i < count; i++) {
4874 pipe_resource_reference(&dst[i].buffer.resource, NULL);
4875 }
4876 unaligned &= ~updated_mask;
4877 }
4878 sctx->vertex_buffers_dirty = true;
4879 sctx->vertex_buffer_unaligned = (orig_unaligned & ~updated_mask) | unaligned;
4880
4881 /* Check whether alignment may have changed in a way that requires
4882 * shader changes. This check is conservative: a vertex buffer can only
4883 * trigger a shader change if the misalignment amount changes (e.g.
4884 * from byte-aligned to short-aligned), but we only keep track of
4885 * whether buffers are at least dword-aligned, since that should always
4886 * be the case in well-behaved applications anyway.
4887 */
4888 if (sctx->vertex_elements && (sctx->vertex_elements->vb_alignment_check_mask &
4889 (unaligned | orig_unaligned) & updated_mask))
4890 sctx->do_update_shaders = true;
4891 }
4892
4893 /*
4894 * Misc
4895 */
4896
4897 static void si_set_tess_state(struct pipe_context *ctx, const float default_outer_level[4],
4898 const float default_inner_level[2])
4899 {
4900 struct si_context *sctx = (struct si_context *)ctx;
4901 struct pipe_constant_buffer cb;
4902 float array[8];
4903
4904 memcpy(array, default_outer_level, sizeof(float) * 4);
4905 memcpy(array + 4, default_inner_level, sizeof(float) * 2);
4906
4907 cb.buffer = NULL;
4908 cb.user_buffer = NULL;
4909 cb.buffer_size = sizeof(array);
4910
4911 si_upload_const_buffer(sctx, (struct si_resource **)&cb.buffer, (void *)array, sizeof(array),
4912 &cb.buffer_offset);
4913
4914 si_set_rw_buffer(sctx, SI_HS_CONST_DEFAULT_TESS_LEVELS, &cb);
4915 pipe_resource_reference(&cb.buffer, NULL);
4916 }
4917
4918 static void si_texture_barrier(struct pipe_context *ctx, unsigned flags)
4919 {
4920 struct si_context *sctx = (struct si_context *)ctx;
4921
4922 si_update_fb_dirtiness_after_rendering(sctx);
4923
4924 /* Multisample surfaces are flushed in si_decompress_textures. */
4925 if (sctx->framebuffer.uncompressed_cb_mask) {
4926 si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples,
4927 sctx->framebuffer.CB_has_shader_readable_metadata,
4928 sctx->framebuffer.all_DCC_pipe_aligned);
4929 }
4930 }
4931
4932 /* This only ensures coherency for shader image/buffer stores. */
4933 static void si_memory_barrier(struct pipe_context *ctx, unsigned flags)
4934 {
4935 struct si_context *sctx = (struct si_context *)ctx;
4936
4937 if (!(flags & ~PIPE_BARRIER_UPDATE))
4938 return;
4939
4940 /* Subsequent commands must wait for all shader invocations to
4941 * complete. */
4942 sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH;
4943
4944 if (flags & PIPE_BARRIER_CONSTANT_BUFFER)
4945 sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE;
4946
4947 if (flags & (PIPE_BARRIER_VERTEX_BUFFER | PIPE_BARRIER_SHADER_BUFFER | PIPE_BARRIER_TEXTURE |
4948 PIPE_BARRIER_IMAGE | PIPE_BARRIER_STREAMOUT_BUFFER | PIPE_BARRIER_GLOBAL_BUFFER)) {
4949 /* As far as I can tell, L1 contents are written back to L2
4950 * automatically at end of shader, but the contents of other
4951 * L1 caches might still be stale. */
4952 sctx->flags |= SI_CONTEXT_INV_VCACHE;
4953 }
4954
4955 if (flags & PIPE_BARRIER_INDEX_BUFFER) {
4956 /* Indices are read through TC L2 since GFX8.
4957 * L1 isn't used.
4958 */
4959 if (sctx->screen->info.chip_class <= GFX7)
4960 sctx->flags |= SI_CONTEXT_WB_L2;
4961 }
4962
4963 /* MSAA color, any depth and any stencil are flushed in
4964 * si_decompress_textures when needed.
4965 */
4966 if (flags & PIPE_BARRIER_FRAMEBUFFER && sctx->framebuffer.uncompressed_cb_mask) {
4967 sctx->flags |= SI_CONTEXT_FLUSH_AND_INV_CB;
4968
4969 if (sctx->chip_class <= GFX8)
4970 sctx->flags |= SI_CONTEXT_WB_L2;
4971 }
4972
4973 /* Indirect buffers use TC L2 on GFX9, but not older hw. */
4974 if (sctx->screen->info.chip_class <= GFX8 && flags & PIPE_BARRIER_INDIRECT_BUFFER)
4975 sctx->flags |= SI_CONTEXT_WB_L2;
4976 }
4977
4978 static void *si_create_blend_custom(struct si_context *sctx, unsigned mode)
4979 {
4980 struct pipe_blend_state blend;
4981
4982 memset(&blend, 0, sizeof(blend));
4983 blend.independent_blend_enable = true;
4984 blend.rt[0].colormask = 0xf;
4985 return si_create_blend_state_mode(&sctx->b, &blend, mode);
4986 }
4987
4988 static void si_init_config(struct si_context *sctx);
4989
4990 void si_init_state_compute_functions(struct si_context *sctx)
4991 {
4992 sctx->b.create_sampler_state = si_create_sampler_state;
4993 sctx->b.delete_sampler_state = si_delete_sampler_state;
4994 sctx->b.create_sampler_view = si_create_sampler_view;
4995 sctx->b.sampler_view_destroy = si_sampler_view_destroy;
4996 sctx->b.memory_barrier = si_memory_barrier;
4997 }
4998
4999 void si_init_state_functions(struct si_context *sctx)
5000 {
5001 sctx->atoms.s.framebuffer.emit = si_emit_framebuffer_state;
5002 sctx->atoms.s.msaa_sample_locs.emit = si_emit_msaa_sample_locs;
5003 sctx->atoms.s.db_render_state.emit = si_emit_db_render_state;
5004 sctx->atoms.s.dpbb_state.emit = si_emit_dpbb_state;
5005 sctx->atoms.s.msaa_config.emit = si_emit_msaa_config;
5006 sctx->atoms.s.sample_mask.emit = si_emit_sample_mask;
5007 sctx->atoms.s.cb_render_state.emit = si_emit_cb_render_state;
5008 sctx->atoms.s.blend_color.emit = si_emit_blend_color;
5009 sctx->atoms.s.clip_regs.emit = si_emit_clip_regs;
5010 sctx->atoms.s.clip_state.emit = si_emit_clip_state;
5011 sctx->atoms.s.stencil_ref.emit = si_emit_stencil_ref;
5012
5013 sctx->b.create_blend_state = si_create_blend_state;
5014 sctx->b.bind_blend_state = si_bind_blend_state;
5015 sctx->b.delete_blend_state = si_delete_blend_state;
5016 sctx->b.set_blend_color = si_set_blend_color;
5017
5018 sctx->b.create_rasterizer_state = si_create_rs_state;
5019 sctx->b.bind_rasterizer_state = si_bind_rs_state;
5020 sctx->b.delete_rasterizer_state = si_delete_rs_state;
5021
5022 sctx->b.create_depth_stencil_alpha_state = si_create_dsa_state;
5023 sctx->b.bind_depth_stencil_alpha_state = si_bind_dsa_state;
5024 sctx->b.delete_depth_stencil_alpha_state = si_delete_dsa_state;
5025
5026 sctx->custom_dsa_flush = si_create_db_flush_dsa(sctx);
5027 sctx->custom_blend_resolve = si_create_blend_custom(sctx, V_028808_CB_RESOLVE);
5028 sctx->custom_blend_fmask_decompress = si_create_blend_custom(sctx, V_028808_CB_FMASK_DECOMPRESS);
5029 sctx->custom_blend_eliminate_fastclear =
5030 si_create_blend_custom(sctx, V_028808_CB_ELIMINATE_FAST_CLEAR);
5031 sctx->custom_blend_dcc_decompress = si_create_blend_custom(sctx, V_028808_CB_DCC_DECOMPRESS);
5032
5033 sctx->b.set_clip_state = si_set_clip_state;
5034 sctx->b.set_stencil_ref = si_set_stencil_ref;
5035
5036 sctx->b.set_framebuffer_state = si_set_framebuffer_state;
5037
5038 sctx->b.set_sample_mask = si_set_sample_mask;
5039
5040 sctx->b.create_vertex_elements_state = si_create_vertex_elements;
5041 sctx->b.bind_vertex_elements_state = si_bind_vertex_elements;
5042 sctx->b.delete_vertex_elements_state = si_delete_vertex_element;
5043 sctx->b.set_vertex_buffers = si_set_vertex_buffers;
5044
5045 sctx->b.texture_barrier = si_texture_barrier;
5046 sctx->b.set_min_samples = si_set_min_samples;
5047 sctx->b.set_tess_state = si_set_tess_state;
5048
5049 sctx->b.set_active_query_state = si_set_active_query_state;
5050
5051 si_init_config(sctx);
5052 }
5053
5054 void si_init_screen_state_functions(struct si_screen *sscreen)
5055 {
5056 sscreen->b.is_format_supported = si_is_format_supported;
5057
5058 if (sscreen->info.chip_class >= GFX10) {
5059 sscreen->make_texture_descriptor = gfx10_make_texture_descriptor;
5060 } else {
5061 sscreen->make_texture_descriptor = si_make_texture_descriptor;
5062 }
5063 }
5064
5065 static void si_set_grbm_gfx_index(struct si_context *sctx, struct si_pm4_state *pm4, unsigned value)
5066 {
5067 unsigned reg = sctx->chip_class >= GFX7 ? R_030800_GRBM_GFX_INDEX : R_00802C_GRBM_GFX_INDEX;
5068 si_pm4_set_reg(pm4, reg, value);
5069 }
5070
5071 static void si_set_grbm_gfx_index_se(struct si_context *sctx, struct si_pm4_state *pm4, unsigned se)
5072 {
5073 assert(se == ~0 || se < sctx->screen->info.max_se);
5074 si_set_grbm_gfx_index(sctx, pm4,
5075 (se == ~0 ? S_030800_SE_BROADCAST_WRITES(1) : S_030800_SE_INDEX(se)) |
5076 S_030800_SH_BROADCAST_WRITES(1) |
5077 S_030800_INSTANCE_BROADCAST_WRITES(1));
5078 }
5079
5080 static void si_write_harvested_raster_configs(struct si_context *sctx, struct si_pm4_state *pm4,
5081 unsigned raster_config, unsigned raster_config_1)
5082 {
5083 unsigned num_se = MAX2(sctx->screen->info.max_se, 1);
5084 unsigned raster_config_se[4];
5085 unsigned se;
5086
5087 ac_get_harvested_configs(&sctx->screen->info, raster_config, &raster_config_1, raster_config_se);
5088
5089 for (se = 0; se < num_se; se++) {
5090 si_set_grbm_gfx_index_se(sctx, pm4, se);
5091 si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, raster_config_se[se]);
5092 }
5093 si_set_grbm_gfx_index(sctx, pm4, ~0);
5094
5095 if (sctx->chip_class >= GFX7) {
5096 si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, raster_config_1);
5097 }
5098 }
5099
5100 static void si_set_raster_config(struct si_context *sctx, struct si_pm4_state *pm4)
5101 {
5102 struct si_screen *sscreen = sctx->screen;
5103 unsigned num_rb = MIN2(sscreen->info.num_render_backends, 16);
5104 unsigned rb_mask = sscreen->info.enabled_rb_mask;
5105 unsigned raster_config = sscreen->pa_sc_raster_config;
5106 unsigned raster_config_1 = sscreen->pa_sc_raster_config_1;
5107
5108 if (!rb_mask || util_bitcount(rb_mask) >= num_rb) {
5109 /* Always use the default config when all backends are enabled
5110 * (or when we failed to determine the enabled backends).
5111 */
5112 si_pm4_set_reg(pm4, R_028350_PA_SC_RASTER_CONFIG, raster_config);
5113 if (sctx->chip_class >= GFX7)
5114 si_pm4_set_reg(pm4, R_028354_PA_SC_RASTER_CONFIG_1, raster_config_1);
5115 } else {
5116 si_write_harvested_raster_configs(sctx, pm4, raster_config, raster_config_1);
5117 }
5118 }
5119
5120 static void si_init_config(struct si_context *sctx)
5121 {
5122 struct si_screen *sscreen = sctx->screen;
5123 uint64_t border_color_va = sctx->border_color_buffer->gpu_address;
5124 bool has_clear_state = sscreen->info.has_clear_state;
5125 struct si_pm4_state *pm4 = CALLOC_STRUCT(si_pm4_state);
5126
5127 if (!pm4)
5128 return;
5129
5130 si_pm4_cmd_begin(pm4, PKT3_CONTEXT_CONTROL);
5131 si_pm4_cmd_add(pm4, CONTEXT_CONTROL_LOAD_ENABLE(1));
5132 si_pm4_cmd_add(pm4, CONTEXT_CONTROL_SHADOW_ENABLE(1));
5133 si_pm4_cmd_end(pm4, false);
5134
5135 if (has_clear_state) {
5136 si_pm4_cmd_begin(pm4, PKT3_CLEAR_STATE);
5137 si_pm4_cmd_add(pm4, 0);
5138 si_pm4_cmd_end(pm4, false);
5139 }
5140
5141 if (sctx->chip_class <= GFX8)
5142 si_set_raster_config(sctx, pm4);
5143
5144 si_pm4_set_reg(pm4, R_028A18_VGT_HOS_MAX_TESS_LEVEL, fui(64));
5145 if (!has_clear_state)
5146 si_pm4_set_reg(pm4, R_028A1C_VGT_HOS_MIN_TESS_LEVEL, fui(0));
5147
5148 /* FIXME calculate these values somehow ??? */
5149 if (sctx->chip_class <= GFX8) {
5150 si_pm4_set_reg(pm4, R_028A54_VGT_GS_PER_ES, SI_GS_PER_ES);
5151 si_pm4_set_reg(pm4, R_028A58_VGT_ES_PER_GS, 0x40);
5152 }
5153
5154 if (!has_clear_state) {
5155 si_pm4_set_reg(pm4, R_028A5C_VGT_GS_PER_VS, 0x2);
5156 si_pm4_set_reg(pm4, R_028A8C_VGT_PRIMITIVEID_RESET, 0x0);
5157 si_pm4_set_reg(pm4, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0x0);
5158 }
5159
5160 if (sscreen->info.chip_class <= GFX9)
5161 si_pm4_set_reg(pm4, R_028AA0_VGT_INSTANCE_STEP_RATE_0, 1);
5162 if (!has_clear_state)
5163 si_pm4_set_reg(pm4, R_028AB8_VGT_VTX_CNT_EN, 0x0);
5164 if (sctx->chip_class < GFX7)
5165 si_pm4_set_reg(pm4, R_008A14_PA_CL_ENHANCE,
5166 S_008A14_NUM_CLIP_SEQ(3) | S_008A14_CLIP_VTX_REORDER_ENA(1));
5167
5168 /* CLEAR_STATE doesn't restore these correctly. */
5169 si_pm4_set_reg(pm4, R_028240_PA_SC_GENERIC_SCISSOR_TL, S_028240_WINDOW_OFFSET_DISABLE(1));
5170 si_pm4_set_reg(pm4, R_028244_PA_SC_GENERIC_SCISSOR_BR,
5171 S_028244_BR_X(16384) | S_028244_BR_Y(16384));
5172
5173 /* CLEAR_STATE doesn't clear these correctly on certain generations.
5174 * I don't know why. Deduced by trial and error.
5175 */
5176 if (sctx->chip_class <= GFX7 || !has_clear_state) {
5177 si_pm4_set_reg(pm4, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, 0);
5178 si_pm4_set_reg(pm4, R_028204_PA_SC_WINDOW_SCISSOR_TL, S_028204_WINDOW_OFFSET_DISABLE(1));
5179 si_pm4_set_reg(pm4, R_028030_PA_SC_SCREEN_SCISSOR_TL, 0);
5180 si_pm4_set_reg(pm4, R_028034_PA_SC_SCREEN_SCISSOR_BR,
5181 S_028034_BR_X(16384) | S_028034_BR_Y(16384));
5182 }
5183
5184 if (!has_clear_state) {
5185 si_pm4_set_reg(pm4, R_028230_PA_SC_EDGERULE,
5186 S_028230_ER_TRI(0xA) | S_028230_ER_POINT(0xA) | S_028230_ER_RECT(0xA) |
5187 /* Required by DX10_DIAMOND_TEST_ENA: */
5188 S_028230_ER_LINE_LR(0x1A) | S_028230_ER_LINE_RL(0x26) |
5189 S_028230_ER_LINE_TB(0xA) | S_028230_ER_LINE_BT(0xA));
5190 si_pm4_set_reg(pm4, R_028820_PA_CL_NANINF_CNTL, 0);
5191 si_pm4_set_reg(pm4, R_028AC0_DB_SRESULTS_COMPARE_STATE0, 0x0);
5192 si_pm4_set_reg(pm4, R_028AC4_DB_SRESULTS_COMPARE_STATE1, 0x0);
5193 si_pm4_set_reg(pm4, R_028AC8_DB_PRELOAD_CONTROL, 0x0);
5194 si_pm4_set_reg(pm4, R_02800C_DB_RENDER_OVERRIDE, 0);
5195 }
5196
5197 if (sctx->chip_class >= GFX10) {
5198 si_pm4_set_reg(pm4, R_028A98_VGT_DRAW_PAYLOAD_CNTL, 0);
5199 si_pm4_set_reg(pm4, R_030964_GE_MAX_VTX_INDX, ~0);
5200 si_pm4_set_reg(pm4, R_030924_GE_MIN_VTX_INDX, 0);
5201 si_pm4_set_reg(pm4, R_030928_GE_INDX_OFFSET, 0);
5202 si_pm4_set_reg(pm4, R_03097C_GE_STEREO_CNTL, 0);
5203 si_pm4_set_reg(pm4, R_030988_GE_USER_VGPR_EN, 0);
5204 } else if (sctx->chip_class == GFX9) {
5205 si_pm4_set_reg(pm4, R_030920_VGT_MAX_VTX_INDX, ~0);
5206 si_pm4_set_reg(pm4, R_030924_VGT_MIN_VTX_INDX, 0);
5207 si_pm4_set_reg(pm4, R_030928_VGT_INDX_OFFSET, 0);
5208 } else {
5209 /* These registers, when written, also overwrite the CLEAR_STATE
5210 * context, so we can't rely on CLEAR_STATE setting them.
5211 * It would be an issue if there was another UMD changing them.
5212 */
5213 si_pm4_set_reg(pm4, R_028400_VGT_MAX_VTX_INDX, ~0);
5214 si_pm4_set_reg(pm4, R_028404_VGT_MIN_VTX_INDX, 0);
5215 si_pm4_set_reg(pm4, R_028408_VGT_INDX_OFFSET, 0);
5216 }
5217
5218 if (sctx->chip_class >= GFX7) {
5219 if (sctx->chip_class >= GFX10) {
5220 /* Logical CUs 16 - 31 */
5221 si_pm4_set_reg(pm4, R_00B404_SPI_SHADER_PGM_RSRC4_HS, S_00B404_CU_EN(0xffff));
5222 si_pm4_set_reg(pm4, R_00B104_SPI_SHADER_PGM_RSRC4_VS, S_00B104_CU_EN(0xffff));
5223 si_pm4_set_reg(pm4, R_00B004_SPI_SHADER_PGM_RSRC4_PS, S_00B004_CU_EN(0xffff));
5224 }
5225
5226 if (sctx->chip_class >= GFX9) {
5227 si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS,
5228 S_00B41C_CU_EN(0xffff) | S_00B41C_WAVE_LIMIT(0x3F));
5229 } else {
5230 si_pm4_set_reg(pm4, R_00B51C_SPI_SHADER_PGM_RSRC3_LS,
5231 S_00B51C_CU_EN(0xffff) | S_00B51C_WAVE_LIMIT(0x3F));
5232 si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, S_00B41C_WAVE_LIMIT(0x3F));
5233 si_pm4_set_reg(pm4, R_00B31C_SPI_SHADER_PGM_RSRC3_ES,
5234 S_00B31C_CU_EN(0xffff) | S_00B31C_WAVE_LIMIT(0x3F));
5235
5236 /* If this is 0, Bonaire can hang even if GS isn't being used.
5237 * Other chips are unaffected. These are suboptimal values,
5238 * but we don't use on-chip GS.
5239 */
5240 si_pm4_set_reg(pm4, R_028A44_VGT_GS_ONCHIP_CNTL,
5241 S_028A44_ES_VERTS_PER_SUBGRP(64) | S_028A44_GS_PRIMS_PER_SUBGRP(4));
5242 }
5243
5244 /* Compute LATE_ALLOC_VS.LIMIT. */
5245 unsigned num_cu_per_sh = sscreen->info.num_good_cu_per_sh;
5246 unsigned late_alloc_wave64 = 0; /* The limit is per SH. */
5247 unsigned cu_mask_vs = 0xffff;
5248 unsigned cu_mask_gs = 0xffff;
5249
5250 if (sctx->chip_class >= GFX10) {
5251 /* For Wave32, the hw will launch twice the number of late
5252 * alloc waves, so 1 == 2x wave32.
5253 */
5254 if (!sscreen->info.use_late_alloc) {
5255 late_alloc_wave64 = 0;
5256 } else if (num_cu_per_sh <= 6) {
5257 late_alloc_wave64 = num_cu_per_sh - 2;
5258 } else {
5259 late_alloc_wave64 = (num_cu_per_sh - 2) * 4;
5260
5261 /* CU2 & CU3 disabled because of the dual CU design */
5262 /* Late alloc is not used for NGG on Navi14 due to a hw bug. */
5263 cu_mask_vs = 0xfff3;
5264 cu_mask_gs = sscreen->use_ngg && sctx->family != CHIP_NAVI14 ? 0xfff3 : 0xffff;
5265 }
5266 } else {
5267 if (!sscreen->info.use_late_alloc) {
5268 late_alloc_wave64 = 0;
5269 } else if (num_cu_per_sh <= 4) {
5270 /* Too few available compute units per SH. Disallowing
5271 * VS to run on one CU could hurt us more than late VS
5272 * allocation would help.
5273 *
5274 * 2 is the highest safe number that allows us to keep
5275 * all CUs enabled.
5276 */
5277 late_alloc_wave64 = 2;
5278 } else {
5279 /* This is a good initial value, allowing 1 late_alloc
5280 * wave per SIMD on num_cu - 2.
5281 */
5282 late_alloc_wave64 = (num_cu_per_sh - 2) * 4;
5283 }
5284
5285 if (late_alloc_wave64 > 2)
5286 cu_mask_vs = 0xfffe; /* 1 CU disabled */
5287 }
5288
5289 /* VS can't execute on one CU if the limit is > 2. */
5290 si_pm4_set_reg(pm4, R_00B118_SPI_SHADER_PGM_RSRC3_VS,
5291 S_00B118_CU_EN(cu_mask_vs) | S_00B118_WAVE_LIMIT(0x3F));
5292 si_pm4_set_reg(pm4, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(late_alloc_wave64));
5293
5294 si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
5295 S_00B21C_CU_EN(cu_mask_gs) | S_00B21C_WAVE_LIMIT(0x3F));
5296
5297 si_pm4_set_reg(pm4, R_00B01C_SPI_SHADER_PGM_RSRC3_PS,
5298 S_00B01C_CU_EN(0xffff) | S_00B01C_WAVE_LIMIT(0x3F));
5299 }
5300
5301 if (sctx->chip_class >= GFX10) {
5302 /* Break up a pixel wave if it contains deallocs for more than
5303 * half the parameter cache.
5304 *
5305 * To avoid a deadlock where pixel waves aren't launched
5306 * because they're waiting for more pixels while the frontend
5307 * is stuck waiting for PC space, the maximum allowed value is
5308 * the size of the PC minus the largest possible allocation for
5309 * a single primitive shader subgroup.
5310 */
5311 si_pm4_set_reg(pm4, R_028C50_PA_SC_NGG_MODE_CNTL, S_028C50_MAX_DEALLOCS_IN_WAVE(512));
5312 si_pm4_set_reg(pm4, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14);
5313
5314 if (!has_clear_state) {
5315 si_pm4_set_reg(pm4, R_02835C_PA_SC_TILE_STEERING_OVERRIDE,
5316 sscreen->info.pa_sc_tile_steering_override);
5317 }
5318
5319 /* Enable CMASK/FMASK/HTILE/DCC caching in L2 for small chips. */
5320 unsigned meta_write_policy, meta_read_policy;
5321 /* TODO: investigate whether LRU improves performance on other chips too */
5322 if (sscreen->info.num_render_backends <= 4) {
5323 meta_write_policy = V_02807C_CACHE_LRU_WR; /* cache writes */
5324 meta_read_policy = V_02807C_CACHE_LRU_RD; /* cache reads */
5325 } else {
5326 meta_write_policy = V_02807C_CACHE_STREAM_WR; /* write combine */
5327 meta_read_policy = V_02807C_CACHE_NOA_RD; /* don't cache reads */
5328 }
5329
5330 si_pm4_set_reg(pm4, R_02807C_DB_RMI_L2_CACHE_CONTROL,
5331 S_02807C_Z_WR_POLICY(V_02807C_CACHE_STREAM_WR) |
5332 S_02807C_S_WR_POLICY(V_02807C_CACHE_STREAM_WR) |
5333 S_02807C_HTILE_WR_POLICY(meta_write_policy) |
5334 S_02807C_ZPCPSD_WR_POLICY(V_02807C_CACHE_STREAM_WR) |
5335 S_02807C_Z_RD_POLICY(V_02807C_CACHE_NOA_RD) |
5336 S_02807C_S_RD_POLICY(V_02807C_CACHE_NOA_RD) |
5337 S_02807C_HTILE_RD_POLICY(meta_read_policy));
5338
5339 si_pm4_set_reg(
5340 pm4, R_028410_CB_RMI_GL2_CACHE_CONTROL,
5341 S_028410_CMASK_WR_POLICY(meta_write_policy) | S_028410_FMASK_WR_POLICY(meta_write_policy) |
5342 S_028410_DCC_WR_POLICY(meta_write_policy) |
5343 S_028410_COLOR_WR_POLICY(V_028410_CACHE_STREAM_WR) |
5344 S_028410_CMASK_RD_POLICY(meta_read_policy) |
5345 S_028410_FMASK_RD_POLICY(meta_read_policy) | S_028410_DCC_RD_POLICY(meta_read_policy) |
5346 S_028410_COLOR_RD_POLICY(V_028410_CACHE_NOA_RD));
5347 si_pm4_set_reg(pm4, R_028428_CB_COVERAGE_OUT_CONTROL, 0);
5348
5349 si_pm4_set_reg(pm4, R_00B0C0_SPI_SHADER_REQ_CTRL_PS,
5350 S_00B0C0_SOFT_GROUPING_EN(1) | S_00B0C0_NUMBER_OF_REQUESTS_PER_CU(4 - 1));
5351 si_pm4_set_reg(pm4, R_00B1C0_SPI_SHADER_REQ_CTRL_VS, 0);
5352 }
5353
5354 if (sctx->chip_class >= GFX9) {
5355 si_pm4_set_reg(pm4, R_028B50_VGT_TESS_DISTRIBUTION,
5356 S_028B50_ACCUM_ISOLINE(40) | S_028B50_ACCUM_TRI(30) | S_028B50_ACCUM_QUAD(24) |
5357 S_028B50_DONUT_SPLIT(24) | S_028B50_TRAP_SPLIT(6));
5358 } else if (sctx->chip_class >= GFX8) {
5359 unsigned vgt_tess_distribution;
5360
5361 vgt_tess_distribution = S_028B50_ACCUM_ISOLINE(32) | S_028B50_ACCUM_TRI(11) |
5362 S_028B50_ACCUM_QUAD(11) | S_028B50_DONUT_SPLIT(16);
5363
5364 /* Testing with Unigine Heaven extreme tesselation yielded best results
5365 * with TRAP_SPLIT = 3.
5366 */
5367 if (sctx->family == CHIP_FIJI || sctx->family >= CHIP_POLARIS10)
5368 vgt_tess_distribution |= S_028B50_TRAP_SPLIT(3);
5369
5370 si_pm4_set_reg(pm4, R_028B50_VGT_TESS_DISTRIBUTION, vgt_tess_distribution);
5371 } else if (!has_clear_state) {
5372 si_pm4_set_reg(pm4, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL, 14);
5373 si_pm4_set_reg(pm4, R_028C5C_VGT_OUT_DEALLOC_CNTL, 16);
5374 }
5375
5376 si_pm4_set_reg(pm4, R_028080_TA_BC_BASE_ADDR, border_color_va >> 8);
5377 if (sctx->chip_class >= GFX7) {
5378 si_pm4_set_reg(pm4, R_028084_TA_BC_BASE_ADDR_HI, S_028084_ADDRESS(border_color_va >> 40));
5379 }
5380 si_pm4_add_bo(pm4, sctx->border_color_buffer, RADEON_USAGE_READ, RADEON_PRIO_BORDER_COLORS);
5381
5382 if (sctx->chip_class >= GFX9) {
5383 si_pm4_set_reg(pm4, R_028C48_PA_SC_BINNER_CNTL_1,
5384 S_028C48_MAX_ALLOC_COUNT(sscreen->info.pbb_max_alloc_count - 1) |
5385 S_028C48_MAX_PRIM_PER_BATCH(1023));
5386 si_pm4_set_reg(pm4, R_028C4C_PA_SC_CONSERVATIVE_RASTERIZATION_CNTL,
5387 S_028C4C_NULL_SQUAD_AA_MASK_ENABLE(1));
5388 si_pm4_set_reg(pm4, R_030968_VGT_INSTANCE_BASE_ID, 0);
5389 }
5390
5391 si_pm4_upload_indirect_buffer(sctx, pm4);
5392 sctx->init_config = pm4;
5393 }