intel/blorp: Take an isl_swizzle instead of a SWIZZLE
[mesa.git] / src / intel / blorp / blorp_blit.c
1 /*
2 * Copyright © 2012 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "compiler/nir/nir_builder.h"
25
26 #include "blorp_priv.h"
27 #include "brw_meta_util.h"
28
29 #define FILE_DEBUG_FLAG DEBUG_BLORP
30
31 /**
32 * Enum to specify the order of arguments in a sampler message
33 */
34 enum sampler_message_arg
35 {
36 SAMPLER_MESSAGE_ARG_U_FLOAT,
37 SAMPLER_MESSAGE_ARG_V_FLOAT,
38 SAMPLER_MESSAGE_ARG_U_INT,
39 SAMPLER_MESSAGE_ARG_V_INT,
40 SAMPLER_MESSAGE_ARG_R_INT,
41 SAMPLER_MESSAGE_ARG_SI_INT,
42 SAMPLER_MESSAGE_ARG_MCS_INT,
43 SAMPLER_MESSAGE_ARG_ZERO_INT,
44 };
45
46 struct brw_blorp_blit_vars {
47 /* Input values from brw_blorp_wm_inputs */
48 nir_variable *v_discard_rect;
49 nir_variable *v_rect_grid;
50 nir_variable *v_coord_transform;
51 nir_variable *v_src_z;
52
53 /* gl_FragCoord */
54 nir_variable *frag_coord;
55
56 /* gl_FragColor */
57 nir_variable *color_out;
58 };
59
60 static void
61 brw_blorp_blit_vars_init(nir_builder *b, struct brw_blorp_blit_vars *v,
62 const struct brw_blorp_blit_prog_key *key)
63 {
64 /* Blended and scaled blits never use pixel discard. */
65 assert(!key->use_kill || !(key->blend && key->blit_scaled));
66
67 #define LOAD_INPUT(name, type)\
68 v->v_##name = nir_variable_create(b->shader, nir_var_shader_in, \
69 type, #name); \
70 v->v_##name->data.interpolation = INTERP_MODE_FLAT; \
71 v->v_##name->data.location = VARYING_SLOT_VAR0 + \
72 offsetof(struct brw_blorp_wm_inputs, name) / (4 * sizeof(float));
73
74 LOAD_INPUT(discard_rect, glsl_vec4_type())
75 LOAD_INPUT(rect_grid, glsl_vec4_type())
76 LOAD_INPUT(coord_transform, glsl_vec4_type())
77 LOAD_INPUT(src_z, glsl_uint_type())
78
79 #undef LOAD_INPUT
80
81 v->frag_coord = nir_variable_create(b->shader, nir_var_shader_in,
82 glsl_vec4_type(), "gl_FragCoord");
83 v->frag_coord->data.location = VARYING_SLOT_POS;
84 v->frag_coord->data.origin_upper_left = true;
85
86 v->color_out = nir_variable_create(b->shader, nir_var_shader_out,
87 glsl_vec4_type(), "gl_FragColor");
88 v->color_out->data.location = FRAG_RESULT_COLOR;
89 }
90
91 static nir_ssa_def *
92 blorp_blit_get_frag_coords(nir_builder *b,
93 const struct brw_blorp_blit_prog_key *key,
94 struct brw_blorp_blit_vars *v)
95 {
96 nir_ssa_def *coord = nir_f2i(b, nir_load_var(b, v->frag_coord));
97
98 if (key->persample_msaa_dispatch) {
99 return nir_vec3(b, nir_channel(b, coord, 0), nir_channel(b, coord, 1),
100 nir_load_sample_id(b));
101 } else {
102 return nir_vec2(b, nir_channel(b, coord, 0), nir_channel(b, coord, 1));
103 }
104 }
105
106 /**
107 * Emit code to translate from destination (X, Y) coordinates to source (X, Y)
108 * coordinates.
109 */
110 static nir_ssa_def *
111 blorp_blit_apply_transform(nir_builder *b, nir_ssa_def *src_pos,
112 struct brw_blorp_blit_vars *v)
113 {
114 nir_ssa_def *coord_transform = nir_load_var(b, v->v_coord_transform);
115
116 nir_ssa_def *offset = nir_vec2(b, nir_channel(b, coord_transform, 1),
117 nir_channel(b, coord_transform, 3));
118 nir_ssa_def *mul = nir_vec2(b, nir_channel(b, coord_transform, 0),
119 nir_channel(b, coord_transform, 2));
120
121 return nir_ffma(b, src_pos, mul, offset);
122 }
123
124 static inline void
125 blorp_nir_discard_if_outside_rect(nir_builder *b, nir_ssa_def *pos,
126 struct brw_blorp_blit_vars *v)
127 {
128 nir_ssa_def *c0, *c1, *c2, *c3;
129 nir_ssa_def *discard_rect = nir_load_var(b, v->v_discard_rect);
130 nir_ssa_def *dst_x0 = nir_channel(b, discard_rect, 0);
131 nir_ssa_def *dst_x1 = nir_channel(b, discard_rect, 1);
132 nir_ssa_def *dst_y0 = nir_channel(b, discard_rect, 2);
133 nir_ssa_def *dst_y1 = nir_channel(b, discard_rect, 3);
134
135 c0 = nir_ult(b, nir_channel(b, pos, 0), dst_x0);
136 c1 = nir_uge(b, nir_channel(b, pos, 0), dst_x1);
137 c2 = nir_ult(b, nir_channel(b, pos, 1), dst_y0);
138 c3 = nir_uge(b, nir_channel(b, pos, 1), dst_y1);
139
140 nir_ssa_def *oob = nir_ior(b, nir_ior(b, c0, c1), nir_ior(b, c2, c3));
141
142 nir_intrinsic_instr *discard =
143 nir_intrinsic_instr_create(b->shader, nir_intrinsic_discard_if);
144 discard->src[0] = nir_src_for_ssa(oob);
145 nir_builder_instr_insert(b, &discard->instr);
146 }
147
148 static nir_tex_instr *
149 blorp_create_nir_tex_instr(nir_builder *b, struct brw_blorp_blit_vars *v,
150 nir_texop op, nir_ssa_def *pos, unsigned num_srcs,
151 nir_alu_type dst_type)
152 {
153 nir_tex_instr *tex = nir_tex_instr_create(b->shader, num_srcs);
154
155 tex->op = op;
156
157 tex->dest_type = dst_type;
158 tex->is_array = false;
159 tex->is_shadow = false;
160
161 /* Blorp only has one texture and it's bound at unit 0 */
162 tex->texture = NULL;
163 tex->sampler = NULL;
164 tex->texture_index = 0;
165 tex->sampler_index = 0;
166
167 /* To properly handle 3-D and 2-D array textures, we pull the Z component
168 * from an input. TODO: This is a bit magic; we should probably make this
169 * more explicit in the future.
170 */
171 assert(pos->num_components >= 2);
172 pos = nir_vec3(b, nir_channel(b, pos, 0), nir_channel(b, pos, 1),
173 nir_load_var(b, v->v_src_z));
174
175 tex->src[0].src_type = nir_tex_src_coord;
176 tex->src[0].src = nir_src_for_ssa(pos);
177 tex->coord_components = 3;
178
179 nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, NULL);
180
181 return tex;
182 }
183
184 static nir_ssa_def *
185 blorp_nir_tex(nir_builder *b, struct brw_blorp_blit_vars *v,
186 nir_ssa_def *pos, nir_alu_type dst_type)
187 {
188 nir_tex_instr *tex =
189 blorp_create_nir_tex_instr(b, v, nir_texop_tex, pos, 2, dst_type);
190
191 assert(pos->num_components == 2);
192 tex->sampler_dim = GLSL_SAMPLER_DIM_2D;
193 tex->src[1].src_type = nir_tex_src_lod;
194 tex->src[1].src = nir_src_for_ssa(nir_imm_int(b, 0));
195
196 nir_builder_instr_insert(b, &tex->instr);
197
198 return &tex->dest.ssa;
199 }
200
201 static nir_ssa_def *
202 blorp_nir_txf(nir_builder *b, struct brw_blorp_blit_vars *v,
203 nir_ssa_def *pos, nir_alu_type dst_type)
204 {
205 nir_tex_instr *tex =
206 blorp_create_nir_tex_instr(b, v, nir_texop_txf, pos, 2, dst_type);
207
208 tex->sampler_dim = GLSL_SAMPLER_DIM_3D;
209 tex->src[1].src_type = nir_tex_src_lod;
210 tex->src[1].src = nir_src_for_ssa(nir_imm_int(b, 0));
211
212 nir_builder_instr_insert(b, &tex->instr);
213
214 return &tex->dest.ssa;
215 }
216
217 static nir_ssa_def *
218 blorp_nir_txf_ms(nir_builder *b, struct brw_blorp_blit_vars *v,
219 nir_ssa_def *pos, nir_ssa_def *mcs, nir_alu_type dst_type)
220 {
221 nir_tex_instr *tex =
222 blorp_create_nir_tex_instr(b, v, nir_texop_txf_ms, pos,
223 mcs != NULL ? 3 : 2, dst_type);
224
225 tex->sampler_dim = GLSL_SAMPLER_DIM_MS;
226
227 tex->src[1].src_type = nir_tex_src_ms_index;
228 if (pos->num_components == 2) {
229 tex->src[1].src = nir_src_for_ssa(nir_imm_int(b, 0));
230 } else {
231 assert(pos->num_components == 3);
232 tex->src[1].src = nir_src_for_ssa(nir_channel(b, pos, 2));
233 }
234
235 if (mcs) {
236 tex->src[2].src_type = nir_tex_src_ms_mcs;
237 tex->src[2].src = nir_src_for_ssa(mcs);
238 }
239
240 nir_builder_instr_insert(b, &tex->instr);
241
242 return &tex->dest.ssa;
243 }
244
245 static nir_ssa_def *
246 blorp_nir_txf_ms_mcs(nir_builder *b, struct brw_blorp_blit_vars *v, nir_ssa_def *pos)
247 {
248 nir_tex_instr *tex =
249 blorp_create_nir_tex_instr(b, v, nir_texop_txf_ms_mcs,
250 pos, 1, nir_type_int);
251
252 tex->sampler_dim = GLSL_SAMPLER_DIM_MS;
253
254 nir_builder_instr_insert(b, &tex->instr);
255
256 return &tex->dest.ssa;
257 }
258
259 static nir_ssa_def *
260 nir_mask_shift_or(struct nir_builder *b, nir_ssa_def *dst, nir_ssa_def *src,
261 uint32_t src_mask, int src_left_shift)
262 {
263 nir_ssa_def *masked = nir_iand(b, src, nir_imm_int(b, src_mask));
264
265 nir_ssa_def *shifted;
266 if (src_left_shift > 0) {
267 shifted = nir_ishl(b, masked, nir_imm_int(b, src_left_shift));
268 } else if (src_left_shift < 0) {
269 shifted = nir_ushr(b, masked, nir_imm_int(b, -src_left_shift));
270 } else {
271 assert(src_left_shift == 0);
272 shifted = masked;
273 }
274
275 return nir_ior(b, dst, shifted);
276 }
277
278 /**
279 * Emit code to compensate for the difference between Y and W tiling.
280 *
281 * This code modifies the X and Y coordinates according to the formula:
282 *
283 * (X', Y', S') = detile(W-MAJOR, tile(Y-MAJOR, X, Y, S))
284 *
285 * (See brw_blorp_build_nir_shader).
286 */
287 static inline nir_ssa_def *
288 blorp_nir_retile_y_to_w(nir_builder *b, nir_ssa_def *pos)
289 {
290 assert(pos->num_components == 2);
291 nir_ssa_def *x_Y = nir_channel(b, pos, 0);
292 nir_ssa_def *y_Y = nir_channel(b, pos, 1);
293
294 /* Given X and Y coordinates that describe an address using Y tiling,
295 * translate to the X and Y coordinates that describe the same address
296 * using W tiling.
297 *
298 * If we break down the low order bits of X and Y, using a
299 * single letter to represent each low-order bit:
300 *
301 * X = A << 7 | 0bBCDEFGH
302 * Y = J << 5 | 0bKLMNP (1)
303 *
304 * Then we can apply the Y tiling formula to see the memory offset being
305 * addressed:
306 *
307 * offset = (J * tile_pitch + A) << 12 | 0bBCDKLMNPEFGH (2)
308 *
309 * If we apply the W detiling formula to this memory location, that the
310 * corresponding X' and Y' coordinates are:
311 *
312 * X' = A << 6 | 0bBCDPFH (3)
313 * Y' = J << 6 | 0bKLMNEG
314 *
315 * Combining (1) and (3), we see that to transform (X, Y) to (X', Y'),
316 * we need to make the following computation:
317 *
318 * X' = (X & ~0b1011) >> 1 | (Y & 0b1) << 2 | X & 0b1 (4)
319 * Y' = (Y & ~0b1) << 1 | (X & 0b1000) >> 2 | (X & 0b10) >> 1
320 */
321 nir_ssa_def *x_W = nir_imm_int(b, 0);
322 x_W = nir_mask_shift_or(b, x_W, x_Y, 0xfffffff4, -1);
323 x_W = nir_mask_shift_or(b, x_W, y_Y, 0x1, 2);
324 x_W = nir_mask_shift_or(b, x_W, x_Y, 0x1, 0);
325
326 nir_ssa_def *y_W = nir_imm_int(b, 0);
327 y_W = nir_mask_shift_or(b, y_W, y_Y, 0xfffffffe, 1);
328 y_W = nir_mask_shift_or(b, y_W, x_Y, 0x8, -2);
329 y_W = nir_mask_shift_or(b, y_W, x_Y, 0x2, -1);
330
331 return nir_vec2(b, x_W, y_W);
332 }
333
334 /**
335 * Emit code to compensate for the difference between Y and W tiling.
336 *
337 * This code modifies the X and Y coordinates according to the formula:
338 *
339 * (X', Y', S') = detile(Y-MAJOR, tile(W-MAJOR, X, Y, S))
340 *
341 * (See brw_blorp_build_nir_shader).
342 */
343 static inline nir_ssa_def *
344 blorp_nir_retile_w_to_y(nir_builder *b, nir_ssa_def *pos)
345 {
346 assert(pos->num_components == 2);
347 nir_ssa_def *x_W = nir_channel(b, pos, 0);
348 nir_ssa_def *y_W = nir_channel(b, pos, 1);
349
350 /* Applying the same logic as above, but in reverse, we obtain the
351 * formulas:
352 *
353 * X' = (X & ~0b101) << 1 | (Y & 0b10) << 2 | (Y & 0b1) << 1 | X & 0b1
354 * Y' = (Y & ~0b11) >> 1 | (X & 0b100) >> 2
355 */
356 nir_ssa_def *x_Y = nir_imm_int(b, 0);
357 x_Y = nir_mask_shift_or(b, x_Y, x_W, 0xfffffffa, 1);
358 x_Y = nir_mask_shift_or(b, x_Y, y_W, 0x2, 2);
359 x_Y = nir_mask_shift_or(b, x_Y, y_W, 0x1, 1);
360 x_Y = nir_mask_shift_or(b, x_Y, x_W, 0x1, 0);
361
362 nir_ssa_def *y_Y = nir_imm_int(b, 0);
363 y_Y = nir_mask_shift_or(b, y_Y, y_W, 0xfffffffc, -1);
364 y_Y = nir_mask_shift_or(b, y_Y, x_W, 0x4, -2);
365
366 return nir_vec2(b, x_Y, y_Y);
367 }
368
369 /**
370 * Emit code to compensate for the difference between MSAA and non-MSAA
371 * surfaces.
372 *
373 * This code modifies the X and Y coordinates according to the formula:
374 *
375 * (X', Y', S') = encode_msaa(num_samples, IMS, X, Y, S)
376 *
377 * (See brw_blorp_blit_program).
378 */
379 static inline nir_ssa_def *
380 blorp_nir_encode_msaa(nir_builder *b, nir_ssa_def *pos,
381 unsigned num_samples, enum isl_msaa_layout layout)
382 {
383 assert(pos->num_components == 2 || pos->num_components == 3);
384
385 switch (layout) {
386 case ISL_MSAA_LAYOUT_NONE:
387 assert(pos->num_components == 2);
388 return pos;
389 case ISL_MSAA_LAYOUT_ARRAY:
390 /* No translation needed */
391 return pos;
392 case ISL_MSAA_LAYOUT_INTERLEAVED: {
393 nir_ssa_def *x_in = nir_channel(b, pos, 0);
394 nir_ssa_def *y_in = nir_channel(b, pos, 1);
395 nir_ssa_def *s_in = pos->num_components == 2 ? nir_imm_int(b, 0) :
396 nir_channel(b, pos, 2);
397
398 nir_ssa_def *x_out = nir_imm_int(b, 0);
399 nir_ssa_def *y_out = nir_imm_int(b, 0);
400 switch (num_samples) {
401 case 2:
402 case 4:
403 /* encode_msaa(2, IMS, X, Y, S) = (X', Y', 0)
404 * where X' = (X & ~0b1) << 1 | (S & 0b1) << 1 | (X & 0b1)
405 * Y' = Y
406 *
407 * encode_msaa(4, IMS, X, Y, S) = (X', Y', 0)
408 * where X' = (X & ~0b1) << 1 | (S & 0b1) << 1 | (X & 0b1)
409 * Y' = (Y & ~0b1) << 1 | (S & 0b10) | (Y & 0b1)
410 */
411 x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffffe, 1);
412 x_out = nir_mask_shift_or(b, x_out, s_in, 0x1, 1);
413 x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0);
414 if (num_samples == 2) {
415 y_out = y_in;
416 } else {
417 y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffe, 1);
418 y_out = nir_mask_shift_or(b, y_out, s_in, 0x2, 0);
419 y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0);
420 }
421 break;
422
423 case 8:
424 /* encode_msaa(8, IMS, X, Y, S) = (X', Y', 0)
425 * where X' = (X & ~0b1) << 2 | (S & 0b100) | (S & 0b1) << 1
426 * | (X & 0b1)
427 * Y' = (Y & ~0b1) << 1 | (S & 0b10) | (Y & 0b1)
428 */
429 x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffffe, 2);
430 x_out = nir_mask_shift_or(b, x_out, s_in, 0x4, 0);
431 x_out = nir_mask_shift_or(b, x_out, s_in, 0x1, 1);
432 x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0);
433 y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffe, 1);
434 y_out = nir_mask_shift_or(b, y_out, s_in, 0x2, 0);
435 y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0);
436 break;
437
438 case 16:
439 /* encode_msaa(16, IMS, X, Y, S) = (X', Y', 0)
440 * where X' = (X & ~0b1) << 2 | (S & 0b100) | (S & 0b1) << 1
441 * | (X & 0b1)
442 * Y' = (Y & ~0b1) << 2 | (S & 0b1000) >> 1 (S & 0b10)
443 * | (Y & 0b1)
444 */
445 x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffffe, 2);
446 x_out = nir_mask_shift_or(b, x_out, s_in, 0x4, 0);
447 x_out = nir_mask_shift_or(b, x_out, s_in, 0x1, 1);
448 x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0);
449 y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffe, 2);
450 y_out = nir_mask_shift_or(b, y_out, s_in, 0x8, -1);
451 y_out = nir_mask_shift_or(b, y_out, s_in, 0x2, 0);
452 y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0);
453 break;
454
455 default:
456 unreachable("Invalid number of samples for IMS layout");
457 }
458
459 return nir_vec2(b, x_out, y_out);
460 }
461
462 default:
463 unreachable("Invalid MSAA layout");
464 }
465 }
466
467 /**
468 * Emit code to compensate for the difference between MSAA and non-MSAA
469 * surfaces.
470 *
471 * This code modifies the X and Y coordinates according to the formula:
472 *
473 * (X', Y', S) = decode_msaa(num_samples, IMS, X, Y, S)
474 *
475 * (See brw_blorp_blit_program).
476 */
477 static inline nir_ssa_def *
478 blorp_nir_decode_msaa(nir_builder *b, nir_ssa_def *pos,
479 unsigned num_samples, enum isl_msaa_layout layout)
480 {
481 assert(pos->num_components == 2 || pos->num_components == 3);
482
483 switch (layout) {
484 case ISL_MSAA_LAYOUT_NONE:
485 /* No translation necessary, and S should already be zero. */
486 assert(pos->num_components == 2);
487 return pos;
488 case ISL_MSAA_LAYOUT_ARRAY:
489 /* No translation necessary. */
490 return pos;
491 case ISL_MSAA_LAYOUT_INTERLEAVED: {
492 assert(pos->num_components == 2);
493
494 nir_ssa_def *x_in = nir_channel(b, pos, 0);
495 nir_ssa_def *y_in = nir_channel(b, pos, 1);
496
497 nir_ssa_def *x_out = nir_imm_int(b, 0);
498 nir_ssa_def *y_out = nir_imm_int(b, 0);
499 nir_ssa_def *s_out = nir_imm_int(b, 0);
500 switch (num_samples) {
501 case 2:
502 case 4:
503 /* decode_msaa(2, IMS, X, Y, 0) = (X', Y', S)
504 * where X' = (X & ~0b11) >> 1 | (X & 0b1)
505 * S = (X & 0b10) >> 1
506 *
507 * decode_msaa(4, IMS, X, Y, 0) = (X', Y', S)
508 * where X' = (X & ~0b11) >> 1 | (X & 0b1)
509 * Y' = (Y & ~0b11) >> 1 | (Y & 0b1)
510 * S = (Y & 0b10) | (X & 0b10) >> 1
511 */
512 x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffffc, -1);
513 x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0);
514 if (num_samples == 2) {
515 y_out = y_in;
516 s_out = nir_mask_shift_or(b, s_out, x_in, 0x2, -1);
517 } else {
518 y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffc, -1);
519 y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0);
520 s_out = nir_mask_shift_or(b, s_out, x_in, 0x2, -1);
521 s_out = nir_mask_shift_or(b, s_out, y_in, 0x2, 0);
522 }
523 break;
524
525 case 8:
526 /* decode_msaa(8, IMS, X, Y, 0) = (X', Y', S)
527 * where X' = (X & ~0b111) >> 2 | (X & 0b1)
528 * Y' = (Y & ~0b11) >> 1 | (Y & 0b1)
529 * S = (X & 0b100) | (Y & 0b10) | (X & 0b10) >> 1
530 */
531 x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffff8, -2);
532 x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0);
533 y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffc, -1);
534 y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0);
535 s_out = nir_mask_shift_or(b, s_out, x_in, 0x4, 0);
536 s_out = nir_mask_shift_or(b, s_out, y_in, 0x2, 0);
537 s_out = nir_mask_shift_or(b, s_out, x_in, 0x2, -1);
538 break;
539
540 case 16:
541 /* decode_msaa(16, IMS, X, Y, 0) = (X', Y', S)
542 * where X' = (X & ~0b111) >> 2 | (X & 0b1)
543 * Y' = (Y & ~0b111) >> 2 | (Y & 0b1)
544 * S = (Y & 0b100) << 1 | (X & 0b100) |
545 * (Y & 0b10) | (X & 0b10) >> 1
546 */
547 x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffff8, -2);
548 x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0);
549 y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffff8, -2);
550 y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0);
551 s_out = nir_mask_shift_or(b, s_out, y_in, 0x4, 1);
552 s_out = nir_mask_shift_or(b, s_out, x_in, 0x4, 0);
553 s_out = nir_mask_shift_or(b, s_out, y_in, 0x2, 0);
554 s_out = nir_mask_shift_or(b, s_out, x_in, 0x2, -1);
555 break;
556
557 default:
558 unreachable("Invalid number of samples for IMS layout");
559 }
560
561 return nir_vec3(b, x_out, y_out, s_out);
562 }
563
564 default:
565 unreachable("Invalid MSAA layout");
566 }
567 }
568
569 /**
570 * Count the number of trailing 1 bits in the given value. For example:
571 *
572 * count_trailing_one_bits(0) == 0
573 * count_trailing_one_bits(7) == 3
574 * count_trailing_one_bits(11) == 2
575 */
576 static inline int count_trailing_one_bits(unsigned value)
577 {
578 #ifdef HAVE___BUILTIN_CTZ
579 return __builtin_ctz(~value);
580 #else
581 return _mesa_bitcount(value & ~(value + 1));
582 #endif
583 }
584
585 static nir_ssa_def *
586 blorp_nir_manual_blend_average(nir_builder *b, struct brw_blorp_blit_vars *v,
587 nir_ssa_def *pos, unsigned tex_samples,
588 enum isl_aux_usage tex_aux_usage,
589 nir_alu_type dst_type)
590 {
591 /* If non-null, this is the outer-most if statement */
592 nir_if *outer_if = NULL;
593
594 nir_variable *color =
595 nir_local_variable_create(b->impl, glsl_vec4_type(), "color");
596
597 nir_ssa_def *mcs = NULL;
598 if (tex_aux_usage == ISL_AUX_USAGE_MCS)
599 mcs = blorp_nir_txf_ms_mcs(b, v, pos);
600
601 /* We add together samples using a binary tree structure, e.g. for 4x MSAA:
602 *
603 * result = ((sample[0] + sample[1]) + (sample[2] + sample[3])) / 4
604 *
605 * This ensures that when all samples have the same value, no numerical
606 * precision is lost, since each addition operation always adds two equal
607 * values, and summing two equal floating point values does not lose
608 * precision.
609 *
610 * We perform this computation by treating the texture_data array as a
611 * stack and performing the following operations:
612 *
613 * - push sample 0 onto stack
614 * - push sample 1 onto stack
615 * - add top two stack entries
616 * - push sample 2 onto stack
617 * - push sample 3 onto stack
618 * - add top two stack entries
619 * - add top two stack entries
620 * - divide top stack entry by 4
621 *
622 * Note that after pushing sample i onto the stack, the number of add
623 * operations we do is equal to the number of trailing 1 bits in i. This
624 * works provided the total number of samples is a power of two, which it
625 * always is for i965.
626 *
627 * For integer formats, we replace the add operations with average
628 * operations and skip the final division.
629 */
630 nir_ssa_def *texture_data[5];
631 unsigned stack_depth = 0;
632 for (unsigned i = 0; i < tex_samples; ++i) {
633 assert(stack_depth == _mesa_bitcount(i)); /* Loop invariant */
634
635 /* Push sample i onto the stack */
636 assert(stack_depth < ARRAY_SIZE(texture_data));
637
638 nir_ssa_def *ms_pos = nir_vec3(b, nir_channel(b, pos, 0),
639 nir_channel(b, pos, 1),
640 nir_imm_int(b, i));
641 texture_data[stack_depth++] = blorp_nir_txf_ms(b, v, ms_pos, mcs, dst_type);
642
643 if (i == 0 && tex_aux_usage == ISL_AUX_USAGE_MCS) {
644 /* The Ivy Bridge PRM, Vol4 Part1 p27 (Multisample Control Surface)
645 * suggests an optimization:
646 *
647 * "A simple optimization with probable large return in
648 * performance is to compare the MCS value to zero (indicating
649 * all samples are on sample slice 0), and sample only from
650 * sample slice 0 using ld2dss if MCS is zero."
651 *
652 * Note that in the case where the MCS value is zero, sampling from
653 * sample slice 0 using ld2dss and sampling from sample 0 using
654 * ld2dms are equivalent (since all samples are on sample slice 0).
655 * Since we have already sampled from sample 0, all we need to do is
656 * skip the remaining fetches and averaging if MCS is zero.
657 */
658 nir_ssa_def *mcs_zero =
659 nir_ieq(b, nir_channel(b, mcs, 0), nir_imm_int(b, 0));
660 if (tex_samples == 16) {
661 mcs_zero = nir_iand(b, mcs_zero,
662 nir_ieq(b, nir_channel(b, mcs, 1), nir_imm_int(b, 0)));
663 }
664
665 nir_if *if_stmt = nir_if_create(b->shader);
666 if_stmt->condition = nir_src_for_ssa(mcs_zero);
667 nir_cf_node_insert(b->cursor, &if_stmt->cf_node);
668
669 b->cursor = nir_after_cf_list(&if_stmt->then_list);
670 nir_store_var(b, color, texture_data[0], 0xf);
671
672 b->cursor = nir_after_cf_list(&if_stmt->else_list);
673 outer_if = if_stmt;
674 }
675
676 for (int j = 0; j < count_trailing_one_bits(i); j++) {
677 assert(stack_depth >= 2);
678 --stack_depth;
679
680 assert(dst_type == nir_type_float);
681 texture_data[stack_depth - 1] =
682 nir_fadd(b, texture_data[stack_depth - 1],
683 texture_data[stack_depth]);
684 }
685 }
686
687 /* We should have just 1 sample on the stack now. */
688 assert(stack_depth == 1);
689
690 texture_data[0] = nir_fmul(b, texture_data[0],
691 nir_imm_float(b, 1.0 / tex_samples));
692
693 nir_store_var(b, color, texture_data[0], 0xf);
694
695 if (outer_if)
696 b->cursor = nir_after_cf_node(&outer_if->cf_node);
697
698 return nir_load_var(b, color);
699 }
700
701 static inline nir_ssa_def *
702 nir_imm_vec2(nir_builder *build, float x, float y)
703 {
704 nir_const_value v;
705
706 memset(&v, 0, sizeof(v));
707 v.f32[0] = x;
708 v.f32[1] = y;
709
710 return nir_build_imm(build, 4, 32, v);
711 }
712
713 static nir_ssa_def *
714 blorp_nir_manual_blend_bilinear(nir_builder *b, nir_ssa_def *pos,
715 unsigned tex_samples,
716 const struct brw_blorp_blit_prog_key *key,
717 struct brw_blorp_blit_vars *v)
718 {
719 nir_ssa_def *pos_xy = nir_channels(b, pos, 0x3);
720 nir_ssa_def *rect_grid = nir_load_var(b, v->v_rect_grid);
721 nir_ssa_def *scale = nir_imm_vec2(b, key->x_scale, key->y_scale);
722
723 /* Translate coordinates to lay out the samples in a rectangular grid
724 * roughly corresponding to sample locations.
725 */
726 pos_xy = nir_fmul(b, pos_xy, scale);
727 /* Adjust coordinates so that integers represent pixel centers rather
728 * than pixel edges.
729 */
730 pos_xy = nir_fadd(b, pos_xy, nir_imm_float(b, -0.5));
731 /* Clamp the X, Y texture coordinates to properly handle the sampling of
732 * texels on texture edges.
733 */
734 pos_xy = nir_fmin(b, nir_fmax(b, pos_xy, nir_imm_float(b, 0.0)),
735 nir_vec2(b, nir_channel(b, rect_grid, 0),
736 nir_channel(b, rect_grid, 1)));
737
738 /* Store the fractional parts to be used as bilinear interpolation
739 * coefficients.
740 */
741 nir_ssa_def *frac_xy = nir_ffract(b, pos_xy);
742 /* Round the float coordinates down to nearest integer */
743 pos_xy = nir_fdiv(b, nir_ftrunc(b, pos_xy), scale);
744
745 nir_ssa_def *tex_data[4];
746 for (unsigned i = 0; i < 4; ++i) {
747 float sample_off_x = (float)(i & 0x1) / key->x_scale;
748 float sample_off_y = (float)((i >> 1) & 0x1) / key->y_scale;
749 nir_ssa_def *sample_off = nir_imm_vec2(b, sample_off_x, sample_off_y);
750
751 nir_ssa_def *sample_coords = nir_fadd(b, pos_xy, sample_off);
752 nir_ssa_def *sample_coords_int = nir_f2i(b, sample_coords);
753
754 /* The MCS value we fetch has to match up with the pixel that we're
755 * sampling from. Since we sample from different pixels in each
756 * iteration of this "for" loop, the call to mcs_fetch() should be
757 * here inside the loop after computing the pixel coordinates.
758 */
759 nir_ssa_def *mcs = NULL;
760 if (key->tex_aux_usage == ISL_AUX_USAGE_MCS)
761 mcs = blorp_nir_txf_ms_mcs(b, v, sample_coords_int);
762
763 /* Compute sample index and map the sample index to a sample number.
764 * Sample index layout shows the numbering of slots in a rectangular
765 * grid of samples with in a pixel. Sample number layout shows the
766 * rectangular grid of samples roughly corresponding to the real sample
767 * locations with in a pixel.
768 * In case of 4x MSAA, layout of sample indices matches the layout of
769 * sample numbers:
770 * ---------
771 * | 0 | 1 |
772 * ---------
773 * | 2 | 3 |
774 * ---------
775 *
776 * In case of 8x MSAA the two layouts don't match.
777 * sample index layout : --------- sample number layout : ---------
778 * | 0 | 1 | | 3 | 7 |
779 * --------- ---------
780 * | 2 | 3 | | 5 | 0 |
781 * --------- ---------
782 * | 4 | 5 | | 1 | 2 |
783 * --------- ---------
784 * | 6 | 7 | | 4 | 6 |
785 * --------- ---------
786 *
787 * Fortunately, this can be done fairly easily as:
788 * S' = (0x17306425 >> (S * 4)) & 0xf
789 *
790 * In the case of 16x MSAA the two layouts don't match.
791 * Sample index layout: Sample number layout:
792 * --------------------- ---------------------
793 * | 0 | 1 | 2 | 3 | | 15 | 10 | 9 | 7 |
794 * --------------------- ---------------------
795 * | 4 | 5 | 6 | 7 | | 4 | 1 | 3 | 13 |
796 * --------------------- ---------------------
797 * | 8 | 9 | 10 | 11 | | 12 | 2 | 0 | 6 |
798 * --------------------- ---------------------
799 * | 12 | 13 | 14 | 15 | | 11 | 8 | 5 | 14 |
800 * --------------------- ---------------------
801 *
802 * This is equivalent to
803 * S' = (0xe58b602cd31479af >> (S * 4)) & 0xf
804 */
805 nir_ssa_def *frac = nir_ffract(b, sample_coords);
806 nir_ssa_def *sample =
807 nir_fdot2(b, frac, nir_imm_vec2(b, key->x_scale,
808 key->x_scale * key->y_scale));
809 sample = nir_f2i(b, sample);
810
811 if (tex_samples == 8) {
812 sample = nir_iand(b, nir_ishr(b, nir_imm_int(b, 0x64210573),
813 nir_ishl(b, sample, nir_imm_int(b, 2))),
814 nir_imm_int(b, 0xf));
815 } else if (tex_samples == 16) {
816 nir_ssa_def *sample_low =
817 nir_iand(b, nir_ishr(b, nir_imm_int(b, 0xd31479af),
818 nir_ishl(b, sample, nir_imm_int(b, 2))),
819 nir_imm_int(b, 0xf));
820 nir_ssa_def *sample_high =
821 nir_iand(b, nir_ishr(b, nir_imm_int(b, 0xe58b602c),
822 nir_ishl(b, nir_iadd(b, sample,
823 nir_imm_int(b, -8)),
824 nir_imm_int(b, 2))),
825 nir_imm_int(b, 0xf));
826
827 sample = nir_bcsel(b, nir_ilt(b, sample, nir_imm_int(b, 8)),
828 sample_low, sample_high);
829 }
830 nir_ssa_def *pos_ms = nir_vec3(b, nir_channel(b, sample_coords_int, 0),
831 nir_channel(b, sample_coords_int, 1),
832 sample);
833 tex_data[i] = blorp_nir_txf_ms(b, v, pos_ms, mcs, key->texture_data_type);
834 }
835
836 nir_ssa_def *frac_x = nir_channel(b, frac_xy, 0);
837 nir_ssa_def *frac_y = nir_channel(b, frac_xy, 1);
838 return nir_flrp(b, nir_flrp(b, tex_data[0], tex_data[1], frac_x),
839 nir_flrp(b, tex_data[2], tex_data[3], frac_x),
840 frac_y);
841 }
842
843 /**
844 * Generator for WM programs used in BLORP blits.
845 *
846 * The bulk of the work done by the WM program is to wrap and unwrap the
847 * coordinate transformations used by the hardware to store surfaces in
848 * memory. The hardware transforms a pixel location (X, Y, S) (where S is the
849 * sample index for a multisampled surface) to a memory offset by the
850 * following formulas:
851 *
852 * offset = tile(tiling_format, encode_msaa(num_samples, layout, X, Y, S))
853 * (X, Y, S) = decode_msaa(num_samples, layout, detile(tiling_format, offset))
854 *
855 * For a single-sampled surface, or for a multisampled surface using
856 * INTEL_MSAA_LAYOUT_UMS, encode_msaa() and decode_msaa are the identity
857 * function:
858 *
859 * encode_msaa(1, NONE, X, Y, 0) = (X, Y, 0)
860 * decode_msaa(1, NONE, X, Y, 0) = (X, Y, 0)
861 * encode_msaa(n, UMS, X, Y, S) = (X, Y, S)
862 * decode_msaa(n, UMS, X, Y, S) = (X, Y, S)
863 *
864 * For a 4x multisampled surface using INTEL_MSAA_LAYOUT_IMS, encode_msaa()
865 * embeds the sample number into bit 1 of the X and Y coordinates:
866 *
867 * encode_msaa(4, IMS, X, Y, S) = (X', Y', 0)
868 * where X' = (X & ~0b1) << 1 | (S & 0b1) << 1 | (X & 0b1)
869 * Y' = (Y & ~0b1 ) << 1 | (S & 0b10) | (Y & 0b1)
870 * decode_msaa(4, IMS, X, Y, 0) = (X', Y', S)
871 * where X' = (X & ~0b11) >> 1 | (X & 0b1)
872 * Y' = (Y & ~0b11) >> 1 | (Y & 0b1)
873 * S = (Y & 0b10) | (X & 0b10) >> 1
874 *
875 * For an 8x multisampled surface using INTEL_MSAA_LAYOUT_IMS, encode_msaa()
876 * embeds the sample number into bits 1 and 2 of the X coordinate and bit 1 of
877 * the Y coordinate:
878 *
879 * encode_msaa(8, IMS, X, Y, S) = (X', Y', 0)
880 * where X' = (X & ~0b1) << 2 | (S & 0b100) | (S & 0b1) << 1 | (X & 0b1)
881 * Y' = (Y & ~0b1) << 1 | (S & 0b10) | (Y & 0b1)
882 * decode_msaa(8, IMS, X, Y, 0) = (X', Y', S)
883 * where X' = (X & ~0b111) >> 2 | (X & 0b1)
884 * Y' = (Y & ~0b11) >> 1 | (Y & 0b1)
885 * S = (X & 0b100) | (Y & 0b10) | (X & 0b10) >> 1
886 *
887 * For X tiling, tile() combines together the low-order bits of the X and Y
888 * coordinates in the pattern 0byyyxxxxxxxxx, creating 4k tiles that are 512
889 * bytes wide and 8 rows high:
890 *
891 * tile(x_tiled, X, Y, S) = A
892 * where A = tile_num << 12 | offset
893 * tile_num = (Y' >> 3) * tile_pitch + (X' >> 9)
894 * offset = (Y' & 0b111) << 9
895 * | (X & 0b111111111)
896 * X' = X * cpp
897 * Y' = Y + S * qpitch
898 * detile(x_tiled, A) = (X, Y, S)
899 * where X = X' / cpp
900 * Y = Y' % qpitch
901 * S = Y' / qpitch
902 * Y' = (tile_num / tile_pitch) << 3
903 * | (A & 0b111000000000) >> 9
904 * X' = (tile_num % tile_pitch) << 9
905 * | (A & 0b111111111)
906 *
907 * (In all tiling formulas, cpp is the number of bytes occupied by a single
908 * sample ("chars per pixel"), tile_pitch is the number of 4k tiles required
909 * to fill the width of the surface, and qpitch is the spacing (in rows)
910 * between array slices).
911 *
912 * For Y tiling, tile() combines together the low-order bits of the X and Y
913 * coordinates in the pattern 0bxxxyyyyyxxxx, creating 4k tiles that are 128
914 * bytes wide and 32 rows high:
915 *
916 * tile(y_tiled, X, Y, S) = A
917 * where A = tile_num << 12 | offset
918 * tile_num = (Y' >> 5) * tile_pitch + (X' >> 7)
919 * offset = (X' & 0b1110000) << 5
920 * | (Y' & 0b11111) << 4
921 * | (X' & 0b1111)
922 * X' = X * cpp
923 * Y' = Y + S * qpitch
924 * detile(y_tiled, A) = (X, Y, S)
925 * where X = X' / cpp
926 * Y = Y' % qpitch
927 * S = Y' / qpitch
928 * Y' = (tile_num / tile_pitch) << 5
929 * | (A & 0b111110000) >> 4
930 * X' = (tile_num % tile_pitch) << 7
931 * | (A & 0b111000000000) >> 5
932 * | (A & 0b1111)
933 *
934 * For W tiling, tile() combines together the low-order bits of the X and Y
935 * coordinates in the pattern 0bxxxyyyyxyxyx, creating 4k tiles that are 64
936 * bytes wide and 64 rows high (note that W tiling is only used for stencil
937 * buffers, which always have cpp = 1 and S=0):
938 *
939 * tile(w_tiled, X, Y, S) = A
940 * where A = tile_num << 12 | offset
941 * tile_num = (Y' >> 6) * tile_pitch + (X' >> 6)
942 * offset = (X' & 0b111000) << 6
943 * | (Y' & 0b111100) << 3
944 * | (X' & 0b100) << 2
945 * | (Y' & 0b10) << 2
946 * | (X' & 0b10) << 1
947 * | (Y' & 0b1) << 1
948 * | (X' & 0b1)
949 * X' = X * cpp = X
950 * Y' = Y + S * qpitch
951 * detile(w_tiled, A) = (X, Y, S)
952 * where X = X' / cpp = X'
953 * Y = Y' % qpitch = Y'
954 * S = Y / qpitch = 0
955 * Y' = (tile_num / tile_pitch) << 6
956 * | (A & 0b111100000) >> 3
957 * | (A & 0b1000) >> 2
958 * | (A & 0b10) >> 1
959 * X' = (tile_num % tile_pitch) << 6
960 * | (A & 0b111000000000) >> 6
961 * | (A & 0b10000) >> 2
962 * | (A & 0b100) >> 1
963 * | (A & 0b1)
964 *
965 * Finally, for a non-tiled surface, tile() simply combines together the X and
966 * Y coordinates in the natural way:
967 *
968 * tile(untiled, X, Y, S) = A
969 * where A = Y * pitch + X'
970 * X' = X * cpp
971 * Y' = Y + S * qpitch
972 * detile(untiled, A) = (X, Y, S)
973 * where X = X' / cpp
974 * Y = Y' % qpitch
975 * S = Y' / qpitch
976 * X' = A % pitch
977 * Y' = A / pitch
978 *
979 * (In these formulas, pitch is the number of bytes occupied by a single row
980 * of samples).
981 */
982 static nir_shader *
983 brw_blorp_build_nir_shader(struct blorp_context *blorp,
984 const struct brw_blorp_blit_prog_key *key)
985 {
986 const struct gen_device_info *devinfo = blorp->isl_dev->info;
987 nir_ssa_def *src_pos, *dst_pos, *color;
988
989 /* Sanity checks */
990 if (key->dst_tiled_w && key->rt_samples > 1) {
991 /* If the destination image is W tiled and multisampled, then the thread
992 * must be dispatched once per sample, not once per pixel. This is
993 * necessary because after conversion between W and Y tiling, there's no
994 * guarantee that all samples corresponding to a single pixel will still
995 * be together.
996 */
997 assert(key->persample_msaa_dispatch);
998 }
999
1000 if (key->blend) {
1001 /* We are blending, which means we won't have an opportunity to
1002 * translate the tiling and sample count for the texture surface. So
1003 * the surface state for the texture must be configured with the correct
1004 * tiling and sample count.
1005 */
1006 assert(!key->src_tiled_w);
1007 assert(key->tex_samples == key->src_samples);
1008 assert(key->tex_layout == key->src_layout);
1009 assert(key->tex_samples > 0);
1010 }
1011
1012 if (key->persample_msaa_dispatch) {
1013 /* It only makes sense to do persample dispatch if the render target is
1014 * configured as multisampled.
1015 */
1016 assert(key->rt_samples > 0);
1017 }
1018
1019 /* Make sure layout is consistent with sample count */
1020 assert((key->tex_layout == ISL_MSAA_LAYOUT_NONE) ==
1021 (key->tex_samples <= 1));
1022 assert((key->rt_layout == ISL_MSAA_LAYOUT_NONE) ==
1023 (key->rt_samples <= 1));
1024 assert((key->src_layout == ISL_MSAA_LAYOUT_NONE) ==
1025 (key->src_samples <= 1));
1026 assert((key->dst_layout == ISL_MSAA_LAYOUT_NONE) ==
1027 (key->dst_samples <= 1));
1028
1029 nir_builder b;
1030 nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_FRAGMENT, NULL);
1031
1032 struct brw_blorp_blit_vars v;
1033 brw_blorp_blit_vars_init(&b, &v, key);
1034
1035 dst_pos = blorp_blit_get_frag_coords(&b, key, &v);
1036
1037 /* Render target and texture hardware don't support W tiling until Gen8. */
1038 const bool rt_tiled_w = false;
1039 const bool tex_tiled_w = devinfo->gen >= 8 && key->src_tiled_w;
1040
1041 /* The address that data will be written to is determined by the
1042 * coordinates supplied to the WM thread and the tiling and sample count of
1043 * the render target, according to the formula:
1044 *
1045 * (X, Y, S) = decode_msaa(rt_samples, detile(rt_tiling, offset))
1046 *
1047 * If the actual tiling and sample count of the destination surface are not
1048 * the same as the configuration of the render target, then these
1049 * coordinates are wrong and we have to adjust them to compensate for the
1050 * difference.
1051 */
1052 if (rt_tiled_w != key->dst_tiled_w ||
1053 key->rt_samples != key->dst_samples ||
1054 key->rt_layout != key->dst_layout) {
1055 dst_pos = blorp_nir_encode_msaa(&b, dst_pos, key->rt_samples,
1056 key->rt_layout);
1057 /* Now (X, Y, S) = detile(rt_tiling, offset) */
1058 if (rt_tiled_w != key->dst_tiled_w)
1059 dst_pos = blorp_nir_retile_y_to_w(&b, dst_pos);
1060 /* Now (X, Y, S) = detile(rt_tiling, offset) */
1061 dst_pos = blorp_nir_decode_msaa(&b, dst_pos, key->dst_samples,
1062 key->dst_layout);
1063 }
1064
1065 /* Now (X, Y, S) = decode_msaa(dst_samples, detile(dst_tiling, offset)).
1066 *
1067 * That is: X, Y and S now contain the true coordinates and sample index of
1068 * the data that the WM thread should output.
1069 *
1070 * If we need to kill pixels that are outside the destination rectangle,
1071 * now is the time to do it.
1072 */
1073 if (key->use_kill) {
1074 assert(!(key->blend && key->blit_scaled));
1075 blorp_nir_discard_if_outside_rect(&b, dst_pos, &v);
1076 }
1077
1078 src_pos = blorp_blit_apply_transform(&b, nir_i2f(&b, dst_pos), &v);
1079 if (dst_pos->num_components == 3) {
1080 /* The sample coordinate is an integer that we want left alone but
1081 * blorp_blit_apply_transform() blindly applies the transform to all
1082 * three coordinates. Grab the original sample index.
1083 */
1084 src_pos = nir_vec3(&b, nir_channel(&b, src_pos, 0),
1085 nir_channel(&b, src_pos, 1),
1086 nir_channel(&b, dst_pos, 2));
1087 }
1088
1089 /* If the source image is not multisampled, then we want to fetch sample
1090 * number 0, because that's the only sample there is.
1091 */
1092 if (key->src_samples == 1)
1093 src_pos = nir_channels(&b, src_pos, 0x3);
1094
1095 /* X, Y, and S are now the coordinates of the pixel in the source image
1096 * that we want to texture from. Exception: if we are blending, then S is
1097 * irrelevant, because we are going to fetch all samples.
1098 */
1099 if (key->blend && !key->blit_scaled) {
1100 /* Resolves (effecively) use texelFetch, so we need integers and we
1101 * don't care about the sample index if we got one.
1102 */
1103 src_pos = nir_f2i(&b, nir_channels(&b, src_pos, 0x3));
1104
1105 if (devinfo->gen == 6) {
1106 /* Because gen6 only supports 4x interleved MSAA, we can do all the
1107 * blending we need with a single linear-interpolated texture lookup
1108 * at the center of the sample. The texture coordinates to be odd
1109 * integers so that they correspond to the center of a 2x2 block
1110 * representing the four samples that maxe up a pixel. So we need
1111 * to multiply our X and Y coordinates each by 2 and then add 1.
1112 */
1113 src_pos = nir_ishl(&b, src_pos, nir_imm_int(&b, 1));
1114 src_pos = nir_iadd(&b, src_pos, nir_imm_int(&b, 1));
1115 src_pos = nir_i2f(&b, src_pos);
1116 color = blorp_nir_tex(&b, &v, src_pos, key->texture_data_type);
1117 } else {
1118 /* Gen7+ hardware doesn't automaticaly blend. */
1119 color = blorp_nir_manual_blend_average(&b, &v, src_pos, key->src_samples,
1120 key->tex_aux_usage,
1121 key->texture_data_type);
1122 }
1123 } else if (key->blend && key->blit_scaled) {
1124 assert(!key->use_kill);
1125 color = blorp_nir_manual_blend_bilinear(&b, src_pos, key->src_samples, key, &v);
1126 } else {
1127 if (key->bilinear_filter) {
1128 color = blorp_nir_tex(&b, &v, src_pos, key->texture_data_type);
1129 } else {
1130 /* We're going to use texelFetch, so we need integers */
1131 if (src_pos->num_components == 2) {
1132 src_pos = nir_f2i(&b, src_pos);
1133 } else {
1134 assert(src_pos->num_components == 3);
1135 src_pos = nir_vec3(&b, nir_channel(&b, nir_f2i(&b, src_pos), 0),
1136 nir_channel(&b, nir_f2i(&b, src_pos), 1),
1137 nir_channel(&b, src_pos, 2));
1138 }
1139
1140 /* We aren't blending, which means we just want to fetch a single
1141 * sample from the source surface. The address that we want to fetch
1142 * from is related to the X, Y and S values according to the formula:
1143 *
1144 * (X, Y, S) = decode_msaa(src_samples, detile(src_tiling, offset)).
1145 *
1146 * If the actual tiling and sample count of the source surface are
1147 * not the same as the configuration of the texture, then we need to
1148 * adjust the coordinates to compensate for the difference.
1149 */
1150 if (tex_tiled_w != key->src_tiled_w ||
1151 key->tex_samples != key->src_samples ||
1152 key->tex_layout != key->src_layout) {
1153 src_pos = blorp_nir_encode_msaa(&b, src_pos, key->src_samples,
1154 key->src_layout);
1155 /* Now (X, Y, S) = detile(src_tiling, offset) */
1156 if (tex_tiled_w != key->src_tiled_w)
1157 src_pos = blorp_nir_retile_w_to_y(&b, src_pos);
1158 /* Now (X, Y, S) = detile(tex_tiling, offset) */
1159 src_pos = blorp_nir_decode_msaa(&b, src_pos, key->tex_samples,
1160 key->tex_layout);
1161 }
1162
1163 /* Now (X, Y, S) = decode_msaa(tex_samples, detile(tex_tiling, offset)).
1164 *
1165 * In other words: X, Y, and S now contain values which, when passed to
1166 * the texturing unit, will cause data to be read from the correct
1167 * memory location. So we can fetch the texel now.
1168 */
1169 if (key->src_samples == 1) {
1170 color = blorp_nir_txf(&b, &v, src_pos, key->texture_data_type);
1171 } else {
1172 nir_ssa_def *mcs = NULL;
1173 if (key->tex_aux_usage == ISL_AUX_USAGE_MCS)
1174 mcs = blorp_nir_txf_ms_mcs(&b, &v, src_pos);
1175
1176 color = blorp_nir_txf_ms(&b, &v, src_pos, mcs, key->texture_data_type);
1177 }
1178 }
1179 }
1180
1181 nir_store_var(&b, v.color_out, color, 0xf);
1182
1183 return b.shader;
1184 }
1185
1186 static void
1187 brw_blorp_get_blit_kernel(struct blorp_context *blorp,
1188 struct blorp_params *params,
1189 const struct brw_blorp_blit_prog_key *prog_key)
1190 {
1191 if (blorp->lookup_shader(blorp, prog_key, sizeof(*prog_key),
1192 &params->wm_prog_kernel, &params->wm_prog_data))
1193 return;
1194
1195 const unsigned *program;
1196 unsigned program_size;
1197 struct brw_blorp_prog_data prog_data;
1198
1199 /* Try and compile with NIR first. If that fails, fall back to the old
1200 * method of building shaders manually.
1201 */
1202 nir_shader *nir = brw_blorp_build_nir_shader(blorp, prog_key);
1203 struct brw_wm_prog_key wm_key;
1204 brw_blorp_init_wm_prog_key(&wm_key);
1205 wm_key.tex.compressed_multisample_layout_mask =
1206 prog_key->tex_aux_usage == ISL_AUX_USAGE_MCS;
1207 wm_key.tex.msaa_16 = prog_key->tex_samples == 16;
1208 wm_key.multisample_fbo = prog_key->rt_samples > 1;
1209
1210 program = brw_blorp_compile_nir_shader(blorp, nir, &wm_key, false,
1211 &prog_data, &program_size);
1212
1213 blorp->upload_shader(blorp, prog_key, sizeof(*prog_key),
1214 program, program_size,
1215 &prog_data, sizeof(prog_data),
1216 &params->wm_prog_kernel, &params->wm_prog_data);
1217 }
1218
1219 static void
1220 brw_blorp_setup_coord_transform(struct brw_blorp_coord_transform *xform,
1221 GLfloat src0, GLfloat src1,
1222 GLfloat dst0, GLfloat dst1,
1223 bool mirror)
1224 {
1225 float scale = (src1 - src0) / (dst1 - dst0);
1226 if (!mirror) {
1227 /* When not mirroring a coordinate (say, X), we need:
1228 * src_x - src_x0 = (dst_x - dst_x0 + 0.5) * scale
1229 * Therefore:
1230 * src_x = src_x0 + (dst_x - dst_x0 + 0.5) * scale
1231 *
1232 * blorp program uses "round toward zero" to convert the
1233 * transformed floating point coordinates to integer coordinates,
1234 * whereas the behaviour we actually want is "round to nearest",
1235 * so 0.5 provides the necessary correction.
1236 */
1237 xform->multiplier = scale;
1238 xform->offset = src0 + (-dst0 + 0.5f) * scale;
1239 } else {
1240 /* When mirroring X we need:
1241 * src_x - src_x0 = dst_x1 - dst_x - 0.5
1242 * Therefore:
1243 * src_x = src_x0 + (dst_x1 -dst_x - 0.5) * scale
1244 */
1245 xform->multiplier = -scale;
1246 xform->offset = src0 + (dst1 - 0.5f) * scale;
1247 }
1248 }
1249
1250 static void
1251 surf_convert_to_single_slice(const struct isl_device *isl_dev,
1252 struct brw_blorp_surface_info *info)
1253 {
1254 /* Just bail if we have nothing to do. */
1255 if (info->surf.dim == ISL_SURF_DIM_2D &&
1256 info->view.base_level == 0 && info->view.base_array_layer == 0 &&
1257 info->surf.levels == 0 && info->surf.logical_level0_px.array_len == 0)
1258 return;
1259
1260 uint32_t x_offset_sa, y_offset_sa;
1261 isl_surf_get_image_offset_sa(&info->surf, info->view.base_level,
1262 info->view.base_array_layer, 0,
1263 &x_offset_sa, &y_offset_sa);
1264
1265 uint32_t byte_offset;
1266 isl_tiling_get_intratile_offset_sa(isl_dev, info->surf.tiling,
1267 info->view.format, info->surf.row_pitch,
1268 x_offset_sa, y_offset_sa,
1269 &byte_offset,
1270 &info->tile_x_sa, &info->tile_y_sa);
1271 info->addr.offset += byte_offset;
1272
1273 /* TODO: Once this file gets converted to C, we shouls just use designated
1274 * initializers.
1275 */
1276 struct isl_surf_init_info init_info = { 0, };
1277
1278 init_info.dim = ISL_SURF_DIM_2D;
1279 init_info.format = ISL_FORMAT_R8_UINT;
1280 init_info.width =
1281 minify(info->surf.logical_level0_px.width, info->view.base_level);
1282 init_info.height =
1283 minify(info->surf.logical_level0_px.height, info->view.base_level);
1284 init_info.depth = 1;
1285 init_info.levels = 1;
1286 init_info.array_len = 1;
1287 init_info.samples = info->surf.samples;
1288 init_info.min_pitch = info->surf.row_pitch;
1289 init_info.usage = info->surf.usage;
1290 init_info.tiling_flags = 1 << info->surf.tiling;
1291
1292 isl_surf_init_s(isl_dev, &info->surf, &init_info);
1293 assert(info->surf.row_pitch == init_info.min_pitch);
1294
1295 /* The view is also different now. */
1296 info->view.base_level = 0;
1297 info->view.levels = 1;
1298 info->view.base_array_layer = 0;
1299 info->view.array_len = 1;
1300 }
1301
1302 static void
1303 surf_fake_interleaved_msaa(const struct isl_device *isl_dev,
1304 struct brw_blorp_surface_info *info)
1305 {
1306 assert(info->surf.msaa_layout == ISL_MSAA_LAYOUT_INTERLEAVED);
1307
1308 /* First, we need to convert it to a simple 1-level 1-layer 2-D surface */
1309 surf_convert_to_single_slice(isl_dev, info);
1310
1311 info->surf.logical_level0_px = info->surf.phys_level0_sa;
1312 info->surf.samples = 1;
1313 info->surf.msaa_layout = ISL_MSAA_LAYOUT_NONE;
1314 }
1315
1316 static void
1317 surf_retile_w_to_y(const struct isl_device *isl_dev,
1318 struct brw_blorp_surface_info *info)
1319 {
1320 assert(info->surf.tiling == ISL_TILING_W);
1321
1322 /* First, we need to convert it to a simple 1-level 1-layer 2-D surface */
1323 surf_convert_to_single_slice(isl_dev, info);
1324
1325 /* On gen7+, we don't have interleaved multisampling for color render
1326 * targets so we have to fake it.
1327 *
1328 * TODO: Are we sure we don't also need to fake it on gen6?
1329 */
1330 if (isl_dev->info->gen > 6 &&
1331 info->surf.msaa_layout == ISL_MSAA_LAYOUT_INTERLEAVED) {
1332 info->surf.logical_level0_px = info->surf.phys_level0_sa;
1333 info->surf.samples = 1;
1334 info->surf.msaa_layout = ISL_MSAA_LAYOUT_NONE;
1335 }
1336
1337 if (isl_dev->info->gen == 6) {
1338 /* Gen6 stencil buffers have a very large alignment coming in from the
1339 * miptree. It's out-of-bounds for what the surface state can handle.
1340 * Since we have a single layer and level, it doesn't really matter as
1341 * long as we don't pass a bogus value into isl_surf_fill_state().
1342 */
1343 info->surf.image_alignment_el = isl_extent3d(4, 2, 1);
1344 }
1345
1346 /* Now that we've converted everything to a simple 2-D surface with only
1347 * one miplevel, we can go about retiling it.
1348 */
1349 const unsigned x_align = 8, y_align = info->surf.samples != 0 ? 8 : 4;
1350 info->surf.tiling = ISL_TILING_Y0;
1351 info->surf.logical_level0_px.width =
1352 ALIGN(info->surf.logical_level0_px.width, x_align) * 2;
1353 info->surf.logical_level0_px.height =
1354 ALIGN(info->surf.logical_level0_px.height, y_align) / 2;
1355 info->tile_x_sa *= 2;
1356 info->tile_y_sa /= 2;
1357 }
1358
1359 void
1360 blorp_blit(struct blorp_batch *batch,
1361 const struct blorp_surf *src_surf,
1362 unsigned src_level, unsigned src_layer,
1363 enum isl_format src_format, struct isl_swizzle src_swizzle,
1364 const struct blorp_surf *dst_surf,
1365 unsigned dst_level, unsigned dst_layer,
1366 enum isl_format dst_format,
1367 float src_x0, float src_y0,
1368 float src_x1, float src_y1,
1369 float dst_x0, float dst_y0,
1370 float dst_x1, float dst_y1,
1371 GLenum filter, bool mirror_x, bool mirror_y)
1372 {
1373 const struct gen_device_info *devinfo = batch->blorp->isl_dev->info;
1374
1375 struct blorp_params params;
1376 blorp_params_init(&params);
1377
1378 brw_blorp_surface_info_init(batch->blorp, &params.src, src_surf, src_level,
1379 src_layer, src_format, false);
1380 brw_blorp_surface_info_init(batch->blorp, &params.dst, dst_surf, dst_level,
1381 dst_layer, dst_format, true);
1382
1383 struct brw_blorp_blit_prog_key wm_prog_key;
1384 memset(&wm_prog_key, 0, sizeof(wm_prog_key));
1385
1386 if (isl_format_has_sint_channel(params.src.view.format)) {
1387 wm_prog_key.texture_data_type = nir_type_int;
1388 } else if (isl_format_has_uint_channel(params.src.view.format)) {
1389 wm_prog_key.texture_data_type = nir_type_uint;
1390 } else {
1391 wm_prog_key.texture_data_type = nir_type_float;
1392 }
1393
1394 /* Scaled blitting or not. */
1395 wm_prog_key.blit_scaled =
1396 ((dst_x1 - dst_x0) == (src_x1 - src_x0) &&
1397 (dst_y1 - dst_y0) == (src_y1 - src_y0)) ? false : true;
1398
1399 /* Scaling factors used for bilinear filtering in multisample scaled
1400 * blits.
1401 */
1402 if (params.src.surf.samples == 16)
1403 wm_prog_key.x_scale = 4.0f;
1404 else
1405 wm_prog_key.x_scale = 2.0f;
1406 wm_prog_key.y_scale = params.src.surf.samples / wm_prog_key.x_scale;
1407
1408 if (filter == GL_LINEAR &&
1409 params.src.surf.samples <= 1 && params.dst.surf.samples <= 1)
1410 wm_prog_key.bilinear_filter = true;
1411
1412 if ((params.src.surf.usage & ISL_SURF_USAGE_DEPTH_BIT) == 0 &&
1413 (params.src.surf.usage & ISL_SURF_USAGE_STENCIL_BIT) == 0 &&
1414 !isl_format_has_int_channel(params.src.surf.format) &&
1415 params.src.surf.samples > 1 && params.dst.surf.samples <= 1) {
1416 /* We are downsampling a non-integer color buffer, so blend.
1417 *
1418 * Regarding integer color buffers, the OpenGL ES 3.2 spec says:
1419 *
1420 * "If the source formats are integer types or stencil values, a
1421 * single sample's value is selected for each pixel."
1422 *
1423 * This implies we should not blend in that case.
1424 */
1425 wm_prog_key.blend = true;
1426 }
1427
1428 /* src_samples and dst_samples are the true sample counts */
1429 wm_prog_key.src_samples = params.src.surf.samples;
1430 wm_prog_key.dst_samples = params.dst.surf.samples;
1431
1432 wm_prog_key.tex_aux_usage = params.src.aux_usage;
1433
1434 /* src_layout and dst_layout indicate the true MSAA layout used by src and
1435 * dst.
1436 */
1437 wm_prog_key.src_layout = params.src.surf.msaa_layout;
1438 wm_prog_key.dst_layout = params.dst.surf.msaa_layout;
1439
1440 /* Round floating point values to nearest integer to avoid "off by one texel"
1441 * kind of errors when blitting.
1442 */
1443 params.x0 = params.wm_inputs.discard_rect.x0 = roundf(dst_x0);
1444 params.y0 = params.wm_inputs.discard_rect.y0 = roundf(dst_y0);
1445 params.x1 = params.wm_inputs.discard_rect.x1 = roundf(dst_x1);
1446 params.y1 = params.wm_inputs.discard_rect.y1 = roundf(dst_y1);
1447
1448 params.wm_inputs.rect_grid.x1 =
1449 minify(params.src.surf.logical_level0_px.width, src_level) *
1450 wm_prog_key.x_scale - 1.0f;
1451 params.wm_inputs.rect_grid.y1 =
1452 minify(params.src.surf.logical_level0_px.height, src_level) *
1453 wm_prog_key.y_scale - 1.0f;
1454
1455 brw_blorp_setup_coord_transform(&params.wm_inputs.coord_transform[0],
1456 src_x0, src_x1, dst_x0, dst_x1, mirror_x);
1457 brw_blorp_setup_coord_transform(&params.wm_inputs.coord_transform[1],
1458 src_y0, src_y1, dst_y0, dst_y1, mirror_y);
1459
1460 /* For some texture types, we need to pass the layer through the sampler. */
1461 params.wm_inputs.src_z = params.src.z_offset;
1462
1463 if (devinfo->gen > 6 &&
1464 params.dst.surf.msaa_layout == ISL_MSAA_LAYOUT_INTERLEAVED) {
1465 assert(params.dst.surf.samples > 1);
1466
1467 /* We must expand the rectangle we send through the rendering pipeline,
1468 * to account for the fact that we are mapping the destination region as
1469 * single-sampled when it is in fact multisampled. We must also align
1470 * it to a multiple of the multisampling pattern, because the
1471 * differences between multisampled and single-sampled surface formats
1472 * will mean that pixels are scrambled within the multisampling pattern.
1473 * TODO: what if this makes the coordinates too large?
1474 *
1475 * Note: this only works if the destination surface uses the IMS layout.
1476 * If it's UMS, then we have no choice but to set up the rendering
1477 * pipeline as multisampled.
1478 */
1479 switch (params.dst.surf.samples) {
1480 case 2:
1481 params.x0 = ROUND_DOWN_TO(params.x0 * 2, 4);
1482 params.y0 = ROUND_DOWN_TO(params.y0, 4);
1483 params.x1 = ALIGN(params.x1 * 2, 4);
1484 params.y1 = ALIGN(params.y1, 4);
1485 break;
1486 case 4:
1487 params.x0 = ROUND_DOWN_TO(params.x0 * 2, 4);
1488 params.y0 = ROUND_DOWN_TO(params.y0 * 2, 4);
1489 params.x1 = ALIGN(params.x1 * 2, 4);
1490 params.y1 = ALIGN(params.y1 * 2, 4);
1491 break;
1492 case 8:
1493 params.x0 = ROUND_DOWN_TO(params.x0 * 4, 8);
1494 params.y0 = ROUND_DOWN_TO(params.y0 * 2, 4);
1495 params.x1 = ALIGN(params.x1 * 4, 8);
1496 params.y1 = ALIGN(params.y1 * 2, 4);
1497 break;
1498 case 16:
1499 params.x0 = ROUND_DOWN_TO(params.x0 * 4, 8);
1500 params.y0 = ROUND_DOWN_TO(params.y0 * 4, 8);
1501 params.x1 = ALIGN(params.x1 * 4, 8);
1502 params.y1 = ALIGN(params.y1 * 4, 8);
1503 break;
1504 default:
1505 unreachable("Unrecognized sample count in brw_blorp_blit_params ctor");
1506 }
1507
1508 surf_fake_interleaved_msaa(batch->blorp->isl_dev, &params.dst);
1509
1510 wm_prog_key.use_kill = true;
1511 }
1512
1513 if (params.dst.surf.tiling == ISL_TILING_W) {
1514 /* We must modify the rectangle we send through the rendering pipeline
1515 * (and the size and x/y offset of the destination surface), to account
1516 * for the fact that we are mapping it as Y-tiled when it is in fact
1517 * W-tiled.
1518 *
1519 * Both Y tiling and W tiling can be understood as organizations of
1520 * 32-byte sub-tiles; within each 32-byte sub-tile, the layout of pixels
1521 * is different, but the layout of the 32-byte sub-tiles within the 4k
1522 * tile is the same (8 sub-tiles across by 16 sub-tiles down, in
1523 * column-major order). In Y tiling, the sub-tiles are 16 bytes wide
1524 * and 2 rows high; in W tiling, they are 8 bytes wide and 4 rows high.
1525 *
1526 * Therefore, to account for the layout differences within the 32-byte
1527 * sub-tiles, we must expand the rectangle so the X coordinates of its
1528 * edges are multiples of 8 (the W sub-tile width), and its Y
1529 * coordinates of its edges are multiples of 4 (the W sub-tile height).
1530 * Then we need to scale the X and Y coordinates of the rectangle to
1531 * account for the differences in aspect ratio between the Y and W
1532 * sub-tiles. We need to modify the layer width and height similarly.
1533 *
1534 * A correction needs to be applied when MSAA is in use: since
1535 * INTEL_MSAA_LAYOUT_IMS uses an interleaving pattern whose height is 4,
1536 * we need to align the Y coordinates to multiples of 8, so that when
1537 * they are divided by two they are still multiples of 4.
1538 *
1539 * Note: Since the x/y offset of the surface will be applied using the
1540 * SURFACE_STATE command packet, it will be invisible to the swizzling
1541 * code in the shader; therefore it needs to be in a multiple of the
1542 * 32-byte sub-tile size. Fortunately it is, since the sub-tile is 8
1543 * pixels wide and 4 pixels high (when viewed as a W-tiled stencil
1544 * buffer), and the miplevel alignment used for stencil buffers is 8
1545 * pixels horizontally and either 4 or 8 pixels vertically (see
1546 * intel_horizontal_texture_alignment_unit() and
1547 * intel_vertical_texture_alignment_unit()).
1548 *
1549 * Note: Also, since the SURFACE_STATE command packet can only apply
1550 * offsets that are multiples of 4 pixels horizontally and 2 pixels
1551 * vertically, it is important that the offsets will be multiples of
1552 * these sizes after they are converted into Y-tiled coordinates.
1553 * Fortunately they will be, since we know from above that the offsets
1554 * are a multiple of the 32-byte sub-tile size, and in Y-tiled
1555 * coordinates the sub-tile is 16 pixels wide and 2 pixels high.
1556 *
1557 * TODO: what if this makes the coordinates (or the texture size) too
1558 * large?
1559 */
1560 const unsigned x_align = 8, y_align = params.dst.surf.samples != 0 ? 8 : 4;
1561 params.x0 = ROUND_DOWN_TO(params.x0, x_align) * 2;
1562 params.y0 = ROUND_DOWN_TO(params.y0, y_align) / 2;
1563 params.x1 = ALIGN(params.x1, x_align) * 2;
1564 params.y1 = ALIGN(params.y1, y_align) / 2;
1565
1566 /* Retile the surface to Y-tiled */
1567 surf_retile_w_to_y(batch->blorp->isl_dev, &params.dst);
1568
1569 wm_prog_key.dst_tiled_w = true;
1570 wm_prog_key.use_kill = true;
1571
1572 if (params.dst.surf.samples > 1) {
1573 /* If the destination surface is a W-tiled multisampled stencil
1574 * buffer that we're mapping as Y tiled, then we need to arrange for
1575 * the WM program to run once per sample rather than once per pixel,
1576 * because the memory layout of related samples doesn't match between
1577 * W and Y tiling.
1578 */
1579 wm_prog_key.persample_msaa_dispatch = true;
1580 }
1581 }
1582
1583 if (devinfo->gen < 8 && params.src.surf.tiling == ISL_TILING_W) {
1584 /* On Haswell and earlier, we have to fake W-tiled sources as Y-tiled.
1585 * Broadwell adds support for sampling from stencil.
1586 *
1587 * See the comments above concerning x/y offset alignment for the
1588 * destination surface.
1589 *
1590 * TODO: what if this makes the texture size too large?
1591 */
1592 surf_retile_w_to_y(batch->blorp->isl_dev, &params.src);
1593
1594 wm_prog_key.src_tiled_w = true;
1595 }
1596
1597 /* tex_samples and rt_samples are the sample counts that are set up in
1598 * SURFACE_STATE.
1599 */
1600 wm_prog_key.tex_samples = params.src.surf.samples;
1601 wm_prog_key.rt_samples = params.dst.surf.samples;
1602
1603 /* tex_layout and rt_layout indicate the MSAA layout the GPU pipeline will
1604 * use to access the source and destination surfaces.
1605 */
1606 wm_prog_key.tex_layout = params.src.surf.msaa_layout;
1607 wm_prog_key.rt_layout = params.dst.surf.msaa_layout;
1608
1609 if (params.src.surf.samples > 0 && params.dst.surf.samples > 1) {
1610 /* We are blitting from a multisample buffer to a multisample buffer, so
1611 * we must preserve samples within a pixel. This means we have to
1612 * arrange for the WM program to run once per sample rather than once
1613 * per pixel.
1614 */
1615 wm_prog_key.persample_msaa_dispatch = true;
1616 }
1617
1618 brw_blorp_get_blit_kernel(batch->blorp, &params, &wm_prog_key);
1619
1620 params.src.view.swizzle = src_swizzle;
1621
1622 batch->blorp->exec(batch, &params);
1623 }