i965/miptree: Allow get_aux_isl_surf when there is no aux surface
[mesa.git] / src / mesa / drivers / dri / i965 / brw_blorp_blit.cpp
1 /*
2 * Copyright © 2012 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "main/context.h"
25 #include "main/teximage.h"
26 #include "main/fbobject.h"
27
28 #include "compiler/nir/nir_builder.h"
29
30 #include "intel_fbo.h"
31
32 #include "brw_blorp.h"
33 #include "brw_context.h"
34 #include "brw_state.h"
35 #include "brw_meta_util.h"
36
37 #define FILE_DEBUG_FLAG DEBUG_BLORP
38
39 static struct intel_mipmap_tree *
40 find_miptree(GLbitfield buffer_bit, struct intel_renderbuffer *irb)
41 {
42 struct intel_mipmap_tree *mt = irb->mt;
43 if (buffer_bit == GL_STENCIL_BUFFER_BIT && mt->stencil_mt)
44 mt = mt->stencil_mt;
45 return mt;
46 }
47
48 static int
49 blorp_get_texture_swizzle(const struct intel_renderbuffer *irb)
50 {
51 return irb->Base.Base._BaseFormat == GL_RGB ?
52 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ONE) :
53 SWIZZLE_XYZW;
54 }
55
56 static void
57 do_blorp_blit(struct brw_context *brw, GLbitfield buffer_bit,
58 struct intel_renderbuffer *src_irb, mesa_format src_format,
59 struct intel_renderbuffer *dst_irb, mesa_format dst_format,
60 GLfloat srcX0, GLfloat srcY0, GLfloat srcX1, GLfloat srcY1,
61 GLfloat dstX0, GLfloat dstY0, GLfloat dstX1, GLfloat dstY1,
62 GLenum filter, bool mirror_x, bool mirror_y)
63 {
64 const struct gl_context *ctx = &brw->ctx;
65
66 /* Find source/dst miptrees */
67 struct intel_mipmap_tree *src_mt = find_miptree(buffer_bit, src_irb);
68 struct intel_mipmap_tree *dst_mt = find_miptree(buffer_bit, dst_irb);
69
70 const bool do_srgb = ctx->Color.sRGBEnabled;
71
72 /* Do the blit */
73 brw_blorp_blit_miptrees(brw,
74 src_mt, src_irb->mt_level, src_irb->mt_layer,
75 src_format, blorp_get_texture_swizzle(src_irb),
76 dst_mt, dst_irb->mt_level, dst_irb->mt_layer,
77 dst_format,
78 srcX0, srcY0, srcX1, srcY1,
79 dstX0, dstY0, dstX1, dstY1,
80 filter, mirror_x, mirror_y,
81 do_srgb, do_srgb);
82
83 dst_irb->need_downsample = true;
84 }
85
86 static bool
87 try_blorp_blit(struct brw_context *brw,
88 const struct gl_framebuffer *read_fb,
89 const struct gl_framebuffer *draw_fb,
90 GLfloat srcX0, GLfloat srcY0, GLfloat srcX1, GLfloat srcY1,
91 GLfloat dstX0, GLfloat dstY0, GLfloat dstX1, GLfloat dstY1,
92 GLenum filter, GLbitfield buffer_bit)
93 {
94 struct gl_context *ctx = &brw->ctx;
95
96 /* Sync up the state of window system buffers. We need to do this before
97 * we go looking for the buffers.
98 */
99 intel_prepare_render(brw);
100
101 bool mirror_x, mirror_y;
102 if (brw_meta_mirror_clip_and_scissor(ctx, read_fb, draw_fb,
103 &srcX0, &srcY0, &srcX1, &srcY1,
104 &dstX0, &dstY0, &dstX1, &dstY1,
105 &mirror_x, &mirror_y))
106 return true;
107
108 /* Find buffers */
109 struct intel_renderbuffer *src_irb;
110 struct intel_renderbuffer *dst_irb;
111 struct intel_mipmap_tree *src_mt;
112 struct intel_mipmap_tree *dst_mt;
113 switch (buffer_bit) {
114 case GL_COLOR_BUFFER_BIT:
115 src_irb = intel_renderbuffer(read_fb->_ColorReadBuffer);
116 for (unsigned i = 0; i < draw_fb->_NumColorDrawBuffers; ++i) {
117 dst_irb = intel_renderbuffer(draw_fb->_ColorDrawBuffers[i]);
118 if (dst_irb)
119 do_blorp_blit(brw, buffer_bit,
120 src_irb, src_irb->Base.Base.Format,
121 dst_irb, dst_irb->Base.Base.Format,
122 srcX0, srcY0, srcX1, srcY1,
123 dstX0, dstY0, dstX1, dstY1,
124 filter, mirror_x, mirror_y);
125 }
126 break;
127 case GL_DEPTH_BUFFER_BIT:
128 src_irb =
129 intel_renderbuffer(read_fb->Attachment[BUFFER_DEPTH].Renderbuffer);
130 dst_irb =
131 intel_renderbuffer(draw_fb->Attachment[BUFFER_DEPTH].Renderbuffer);
132 src_mt = find_miptree(buffer_bit, src_irb);
133 dst_mt = find_miptree(buffer_bit, dst_irb);
134
135 /* We can't handle format conversions between Z24 and other formats
136 * since we have to lie about the surface format. See the comments in
137 * brw_blorp_surface_info::set().
138 */
139 if ((src_mt->format == MESA_FORMAT_Z24_UNORM_X8_UINT) !=
140 (dst_mt->format == MESA_FORMAT_Z24_UNORM_X8_UINT))
141 return false;
142
143 do_blorp_blit(brw, buffer_bit, src_irb, MESA_FORMAT_NONE,
144 dst_irb, MESA_FORMAT_NONE, srcX0, srcY0,
145 srcX1, srcY1, dstX0, dstY0, dstX1, dstY1,
146 filter, mirror_x, mirror_y);
147 break;
148 case GL_STENCIL_BUFFER_BIT:
149 src_irb =
150 intel_renderbuffer(read_fb->Attachment[BUFFER_STENCIL].Renderbuffer);
151 dst_irb =
152 intel_renderbuffer(draw_fb->Attachment[BUFFER_STENCIL].Renderbuffer);
153 do_blorp_blit(brw, buffer_bit, src_irb, MESA_FORMAT_NONE,
154 dst_irb, MESA_FORMAT_NONE, srcX0, srcY0,
155 srcX1, srcY1, dstX0, dstY0, dstX1, dstY1,
156 filter, mirror_x, mirror_y);
157 break;
158 default:
159 unreachable("not reached");
160 }
161
162 return true;
163 }
164
165 bool
166 brw_blorp_copytexsubimage(struct brw_context *brw,
167 struct gl_renderbuffer *src_rb,
168 struct gl_texture_image *dst_image,
169 int slice,
170 int srcX0, int srcY0,
171 int dstX0, int dstY0,
172 int width, int height)
173 {
174 struct gl_context *ctx = &brw->ctx;
175 struct intel_renderbuffer *src_irb = intel_renderbuffer(src_rb);
176 struct intel_texture_image *intel_image = intel_texture_image(dst_image);
177
178 /* No pixel transfer operations (zoom, bias, mapping), just a blit */
179 if (brw->ctx._ImageTransferState)
180 return false;
181
182 /* Sync up the state of window system buffers. We need to do this before
183 * we go looking at the src renderbuffer's miptree.
184 */
185 intel_prepare_render(brw);
186
187 struct intel_mipmap_tree *src_mt = src_irb->mt;
188 struct intel_mipmap_tree *dst_mt = intel_image->mt;
189
190 /* There is support for only up to eight samples. */
191 if (src_mt->num_samples > 8 || dst_mt->num_samples > 8)
192 return false;
193
194 /* BLORP is only supported from Gen6 onwards. */
195 if (brw->gen < 6)
196 return false;
197
198 if (_mesa_get_format_base_format(src_rb->Format) !=
199 _mesa_get_format_base_format(dst_image->TexFormat)) {
200 return false;
201 }
202
203 /* We can't handle format conversions between Z24 and other formats since
204 * we have to lie about the surface format. See the comments in
205 * brw_blorp_surface_info::set().
206 */
207 if ((src_mt->format == MESA_FORMAT_Z24_UNORM_X8_UINT) !=
208 (dst_mt->format == MESA_FORMAT_Z24_UNORM_X8_UINT)) {
209 return false;
210 }
211
212 if (!brw->format_supported_as_render_target[dst_image->TexFormat])
213 return false;
214
215 /* Source clipping shouldn't be necessary, since copytexsubimage (in
216 * src/mesa/main/teximage.c) calls _mesa_clip_copytexsubimage() which
217 * takes care of it.
218 *
219 * Destination clipping shouldn't be necessary since the restrictions on
220 * glCopyTexSubImage prevent the user from specifying a destination rectangle
221 * that falls outside the bounds of the destination texture.
222 * See error_check_subtexture_dimensions().
223 */
224
225 int srcY1 = srcY0 + height;
226 int srcX1 = srcX0 + width;
227 int dstX1 = dstX0 + width;
228 int dstY1 = dstY0 + height;
229
230 /* Account for the fact that in the system framebuffer, the origin is at
231 * the lower left.
232 */
233 bool mirror_y = false;
234 if (_mesa_is_winsys_fbo(ctx->ReadBuffer)) {
235 GLint tmp = src_rb->Height - srcY0;
236 srcY0 = src_rb->Height - srcY1;
237 srcY1 = tmp;
238 mirror_y = true;
239 }
240
241 /* Account for face selection and texture view MinLayer */
242 int dst_slice = slice + dst_image->TexObject->MinLayer + dst_image->Face;
243 int dst_level = dst_image->Level + dst_image->TexObject->MinLevel;
244
245 brw_blorp_blit_miptrees(brw,
246 src_mt, src_irb->mt_level, src_irb->mt_layer,
247 src_rb->Format, blorp_get_texture_swizzle(src_irb),
248 dst_mt, dst_level, dst_slice,
249 dst_image->TexFormat,
250 srcX0, srcY0, srcX1, srcY1,
251 dstX0, dstY0, dstX1, dstY1,
252 GL_NEAREST, false, mirror_y,
253 false, false);
254
255 /* If we're copying to a packed depth stencil texture and the source
256 * framebuffer has separate stencil, we need to also copy the stencil data
257 * over.
258 */
259 src_rb = ctx->ReadBuffer->Attachment[BUFFER_STENCIL].Renderbuffer;
260 if (_mesa_get_format_bits(dst_image->TexFormat, GL_STENCIL_BITS) > 0 &&
261 src_rb != NULL) {
262 src_irb = intel_renderbuffer(src_rb);
263 src_mt = src_irb->mt;
264
265 if (src_mt->stencil_mt)
266 src_mt = src_mt->stencil_mt;
267 if (dst_mt->stencil_mt)
268 dst_mt = dst_mt->stencil_mt;
269
270 if (src_mt != dst_mt) {
271 brw_blorp_blit_miptrees(brw,
272 src_mt, src_irb->mt_level, src_irb->mt_layer,
273 src_mt->format,
274 blorp_get_texture_swizzle(src_irb),
275 dst_mt, dst_level, dst_slice,
276 dst_mt->format,
277 srcX0, srcY0, srcX1, srcY1,
278 dstX0, dstY0, dstX1, dstY1,
279 GL_NEAREST, false, mirror_y,
280 false, false);
281 }
282 }
283
284 return true;
285 }
286
287
288 GLbitfield
289 brw_blorp_framebuffer(struct brw_context *brw,
290 struct gl_framebuffer *readFb,
291 struct gl_framebuffer *drawFb,
292 GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
293 GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
294 GLbitfield mask, GLenum filter)
295 {
296 /* BLORP is not supported before Gen6. */
297 if (brw->gen < 6)
298 return mask;
299
300 static GLbitfield buffer_bits[] = {
301 GL_COLOR_BUFFER_BIT,
302 GL_DEPTH_BUFFER_BIT,
303 GL_STENCIL_BUFFER_BIT,
304 };
305
306 for (unsigned int i = 0; i < ARRAY_SIZE(buffer_bits); ++i) {
307 if ((mask & buffer_bits[i]) &&
308 try_blorp_blit(brw, readFb, drawFb,
309 srcX0, srcY0, srcX1, srcY1,
310 dstX0, dstY0, dstX1, dstY1,
311 filter, buffer_bits[i])) {
312 mask &= ~buffer_bits[i];
313 }
314 }
315
316 return mask;
317 }
318
319
320 /**
321 * Enum to specify the order of arguments in a sampler message
322 */
323 enum sampler_message_arg
324 {
325 SAMPLER_MESSAGE_ARG_U_FLOAT,
326 SAMPLER_MESSAGE_ARG_V_FLOAT,
327 SAMPLER_MESSAGE_ARG_U_INT,
328 SAMPLER_MESSAGE_ARG_V_INT,
329 SAMPLER_MESSAGE_ARG_R_INT,
330 SAMPLER_MESSAGE_ARG_SI_INT,
331 SAMPLER_MESSAGE_ARG_MCS_INT,
332 SAMPLER_MESSAGE_ARG_ZERO_INT,
333 };
334
335 struct brw_blorp_blit_vars {
336 /* Input values from brw_blorp_wm_inputs */
337 nir_variable *v_discard_rect;
338 nir_variable *v_rect_grid;
339 nir_variable *v_coord_transform;
340 nir_variable *v_src_z;
341
342 /* gl_FragCoord */
343 nir_variable *frag_coord;
344
345 /* gl_FragColor */
346 nir_variable *color_out;
347 };
348
349 static void
350 brw_blorp_blit_vars_init(nir_builder *b, struct brw_blorp_blit_vars *v,
351 const struct brw_blorp_blit_prog_key *key)
352 {
353 /* Blended and scaled blits never use pixel discard. */
354 assert(!key->use_kill || !(key->blend && key->blit_scaled));
355
356 #define LOAD_INPUT(name, type)\
357 v->v_##name = nir_variable_create(b->shader, nir_var_shader_in, \
358 type, #name); \
359 v->v_##name->data.interpolation = INTERP_MODE_FLAT; \
360 v->v_##name->data.location = VARYING_SLOT_VAR0 + \
361 offsetof(struct brw_blorp_wm_inputs, name) / (4 * sizeof(float));
362
363 LOAD_INPUT(discard_rect, glsl_vec4_type())
364 LOAD_INPUT(rect_grid, glsl_vec4_type())
365 LOAD_INPUT(coord_transform, glsl_vec4_type())
366 LOAD_INPUT(src_z, glsl_uint_type())
367
368 #undef LOAD_INPUT
369
370 v->frag_coord = nir_variable_create(b->shader, nir_var_shader_in,
371 glsl_vec4_type(), "gl_FragCoord");
372 v->frag_coord->data.location = VARYING_SLOT_POS;
373 v->frag_coord->data.origin_upper_left = true;
374
375 v->color_out = nir_variable_create(b->shader, nir_var_shader_out,
376 glsl_vec4_type(), "gl_FragColor");
377 v->color_out->data.location = FRAG_RESULT_COLOR;
378 }
379
380 nir_ssa_def *
381 blorp_blit_get_frag_coords(nir_builder *b,
382 const struct brw_blorp_blit_prog_key *key,
383 struct brw_blorp_blit_vars *v)
384 {
385 nir_ssa_def *coord = nir_f2i(b, nir_load_var(b, v->frag_coord));
386
387 if (key->persample_msaa_dispatch) {
388 return nir_vec3(b, nir_channel(b, coord, 0), nir_channel(b, coord, 1),
389 nir_load_system_value(b, nir_intrinsic_load_sample_id, 0));
390 } else {
391 return nir_vec2(b, nir_channel(b, coord, 0), nir_channel(b, coord, 1));
392 }
393 }
394
395 /**
396 * Emit code to translate from destination (X, Y) coordinates to source (X, Y)
397 * coordinates.
398 */
399 nir_ssa_def *
400 blorp_blit_apply_transform(nir_builder *b, nir_ssa_def *src_pos,
401 struct brw_blorp_blit_vars *v)
402 {
403 nir_ssa_def *coord_transform = nir_load_var(b, v->v_coord_transform);
404
405 nir_ssa_def *offset = nir_vec2(b, nir_channel(b, coord_transform, 1),
406 nir_channel(b, coord_transform, 3));
407 nir_ssa_def *mul = nir_vec2(b, nir_channel(b, coord_transform, 0),
408 nir_channel(b, coord_transform, 2));
409
410 return nir_ffma(b, src_pos, mul, offset);
411 }
412
413 static inline void
414 blorp_nir_discard_if_outside_rect(nir_builder *b, nir_ssa_def *pos,
415 struct brw_blorp_blit_vars *v)
416 {
417 nir_ssa_def *c0, *c1, *c2, *c3;
418 nir_ssa_def *discard_rect = nir_load_var(b, v->v_discard_rect);
419 nir_ssa_def *dst_x0 = nir_channel(b, discard_rect, 0);
420 nir_ssa_def *dst_x1 = nir_channel(b, discard_rect, 1);
421 nir_ssa_def *dst_y0 = nir_channel(b, discard_rect, 2);
422 nir_ssa_def *dst_y1 = nir_channel(b, discard_rect, 3);
423
424 c0 = nir_ult(b, nir_channel(b, pos, 0), dst_x0);
425 c1 = nir_uge(b, nir_channel(b, pos, 0), dst_x1);
426 c2 = nir_ult(b, nir_channel(b, pos, 1), dst_y0);
427 c3 = nir_uge(b, nir_channel(b, pos, 1), dst_y1);
428
429 nir_ssa_def *oob = nir_ior(b, nir_ior(b, c0, c1), nir_ior(b, c2, c3));
430
431 nir_intrinsic_instr *discard =
432 nir_intrinsic_instr_create(b->shader, nir_intrinsic_discard_if);
433 discard->src[0] = nir_src_for_ssa(oob);
434 nir_builder_instr_insert(b, &discard->instr);
435 }
436
437 static nir_tex_instr *
438 blorp_create_nir_tex_instr(nir_builder *b, struct brw_blorp_blit_vars *v,
439 nir_texop op, nir_ssa_def *pos, unsigned num_srcs,
440 enum brw_reg_type dst_type)
441 {
442 nir_tex_instr *tex = nir_tex_instr_create(b->shader, num_srcs);
443
444 tex->op = op;
445
446 switch (dst_type) {
447 case BRW_REGISTER_TYPE_F:
448 tex->dest_type = nir_type_float;
449 break;
450 case BRW_REGISTER_TYPE_D:
451 tex->dest_type = nir_type_int;
452 break;
453 case BRW_REGISTER_TYPE_UD:
454 tex->dest_type = nir_type_uint;
455 break;
456 default:
457 unreachable("Invalid texture return type");
458 }
459
460 tex->is_array = false;
461 tex->is_shadow = false;
462
463 /* Blorp only has one texture and it's bound at unit 0 */
464 tex->texture = NULL;
465 tex->sampler = NULL;
466 tex->texture_index = 0;
467 tex->sampler_index = 0;
468
469 /* To properly handle 3-D and 2-D array textures, we pull the Z component
470 * from an input. TODO: This is a bit magic; we should probably make this
471 * more explicit in the future.
472 */
473 assert(pos->num_components >= 2);
474 pos = nir_vec3(b, nir_channel(b, pos, 0), nir_channel(b, pos, 1),
475 nir_load_var(b, v->v_src_z));
476
477 tex->src[0].src_type = nir_tex_src_coord;
478 tex->src[0].src = nir_src_for_ssa(pos);
479 tex->coord_components = 3;
480
481 nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, NULL);
482
483 return tex;
484 }
485
486 static nir_ssa_def *
487 blorp_nir_tex(nir_builder *b, struct brw_blorp_blit_vars *v,
488 nir_ssa_def *pos, enum brw_reg_type dst_type)
489 {
490 nir_tex_instr *tex =
491 blorp_create_nir_tex_instr(b, v, nir_texop_tex, pos, 2, dst_type);
492
493 assert(pos->num_components == 2);
494 tex->sampler_dim = GLSL_SAMPLER_DIM_2D;
495 tex->src[1].src_type = nir_tex_src_lod;
496 tex->src[1].src = nir_src_for_ssa(nir_imm_int(b, 0));
497
498 nir_builder_instr_insert(b, &tex->instr);
499
500 return &tex->dest.ssa;
501 }
502
503 static nir_ssa_def *
504 blorp_nir_txf(nir_builder *b, struct brw_blorp_blit_vars *v,
505 nir_ssa_def *pos, enum brw_reg_type dst_type)
506 {
507 nir_tex_instr *tex =
508 blorp_create_nir_tex_instr(b, v, nir_texop_txf, pos, 2, dst_type);
509
510 tex->sampler_dim = GLSL_SAMPLER_DIM_3D;
511 tex->src[1].src_type = nir_tex_src_lod;
512 tex->src[1].src = nir_src_for_ssa(nir_imm_int(b, 0));
513
514 nir_builder_instr_insert(b, &tex->instr);
515
516 return &tex->dest.ssa;
517 }
518
519 static nir_ssa_def *
520 blorp_nir_txf_ms(nir_builder *b, struct brw_blorp_blit_vars *v,
521 nir_ssa_def *pos, nir_ssa_def *mcs, enum brw_reg_type dst_type)
522 {
523 nir_tex_instr *tex =
524 blorp_create_nir_tex_instr(b, v, nir_texop_txf_ms, pos,
525 mcs != NULL ? 3 : 2, dst_type);
526
527 tex->sampler_dim = GLSL_SAMPLER_DIM_MS;
528
529 tex->src[1].src_type = nir_tex_src_ms_index;
530 if (pos->num_components == 2) {
531 tex->src[1].src = nir_src_for_ssa(nir_imm_int(b, 0));
532 } else {
533 assert(pos->num_components == 3);
534 tex->src[1].src = nir_src_for_ssa(nir_channel(b, pos, 2));
535 }
536
537 if (mcs) {
538 tex->src[2].src_type = nir_tex_src_ms_mcs;
539 tex->src[2].src = nir_src_for_ssa(mcs);
540 }
541
542 nir_builder_instr_insert(b, &tex->instr);
543
544 return &tex->dest.ssa;
545 }
546
547 static nir_ssa_def *
548 blorp_nir_txf_ms_mcs(nir_builder *b, struct brw_blorp_blit_vars *v, nir_ssa_def *pos)
549 {
550 nir_tex_instr *tex =
551 blorp_create_nir_tex_instr(b, v, nir_texop_txf_ms_mcs,
552 pos, 1, BRW_REGISTER_TYPE_D);
553
554 tex->sampler_dim = GLSL_SAMPLER_DIM_MS;
555
556 nir_builder_instr_insert(b, &tex->instr);
557
558 return &tex->dest.ssa;
559 }
560
561 static nir_ssa_def *
562 nir_mask_shift_or(struct nir_builder *b, nir_ssa_def *dst, nir_ssa_def *src,
563 uint32_t src_mask, int src_left_shift)
564 {
565 nir_ssa_def *masked = nir_iand(b, src, nir_imm_int(b, src_mask));
566
567 nir_ssa_def *shifted;
568 if (src_left_shift > 0) {
569 shifted = nir_ishl(b, masked, nir_imm_int(b, src_left_shift));
570 } else if (src_left_shift < 0) {
571 shifted = nir_ushr(b, masked, nir_imm_int(b, -src_left_shift));
572 } else {
573 assert(src_left_shift == 0);
574 shifted = masked;
575 }
576
577 return nir_ior(b, dst, shifted);
578 }
579
580 /**
581 * Emit code to compensate for the difference between Y and W tiling.
582 *
583 * This code modifies the X and Y coordinates according to the formula:
584 *
585 * (X', Y', S') = detile(W-MAJOR, tile(Y-MAJOR, X, Y, S))
586 *
587 * (See brw_blorp_build_nir_shader).
588 */
589 static inline nir_ssa_def *
590 blorp_nir_retile_y_to_w(nir_builder *b, nir_ssa_def *pos)
591 {
592 assert(pos->num_components == 2);
593 nir_ssa_def *x_Y = nir_channel(b, pos, 0);
594 nir_ssa_def *y_Y = nir_channel(b, pos, 1);
595
596 /* Given X and Y coordinates that describe an address using Y tiling,
597 * translate to the X and Y coordinates that describe the same address
598 * using W tiling.
599 *
600 * If we break down the low order bits of X and Y, using a
601 * single letter to represent each low-order bit:
602 *
603 * X = A << 7 | 0bBCDEFGH
604 * Y = J << 5 | 0bKLMNP (1)
605 *
606 * Then we can apply the Y tiling formula to see the memory offset being
607 * addressed:
608 *
609 * offset = (J * tile_pitch + A) << 12 | 0bBCDKLMNPEFGH (2)
610 *
611 * If we apply the W detiling formula to this memory location, that the
612 * corresponding X' and Y' coordinates are:
613 *
614 * X' = A << 6 | 0bBCDPFH (3)
615 * Y' = J << 6 | 0bKLMNEG
616 *
617 * Combining (1) and (3), we see that to transform (X, Y) to (X', Y'),
618 * we need to make the following computation:
619 *
620 * X' = (X & ~0b1011) >> 1 | (Y & 0b1) << 2 | X & 0b1 (4)
621 * Y' = (Y & ~0b1) << 1 | (X & 0b1000) >> 2 | (X & 0b10) >> 1
622 */
623 nir_ssa_def *x_W = nir_imm_int(b, 0);
624 x_W = nir_mask_shift_or(b, x_W, x_Y, 0xfffffff4, -1);
625 x_W = nir_mask_shift_or(b, x_W, y_Y, 0x1, 2);
626 x_W = nir_mask_shift_or(b, x_W, x_Y, 0x1, 0);
627
628 nir_ssa_def *y_W = nir_imm_int(b, 0);
629 y_W = nir_mask_shift_or(b, y_W, y_Y, 0xfffffffe, 1);
630 y_W = nir_mask_shift_or(b, y_W, x_Y, 0x8, -2);
631 y_W = nir_mask_shift_or(b, y_W, x_Y, 0x2, -1);
632
633 return nir_vec2(b, x_W, y_W);
634 }
635
636 /**
637 * Emit code to compensate for the difference between Y and W tiling.
638 *
639 * This code modifies the X and Y coordinates according to the formula:
640 *
641 * (X', Y', S') = detile(Y-MAJOR, tile(W-MAJOR, X, Y, S))
642 *
643 * (See brw_blorp_build_nir_shader).
644 */
645 static inline nir_ssa_def *
646 blorp_nir_retile_w_to_y(nir_builder *b, nir_ssa_def *pos)
647 {
648 assert(pos->num_components == 2);
649 nir_ssa_def *x_W = nir_channel(b, pos, 0);
650 nir_ssa_def *y_W = nir_channel(b, pos, 1);
651
652 /* Applying the same logic as above, but in reverse, we obtain the
653 * formulas:
654 *
655 * X' = (X & ~0b101) << 1 | (Y & 0b10) << 2 | (Y & 0b1) << 1 | X & 0b1
656 * Y' = (Y & ~0b11) >> 1 | (X & 0b100) >> 2
657 */
658 nir_ssa_def *x_Y = nir_imm_int(b, 0);
659 x_Y = nir_mask_shift_or(b, x_Y, x_W, 0xfffffffa, 1);
660 x_Y = nir_mask_shift_or(b, x_Y, y_W, 0x2, 2);
661 x_Y = nir_mask_shift_or(b, x_Y, y_W, 0x1, 1);
662 x_Y = nir_mask_shift_or(b, x_Y, x_W, 0x1, 0);
663
664 nir_ssa_def *y_Y = nir_imm_int(b, 0);
665 y_Y = nir_mask_shift_or(b, y_Y, y_W, 0xfffffffc, -1);
666 y_Y = nir_mask_shift_or(b, y_Y, x_W, 0x4, -2);
667
668 return nir_vec2(b, x_Y, y_Y);
669 }
670
671 /**
672 * Emit code to compensate for the difference between MSAA and non-MSAA
673 * surfaces.
674 *
675 * This code modifies the X and Y coordinates according to the formula:
676 *
677 * (X', Y', S') = encode_msaa(num_samples, IMS, X, Y, S)
678 *
679 * (See brw_blorp_blit_program).
680 */
681 static inline nir_ssa_def *
682 blorp_nir_encode_msaa(nir_builder *b, nir_ssa_def *pos,
683 unsigned num_samples, enum isl_msaa_layout layout)
684 {
685 assert(pos->num_components == 2 || pos->num_components == 3);
686
687 switch (layout) {
688 case ISL_MSAA_LAYOUT_NONE:
689 assert(pos->num_components == 2);
690 return pos;
691 case ISL_MSAA_LAYOUT_ARRAY:
692 /* No translation needed */
693 return pos;
694 case ISL_MSAA_LAYOUT_INTERLEAVED: {
695 nir_ssa_def *x_in = nir_channel(b, pos, 0);
696 nir_ssa_def *y_in = nir_channel(b, pos, 1);
697 nir_ssa_def *s_in = pos->num_components == 2 ? nir_imm_int(b, 0) :
698 nir_channel(b, pos, 2);
699
700 nir_ssa_def *x_out = nir_imm_int(b, 0);
701 nir_ssa_def *y_out = nir_imm_int(b, 0);
702 switch (num_samples) {
703 case 2:
704 case 4:
705 /* encode_msaa(2, IMS, X, Y, S) = (X', Y', 0)
706 * where X' = (X & ~0b1) << 1 | (S & 0b1) << 1 | (X & 0b1)
707 * Y' = Y
708 *
709 * encode_msaa(4, IMS, X, Y, S) = (X', Y', 0)
710 * where X' = (X & ~0b1) << 1 | (S & 0b1) << 1 | (X & 0b1)
711 * Y' = (Y & ~0b1) << 1 | (S & 0b10) | (Y & 0b1)
712 */
713 x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffffe, 1);
714 x_out = nir_mask_shift_or(b, x_out, s_in, 0x1, 1);
715 x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0);
716 if (num_samples == 2) {
717 y_out = y_in;
718 } else {
719 y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffe, 1);
720 y_out = nir_mask_shift_or(b, y_out, s_in, 0x2, 0);
721 y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0);
722 }
723 break;
724
725 case 8:
726 /* encode_msaa(8, IMS, X, Y, S) = (X', Y', 0)
727 * where X' = (X & ~0b1) << 2 | (S & 0b100) | (S & 0b1) << 1
728 * | (X & 0b1)
729 * Y' = (Y & ~0b1) << 1 | (S & 0b10) | (Y & 0b1)
730 */
731 x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffffe, 2);
732 x_out = nir_mask_shift_or(b, x_out, s_in, 0x4, 0);
733 x_out = nir_mask_shift_or(b, x_out, s_in, 0x1, 1);
734 x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0);
735 y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffe, 1);
736 y_out = nir_mask_shift_or(b, y_out, s_in, 0x2, 0);
737 y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0);
738 break;
739
740 case 16:
741 /* encode_msaa(16, IMS, X, Y, S) = (X', Y', 0)
742 * where X' = (X & ~0b1) << 2 | (S & 0b100) | (S & 0b1) << 1
743 * | (X & 0b1)
744 * Y' = (Y & ~0b1) << 2 | (S & 0b1000) >> 1 (S & 0b10)
745 * | (Y & 0b1)
746 */
747 x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffffe, 2);
748 x_out = nir_mask_shift_or(b, x_out, s_in, 0x4, 0);
749 x_out = nir_mask_shift_or(b, x_out, s_in, 0x1, 1);
750 x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0);
751 y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffe, 2);
752 y_out = nir_mask_shift_or(b, y_out, s_in, 0x8, -1);
753 y_out = nir_mask_shift_or(b, y_out, s_in, 0x2, 0);
754 y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0);
755 break;
756
757 default:
758 unreachable("Invalid number of samples for IMS layout");
759 }
760
761 return nir_vec2(b, x_out, y_out);
762 }
763
764 default:
765 unreachable("Invalid MSAA layout");
766 }
767 }
768
769 /**
770 * Emit code to compensate for the difference between MSAA and non-MSAA
771 * surfaces.
772 *
773 * This code modifies the X and Y coordinates according to the formula:
774 *
775 * (X', Y', S) = decode_msaa(num_samples, IMS, X, Y, S)
776 *
777 * (See brw_blorp_blit_program).
778 */
779 static inline nir_ssa_def *
780 blorp_nir_decode_msaa(nir_builder *b, nir_ssa_def *pos,
781 unsigned num_samples, enum isl_msaa_layout layout)
782 {
783 assert(pos->num_components == 2 || pos->num_components == 3);
784
785 switch (layout) {
786 case ISL_MSAA_LAYOUT_NONE:
787 /* No translation necessary, and S should already be zero. */
788 assert(pos->num_components == 2);
789 return pos;
790 case ISL_MSAA_LAYOUT_ARRAY:
791 /* No translation necessary. */
792 return pos;
793 case ISL_MSAA_LAYOUT_INTERLEAVED: {
794 assert(pos->num_components == 2);
795
796 nir_ssa_def *x_in = nir_channel(b, pos, 0);
797 nir_ssa_def *y_in = nir_channel(b, pos, 1);
798
799 nir_ssa_def *x_out = nir_imm_int(b, 0);
800 nir_ssa_def *y_out = nir_imm_int(b, 0);
801 nir_ssa_def *s_out = nir_imm_int(b, 0);
802 switch (num_samples) {
803 case 2:
804 case 4:
805 /* decode_msaa(2, IMS, X, Y, 0) = (X', Y', S)
806 * where X' = (X & ~0b11) >> 1 | (X & 0b1)
807 * S = (X & 0b10) >> 1
808 *
809 * decode_msaa(4, IMS, X, Y, 0) = (X', Y', S)
810 * where X' = (X & ~0b11) >> 1 | (X & 0b1)
811 * Y' = (Y & ~0b11) >> 1 | (Y & 0b1)
812 * S = (Y & 0b10) | (X & 0b10) >> 1
813 */
814 x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffffc, -1);
815 x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0);
816 if (num_samples == 2) {
817 y_out = y_in;
818 s_out = nir_mask_shift_or(b, s_out, x_in, 0x2, -1);
819 } else {
820 y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffc, -1);
821 y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0);
822 s_out = nir_mask_shift_or(b, s_out, x_in, 0x2, -1);
823 s_out = nir_mask_shift_or(b, s_out, y_in, 0x2, 0);
824 }
825 break;
826
827 case 8:
828 /* decode_msaa(8, IMS, X, Y, 0) = (X', Y', S)
829 * where X' = (X & ~0b111) >> 2 | (X & 0b1)
830 * Y' = (Y & ~0b11) >> 1 | (Y & 0b1)
831 * S = (X & 0b100) | (Y & 0b10) | (X & 0b10) >> 1
832 */
833 x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffff8, -2);
834 x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0);
835 y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffc, -1);
836 y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0);
837 s_out = nir_mask_shift_or(b, s_out, x_in, 0x4, 0);
838 s_out = nir_mask_shift_or(b, s_out, y_in, 0x2, 0);
839 s_out = nir_mask_shift_or(b, s_out, x_in, 0x2, -1);
840 break;
841
842 case 16:
843 /* decode_msaa(16, IMS, X, Y, 0) = (X', Y', S)
844 * where X' = (X & ~0b111) >> 2 | (X & 0b1)
845 * Y' = (Y & ~0b111) >> 2 | (Y & 0b1)
846 * S = (Y & 0b100) << 1 | (X & 0b100) |
847 * (Y & 0b10) | (X & 0b10) >> 1
848 */
849 x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffff8, -2);
850 x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0);
851 y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffff8, -2);
852 y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0);
853 s_out = nir_mask_shift_or(b, s_out, y_in, 0x4, 1);
854 s_out = nir_mask_shift_or(b, s_out, x_in, 0x4, 0);
855 s_out = nir_mask_shift_or(b, s_out, y_in, 0x2, 0);
856 s_out = nir_mask_shift_or(b, s_out, x_in, 0x2, -1);
857 break;
858
859 default:
860 unreachable("Invalid number of samples for IMS layout");
861 }
862
863 return nir_vec3(b, x_out, y_out, s_out);
864 }
865
866 default:
867 unreachable("Invalid MSAA layout");
868 }
869 }
870
871 /**
872 * Count the number of trailing 1 bits in the given value. For example:
873 *
874 * count_trailing_one_bits(0) == 0
875 * count_trailing_one_bits(7) == 3
876 * count_trailing_one_bits(11) == 2
877 */
878 static inline int count_trailing_one_bits(unsigned value)
879 {
880 #ifdef HAVE___BUILTIN_CTZ
881 return __builtin_ctz(~value);
882 #else
883 return _mesa_bitcount(value & ~(value + 1));
884 #endif
885 }
886
887 static nir_ssa_def *
888 blorp_nir_manual_blend_average(nir_builder *b, struct brw_blorp_blit_vars *v,
889 nir_ssa_def *pos, unsigned tex_samples,
890 enum isl_aux_usage tex_aux_usage,
891 enum brw_reg_type dst_type)
892 {
893 /* If non-null, this is the outer-most if statement */
894 nir_if *outer_if = NULL;
895
896 nir_variable *color =
897 nir_local_variable_create(b->impl, glsl_vec4_type(), "color");
898
899 nir_ssa_def *mcs = NULL;
900 if (tex_aux_usage == ISL_AUX_USAGE_MCS)
901 mcs = blorp_nir_txf_ms_mcs(b, v, pos);
902
903 /* We add together samples using a binary tree structure, e.g. for 4x MSAA:
904 *
905 * result = ((sample[0] + sample[1]) + (sample[2] + sample[3])) / 4
906 *
907 * This ensures that when all samples have the same value, no numerical
908 * precision is lost, since each addition operation always adds two equal
909 * values, and summing two equal floating point values does not lose
910 * precision.
911 *
912 * We perform this computation by treating the texture_data array as a
913 * stack and performing the following operations:
914 *
915 * - push sample 0 onto stack
916 * - push sample 1 onto stack
917 * - add top two stack entries
918 * - push sample 2 onto stack
919 * - push sample 3 onto stack
920 * - add top two stack entries
921 * - add top two stack entries
922 * - divide top stack entry by 4
923 *
924 * Note that after pushing sample i onto the stack, the number of add
925 * operations we do is equal to the number of trailing 1 bits in i. This
926 * works provided the total number of samples is a power of two, which it
927 * always is for i965.
928 *
929 * For integer formats, we replace the add operations with average
930 * operations and skip the final division.
931 */
932 nir_ssa_def *texture_data[5];
933 unsigned stack_depth = 0;
934 for (unsigned i = 0; i < tex_samples; ++i) {
935 assert(stack_depth == _mesa_bitcount(i)); /* Loop invariant */
936
937 /* Push sample i onto the stack */
938 assert(stack_depth < ARRAY_SIZE(texture_data));
939
940 nir_ssa_def *ms_pos = nir_vec3(b, nir_channel(b, pos, 0),
941 nir_channel(b, pos, 1),
942 nir_imm_int(b, i));
943 texture_data[stack_depth++] = blorp_nir_txf_ms(b, v, ms_pos, mcs, dst_type);
944
945 if (i == 0 && tex_aux_usage == ISL_AUX_USAGE_MCS) {
946 /* The Ivy Bridge PRM, Vol4 Part1 p27 (Multisample Control Surface)
947 * suggests an optimization:
948 *
949 * "A simple optimization with probable large return in
950 * performance is to compare the MCS value to zero (indicating
951 * all samples are on sample slice 0), and sample only from
952 * sample slice 0 using ld2dss if MCS is zero."
953 *
954 * Note that in the case where the MCS value is zero, sampling from
955 * sample slice 0 using ld2dss and sampling from sample 0 using
956 * ld2dms are equivalent (since all samples are on sample slice 0).
957 * Since we have already sampled from sample 0, all we need to do is
958 * skip the remaining fetches and averaging if MCS is zero.
959 */
960 nir_ssa_def *mcs_zero =
961 nir_ieq(b, nir_channel(b, mcs, 0), nir_imm_int(b, 0));
962 if (tex_samples == 16) {
963 mcs_zero = nir_iand(b, mcs_zero,
964 nir_ieq(b, nir_channel(b, mcs, 1), nir_imm_int(b, 0)));
965 }
966
967 nir_if *if_stmt = nir_if_create(b->shader);
968 if_stmt->condition = nir_src_for_ssa(mcs_zero);
969 nir_cf_node_insert(b->cursor, &if_stmt->cf_node);
970
971 b->cursor = nir_after_cf_list(&if_stmt->then_list);
972 nir_store_var(b, color, texture_data[0], 0xf);
973
974 b->cursor = nir_after_cf_list(&if_stmt->else_list);
975 outer_if = if_stmt;
976 }
977
978 for (int j = 0; j < count_trailing_one_bits(i); j++) {
979 assert(stack_depth >= 2);
980 --stack_depth;
981
982 assert(dst_type == BRW_REGISTER_TYPE_F);
983 texture_data[stack_depth - 1] =
984 nir_fadd(b, texture_data[stack_depth - 1],
985 texture_data[stack_depth]);
986 }
987 }
988
989 /* We should have just 1 sample on the stack now. */
990 assert(stack_depth == 1);
991
992 texture_data[0] = nir_fmul(b, texture_data[0],
993 nir_imm_float(b, 1.0 / tex_samples));
994
995 nir_store_var(b, color, texture_data[0], 0xf);
996
997 if (outer_if)
998 b->cursor = nir_after_cf_node(&outer_if->cf_node);
999
1000 return nir_load_var(b, color);
1001 }
1002
1003 static inline nir_ssa_def *
1004 nir_imm_vec2(nir_builder *build, float x, float y)
1005 {
1006 nir_const_value v;
1007
1008 memset(&v, 0, sizeof(v));
1009 v.f32[0] = x;
1010 v.f32[1] = y;
1011
1012 return nir_build_imm(build, 4, 32, v);
1013 }
1014
1015 static nir_ssa_def *
1016 blorp_nir_manual_blend_bilinear(nir_builder *b, nir_ssa_def *pos,
1017 unsigned tex_samples,
1018 const brw_blorp_blit_prog_key *key,
1019 struct brw_blorp_blit_vars *v)
1020 {
1021 nir_ssa_def *pos_xy = nir_channels(b, pos, 0x3);
1022 nir_ssa_def *rect_grid = nir_load_var(b, v->v_rect_grid);
1023 nir_ssa_def *scale = nir_imm_vec2(b, key->x_scale, key->y_scale);
1024
1025 /* Translate coordinates to lay out the samples in a rectangular grid
1026 * roughly corresponding to sample locations.
1027 */
1028 pos_xy = nir_fmul(b, pos_xy, scale);
1029 /* Adjust coordinates so that integers represent pixel centers rather
1030 * than pixel edges.
1031 */
1032 pos_xy = nir_fadd(b, pos_xy, nir_imm_float(b, -0.5));
1033 /* Clamp the X, Y texture coordinates to properly handle the sampling of
1034 * texels on texture edges.
1035 */
1036 pos_xy = nir_fmin(b, nir_fmax(b, pos_xy, nir_imm_float(b, 0.0)),
1037 nir_vec2(b, nir_channel(b, rect_grid, 0),
1038 nir_channel(b, rect_grid, 1)));
1039
1040 /* Store the fractional parts to be used as bilinear interpolation
1041 * coefficients.
1042 */
1043 nir_ssa_def *frac_xy = nir_ffract(b, pos_xy);
1044 /* Round the float coordinates down to nearest integer */
1045 pos_xy = nir_fdiv(b, nir_ftrunc(b, pos_xy), scale);
1046
1047 nir_ssa_def *tex_data[4];
1048 for (unsigned i = 0; i < 4; ++i) {
1049 float sample_off_x = (float)(i & 0x1) / key->x_scale;
1050 float sample_off_y = (float)((i >> 1) & 0x1) / key->y_scale;
1051 nir_ssa_def *sample_off = nir_imm_vec2(b, sample_off_x, sample_off_y);
1052
1053 nir_ssa_def *sample_coords = nir_fadd(b, pos_xy, sample_off);
1054 nir_ssa_def *sample_coords_int = nir_f2i(b, sample_coords);
1055
1056 /* The MCS value we fetch has to match up with the pixel that we're
1057 * sampling from. Since we sample from different pixels in each
1058 * iteration of this "for" loop, the call to mcs_fetch() should be
1059 * here inside the loop after computing the pixel coordinates.
1060 */
1061 nir_ssa_def *mcs = NULL;
1062 if (key->tex_aux_usage == ISL_AUX_USAGE_MCS)
1063 mcs = blorp_nir_txf_ms_mcs(b, v, sample_coords_int);
1064
1065 /* Compute sample index and map the sample index to a sample number.
1066 * Sample index layout shows the numbering of slots in a rectangular
1067 * grid of samples with in a pixel. Sample number layout shows the
1068 * rectangular grid of samples roughly corresponding to the real sample
1069 * locations with in a pixel.
1070 * In case of 4x MSAA, layout of sample indices matches the layout of
1071 * sample numbers:
1072 * ---------
1073 * | 0 | 1 |
1074 * ---------
1075 * | 2 | 3 |
1076 * ---------
1077 *
1078 * In case of 8x MSAA the two layouts don't match.
1079 * sample index layout : --------- sample number layout : ---------
1080 * | 0 | 1 | | 3 | 7 |
1081 * --------- ---------
1082 * | 2 | 3 | | 5 | 0 |
1083 * --------- ---------
1084 * | 4 | 5 | | 1 | 2 |
1085 * --------- ---------
1086 * | 6 | 7 | | 4 | 6 |
1087 * --------- ---------
1088 *
1089 * Fortunately, this can be done fairly easily as:
1090 * S' = (0x17306425 >> (S * 4)) & 0xf
1091 *
1092 * In the case of 16x MSAA the two layouts don't match.
1093 * Sample index layout: Sample number layout:
1094 * --------------------- ---------------------
1095 * | 0 | 1 | 2 | 3 | | 15 | 10 | 9 | 7 |
1096 * --------------------- ---------------------
1097 * | 4 | 5 | 6 | 7 | | 4 | 1 | 3 | 13 |
1098 * --------------------- ---------------------
1099 * | 8 | 9 | 10 | 11 | | 12 | 2 | 0 | 6 |
1100 * --------------------- ---------------------
1101 * | 12 | 13 | 14 | 15 | | 11 | 8 | 5 | 14 |
1102 * --------------------- ---------------------
1103 *
1104 * This is equivalent to
1105 * S' = (0xe58b602cd31479af >> (S * 4)) & 0xf
1106 */
1107 nir_ssa_def *frac = nir_ffract(b, sample_coords);
1108 nir_ssa_def *sample =
1109 nir_fdot2(b, frac, nir_imm_vec2(b, key->x_scale,
1110 key->x_scale * key->y_scale));
1111 sample = nir_f2i(b, sample);
1112
1113 if (tex_samples == 8) {
1114 sample = nir_iand(b, nir_ishr(b, nir_imm_int(b, 0x64210573),
1115 nir_ishl(b, sample, nir_imm_int(b, 2))),
1116 nir_imm_int(b, 0xf));
1117 } else if (tex_samples == 16) {
1118 nir_ssa_def *sample_low =
1119 nir_iand(b, nir_ishr(b, nir_imm_int(b, 0xd31479af),
1120 nir_ishl(b, sample, nir_imm_int(b, 2))),
1121 nir_imm_int(b, 0xf));
1122 nir_ssa_def *sample_high =
1123 nir_iand(b, nir_ishr(b, nir_imm_int(b, 0xe58b602c),
1124 nir_ishl(b, nir_iadd(b, sample,
1125 nir_imm_int(b, -8)),
1126 nir_imm_int(b, 2))),
1127 nir_imm_int(b, 0xf));
1128
1129 sample = nir_bcsel(b, nir_ilt(b, sample, nir_imm_int(b, 8)),
1130 sample_low, sample_high);
1131 }
1132 nir_ssa_def *pos_ms = nir_vec3(b, nir_channel(b, sample_coords_int, 0),
1133 nir_channel(b, sample_coords_int, 1),
1134 sample);
1135 tex_data[i] = blorp_nir_txf_ms(b, v, pos_ms, mcs, key->texture_data_type);
1136 }
1137
1138 nir_ssa_def *frac_x = nir_channel(b, frac_xy, 0);
1139 nir_ssa_def *frac_y = nir_channel(b, frac_xy, 1);
1140 return nir_flrp(b, nir_flrp(b, tex_data[0], tex_data[1], frac_x),
1141 nir_flrp(b, tex_data[2], tex_data[3], frac_x),
1142 frac_y);
1143 }
1144
1145 /**
1146 * Generator for WM programs used in BLORP blits.
1147 *
1148 * The bulk of the work done by the WM program is to wrap and unwrap the
1149 * coordinate transformations used by the hardware to store surfaces in
1150 * memory. The hardware transforms a pixel location (X, Y, S) (where S is the
1151 * sample index for a multisampled surface) to a memory offset by the
1152 * following formulas:
1153 *
1154 * offset = tile(tiling_format, encode_msaa(num_samples, layout, X, Y, S))
1155 * (X, Y, S) = decode_msaa(num_samples, layout, detile(tiling_format, offset))
1156 *
1157 * For a single-sampled surface, or for a multisampled surface using
1158 * INTEL_MSAA_LAYOUT_UMS, encode_msaa() and decode_msaa are the identity
1159 * function:
1160 *
1161 * encode_msaa(1, NONE, X, Y, 0) = (X, Y, 0)
1162 * decode_msaa(1, NONE, X, Y, 0) = (X, Y, 0)
1163 * encode_msaa(n, UMS, X, Y, S) = (X, Y, S)
1164 * decode_msaa(n, UMS, X, Y, S) = (X, Y, S)
1165 *
1166 * For a 4x multisampled surface using INTEL_MSAA_LAYOUT_IMS, encode_msaa()
1167 * embeds the sample number into bit 1 of the X and Y coordinates:
1168 *
1169 * encode_msaa(4, IMS, X, Y, S) = (X', Y', 0)
1170 * where X' = (X & ~0b1) << 1 | (S & 0b1) << 1 | (X & 0b1)
1171 * Y' = (Y & ~0b1 ) << 1 | (S & 0b10) | (Y & 0b1)
1172 * decode_msaa(4, IMS, X, Y, 0) = (X', Y', S)
1173 * where X' = (X & ~0b11) >> 1 | (X & 0b1)
1174 * Y' = (Y & ~0b11) >> 1 | (Y & 0b1)
1175 * S = (Y & 0b10) | (X & 0b10) >> 1
1176 *
1177 * For an 8x multisampled surface using INTEL_MSAA_LAYOUT_IMS, encode_msaa()
1178 * embeds the sample number into bits 1 and 2 of the X coordinate and bit 1 of
1179 * the Y coordinate:
1180 *
1181 * encode_msaa(8, IMS, X, Y, S) = (X', Y', 0)
1182 * where X' = (X & ~0b1) << 2 | (S & 0b100) | (S & 0b1) << 1 | (X & 0b1)
1183 * Y' = (Y & ~0b1) << 1 | (S & 0b10) | (Y & 0b1)
1184 * decode_msaa(8, IMS, X, Y, 0) = (X', Y', S)
1185 * where X' = (X & ~0b111) >> 2 | (X & 0b1)
1186 * Y' = (Y & ~0b11) >> 1 | (Y & 0b1)
1187 * S = (X & 0b100) | (Y & 0b10) | (X & 0b10) >> 1
1188 *
1189 * For X tiling, tile() combines together the low-order bits of the X and Y
1190 * coordinates in the pattern 0byyyxxxxxxxxx, creating 4k tiles that are 512
1191 * bytes wide and 8 rows high:
1192 *
1193 * tile(x_tiled, X, Y, S) = A
1194 * where A = tile_num << 12 | offset
1195 * tile_num = (Y' >> 3) * tile_pitch + (X' >> 9)
1196 * offset = (Y' & 0b111) << 9
1197 * | (X & 0b111111111)
1198 * X' = X * cpp
1199 * Y' = Y + S * qpitch
1200 * detile(x_tiled, A) = (X, Y, S)
1201 * where X = X' / cpp
1202 * Y = Y' % qpitch
1203 * S = Y' / qpitch
1204 * Y' = (tile_num / tile_pitch) << 3
1205 * | (A & 0b111000000000) >> 9
1206 * X' = (tile_num % tile_pitch) << 9
1207 * | (A & 0b111111111)
1208 *
1209 * (In all tiling formulas, cpp is the number of bytes occupied by a single
1210 * sample ("chars per pixel"), tile_pitch is the number of 4k tiles required
1211 * to fill the width of the surface, and qpitch is the spacing (in rows)
1212 * between array slices).
1213 *
1214 * For Y tiling, tile() combines together the low-order bits of the X and Y
1215 * coordinates in the pattern 0bxxxyyyyyxxxx, creating 4k tiles that are 128
1216 * bytes wide and 32 rows high:
1217 *
1218 * tile(y_tiled, X, Y, S) = A
1219 * where A = tile_num << 12 | offset
1220 * tile_num = (Y' >> 5) * tile_pitch + (X' >> 7)
1221 * offset = (X' & 0b1110000) << 5
1222 * | (Y' & 0b11111) << 4
1223 * | (X' & 0b1111)
1224 * X' = X * cpp
1225 * Y' = Y + S * qpitch
1226 * detile(y_tiled, A) = (X, Y, S)
1227 * where X = X' / cpp
1228 * Y = Y' % qpitch
1229 * S = Y' / qpitch
1230 * Y' = (tile_num / tile_pitch) << 5
1231 * | (A & 0b111110000) >> 4
1232 * X' = (tile_num % tile_pitch) << 7
1233 * | (A & 0b111000000000) >> 5
1234 * | (A & 0b1111)
1235 *
1236 * For W tiling, tile() combines together the low-order bits of the X and Y
1237 * coordinates in the pattern 0bxxxyyyyxyxyx, creating 4k tiles that are 64
1238 * bytes wide and 64 rows high (note that W tiling is only used for stencil
1239 * buffers, which always have cpp = 1 and S=0):
1240 *
1241 * tile(w_tiled, X, Y, S) = A
1242 * where A = tile_num << 12 | offset
1243 * tile_num = (Y' >> 6) * tile_pitch + (X' >> 6)
1244 * offset = (X' & 0b111000) << 6
1245 * | (Y' & 0b111100) << 3
1246 * | (X' & 0b100) << 2
1247 * | (Y' & 0b10) << 2
1248 * | (X' & 0b10) << 1
1249 * | (Y' & 0b1) << 1
1250 * | (X' & 0b1)
1251 * X' = X * cpp = X
1252 * Y' = Y + S * qpitch
1253 * detile(w_tiled, A) = (X, Y, S)
1254 * where X = X' / cpp = X'
1255 * Y = Y' % qpitch = Y'
1256 * S = Y / qpitch = 0
1257 * Y' = (tile_num / tile_pitch) << 6
1258 * | (A & 0b111100000) >> 3
1259 * | (A & 0b1000) >> 2
1260 * | (A & 0b10) >> 1
1261 * X' = (tile_num % tile_pitch) << 6
1262 * | (A & 0b111000000000) >> 6
1263 * | (A & 0b10000) >> 2
1264 * | (A & 0b100) >> 1
1265 * | (A & 0b1)
1266 *
1267 * Finally, for a non-tiled surface, tile() simply combines together the X and
1268 * Y coordinates in the natural way:
1269 *
1270 * tile(untiled, X, Y, S) = A
1271 * where A = Y * pitch + X'
1272 * X' = X * cpp
1273 * Y' = Y + S * qpitch
1274 * detile(untiled, A) = (X, Y, S)
1275 * where X = X' / cpp
1276 * Y = Y' % qpitch
1277 * S = Y' / qpitch
1278 * X' = A % pitch
1279 * Y' = A / pitch
1280 *
1281 * (In these formulas, pitch is the number of bytes occupied by a single row
1282 * of samples).
1283 */
1284 static nir_shader *
1285 brw_blorp_build_nir_shader(struct brw_context *brw,
1286 const brw_blorp_blit_prog_key *key)
1287 {
1288 nir_ssa_def *src_pos, *dst_pos, *color;
1289
1290 /* Sanity checks */
1291 if (key->dst_tiled_w && key->rt_samples > 1) {
1292 /* If the destination image is W tiled and multisampled, then the thread
1293 * must be dispatched once per sample, not once per pixel. This is
1294 * necessary because after conversion between W and Y tiling, there's no
1295 * guarantee that all samples corresponding to a single pixel will still
1296 * be together.
1297 */
1298 assert(key->persample_msaa_dispatch);
1299 }
1300
1301 if (key->blend) {
1302 /* We are blending, which means we won't have an opportunity to
1303 * translate the tiling and sample count for the texture surface. So
1304 * the surface state for the texture must be configured with the correct
1305 * tiling and sample count.
1306 */
1307 assert(!key->src_tiled_w);
1308 assert(key->tex_samples == key->src_samples);
1309 assert(key->tex_layout == key->src_layout);
1310 assert(key->tex_samples > 0);
1311 }
1312
1313 if (key->persample_msaa_dispatch) {
1314 /* It only makes sense to do persample dispatch if the render target is
1315 * configured as multisampled.
1316 */
1317 assert(key->rt_samples > 0);
1318 }
1319
1320 /* Make sure layout is consistent with sample count */
1321 assert((key->tex_layout == ISL_MSAA_LAYOUT_NONE) ==
1322 (key->tex_samples <= 1));
1323 assert((key->rt_layout == ISL_MSAA_LAYOUT_NONE) ==
1324 (key->rt_samples <= 1));
1325 assert((key->src_layout == ISL_MSAA_LAYOUT_NONE) ==
1326 (key->src_samples <= 1));
1327 assert((key->dst_layout == ISL_MSAA_LAYOUT_NONE) ==
1328 (key->dst_samples <= 1));
1329
1330 nir_builder b;
1331 nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_FRAGMENT, NULL);
1332
1333 struct brw_blorp_blit_vars v;
1334 brw_blorp_blit_vars_init(&b, &v, key);
1335
1336 dst_pos = blorp_blit_get_frag_coords(&b, key, &v);
1337
1338 /* Render target and texture hardware don't support W tiling until Gen8. */
1339 const bool rt_tiled_w = false;
1340 const bool tex_tiled_w = brw->gen >= 8 && key->src_tiled_w;
1341
1342 /* The address that data will be written to is determined by the
1343 * coordinates supplied to the WM thread and the tiling and sample count of
1344 * the render target, according to the formula:
1345 *
1346 * (X, Y, S) = decode_msaa(rt_samples, detile(rt_tiling, offset))
1347 *
1348 * If the actual tiling and sample count of the destination surface are not
1349 * the same as the configuration of the render target, then these
1350 * coordinates are wrong and we have to adjust them to compensate for the
1351 * difference.
1352 */
1353 if (rt_tiled_w != key->dst_tiled_w ||
1354 key->rt_samples != key->dst_samples ||
1355 key->rt_layout != key->dst_layout) {
1356 dst_pos = blorp_nir_encode_msaa(&b, dst_pos, key->rt_samples,
1357 key->rt_layout);
1358 /* Now (X, Y, S) = detile(rt_tiling, offset) */
1359 if (rt_tiled_w != key->dst_tiled_w)
1360 dst_pos = blorp_nir_retile_y_to_w(&b, dst_pos);
1361 /* Now (X, Y, S) = detile(rt_tiling, offset) */
1362 dst_pos = blorp_nir_decode_msaa(&b, dst_pos, key->dst_samples,
1363 key->dst_layout);
1364 }
1365
1366 /* Now (X, Y, S) = decode_msaa(dst_samples, detile(dst_tiling, offset)).
1367 *
1368 * That is: X, Y and S now contain the true coordinates and sample index of
1369 * the data that the WM thread should output.
1370 *
1371 * If we need to kill pixels that are outside the destination rectangle,
1372 * now is the time to do it.
1373 */
1374 if (key->use_kill) {
1375 assert(!(key->blend && key->blit_scaled));
1376 blorp_nir_discard_if_outside_rect(&b, dst_pos, &v);
1377 }
1378
1379 src_pos = blorp_blit_apply_transform(&b, nir_i2f(&b, dst_pos), &v);
1380 if (dst_pos->num_components == 3) {
1381 /* The sample coordinate is an integer that we want left alone but
1382 * blorp_blit_apply_transform() blindly applies the transform to all
1383 * three coordinates. Grab the original sample index.
1384 */
1385 src_pos = nir_vec3(&b, nir_channel(&b, src_pos, 0),
1386 nir_channel(&b, src_pos, 1),
1387 nir_channel(&b, dst_pos, 2));
1388 }
1389
1390 /* If the source image is not multisampled, then we want to fetch sample
1391 * number 0, because that's the only sample there is.
1392 */
1393 if (key->src_samples == 0)
1394 src_pos = nir_channels(&b, src_pos, 0x3);
1395
1396 /* X, Y, and S are now the coordinates of the pixel in the source image
1397 * that we want to texture from. Exception: if we are blending, then S is
1398 * irrelevant, because we are going to fetch all samples.
1399 */
1400 if (key->blend && !key->blit_scaled) {
1401 /* Resolves (effecively) use texelFetch, so we need integers and we
1402 * don't care about the sample index if we got one.
1403 */
1404 src_pos = nir_f2i(&b, nir_channels(&b, src_pos, 0x3));
1405
1406 if (brw->gen == 6) {
1407 /* Because gen6 only supports 4x interleved MSAA, we can do all the
1408 * blending we need with a single linear-interpolated texture lookup
1409 * at the center of the sample. The texture coordinates to be odd
1410 * integers so that they correspond to the center of a 2x2 block
1411 * representing the four samples that maxe up a pixel. So we need
1412 * to multiply our X and Y coordinates each by 2 and then add 1.
1413 */
1414 src_pos = nir_ishl(&b, src_pos, nir_imm_int(&b, 1));
1415 src_pos = nir_iadd(&b, src_pos, nir_imm_int(&b, 1));
1416 src_pos = nir_i2f(&b, src_pos);
1417 color = blorp_nir_tex(&b, &v, src_pos, key->texture_data_type);
1418 } else {
1419 /* Gen7+ hardware doesn't automaticaly blend. */
1420 color = blorp_nir_manual_blend_average(&b, &v, src_pos, key->src_samples,
1421 key->tex_aux_usage,
1422 key->texture_data_type);
1423 }
1424 } else if (key->blend && key->blit_scaled) {
1425 assert(!key->use_kill);
1426 color = blorp_nir_manual_blend_bilinear(&b, src_pos, key->src_samples, key, &v);
1427 } else {
1428 if (key->bilinear_filter) {
1429 color = blorp_nir_tex(&b, &v, src_pos, key->texture_data_type);
1430 } else {
1431 /* We're going to use texelFetch, so we need integers */
1432 if (src_pos->num_components == 2) {
1433 src_pos = nir_f2i(&b, src_pos);
1434 } else {
1435 assert(src_pos->num_components == 3);
1436 src_pos = nir_vec3(&b, nir_channel(&b, nir_f2i(&b, src_pos), 0),
1437 nir_channel(&b, nir_f2i(&b, src_pos), 1),
1438 nir_channel(&b, src_pos, 2));
1439 }
1440
1441 /* We aren't blending, which means we just want to fetch a single
1442 * sample from the source surface. The address that we want to fetch
1443 * from is related to the X, Y and S values according to the formula:
1444 *
1445 * (X, Y, S) = decode_msaa(src_samples, detile(src_tiling, offset)).
1446 *
1447 * If the actual tiling and sample count of the source surface are
1448 * not the same as the configuration of the texture, then we need to
1449 * adjust the coordinates to compensate for the difference.
1450 */
1451 if (tex_tiled_w != key->src_tiled_w ||
1452 key->tex_samples != key->src_samples ||
1453 key->tex_layout != key->src_layout) {
1454 src_pos = blorp_nir_encode_msaa(&b, src_pos, key->src_samples,
1455 key->src_layout);
1456 /* Now (X, Y, S) = detile(src_tiling, offset) */
1457 if (tex_tiled_w != key->src_tiled_w)
1458 src_pos = blorp_nir_retile_w_to_y(&b, src_pos);
1459 /* Now (X, Y, S) = detile(tex_tiling, offset) */
1460 src_pos = blorp_nir_decode_msaa(&b, src_pos, key->tex_samples,
1461 key->tex_layout);
1462 }
1463
1464 /* Now (X, Y, S) = decode_msaa(tex_samples, detile(tex_tiling, offset)).
1465 *
1466 * In other words: X, Y, and S now contain values which, when passed to
1467 * the texturing unit, will cause data to be read from the correct
1468 * memory location. So we can fetch the texel now.
1469 */
1470 if (key->src_samples == 0) {
1471 color = blorp_nir_txf(&b, &v, src_pos, key->texture_data_type);
1472 } else {
1473 nir_ssa_def *mcs = NULL;
1474 if (key->tex_aux_usage == ISL_AUX_USAGE_MCS)
1475 mcs = blorp_nir_txf_ms_mcs(&b, &v, src_pos);
1476
1477 color = blorp_nir_txf_ms(&b, &v, src_pos, mcs, key->texture_data_type);
1478 }
1479 }
1480 }
1481
1482 nir_store_var(&b, v.color_out, color, 0xf);
1483
1484 return b.shader;
1485 }
1486
1487 static void
1488 brw_blorp_get_blit_kernel(struct brw_context *brw,
1489 struct brw_blorp_params *params,
1490 const struct brw_blorp_blit_prog_key *prog_key)
1491 {
1492 if (brw_search_cache(&brw->cache, BRW_CACHE_BLORP_PROG,
1493 prog_key, sizeof(*prog_key),
1494 &params->wm_prog_kernel, &params->wm_prog_data))
1495 return;
1496
1497 const unsigned *program;
1498 unsigned program_size;
1499 struct brw_blorp_prog_data prog_data;
1500
1501 /* Try and compile with NIR first. If that fails, fall back to the old
1502 * method of building shaders manually.
1503 */
1504 nir_shader *nir = brw_blorp_build_nir_shader(brw, prog_key);
1505 struct brw_wm_prog_key wm_key;
1506 brw_blorp_init_wm_prog_key(&wm_key);
1507 wm_key.tex.compressed_multisample_layout_mask =
1508 prog_key->tex_aux_usage == ISL_AUX_USAGE_MCS;
1509 wm_key.tex.msaa_16 = prog_key->tex_samples == 16;
1510 wm_key.multisample_fbo = prog_key->rt_samples > 1;
1511
1512 program = brw_blorp_compile_nir_shader(brw, nir, &wm_key, false,
1513 &prog_data, &program_size);
1514
1515 brw_upload_cache(&brw->cache, BRW_CACHE_BLORP_PROG,
1516 prog_key, sizeof(*prog_key),
1517 program, program_size,
1518 &prog_data, sizeof(prog_data),
1519 &params->wm_prog_kernel, &params->wm_prog_data);
1520 }
1521
1522 static void
1523 brw_blorp_setup_coord_transform(struct brw_blorp_coord_transform *xform,
1524 GLfloat src0, GLfloat src1,
1525 GLfloat dst0, GLfloat dst1,
1526 bool mirror)
1527 {
1528 float scale = (src1 - src0) / (dst1 - dst0);
1529 if (!mirror) {
1530 /* When not mirroring a coordinate (say, X), we need:
1531 * src_x - src_x0 = (dst_x - dst_x0 + 0.5) * scale
1532 * Therefore:
1533 * src_x = src_x0 + (dst_x - dst_x0 + 0.5) * scale
1534 *
1535 * blorp program uses "round toward zero" to convert the
1536 * transformed floating point coordinates to integer coordinates,
1537 * whereas the behaviour we actually want is "round to nearest",
1538 * so 0.5 provides the necessary correction.
1539 */
1540 xform->multiplier = scale;
1541 xform->offset = src0 + (-dst0 + 0.5f) * scale;
1542 } else {
1543 /* When mirroring X we need:
1544 * src_x - src_x0 = dst_x1 - dst_x - 0.5
1545 * Therefore:
1546 * src_x = src_x0 + (dst_x1 -dst_x - 0.5) * scale
1547 */
1548 xform->multiplier = -scale;
1549 xform->offset = src0 + (dst1 - 0.5f) * scale;
1550 }
1551 }
1552
1553 static enum isl_msaa_layout
1554 get_isl_msaa_layout(unsigned samples, enum intel_msaa_layout layout)
1555 {
1556 if (samples > 1) {
1557 switch (layout) {
1558 case INTEL_MSAA_LAYOUT_NONE:
1559 return ISL_MSAA_LAYOUT_NONE;
1560 case INTEL_MSAA_LAYOUT_IMS:
1561 return ISL_MSAA_LAYOUT_INTERLEAVED;
1562 case INTEL_MSAA_LAYOUT_UMS:
1563 case INTEL_MSAA_LAYOUT_CMS:
1564 return ISL_MSAA_LAYOUT_ARRAY;
1565 default:
1566 unreachable("Invalid MSAA layout");
1567 }
1568 } else {
1569 return ISL_MSAA_LAYOUT_NONE;
1570 }
1571 }
1572
1573 /**
1574 * Convert an swizzle enumeration (i.e. SWIZZLE_X) to one of the Gen7.5+
1575 * "Shader Channel Select" enumerations (i.e. HSW_SCS_RED). The mappings are
1576 *
1577 * SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W, SWIZZLE_ZERO, SWIZZLE_ONE
1578 * 0 1 2 3 4 5
1579 * 4 5 6 7 0 1
1580 * SCS_RED, SCS_GREEN, SCS_BLUE, SCS_ALPHA, SCS_ZERO, SCS_ONE
1581 *
1582 * which is simply adding 4 then modding by 8 (or anding with 7).
1583 *
1584 * We then may need to apply workarounds for textureGather hardware bugs.
1585 */
1586 static enum isl_channel_select
1587 swizzle_to_scs(GLenum swizzle)
1588 {
1589 return (enum isl_channel_select)((swizzle + 4) & 7);
1590 }
1591
1592 static void
1593 surf_convert_to_single_slice(struct brw_context *brw,
1594 struct brw_blorp_surface_info *info)
1595 {
1596 /* This only makes sense for a single level and array slice */
1597 assert(info->view.levels == 1 && info->view.array_len == 1);
1598
1599 /* Just bail if we have nothing to do. */
1600 if (info->surf.dim == ISL_SURF_DIM_2D &&
1601 info->view.base_level == 0 && info->view.base_array_layer == 0 &&
1602 info->surf.levels == 0 && info->surf.logical_level0_px.array_len == 0)
1603 return;
1604
1605 uint32_t x_offset_sa, y_offset_sa;
1606 blorp_get_image_offset_sa(&brw->isl_dev, &info->surf, info->view.base_level,
1607 info->view.base_array_layer,
1608 &x_offset_sa, &y_offset_sa);
1609
1610 isl_tiling_get_intratile_offset_sa(&brw->isl_dev, info->surf.tiling,
1611 info->view.format, info->surf.row_pitch,
1612 x_offset_sa, y_offset_sa,
1613 &info->bo_offset,
1614 &info->tile_x_sa, &info->tile_y_sa);
1615
1616 /* TODO: Once this file gets converted to C, we shouls just use designated
1617 * initializers.
1618 */
1619 struct isl_surf_init_info init_info = isl_surf_init_info();
1620
1621 init_info.dim = ISL_SURF_DIM_2D;
1622 init_info.format = ISL_FORMAT_R8_UINT;
1623 init_info.width =
1624 minify(info->surf.logical_level0_px.width, info->view.base_level);
1625 init_info.height =
1626 minify(info->surf.logical_level0_px.height, info->view.base_level);
1627 init_info.depth = 1;
1628 init_info.levels = 1;
1629 init_info.array_len = 1;
1630 init_info.samples = info->surf.samples;
1631 init_info.min_pitch = info->surf.row_pitch;
1632 init_info.usage = info->surf.usage;
1633 init_info.tiling_flags = 1 << info->surf.tiling;
1634
1635 isl_surf_init_s(&brw->isl_dev, &info->surf, &init_info);
1636 assert(info->surf.row_pitch == init_info.min_pitch);
1637
1638 /* The view is also different now. */
1639 info->view.base_level = 0;
1640 info->view.levels = 1;
1641 info->view.base_array_layer = 0;
1642 info->view.array_len = 1;
1643 }
1644
1645 static void
1646 surf_fake_interleaved_msaa(struct brw_context *brw,
1647 struct brw_blorp_surface_info *info)
1648 {
1649 assert(info->surf.msaa_layout == ISL_MSAA_LAYOUT_INTERLEAVED);
1650
1651 /* First, we need to convert it to a simple 1-level 1-layer 2-D surface */
1652 surf_convert_to_single_slice(brw, info);
1653
1654 info->surf.logical_level0_px = info->surf.phys_level0_sa;
1655 info->surf.samples = 1;
1656 info->surf.msaa_layout = ISL_MSAA_LAYOUT_NONE;
1657 }
1658
1659 static void
1660 surf_retile_w_to_y(struct brw_context *brw,
1661 struct brw_blorp_surface_info *info)
1662 {
1663 assert(info->surf.tiling == ISL_TILING_W);
1664
1665 /* First, we need to convert it to a simple 1-level 1-layer 2-D surface */
1666 surf_convert_to_single_slice(brw, info);
1667
1668 /* On gen7+, we don't have interleaved multisampling for color render
1669 * targets so we have to fake it.
1670 *
1671 * TODO: Are we sure we don't also need to fake it on gen6?
1672 */
1673 if (brw->gen > 6 && info->surf.msaa_layout == ISL_MSAA_LAYOUT_INTERLEAVED) {
1674 info->surf.logical_level0_px = info->surf.phys_level0_sa;
1675 info->surf.samples = 1;
1676 info->surf.msaa_layout = ISL_MSAA_LAYOUT_NONE;
1677 }
1678
1679 if (brw->gen == 6) {
1680 /* Gen6 stencil buffers have a very large alignment coming in from the
1681 * miptree. It's out-of-bounds for what the surface state can handle.
1682 * Since we have a single layer and level, it doesn't really matter as
1683 * long as we don't pass a bogus value into isl_surf_fill_state().
1684 */
1685 info->surf.image_alignment_el = isl_extent3d(4, 2, 1);
1686 }
1687
1688 /* Now that we've converted everything to a simple 2-D surface with only
1689 * one miplevel, we can go about retiling it.
1690 */
1691 const unsigned x_align = 8, y_align = info->surf.samples != 0 ? 8 : 4;
1692 info->surf.tiling = ISL_TILING_Y0;
1693 info->surf.logical_level0_px.width =
1694 ALIGN(info->surf.logical_level0_px.width, x_align) * 2;
1695 info->surf.logical_level0_px.height =
1696 ALIGN(info->surf.logical_level0_px.height, y_align) / 2;
1697 info->tile_x_sa *= 2;
1698 info->tile_y_sa /= 2;
1699 }
1700
1701 /**
1702 * Note: if the src (or dst) is a 2D multisample array texture on Gen7+ using
1703 * INTEL_MSAA_LAYOUT_UMS or INTEL_MSAA_LAYOUT_CMS, src_layer (dst_layer) is
1704 * the physical layer holding sample 0. So, for example, if
1705 * src_mt->num_samples == 4, then logical layer n corresponds to src_layer ==
1706 * 4*n.
1707 */
1708 void
1709 brw_blorp_blit_miptrees(struct brw_context *brw,
1710 struct intel_mipmap_tree *src_mt,
1711 unsigned src_level, unsigned src_layer,
1712 mesa_format src_format, int src_swizzle,
1713 struct intel_mipmap_tree *dst_mt,
1714 unsigned dst_level, unsigned dst_layer,
1715 mesa_format dst_format,
1716 float src_x0, float src_y0,
1717 float src_x1, float src_y1,
1718 float dst_x0, float dst_y0,
1719 float dst_x1, float dst_y1,
1720 GLenum filter, bool mirror_x, bool mirror_y,
1721 bool decode_srgb, bool encode_srgb)
1722 {
1723 /* Get ready to blit. This includes depth resolving the src and dst
1724 * buffers if necessary. Note: it's not necessary to do a color resolve on
1725 * the destination buffer because we use the standard render path to render
1726 * to destination color buffers, and the standard render path is
1727 * fast-color-aware.
1728 */
1729 intel_miptree_resolve_color(brw, src_mt, INTEL_MIPTREE_IGNORE_CCS_E);
1730 intel_miptree_slice_resolve_depth(brw, src_mt, src_level, src_layer);
1731 intel_miptree_slice_resolve_depth(brw, dst_mt, dst_level, dst_layer);
1732
1733 intel_miptree_prepare_mcs(brw, dst_mt);
1734
1735 DBG("%s from %dx %s mt %p %d %d (%f,%f) (%f,%f)"
1736 "to %dx %s mt %p %d %d (%f,%f) (%f,%f) (flip %d,%d)\n",
1737 __func__,
1738 src_mt->num_samples, _mesa_get_format_name(src_mt->format), src_mt,
1739 src_level, src_layer, src_x0, src_y0, src_x1, src_y1,
1740 dst_mt->num_samples, _mesa_get_format_name(dst_mt->format), dst_mt,
1741 dst_level, dst_layer, dst_x0, dst_y0, dst_x1, dst_y1,
1742 mirror_x, mirror_y);
1743
1744 if (!decode_srgb && _mesa_get_format_color_encoding(src_format) == GL_SRGB)
1745 src_format = _mesa_get_srgb_format_linear(src_format);
1746
1747 if (!encode_srgb && _mesa_get_format_color_encoding(dst_format) == GL_SRGB)
1748 dst_format = _mesa_get_srgb_format_linear(dst_format);
1749
1750 struct brw_blorp_params params;
1751 brw_blorp_params_init(&params);
1752
1753 brw_blorp_surface_info_init(brw, &params.src, src_mt, src_level,
1754 src_layer, src_format, false);
1755 brw_blorp_surface_info_init(brw, &params.dst, dst_mt, dst_level,
1756 dst_layer, dst_format, true);
1757
1758 /* When doing a multisample resolve of a GL_LUMINANCE32F or GL_INTENSITY32F
1759 * texture, the above code configures the source format for L32_FLOAT or
1760 * I32_FLOAT, and the destination format for R32_FLOAT. On Sandy Bridge,
1761 * the SAMPLE message appears to handle multisampled L32_FLOAT and
1762 * I32_FLOAT textures incorrectly, resulting in blocky artifacts. So work
1763 * around the problem by using a source format of R32_FLOAT. This
1764 * shouldn't affect rendering correctness, since the destination format is
1765 * R32_FLOAT, so only the contents of the red channel matters.
1766 */
1767 if (brw->gen == 6 &&
1768 params.src.surf.samples > 1 && params.dst.surf.samples <= 1 &&
1769 src_mt->format == dst_mt->format &&
1770 params.dst.view.format == ISL_FORMAT_R32_FLOAT) {
1771 params.src.view.format = params.dst.view.format;
1772 }
1773
1774 struct brw_blorp_blit_prog_key wm_prog_key;
1775 memset(&wm_prog_key, 0, sizeof(wm_prog_key));
1776
1777 /* texture_data_type indicates the register type that should be used to
1778 * manipulate texture data.
1779 */
1780 switch (_mesa_get_format_datatype(src_mt->format)) {
1781 case GL_UNSIGNED_NORMALIZED:
1782 case GL_SIGNED_NORMALIZED:
1783 case GL_FLOAT:
1784 wm_prog_key.texture_data_type = BRW_REGISTER_TYPE_F;
1785 break;
1786 case GL_UNSIGNED_INT:
1787 if (src_mt->format == MESA_FORMAT_S_UINT8) {
1788 /* We process stencil as though it's an unsigned normalized color */
1789 wm_prog_key.texture_data_type = BRW_REGISTER_TYPE_F;
1790 } else {
1791 wm_prog_key.texture_data_type = BRW_REGISTER_TYPE_UD;
1792 }
1793 break;
1794 case GL_INT:
1795 wm_prog_key.texture_data_type = BRW_REGISTER_TYPE_D;
1796 break;
1797 default:
1798 unreachable("Unrecognized blorp format");
1799 }
1800
1801 /* Scaled blitting or not. */
1802 wm_prog_key.blit_scaled =
1803 ((dst_x1 - dst_x0) == (src_x1 - src_x0) &&
1804 (dst_y1 - dst_y0) == (src_y1 - src_y0)) ? false : true;
1805
1806 /* Scaling factors used for bilinear filtering in multisample scaled
1807 * blits.
1808 */
1809 if (src_mt->num_samples == 16)
1810 wm_prog_key.x_scale = 4.0f;
1811 else
1812 wm_prog_key.x_scale = 2.0f;
1813 wm_prog_key.y_scale = src_mt->num_samples / wm_prog_key.x_scale;
1814
1815 if (filter == GL_LINEAR &&
1816 params.src.surf.samples <= 1 && params.dst.surf.samples <= 1)
1817 wm_prog_key.bilinear_filter = true;
1818
1819 GLenum base_format = _mesa_get_format_base_format(src_mt->format);
1820 if (base_format != GL_DEPTH_COMPONENT && /* TODO: what about depth/stencil? */
1821 base_format != GL_STENCIL_INDEX &&
1822 !_mesa_is_format_integer(src_mt->format) &&
1823 src_mt->num_samples > 1 && dst_mt->num_samples <= 1) {
1824 /* We are downsampling a non-integer color buffer, so blend.
1825 *
1826 * Regarding integer color buffers, the OpenGL ES 3.2 spec says:
1827 *
1828 * "If the source formats are integer types or stencil values, a
1829 * single sample's value is selected for each pixel."
1830 *
1831 * This implies we should not blend in that case.
1832 */
1833 wm_prog_key.blend = true;
1834 }
1835
1836 /* src_samples and dst_samples are the true sample counts */
1837 wm_prog_key.src_samples = src_mt->num_samples;
1838 wm_prog_key.dst_samples = dst_mt->num_samples;
1839
1840 wm_prog_key.tex_aux_usage = params.src.aux_usage;
1841
1842 /* src_layout and dst_layout indicate the true MSAA layout used by src and
1843 * dst.
1844 */
1845 wm_prog_key.src_layout = get_isl_msaa_layout(src_mt->num_samples,
1846 src_mt->msaa_layout);
1847 wm_prog_key.dst_layout = get_isl_msaa_layout(dst_mt->num_samples,
1848 dst_mt->msaa_layout);
1849
1850 /* Round floating point values to nearest integer to avoid "off by one texel"
1851 * kind of errors when blitting.
1852 */
1853 params.x0 = params.wm_inputs.discard_rect.x0 = roundf(dst_x0);
1854 params.y0 = params.wm_inputs.discard_rect.y0 = roundf(dst_y0);
1855 params.x1 = params.wm_inputs.discard_rect.x1 = roundf(dst_x1);
1856 params.y1 = params.wm_inputs.discard_rect.y1 = roundf(dst_y1);
1857
1858 params.wm_inputs.rect_grid.x1 =
1859 minify(src_mt->logical_width0, src_level) * wm_prog_key.x_scale - 1.0f;
1860 params.wm_inputs.rect_grid.y1 =
1861 minify(src_mt->logical_height0, src_level) * wm_prog_key.y_scale - 1.0f;
1862
1863 brw_blorp_setup_coord_transform(&params.wm_inputs.coord_transform[0],
1864 src_x0, src_x1, dst_x0, dst_x1, mirror_x);
1865 brw_blorp_setup_coord_transform(&params.wm_inputs.coord_transform[1],
1866 src_y0, src_y1, dst_y0, dst_y1, mirror_y);
1867
1868 /* For some texture types, we need to pass the layer through the sampler. */
1869 params.wm_inputs.src_z = params.src.z_offset;
1870
1871 if (brw->gen > 6 &&
1872 params.dst.surf.msaa_layout == ISL_MSAA_LAYOUT_INTERLEAVED) {
1873 assert(params.dst.surf.samples > 1);
1874
1875 /* We must expand the rectangle we send through the rendering pipeline,
1876 * to account for the fact that we are mapping the destination region as
1877 * single-sampled when it is in fact multisampled. We must also align
1878 * it to a multiple of the multisampling pattern, because the
1879 * differences between multisampled and single-sampled surface formats
1880 * will mean that pixels are scrambled within the multisampling pattern.
1881 * TODO: what if this makes the coordinates too large?
1882 *
1883 * Note: this only works if the destination surface uses the IMS layout.
1884 * If it's UMS, then we have no choice but to set up the rendering
1885 * pipeline as multisampled.
1886 */
1887 switch (params.dst.surf.samples) {
1888 case 2:
1889 params.x0 = ROUND_DOWN_TO(params.x0 * 2, 4);
1890 params.y0 = ROUND_DOWN_TO(params.y0, 4);
1891 params.x1 = ALIGN(params.x1 * 2, 4);
1892 params.y1 = ALIGN(params.y1, 4);
1893 break;
1894 case 4:
1895 params.x0 = ROUND_DOWN_TO(params.x0 * 2, 4);
1896 params.y0 = ROUND_DOWN_TO(params.y0 * 2, 4);
1897 params.x1 = ALIGN(params.x1 * 2, 4);
1898 params.y1 = ALIGN(params.y1 * 2, 4);
1899 break;
1900 case 8:
1901 params.x0 = ROUND_DOWN_TO(params.x0 * 4, 8);
1902 params.y0 = ROUND_DOWN_TO(params.y0 * 2, 4);
1903 params.x1 = ALIGN(params.x1 * 4, 8);
1904 params.y1 = ALIGN(params.y1 * 2, 4);
1905 break;
1906 case 16:
1907 params.x0 = ROUND_DOWN_TO(params.x0 * 4, 8);
1908 params.y0 = ROUND_DOWN_TO(params.y0 * 4, 8);
1909 params.x1 = ALIGN(params.x1 * 4, 8);
1910 params.y1 = ALIGN(params.y1 * 4, 8);
1911 break;
1912 default:
1913 unreachable("Unrecognized sample count in brw_blorp_blit_params ctor");
1914 }
1915
1916 surf_fake_interleaved_msaa(brw, &params.dst);
1917
1918 wm_prog_key.use_kill = true;
1919 }
1920
1921 if (params.dst.surf.tiling == ISL_TILING_W) {
1922 /* We must modify the rectangle we send through the rendering pipeline
1923 * (and the size and x/y offset of the destination surface), to account
1924 * for the fact that we are mapping it as Y-tiled when it is in fact
1925 * W-tiled.
1926 *
1927 * Both Y tiling and W tiling can be understood as organizations of
1928 * 32-byte sub-tiles; within each 32-byte sub-tile, the layout of pixels
1929 * is different, but the layout of the 32-byte sub-tiles within the 4k
1930 * tile is the same (8 sub-tiles across by 16 sub-tiles down, in
1931 * column-major order). In Y tiling, the sub-tiles are 16 bytes wide
1932 * and 2 rows high; in W tiling, they are 8 bytes wide and 4 rows high.
1933 *
1934 * Therefore, to account for the layout differences within the 32-byte
1935 * sub-tiles, we must expand the rectangle so the X coordinates of its
1936 * edges are multiples of 8 (the W sub-tile width), and its Y
1937 * coordinates of its edges are multiples of 4 (the W sub-tile height).
1938 * Then we need to scale the X and Y coordinates of the rectangle to
1939 * account for the differences in aspect ratio between the Y and W
1940 * sub-tiles. We need to modify the layer width and height similarly.
1941 *
1942 * A correction needs to be applied when MSAA is in use: since
1943 * INTEL_MSAA_LAYOUT_IMS uses an interleaving pattern whose height is 4,
1944 * we need to align the Y coordinates to multiples of 8, so that when
1945 * they are divided by two they are still multiples of 4.
1946 *
1947 * Note: Since the x/y offset of the surface will be applied using the
1948 * SURFACE_STATE command packet, it will be invisible to the swizzling
1949 * code in the shader; therefore it needs to be in a multiple of the
1950 * 32-byte sub-tile size. Fortunately it is, since the sub-tile is 8
1951 * pixels wide and 4 pixels high (when viewed as a W-tiled stencil
1952 * buffer), and the miplevel alignment used for stencil buffers is 8
1953 * pixels horizontally and either 4 or 8 pixels vertically (see
1954 * intel_horizontal_texture_alignment_unit() and
1955 * intel_vertical_texture_alignment_unit()).
1956 *
1957 * Note: Also, since the SURFACE_STATE command packet can only apply
1958 * offsets that are multiples of 4 pixels horizontally and 2 pixels
1959 * vertically, it is important that the offsets will be multiples of
1960 * these sizes after they are converted into Y-tiled coordinates.
1961 * Fortunately they will be, since we know from above that the offsets
1962 * are a multiple of the 32-byte sub-tile size, and in Y-tiled
1963 * coordinates the sub-tile is 16 pixels wide and 2 pixels high.
1964 *
1965 * TODO: what if this makes the coordinates (or the texture size) too
1966 * large?
1967 */
1968 const unsigned x_align = 8, y_align = params.dst.surf.samples != 0 ? 8 : 4;
1969 params.x0 = ROUND_DOWN_TO(params.x0, x_align) * 2;
1970 params.y0 = ROUND_DOWN_TO(params.y0, y_align) / 2;
1971 params.x1 = ALIGN(params.x1, x_align) * 2;
1972 params.y1 = ALIGN(params.y1, y_align) / 2;
1973
1974 /* Retile the surface to Y-tiled */
1975 surf_retile_w_to_y(brw, &params.dst);
1976
1977 wm_prog_key.dst_tiled_w = true;
1978 wm_prog_key.use_kill = true;
1979
1980 if (params.dst.surf.samples > 1) {
1981 /* If the destination surface is a W-tiled multisampled stencil
1982 * buffer that we're mapping as Y tiled, then we need to arrange for
1983 * the WM program to run once per sample rather than once per pixel,
1984 * because the memory layout of related samples doesn't match between
1985 * W and Y tiling.
1986 */
1987 wm_prog_key.persample_msaa_dispatch = true;
1988 }
1989 }
1990
1991 if (brw->gen < 8 && params.src.surf.tiling == ISL_TILING_W) {
1992 /* On Haswell and earlier, we have to fake W-tiled sources as Y-tiled.
1993 * Broadwell adds support for sampling from stencil.
1994 *
1995 * See the comments above concerning x/y offset alignment for the
1996 * destination surface.
1997 *
1998 * TODO: what if this makes the texture size too large?
1999 */
2000 surf_retile_w_to_y(brw, &params.src);
2001
2002 wm_prog_key.src_tiled_w = true;
2003 }
2004
2005 /* tex_samples and rt_samples are the sample counts that are set up in
2006 * SURFACE_STATE.
2007 */
2008 wm_prog_key.tex_samples = params.src.surf.samples;
2009 wm_prog_key.rt_samples = params.dst.surf.samples;
2010
2011 /* tex_layout and rt_layout indicate the MSAA layout the GPU pipeline will
2012 * use to access the source and destination surfaces.
2013 */
2014 wm_prog_key.tex_layout = params.src.surf.msaa_layout;
2015 wm_prog_key.rt_layout = params.dst.surf.msaa_layout;
2016
2017 if (params.src.surf.samples > 0 && params.dst.surf.samples > 1) {
2018 /* We are blitting from a multisample buffer to a multisample buffer, so
2019 * we must preserve samples within a pixel. This means we have to
2020 * arrange for the WM program to run once per sample rather than once
2021 * per pixel.
2022 */
2023 wm_prog_key.persample_msaa_dispatch = true;
2024 }
2025
2026 brw_blorp_get_blit_kernel(brw, &params, &wm_prog_key);
2027
2028 for (unsigned i = 0; i < 4; i++) {
2029 params.src.view.channel_select[i] =
2030 swizzle_to_scs(GET_SWZ(src_swizzle, i));
2031 }
2032
2033 brw_blorp_exec(brw, &params);
2034
2035 intel_miptree_slice_set_needs_hiz_resolve(dst_mt, dst_level, dst_layer);
2036
2037 if (intel_miptree_is_lossless_compressed(brw, dst_mt))
2038 dst_mt->fast_clear_state = INTEL_FAST_CLEAR_STATE_UNRESOLVED;
2039 }