ce1a5d2d0954692c097e0b1c8b49096e1ebd79f2
[mesa.git] / src / mesa / drivers / dri / i965 / brw_blorp_blit.cpp
1 /*
2 * Copyright © 2012 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "main/context.h"
25 #include "main/teximage.h"
26 #include "main/fbobject.h"
27
28 #include "compiler/nir/nir_builder.h"
29
30 #include "intel_fbo.h"
31
32 #include "brw_blorp.h"
33 #include "brw_context.h"
34 #include "brw_state.h"
35 #include "brw_meta_util.h"
36
37 #define FILE_DEBUG_FLAG DEBUG_BLORP
38
39 static struct intel_mipmap_tree *
40 find_miptree(GLbitfield buffer_bit, struct intel_renderbuffer *irb)
41 {
42 struct intel_mipmap_tree *mt = irb->mt;
43 if (buffer_bit == GL_STENCIL_BUFFER_BIT && mt->stencil_mt)
44 mt = mt->stencil_mt;
45 return mt;
46 }
47
48 static int
49 blorp_get_texture_swizzle(const struct intel_renderbuffer *irb)
50 {
51 return irb->Base.Base._BaseFormat == GL_RGB ?
52 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ONE) :
53 SWIZZLE_XYZW;
54 }
55
56 static void
57 do_blorp_blit(struct brw_context *brw, GLbitfield buffer_bit,
58 struct intel_renderbuffer *src_irb, mesa_format src_format,
59 struct intel_renderbuffer *dst_irb, mesa_format dst_format,
60 GLfloat srcX0, GLfloat srcY0, GLfloat srcX1, GLfloat srcY1,
61 GLfloat dstX0, GLfloat dstY0, GLfloat dstX1, GLfloat dstY1,
62 GLenum filter, bool mirror_x, bool mirror_y)
63 {
64 /* Find source/dst miptrees */
65 struct intel_mipmap_tree *src_mt = find_miptree(buffer_bit, src_irb);
66 struct intel_mipmap_tree *dst_mt = find_miptree(buffer_bit, dst_irb);
67
68 const bool es3 = _mesa_is_gles3(&brw->ctx);
69 /* Do the blit */
70 brw_blorp_blit_miptrees(brw,
71 src_mt, src_irb->mt_level, src_irb->mt_layer,
72 src_format, blorp_get_texture_swizzle(src_irb),
73 dst_mt, dst_irb->mt_level, dst_irb->mt_layer,
74 dst_format,
75 srcX0, srcY0, srcX1, srcY1,
76 dstX0, dstY0, dstX1, dstY1,
77 filter, mirror_x, mirror_y,
78 es3, es3);
79
80 dst_irb->need_downsample = true;
81 }
82
83 static bool
84 try_blorp_blit(struct brw_context *brw,
85 const struct gl_framebuffer *read_fb,
86 const struct gl_framebuffer *draw_fb,
87 GLfloat srcX0, GLfloat srcY0, GLfloat srcX1, GLfloat srcY1,
88 GLfloat dstX0, GLfloat dstY0, GLfloat dstX1, GLfloat dstY1,
89 GLenum filter, GLbitfield buffer_bit)
90 {
91 struct gl_context *ctx = &brw->ctx;
92
93 /* Sync up the state of window system buffers. We need to do this before
94 * we go looking for the buffers.
95 */
96 intel_prepare_render(brw);
97
98 bool mirror_x, mirror_y;
99 if (brw_meta_mirror_clip_and_scissor(ctx, read_fb, draw_fb,
100 &srcX0, &srcY0, &srcX1, &srcY1,
101 &dstX0, &dstY0, &dstX1, &dstY1,
102 &mirror_x, &mirror_y))
103 return true;
104
105 /* Find buffers */
106 struct intel_renderbuffer *src_irb;
107 struct intel_renderbuffer *dst_irb;
108 struct intel_mipmap_tree *src_mt;
109 struct intel_mipmap_tree *dst_mt;
110 switch (buffer_bit) {
111 case GL_COLOR_BUFFER_BIT:
112 src_irb = intel_renderbuffer(read_fb->_ColorReadBuffer);
113 for (unsigned i = 0; i < draw_fb->_NumColorDrawBuffers; ++i) {
114 dst_irb = intel_renderbuffer(draw_fb->_ColorDrawBuffers[i]);
115 if (dst_irb)
116 do_blorp_blit(brw, buffer_bit,
117 src_irb, src_irb->Base.Base.Format,
118 dst_irb, dst_irb->Base.Base.Format,
119 srcX0, srcY0, srcX1, srcY1,
120 dstX0, dstY0, dstX1, dstY1,
121 filter, mirror_x, mirror_y);
122 }
123 break;
124 case GL_DEPTH_BUFFER_BIT:
125 src_irb =
126 intel_renderbuffer(read_fb->Attachment[BUFFER_DEPTH].Renderbuffer);
127 dst_irb =
128 intel_renderbuffer(draw_fb->Attachment[BUFFER_DEPTH].Renderbuffer);
129 src_mt = find_miptree(buffer_bit, src_irb);
130 dst_mt = find_miptree(buffer_bit, dst_irb);
131
132 /* We can't handle format conversions between Z24 and other formats
133 * since we have to lie about the surface format. See the comments in
134 * brw_blorp_surface_info::set().
135 */
136 if ((src_mt->format == MESA_FORMAT_Z24_UNORM_X8_UINT) !=
137 (dst_mt->format == MESA_FORMAT_Z24_UNORM_X8_UINT))
138 return false;
139
140 do_blorp_blit(brw, buffer_bit, src_irb, MESA_FORMAT_NONE,
141 dst_irb, MESA_FORMAT_NONE, srcX0, srcY0,
142 srcX1, srcY1, dstX0, dstY0, dstX1, dstY1,
143 filter, mirror_x, mirror_y);
144 break;
145 case GL_STENCIL_BUFFER_BIT:
146 src_irb =
147 intel_renderbuffer(read_fb->Attachment[BUFFER_STENCIL].Renderbuffer);
148 dst_irb =
149 intel_renderbuffer(draw_fb->Attachment[BUFFER_STENCIL].Renderbuffer);
150 do_blorp_blit(brw, buffer_bit, src_irb, MESA_FORMAT_NONE,
151 dst_irb, MESA_FORMAT_NONE, srcX0, srcY0,
152 srcX1, srcY1, dstX0, dstY0, dstX1, dstY1,
153 filter, mirror_x, mirror_y);
154 break;
155 default:
156 unreachable("not reached");
157 }
158
159 return true;
160 }
161
162 bool
163 brw_blorp_copytexsubimage(struct brw_context *brw,
164 struct gl_renderbuffer *src_rb,
165 struct gl_texture_image *dst_image,
166 int slice,
167 int srcX0, int srcY0,
168 int dstX0, int dstY0,
169 int width, int height)
170 {
171 struct gl_context *ctx = &brw->ctx;
172 struct intel_renderbuffer *src_irb = intel_renderbuffer(src_rb);
173 struct intel_texture_image *intel_image = intel_texture_image(dst_image);
174
175 /* No pixel transfer operations (zoom, bias, mapping), just a blit */
176 if (brw->ctx._ImageTransferState)
177 return false;
178
179 /* Sync up the state of window system buffers. We need to do this before
180 * we go looking at the src renderbuffer's miptree.
181 */
182 intel_prepare_render(brw);
183
184 struct intel_mipmap_tree *src_mt = src_irb->mt;
185 struct intel_mipmap_tree *dst_mt = intel_image->mt;
186
187 /* There is support for only up to eight samples. */
188 if (src_mt->num_samples > 8 || dst_mt->num_samples > 8)
189 return false;
190
191 /* BLORP is only supported from Gen6 onwards. */
192 if (brw->gen < 6)
193 return false;
194
195 if (_mesa_get_format_base_format(src_rb->Format) !=
196 _mesa_get_format_base_format(dst_image->TexFormat)) {
197 return false;
198 }
199
200 /* We can't handle format conversions between Z24 and other formats since
201 * we have to lie about the surface format. See the comments in
202 * brw_blorp_surface_info::set().
203 */
204 if ((src_mt->format == MESA_FORMAT_Z24_UNORM_X8_UINT) !=
205 (dst_mt->format == MESA_FORMAT_Z24_UNORM_X8_UINT)) {
206 return false;
207 }
208
209 if (!brw->format_supported_as_render_target[dst_image->TexFormat])
210 return false;
211
212 /* Source clipping shouldn't be necessary, since copytexsubimage (in
213 * src/mesa/main/teximage.c) calls _mesa_clip_copytexsubimage() which
214 * takes care of it.
215 *
216 * Destination clipping shouldn't be necessary since the restrictions on
217 * glCopyTexSubImage prevent the user from specifying a destination rectangle
218 * that falls outside the bounds of the destination texture.
219 * See error_check_subtexture_dimensions().
220 */
221
222 int srcY1 = srcY0 + height;
223 int srcX1 = srcX0 + width;
224 int dstX1 = dstX0 + width;
225 int dstY1 = dstY0 + height;
226
227 /* Account for the fact that in the system framebuffer, the origin is at
228 * the lower left.
229 */
230 bool mirror_y = false;
231 if (_mesa_is_winsys_fbo(ctx->ReadBuffer)) {
232 GLint tmp = src_rb->Height - srcY0;
233 srcY0 = src_rb->Height - srcY1;
234 srcY1 = tmp;
235 mirror_y = true;
236 }
237
238 /* Account for face selection and texture view MinLayer */
239 int dst_slice = slice + dst_image->TexObject->MinLayer + dst_image->Face;
240 int dst_level = dst_image->Level + dst_image->TexObject->MinLevel;
241
242 brw_blorp_blit_miptrees(brw,
243 src_mt, src_irb->mt_level, src_irb->mt_layer,
244 src_rb->Format, blorp_get_texture_swizzle(src_irb),
245 dst_mt, dst_level, dst_slice,
246 dst_image->TexFormat,
247 srcX0, srcY0, srcX1, srcY1,
248 dstX0, dstY0, dstX1, dstY1,
249 GL_NEAREST, false, mirror_y,
250 false, false);
251
252 /* If we're copying to a packed depth stencil texture and the source
253 * framebuffer has separate stencil, we need to also copy the stencil data
254 * over.
255 */
256 src_rb = ctx->ReadBuffer->Attachment[BUFFER_STENCIL].Renderbuffer;
257 if (_mesa_get_format_bits(dst_image->TexFormat, GL_STENCIL_BITS) > 0 &&
258 src_rb != NULL) {
259 src_irb = intel_renderbuffer(src_rb);
260 src_mt = src_irb->mt;
261
262 if (src_mt->stencil_mt)
263 src_mt = src_mt->stencil_mt;
264 if (dst_mt->stencil_mt)
265 dst_mt = dst_mt->stencil_mt;
266
267 if (src_mt != dst_mt) {
268 brw_blorp_blit_miptrees(brw,
269 src_mt, src_irb->mt_level, src_irb->mt_layer,
270 src_mt->format,
271 blorp_get_texture_swizzle(src_irb),
272 dst_mt, dst_level, dst_slice,
273 dst_mt->format,
274 srcX0, srcY0, srcX1, srcY1,
275 dstX0, dstY0, dstX1, dstY1,
276 GL_NEAREST, false, mirror_y,
277 false, false);
278 }
279 }
280
281 return true;
282 }
283
284
285 GLbitfield
286 brw_blorp_framebuffer(struct brw_context *brw,
287 struct gl_framebuffer *readFb,
288 struct gl_framebuffer *drawFb,
289 GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
290 GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
291 GLbitfield mask, GLenum filter)
292 {
293 /* BLORP is not supported before Gen6. */
294 if (brw->gen < 6)
295 return mask;
296
297 /* There is support for only up to eight samples. */
298 if (readFb->Visual.samples > 8 || drawFb->Visual.samples > 8)
299 return mask;
300
301 static GLbitfield buffer_bits[] = {
302 GL_COLOR_BUFFER_BIT,
303 GL_DEPTH_BUFFER_BIT,
304 GL_STENCIL_BUFFER_BIT,
305 };
306
307 for (unsigned int i = 0; i < ARRAY_SIZE(buffer_bits); ++i) {
308 if ((mask & buffer_bits[i]) &&
309 try_blorp_blit(brw, readFb, drawFb,
310 srcX0, srcY0, srcX1, srcY1,
311 dstX0, dstY0, dstX1, dstY1,
312 filter, buffer_bits[i])) {
313 mask &= ~buffer_bits[i];
314 }
315 }
316
317 return mask;
318 }
319
320
321 /**
322 * Enum to specify the order of arguments in a sampler message
323 */
324 enum sampler_message_arg
325 {
326 SAMPLER_MESSAGE_ARG_U_FLOAT,
327 SAMPLER_MESSAGE_ARG_V_FLOAT,
328 SAMPLER_MESSAGE_ARG_U_INT,
329 SAMPLER_MESSAGE_ARG_V_INT,
330 SAMPLER_MESSAGE_ARG_R_INT,
331 SAMPLER_MESSAGE_ARG_SI_INT,
332 SAMPLER_MESSAGE_ARG_MCS_INT,
333 SAMPLER_MESSAGE_ARG_ZERO_INT,
334 };
335
336 struct brw_blorp_blit_vars {
337 /* Uniforms values from brw_blorp_wm_push_constants */
338 nir_variable *u_dst_x0;
339 nir_variable *u_dst_x1;
340 nir_variable *u_dst_y0;
341 nir_variable *u_dst_y1;
342 nir_variable *u_rect_grid_x1;
343 nir_variable *u_rect_grid_y1;
344 struct {
345 nir_variable *multiplier;
346 nir_variable *offset;
347 } u_x_transform, u_y_transform;
348 nir_variable *u_src_z;
349
350 /* gl_FragCoord */
351 nir_variable *frag_coord;
352
353 /* gl_FragColor */
354 nir_variable *color_out;
355 };
356
357 static void
358 brw_blorp_blit_vars_init(nir_builder *b, struct brw_blorp_blit_vars *v,
359 const struct brw_blorp_blit_prog_key *key)
360 {
361 #define LOAD_UNIFORM(name, type)\
362 v->u_##name = nir_variable_create(b->shader, nir_var_uniform, type, #name); \
363 v->u_##name->data.location = \
364 offsetof(struct brw_blorp_wm_push_constants, name);
365
366 LOAD_UNIFORM(dst_x0, glsl_uint_type())
367 LOAD_UNIFORM(dst_x1, glsl_uint_type())
368 LOAD_UNIFORM(dst_y0, glsl_uint_type())
369 LOAD_UNIFORM(dst_y1, glsl_uint_type())
370 LOAD_UNIFORM(rect_grid_x1, glsl_float_type())
371 LOAD_UNIFORM(rect_grid_y1, glsl_float_type())
372 LOAD_UNIFORM(x_transform.multiplier, glsl_float_type())
373 LOAD_UNIFORM(x_transform.offset, glsl_float_type())
374 LOAD_UNIFORM(y_transform.multiplier, glsl_float_type())
375 LOAD_UNIFORM(y_transform.offset, glsl_float_type())
376 LOAD_UNIFORM(src_z, glsl_uint_type())
377
378 #undef DECL_UNIFORM
379
380 v->frag_coord = nir_variable_create(b->shader, nir_var_shader_in,
381 glsl_vec4_type(), "gl_FragCoord");
382 v->frag_coord->data.location = VARYING_SLOT_POS;
383 v->frag_coord->data.origin_upper_left = true;
384
385 v->color_out = nir_variable_create(b->shader, nir_var_shader_out,
386 glsl_vec4_type(), "gl_FragColor");
387 v->color_out->data.location = FRAG_RESULT_COLOR;
388 }
389
390 nir_ssa_def *
391 blorp_blit_get_frag_coords(nir_builder *b,
392 const struct brw_blorp_blit_prog_key *key,
393 struct brw_blorp_blit_vars *v)
394 {
395 nir_ssa_def *coord = nir_f2i(b, nir_load_var(b, v->frag_coord));
396
397 if (key->persample_msaa_dispatch) {
398 return nir_vec3(b, nir_channel(b, coord, 0), nir_channel(b, coord, 1),
399 nir_load_system_value(b, nir_intrinsic_load_sample_id, 0));
400 } else {
401 return nir_vec2(b, nir_channel(b, coord, 0), nir_channel(b, coord, 1));
402 }
403 }
404
405 /**
406 * Emit code to translate from destination (X, Y) coordinates to source (X, Y)
407 * coordinates.
408 */
409 nir_ssa_def *
410 blorp_blit_apply_transform(nir_builder *b, nir_ssa_def *src_pos,
411 struct brw_blorp_blit_vars *v)
412 {
413 nir_ssa_def *offset = nir_vec2(b, nir_load_var(b, v->u_x_transform.offset),
414 nir_load_var(b, v->u_y_transform.offset));
415 nir_ssa_def *mul = nir_vec2(b, nir_load_var(b, v->u_x_transform.multiplier),
416 nir_load_var(b, v->u_y_transform.multiplier));
417
418 return nir_ffma(b, src_pos, mul, offset);
419 }
420
421 static inline void
422 blorp_nir_discard_if_outside_rect(nir_builder *b, nir_ssa_def *pos,
423 struct brw_blorp_blit_vars *v)
424 {
425 nir_ssa_def *c0, *c1, *c2, *c3;
426 c0 = nir_ult(b, nir_channel(b, pos, 0), nir_load_var(b, v->u_dst_x0));
427 c1 = nir_uge(b, nir_channel(b, pos, 0), nir_load_var(b, v->u_dst_x1));
428 c2 = nir_ult(b, nir_channel(b, pos, 1), nir_load_var(b, v->u_dst_y0));
429 c3 = nir_uge(b, nir_channel(b, pos, 1), nir_load_var(b, v->u_dst_y1));
430 nir_ssa_def *oob = nir_ior(b, nir_ior(b, c0, c1), nir_ior(b, c2, c3));
431
432 nir_intrinsic_instr *discard =
433 nir_intrinsic_instr_create(b->shader, nir_intrinsic_discard_if);
434 discard->src[0] = nir_src_for_ssa(oob);
435 nir_builder_instr_insert(b, &discard->instr);
436 }
437
438 static nir_tex_instr *
439 blorp_create_nir_tex_instr(nir_shader *shader, nir_texop op,
440 nir_ssa_def *pos, unsigned num_srcs,
441 enum brw_reg_type dst_type)
442 {
443 nir_tex_instr *tex = nir_tex_instr_create(shader, num_srcs);
444
445 tex->op = op;
446
447 switch (dst_type) {
448 case BRW_REGISTER_TYPE_F:
449 tex->dest_type = nir_type_float;
450 break;
451 case BRW_REGISTER_TYPE_D:
452 tex->dest_type = nir_type_int;
453 break;
454 case BRW_REGISTER_TYPE_UD:
455 tex->dest_type = nir_type_uint;
456 break;
457 default:
458 unreachable("Invalid texture return type");
459 }
460
461 tex->is_array = false;
462 tex->is_shadow = false;
463
464 /* Blorp only has one texture and it's bound at unit 0 */
465 tex->texture = NULL;
466 tex->sampler = NULL;
467 tex->texture_index = 0;
468 tex->sampler_index = 0;
469
470 nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, NULL);
471
472 return tex;
473 }
474
475 static nir_ssa_def *
476 blorp_nir_tex(nir_builder *b, nir_ssa_def *pos, enum brw_reg_type dst_type)
477 {
478 nir_tex_instr *tex =
479 blorp_create_nir_tex_instr(b->shader, nir_texop_tex, pos, 2, dst_type);
480
481 assert(pos->num_components == 2);
482 tex->sampler_dim = GLSL_SAMPLER_DIM_2D;
483 tex->coord_components = 2;
484 tex->src[0].src_type = nir_tex_src_coord;
485 tex->src[0].src = nir_src_for_ssa(pos);
486 tex->src[1].src_type = nir_tex_src_lod;
487 tex->src[1].src = nir_src_for_ssa(nir_imm_int(b, 0));
488
489 nir_builder_instr_insert(b, &tex->instr);
490
491 return &tex->dest.ssa;
492 }
493
494 static nir_ssa_def *
495 blorp_nir_txf(nir_builder *b, struct brw_blorp_blit_vars *v,
496 nir_ssa_def *pos, enum brw_reg_type dst_type)
497 {
498 nir_tex_instr *tex =
499 blorp_create_nir_tex_instr(b->shader, nir_texop_txf, pos, 2, dst_type);
500
501 /* In order to properly handle 3-D textures, we pull the Z component from
502 * a uniform. TODO: This is a bit magic; we should probably make this
503 * more explicit in the future.
504 */
505 assert(pos->num_components == 2);
506 pos = nir_vec3(b, nir_channel(b, pos, 0), nir_channel(b, pos, 1),
507 nir_load_var(b, v->u_src_z));
508
509 tex->sampler_dim = GLSL_SAMPLER_DIM_3D;
510 tex->coord_components = 3;
511 tex->src[0].src_type = nir_tex_src_coord;
512 tex->src[0].src = nir_src_for_ssa(pos);
513 tex->src[1].src_type = nir_tex_src_lod;
514 tex->src[1].src = nir_src_for_ssa(nir_imm_int(b, 0));
515
516 nir_builder_instr_insert(b, &tex->instr);
517
518 return &tex->dest.ssa;
519 }
520
521 static nir_ssa_def *
522 blorp_nir_txf_ms(nir_builder *b, nir_ssa_def *pos, nir_ssa_def *mcs,
523 enum brw_reg_type dst_type)
524 {
525 nir_tex_instr *tex =
526 blorp_create_nir_tex_instr(b->shader, nir_texop_txf_ms, pos,
527 mcs != NULL ? 3 : 2, dst_type);
528
529 tex->sampler_dim = GLSL_SAMPLER_DIM_MS;
530 tex->coord_components = 2;
531 tex->src[0].src_type = nir_tex_src_coord;
532 tex->src[0].src = nir_src_for_ssa(pos);
533
534 tex->src[1].src_type = nir_tex_src_ms_index;
535 if (pos->num_components == 2) {
536 tex->src[1].src = nir_src_for_ssa(nir_imm_int(b, 0));
537 } else {
538 assert(pos->num_components == 3);
539 tex->src[1].src = nir_src_for_ssa(nir_channel(b, pos, 2));
540 }
541
542 if (mcs) {
543 tex->src[2].src_type = nir_tex_src_ms_mcs;
544 tex->src[2].src = nir_src_for_ssa(mcs);
545 }
546
547 nir_builder_instr_insert(b, &tex->instr);
548
549 return &tex->dest.ssa;
550 }
551
552 static nir_ssa_def *
553 blorp_nir_txf_ms_mcs(nir_builder *b, nir_ssa_def *pos)
554 {
555 nir_tex_instr *tex =
556 blorp_create_nir_tex_instr(b->shader, nir_texop_txf_ms_mcs,
557 pos, 1, BRW_REGISTER_TYPE_D);
558
559 tex->sampler_dim = GLSL_SAMPLER_DIM_MS;
560 tex->coord_components = 2;
561 tex->src[0].src_type = nir_tex_src_coord;
562 tex->src[0].src = nir_src_for_ssa(pos);
563
564 nir_builder_instr_insert(b, &tex->instr);
565
566 return &tex->dest.ssa;
567 }
568
569 static nir_ssa_def *
570 nir_mask_shift_or(struct nir_builder *b, nir_ssa_def *dst, nir_ssa_def *src,
571 uint32_t src_mask, int src_left_shift)
572 {
573 nir_ssa_def *masked = nir_iand(b, src, nir_imm_int(b, src_mask));
574
575 nir_ssa_def *shifted;
576 if (src_left_shift > 0) {
577 shifted = nir_ishl(b, masked, nir_imm_int(b, src_left_shift));
578 } else if (src_left_shift < 0) {
579 shifted = nir_ushr(b, masked, nir_imm_int(b, -src_left_shift));
580 } else {
581 assert(src_left_shift == 0);
582 shifted = masked;
583 }
584
585 return nir_ior(b, dst, shifted);
586 }
587
588 /**
589 * Emit code to compensate for the difference between Y and W tiling.
590 *
591 * This code modifies the X and Y coordinates according to the formula:
592 *
593 * (X', Y', S') = detile(W-MAJOR, tile(Y-MAJOR, X, Y, S))
594 *
595 * (See brw_blorp_build_nir_shader).
596 */
597 static inline nir_ssa_def *
598 blorp_nir_retile_y_to_w(nir_builder *b, nir_ssa_def *pos)
599 {
600 assert(pos->num_components == 2);
601 nir_ssa_def *x_Y = nir_channel(b, pos, 0);
602 nir_ssa_def *y_Y = nir_channel(b, pos, 1);
603
604 /* Given X and Y coordinates that describe an address using Y tiling,
605 * translate to the X and Y coordinates that describe the same address
606 * using W tiling.
607 *
608 * If we break down the low order bits of X and Y, using a
609 * single letter to represent each low-order bit:
610 *
611 * X = A << 7 | 0bBCDEFGH
612 * Y = J << 5 | 0bKLMNP (1)
613 *
614 * Then we can apply the Y tiling formula to see the memory offset being
615 * addressed:
616 *
617 * offset = (J * tile_pitch + A) << 12 | 0bBCDKLMNPEFGH (2)
618 *
619 * If we apply the W detiling formula to this memory location, that the
620 * corresponding X' and Y' coordinates are:
621 *
622 * X' = A << 6 | 0bBCDPFH (3)
623 * Y' = J << 6 | 0bKLMNEG
624 *
625 * Combining (1) and (3), we see that to transform (X, Y) to (X', Y'),
626 * we need to make the following computation:
627 *
628 * X' = (X & ~0b1011) >> 1 | (Y & 0b1) << 2 | X & 0b1 (4)
629 * Y' = (Y & ~0b1) << 1 | (X & 0b1000) >> 2 | (X & 0b10) >> 1
630 */
631 nir_ssa_def *x_W = nir_imm_int(b, 0);
632 x_W = nir_mask_shift_or(b, x_W, x_Y, 0xfffffff4, -1);
633 x_W = nir_mask_shift_or(b, x_W, y_Y, 0x1, 2);
634 x_W = nir_mask_shift_or(b, x_W, x_Y, 0x1, 0);
635
636 nir_ssa_def *y_W = nir_imm_int(b, 0);
637 y_W = nir_mask_shift_or(b, y_W, y_Y, 0xfffffffe, 1);
638 y_W = nir_mask_shift_or(b, y_W, x_Y, 0x8, -2);
639 y_W = nir_mask_shift_or(b, y_W, x_Y, 0x2, -1);
640
641 return nir_vec2(b, x_W, y_W);
642 }
643
644 /**
645 * Emit code to compensate for the difference between Y and W tiling.
646 *
647 * This code modifies the X and Y coordinates according to the formula:
648 *
649 * (X', Y', S') = detile(Y-MAJOR, tile(W-MAJOR, X, Y, S))
650 *
651 * (See brw_blorp_build_nir_shader).
652 */
653 static inline nir_ssa_def *
654 blorp_nir_retile_w_to_y(nir_builder *b, nir_ssa_def *pos)
655 {
656 assert(pos->num_components == 2);
657 nir_ssa_def *x_W = nir_channel(b, pos, 0);
658 nir_ssa_def *y_W = nir_channel(b, pos, 1);
659
660 /* Applying the same logic as above, but in reverse, we obtain the
661 * formulas:
662 *
663 * X' = (X & ~0b101) << 1 | (Y & 0b10) << 2 | (Y & 0b1) << 1 | X & 0b1
664 * Y' = (Y & ~0b11) >> 1 | (X & 0b100) >> 2
665 */
666 nir_ssa_def *x_Y = nir_imm_int(b, 0);
667 x_Y = nir_mask_shift_or(b, x_Y, x_W, 0xfffffffa, 1);
668 x_Y = nir_mask_shift_or(b, x_Y, y_W, 0x2, 2);
669 x_Y = nir_mask_shift_or(b, x_Y, y_W, 0x1, 1);
670 x_Y = nir_mask_shift_or(b, x_Y, x_W, 0x1, 0);
671
672 nir_ssa_def *y_Y = nir_imm_int(b, 0);
673 y_Y = nir_mask_shift_or(b, y_Y, y_W, 0xfffffffc, -1);
674 y_Y = nir_mask_shift_or(b, y_Y, x_W, 0x4, -2);
675
676 return nir_vec2(b, x_Y, y_Y);
677 }
678
679 /**
680 * Emit code to compensate for the difference between MSAA and non-MSAA
681 * surfaces.
682 *
683 * This code modifies the X and Y coordinates according to the formula:
684 *
685 * (X', Y', S') = encode_msaa(num_samples, IMS, X, Y, S)
686 *
687 * (See brw_blorp_blit_program).
688 */
689 static inline nir_ssa_def *
690 blorp_nir_encode_msaa(nir_builder *b, nir_ssa_def *pos,
691 unsigned num_samples, enum intel_msaa_layout layout)
692 {
693 assert(pos->num_components == 2 || pos->num_components == 3);
694
695 switch (layout) {
696 case INTEL_MSAA_LAYOUT_NONE:
697 assert(pos->num_components == 2);
698 return pos;
699 case INTEL_MSAA_LAYOUT_CMS:
700 /* We can't compensate for compressed layout since at this point in the
701 * program we haven't read from the MCS buffer.
702 */
703 unreachable("Bad layout in encode_msaa");
704 case INTEL_MSAA_LAYOUT_UMS:
705 /* No translation needed */
706 return pos;
707 case INTEL_MSAA_LAYOUT_IMS: {
708 nir_ssa_def *x_in = nir_channel(b, pos, 0);
709 nir_ssa_def *y_in = nir_channel(b, pos, 1);
710 nir_ssa_def *s_in = pos->num_components == 2 ? nir_imm_int(b, 0) :
711 nir_channel(b, pos, 2);
712
713 nir_ssa_def *x_out = nir_imm_int(b, 0);
714 nir_ssa_def *y_out = nir_imm_int(b, 0);
715 switch (num_samples) {
716 case 2:
717 case 4:
718 /* encode_msaa(2, IMS, X, Y, S) = (X', Y', 0)
719 * where X' = (X & ~0b1) << 1 | (S & 0b1) << 1 | (X & 0b1)
720 * Y' = Y
721 *
722 * encode_msaa(4, IMS, X, Y, S) = (X', Y', 0)
723 * where X' = (X & ~0b1) << 1 | (S & 0b1) << 1 | (X & 0b1)
724 * Y' = (Y & ~0b1) << 1 | (S & 0b10) | (Y & 0b1)
725 */
726 x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffffe, 1);
727 x_out = nir_mask_shift_or(b, x_out, s_in, 0x1, 1);
728 x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0);
729 if (num_samples == 2) {
730 y_out = y_in;
731 } else {
732 y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffe, 1);
733 y_out = nir_mask_shift_or(b, y_out, s_in, 0x2, 0);
734 y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0);
735 }
736 break;
737
738 case 8:
739 /* encode_msaa(8, IMS, X, Y, S) = (X', Y', 0)
740 * where X' = (X & ~0b1) << 2 | (S & 0b100) | (S & 0b1) << 1
741 * | (X & 0b1)
742 * Y' = (Y & ~0b1) << 1 | (S & 0b10) | (Y & 0b1)
743 */
744 x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffffe, 2);
745 x_out = nir_mask_shift_or(b, x_out, s_in, 0x4, 0);
746 x_out = nir_mask_shift_or(b, x_out, s_in, 0x1, 1);
747 x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0);
748 y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffe, 1);
749 y_out = nir_mask_shift_or(b, y_out, s_in, 0x2, 0);
750 y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0);
751 break;
752
753 case 16:
754 /* encode_msaa(16, IMS, X, Y, S) = (X', Y', 0)
755 * where X' = (X & ~0b1) << 2 | (S & 0b100) | (S & 0b1) << 1
756 * | (X & 0b1)
757 * Y' = (Y & ~0b1) << 2 | (S & 0b1000) >> 1 (S & 0b10)
758 * | (Y & 0b1)
759 */
760 x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffffe, 2);
761 x_out = nir_mask_shift_or(b, x_out, s_in, 0x4, 0);
762 x_out = nir_mask_shift_or(b, x_out, s_in, 0x1, 1);
763 x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0);
764 y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffe, 2);
765 y_out = nir_mask_shift_or(b, y_out, s_in, 0x8, -1);
766 y_out = nir_mask_shift_or(b, y_out, s_in, 0x2, 0);
767 y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0);
768 break;
769
770 default:
771 unreachable("Invalid number of samples for IMS layout");
772 }
773
774 return nir_vec2(b, x_out, y_out);
775 }
776
777 default:
778 unreachable("Invalid MSAA layout");
779 }
780 }
781
782 /**
783 * Emit code to compensate for the difference between MSAA and non-MSAA
784 * surfaces.
785 *
786 * This code modifies the X and Y coordinates according to the formula:
787 *
788 * (X', Y', S) = decode_msaa(num_samples, IMS, X, Y, S)
789 *
790 * (See brw_blorp_blit_program).
791 */
792 static inline nir_ssa_def *
793 blorp_nir_decode_msaa(nir_builder *b, nir_ssa_def *pos,
794 unsigned num_samples, enum intel_msaa_layout layout)
795 {
796 assert(pos->num_components == 2 || pos->num_components == 3);
797
798 switch (layout) {
799 case INTEL_MSAA_LAYOUT_NONE:
800 /* No translation necessary, and S should already be zero. */
801 assert(pos->num_components == 2);
802 return pos;
803 case INTEL_MSAA_LAYOUT_CMS:
804 /* We can't compensate for compressed layout since at this point in the
805 * program we don't have access to the MCS buffer.
806 */
807 unreachable("Bad layout in encode_msaa");
808 case INTEL_MSAA_LAYOUT_UMS:
809 /* No translation necessary. */
810 return pos;
811 case INTEL_MSAA_LAYOUT_IMS: {
812 assert(pos->num_components == 2);
813
814 nir_ssa_def *x_in = nir_channel(b, pos, 0);
815 nir_ssa_def *y_in = nir_channel(b, pos, 1);
816
817 nir_ssa_def *x_out = nir_imm_int(b, 0);
818 nir_ssa_def *y_out = nir_imm_int(b, 0);
819 nir_ssa_def *s_out = nir_imm_int(b, 0);
820 switch (num_samples) {
821 case 2:
822 case 4:
823 /* decode_msaa(2, IMS, X, Y, 0) = (X', Y', S)
824 * where X' = (X & ~0b11) >> 1 | (X & 0b1)
825 * S = (X & 0b10) >> 1
826 *
827 * decode_msaa(4, IMS, X, Y, 0) = (X', Y', S)
828 * where X' = (X & ~0b11) >> 1 | (X & 0b1)
829 * Y' = (Y & ~0b11) >> 1 | (Y & 0b1)
830 * S = (Y & 0b10) | (X & 0b10) >> 1
831 */
832 x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffffc, -1);
833 x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0);
834 if (num_samples == 2) {
835 y_out = y_in;
836 s_out = nir_mask_shift_or(b, s_out, x_in, 0x2, -1);
837 } else {
838 y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffc, -1);
839 y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0);
840 s_out = nir_mask_shift_or(b, s_out, x_in, 0x2, -1);
841 s_out = nir_mask_shift_or(b, s_out, y_in, 0x2, 0);
842 }
843 break;
844
845 case 8:
846 /* decode_msaa(8, IMS, X, Y, 0) = (X', Y', S)
847 * where X' = (X & ~0b111) >> 2 | (X & 0b1)
848 * Y' = (Y & ~0b11) >> 1 | (Y & 0b1)
849 * S = (X & 0b100) | (Y & 0b10) | (X & 0b10) >> 1
850 */
851 x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffff8, -2);
852 x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0);
853 y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffc, -1);
854 y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0);
855 s_out = nir_mask_shift_or(b, s_out, x_in, 0x4, 0);
856 s_out = nir_mask_shift_or(b, s_out, y_in, 0x2, 0);
857 s_out = nir_mask_shift_or(b, s_out, x_in, 0x2, -1);
858 break;
859
860 case 16:
861 /* decode_msaa(16, IMS, X, Y, 0) = (X', Y', S)
862 * where X' = (X & ~0b111) >> 2 | (X & 0b1)
863 * Y' = (Y & ~0b111) >> 2 | (Y & 0b1)
864 * S = (Y & 0b100) << 1 | (X & 0b100) |
865 * (Y & 0b10) | (X & 0b10) >> 1
866 */
867 x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffff8, -2);
868 x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0);
869 y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffff8, -2);
870 y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0);
871 s_out = nir_mask_shift_or(b, s_out, y_in, 0x4, 1);
872 s_out = nir_mask_shift_or(b, s_out, x_in, 0x4, 0);
873 s_out = nir_mask_shift_or(b, s_out, y_in, 0x2, 0);
874 s_out = nir_mask_shift_or(b, s_out, x_in, 0x2, -1);
875 break;
876
877 default:
878 unreachable("Invalid number of samples for IMS layout");
879 }
880
881 return nir_vec3(b, x_out, y_out, s_out);
882 }
883
884 default:
885 unreachable("Invalid MSAA layout");
886 }
887 }
888
889 /**
890 * Count the number of trailing 1 bits in the given value. For example:
891 *
892 * count_trailing_one_bits(0) == 0
893 * count_trailing_one_bits(7) == 3
894 * count_trailing_one_bits(11) == 2
895 */
896 static inline int count_trailing_one_bits(unsigned value)
897 {
898 #ifdef HAVE___BUILTIN_CTZ
899 return __builtin_ctz(~value);
900 #else
901 return _mesa_bitcount(value & ~(value + 1));
902 #endif
903 }
904
905 static nir_ssa_def *
906 blorp_nir_manual_blend_average(nir_builder *b, nir_ssa_def *pos,
907 unsigned tex_samples,
908 enum intel_msaa_layout tex_layout,
909 enum brw_reg_type dst_type)
910 {
911 /* If non-null, this is the outer-most if statement */
912 nir_if *outer_if = NULL;
913
914 nir_variable *color =
915 nir_local_variable_create(b->impl, glsl_vec4_type(), "color");
916
917 nir_ssa_def *mcs = NULL;
918 if (tex_layout == INTEL_MSAA_LAYOUT_CMS)
919 mcs = blorp_nir_txf_ms_mcs(b, pos);
920
921 /* We add together samples using a binary tree structure, e.g. for 4x MSAA:
922 *
923 * result = ((sample[0] + sample[1]) + (sample[2] + sample[3])) / 4
924 *
925 * This ensures that when all samples have the same value, no numerical
926 * precision is lost, since each addition operation always adds two equal
927 * values, and summing two equal floating point values does not lose
928 * precision.
929 *
930 * We perform this computation by treating the texture_data array as a
931 * stack and performing the following operations:
932 *
933 * - push sample 0 onto stack
934 * - push sample 1 onto stack
935 * - add top two stack entries
936 * - push sample 2 onto stack
937 * - push sample 3 onto stack
938 * - add top two stack entries
939 * - add top two stack entries
940 * - divide top stack entry by 4
941 *
942 * Note that after pushing sample i onto the stack, the number of add
943 * operations we do is equal to the number of trailing 1 bits in i. This
944 * works provided the total number of samples is a power of two, which it
945 * always is for i965.
946 *
947 * For integer formats, we replace the add operations with average
948 * operations and skip the final division.
949 */
950 nir_ssa_def *texture_data[5];
951 unsigned stack_depth = 0;
952 for (unsigned i = 0; i < tex_samples; ++i) {
953 assert(stack_depth == _mesa_bitcount(i)); /* Loop invariant */
954
955 /* Push sample i onto the stack */
956 assert(stack_depth < ARRAY_SIZE(texture_data));
957
958 nir_ssa_def *ms_pos = nir_vec3(b, nir_channel(b, pos, 0),
959 nir_channel(b, pos, 1),
960 nir_imm_int(b, i));
961 texture_data[stack_depth++] = blorp_nir_txf_ms(b, ms_pos, mcs, dst_type);
962
963 if (i == 0 && tex_layout == INTEL_MSAA_LAYOUT_CMS) {
964 /* The Ivy Bridge PRM, Vol4 Part1 p27 (Multisample Control Surface)
965 * suggests an optimization:
966 *
967 * "A simple optimization with probable large return in
968 * performance is to compare the MCS value to zero (indicating
969 * all samples are on sample slice 0), and sample only from
970 * sample slice 0 using ld2dss if MCS is zero."
971 *
972 * Note that in the case where the MCS value is zero, sampling from
973 * sample slice 0 using ld2dss and sampling from sample 0 using
974 * ld2dms are equivalent (since all samples are on sample slice 0).
975 * Since we have already sampled from sample 0, all we need to do is
976 * skip the remaining fetches and averaging if MCS is zero.
977 */
978 nir_ssa_def *mcs_zero =
979 nir_ieq(b, nir_channel(b, mcs, 0), nir_imm_int(b, 0));
980 if (tex_samples == 16) {
981 mcs_zero = nir_iand(b, mcs_zero,
982 nir_ieq(b, nir_channel(b, mcs, 1), nir_imm_int(b, 0)));
983 }
984
985 nir_if *if_stmt = nir_if_create(b->shader);
986 if_stmt->condition = nir_src_for_ssa(mcs_zero);
987 nir_cf_node_insert(b->cursor, &if_stmt->cf_node);
988
989 b->cursor = nir_after_cf_list(&if_stmt->then_list);
990 nir_store_var(b, color, texture_data[0], 0xf);
991
992 b->cursor = nir_after_cf_list(&if_stmt->else_list);
993 outer_if = if_stmt;
994 }
995
996 for (int j = 0; j < count_trailing_one_bits(i); j++) {
997 assert(stack_depth >= 2);
998 --stack_depth;
999
1000 assert(dst_type == BRW_REGISTER_TYPE_F);
1001 texture_data[stack_depth - 1] =
1002 nir_fadd(b, texture_data[stack_depth - 1],
1003 texture_data[stack_depth]);
1004 }
1005 }
1006
1007 /* We should have just 1 sample on the stack now. */
1008 assert(stack_depth == 1);
1009
1010 texture_data[0] = nir_fmul(b, texture_data[0],
1011 nir_imm_float(b, 1.0 / tex_samples));
1012
1013 nir_store_var(b, color, texture_data[0], 0xf);
1014
1015 if (outer_if)
1016 b->cursor = nir_after_cf_node(&outer_if->cf_node);
1017
1018 return nir_load_var(b, color);
1019 }
1020
1021 static inline nir_ssa_def *
1022 nir_imm_vec2(nir_builder *build, float x, float y)
1023 {
1024 nir_const_value v;
1025
1026 memset(&v, 0, sizeof(v));
1027 v.f32[0] = x;
1028 v.f32[1] = y;
1029
1030 return nir_build_imm(build, 4, 32, v);
1031 }
1032
1033 static nir_ssa_def *
1034 blorp_nir_manual_blend_bilinear(nir_builder *b, nir_ssa_def *pos,
1035 unsigned tex_samples,
1036 const brw_blorp_blit_prog_key *key,
1037 struct brw_blorp_blit_vars *v)
1038 {
1039 nir_ssa_def *pos_xy = nir_channels(b, pos, 0x3);
1040
1041 nir_ssa_def *scale = nir_imm_vec2(b, key->x_scale, key->y_scale);
1042
1043 /* Translate coordinates to lay out the samples in a rectangular grid
1044 * roughly corresponding to sample locations.
1045 */
1046 pos_xy = nir_fmul(b, pos_xy, scale);
1047 /* Adjust coordinates so that integers represent pixel centers rather
1048 * than pixel edges.
1049 */
1050 pos_xy = nir_fadd(b, pos_xy, nir_imm_float(b, -0.5));
1051 /* Clamp the X, Y texture coordinates to properly handle the sampling of
1052 * texels on texture edges.
1053 */
1054 pos_xy = nir_fmin(b, nir_fmax(b, pos_xy, nir_imm_float(b, 0.0)),
1055 nir_vec2(b, nir_load_var(b, v->u_rect_grid_x1),
1056 nir_load_var(b, v->u_rect_grid_y1)));
1057
1058 /* Store the fractional parts to be used as bilinear interpolation
1059 * coefficients.
1060 */
1061 nir_ssa_def *frac_xy = nir_ffract(b, pos_xy);
1062 /* Round the float coordinates down to nearest integer */
1063 pos_xy = nir_fdiv(b, nir_ftrunc(b, pos_xy), scale);
1064
1065 nir_ssa_def *tex_data[4];
1066 for (unsigned i = 0; i < 4; ++i) {
1067 float sample_off_x = (float)(i & 0x1) / key->x_scale;
1068 float sample_off_y = (float)((i >> 1) & 0x1) / key->y_scale;
1069 nir_ssa_def *sample_off = nir_imm_vec2(b, sample_off_x, sample_off_y);
1070
1071 nir_ssa_def *sample_coords = nir_fadd(b, pos_xy, sample_off);
1072 nir_ssa_def *sample_coords_int = nir_f2i(b, sample_coords);
1073
1074 /* The MCS value we fetch has to match up with the pixel that we're
1075 * sampling from. Since we sample from different pixels in each
1076 * iteration of this "for" loop, the call to mcs_fetch() should be
1077 * here inside the loop after computing the pixel coordinates.
1078 */
1079 nir_ssa_def *mcs = NULL;
1080 if (key->tex_layout == INTEL_MSAA_LAYOUT_CMS)
1081 mcs = blorp_nir_txf_ms_mcs(b, sample_coords_int);
1082
1083 /* Compute sample index and map the sample index to a sample number.
1084 * Sample index layout shows the numbering of slots in a rectangular
1085 * grid of samples with in a pixel. Sample number layout shows the
1086 * rectangular grid of samples roughly corresponding to the real sample
1087 * locations with in a pixel.
1088 * In case of 4x MSAA, layout of sample indices matches the layout of
1089 * sample numbers:
1090 * ---------
1091 * | 0 | 1 |
1092 * ---------
1093 * | 2 | 3 |
1094 * ---------
1095 *
1096 * In case of 8x MSAA the two layouts don't match.
1097 * sample index layout : --------- sample number layout : ---------
1098 * | 0 | 1 | | 5 | 2 |
1099 * --------- ---------
1100 * | 2 | 3 | | 4 | 6 |
1101 * --------- ---------
1102 * | 4 | 5 | | 0 | 3 |
1103 * --------- ---------
1104 * | 6 | 7 | | 7 | 1 |
1105 * --------- ---------
1106 *
1107 * Fortunately, this can be done fairly easily as:
1108 * S' = (0x17306425 >> (S * 4)) & 0xf
1109 *
1110 * In the case of 16x MSAA the two layouts don't match.
1111 * Sample index layout: Sample number layout:
1112 * --------------------- ---------------------
1113 * | 0 | 1 | 2 | 3 | | 15 | 10 | 9 | 13 |
1114 * --------------------- ---------------------
1115 * | 4 | 5 | 6 | 7 | | 4 | 1 | 7 | 3 |
1116 * --------------------- ---------------------
1117 * | 8 | 9 | 10 | 11 | | 12 | 2 | 0 | 6 |
1118 * --------------------- ---------------------
1119 * | 12 | 13 | 14 | 15 | | 11 | 8 | 5 | 14 |
1120 * --------------------- ---------------------
1121 *
1122 * This is equivalent to
1123 * S' = (0xfa9d4173c206b85e >> (S * 4)) & 0xf
1124 */
1125 nir_ssa_def *frac = nir_ffract(b, sample_coords);
1126 nir_ssa_def *sample =
1127 nir_fdot2(b, frac, nir_imm_vec2(b, key->x_scale,
1128 key->x_scale * key->y_scale));
1129 sample = nir_f2i(b, sample);
1130
1131 if (tex_samples == 8) {
1132 sample = nir_iand(b, nir_ishr(b, nir_imm_int(b, 0x17306425),
1133 nir_ishl(b, sample, nir_imm_int(b, 2))),
1134 nir_imm_int(b, 0xf));
1135 } else if (tex_samples == 16) {
1136 nir_ssa_def *sample_low =
1137 nir_iand(b, nir_ishr(b, nir_imm_int(b, 0xc206b85e),
1138 nir_ishl(b, sample, nir_imm_int(b, 2))),
1139 nir_imm_int(b, 0xf));
1140 nir_ssa_def *sample_high =
1141 nir_iand(b, nir_ishr(b, nir_imm_int(b, 0xfa9d4173),
1142 nir_ishl(b, nir_iadd(b, sample,
1143 nir_imm_int(b, -8)),
1144 nir_imm_int(b, 2))),
1145 nir_imm_int(b, 0xf));
1146
1147 sample = nir_bcsel(b, nir_ilt(b, sample, nir_imm_int(b, 8)),
1148 sample_low, sample_high);
1149 }
1150 nir_ssa_def *pos_ms = nir_vec3(b, nir_channel(b, sample_coords_int, 0),
1151 nir_channel(b, sample_coords_int, 1),
1152 sample);
1153 tex_data[i] = blorp_nir_txf_ms(b, pos_ms, mcs, key->texture_data_type);
1154 }
1155
1156 nir_ssa_def *frac_x = nir_channel(b, frac_xy, 0);
1157 nir_ssa_def *frac_y = nir_channel(b, frac_xy, 1);
1158 return nir_flrp(b, nir_flrp(b, tex_data[0], tex_data[1], frac_x),
1159 nir_flrp(b, tex_data[2], tex_data[3], frac_x),
1160 frac_y);
1161 }
1162
1163 /**
1164 * Generator for WM programs used in BLORP blits.
1165 *
1166 * The bulk of the work done by the WM program is to wrap and unwrap the
1167 * coordinate transformations used by the hardware to store surfaces in
1168 * memory. The hardware transforms a pixel location (X, Y, S) (where S is the
1169 * sample index for a multisampled surface) to a memory offset by the
1170 * following formulas:
1171 *
1172 * offset = tile(tiling_format, encode_msaa(num_samples, layout, X, Y, S))
1173 * (X, Y, S) = decode_msaa(num_samples, layout, detile(tiling_format, offset))
1174 *
1175 * For a single-sampled surface, or for a multisampled surface using
1176 * INTEL_MSAA_LAYOUT_UMS, encode_msaa() and decode_msaa are the identity
1177 * function:
1178 *
1179 * encode_msaa(1, NONE, X, Y, 0) = (X, Y, 0)
1180 * decode_msaa(1, NONE, X, Y, 0) = (X, Y, 0)
1181 * encode_msaa(n, UMS, X, Y, S) = (X, Y, S)
1182 * decode_msaa(n, UMS, X, Y, S) = (X, Y, S)
1183 *
1184 * For a 4x multisampled surface using INTEL_MSAA_LAYOUT_IMS, encode_msaa()
1185 * embeds the sample number into bit 1 of the X and Y coordinates:
1186 *
1187 * encode_msaa(4, IMS, X, Y, S) = (X', Y', 0)
1188 * where X' = (X & ~0b1) << 1 | (S & 0b1) << 1 | (X & 0b1)
1189 * Y' = (Y & ~0b1 ) << 1 | (S & 0b10) | (Y & 0b1)
1190 * decode_msaa(4, IMS, X, Y, 0) = (X', Y', S)
1191 * where X' = (X & ~0b11) >> 1 | (X & 0b1)
1192 * Y' = (Y & ~0b11) >> 1 | (Y & 0b1)
1193 * S = (Y & 0b10) | (X & 0b10) >> 1
1194 *
1195 * For an 8x multisampled surface using INTEL_MSAA_LAYOUT_IMS, encode_msaa()
1196 * embeds the sample number into bits 1 and 2 of the X coordinate and bit 1 of
1197 * the Y coordinate:
1198 *
1199 * encode_msaa(8, IMS, X, Y, S) = (X', Y', 0)
1200 * where X' = (X & ~0b1) << 2 | (S & 0b100) | (S & 0b1) << 1 | (X & 0b1)
1201 * Y' = (Y & ~0b1) << 1 | (S & 0b10) | (Y & 0b1)
1202 * decode_msaa(8, IMS, X, Y, 0) = (X', Y', S)
1203 * where X' = (X & ~0b111) >> 2 | (X & 0b1)
1204 * Y' = (Y & ~0b11) >> 1 | (Y & 0b1)
1205 * S = (X & 0b100) | (Y & 0b10) | (X & 0b10) >> 1
1206 *
1207 * For X tiling, tile() combines together the low-order bits of the X and Y
1208 * coordinates in the pattern 0byyyxxxxxxxxx, creating 4k tiles that are 512
1209 * bytes wide and 8 rows high:
1210 *
1211 * tile(x_tiled, X, Y, S) = A
1212 * where A = tile_num << 12 | offset
1213 * tile_num = (Y' >> 3) * tile_pitch + (X' >> 9)
1214 * offset = (Y' & 0b111) << 9
1215 * | (X & 0b111111111)
1216 * X' = X * cpp
1217 * Y' = Y + S * qpitch
1218 * detile(x_tiled, A) = (X, Y, S)
1219 * where X = X' / cpp
1220 * Y = Y' % qpitch
1221 * S = Y' / qpitch
1222 * Y' = (tile_num / tile_pitch) << 3
1223 * | (A & 0b111000000000) >> 9
1224 * X' = (tile_num % tile_pitch) << 9
1225 * | (A & 0b111111111)
1226 *
1227 * (In all tiling formulas, cpp is the number of bytes occupied by a single
1228 * sample ("chars per pixel"), tile_pitch is the number of 4k tiles required
1229 * to fill the width of the surface, and qpitch is the spacing (in rows)
1230 * between array slices).
1231 *
1232 * For Y tiling, tile() combines together the low-order bits of the X and Y
1233 * coordinates in the pattern 0bxxxyyyyyxxxx, creating 4k tiles that are 128
1234 * bytes wide and 32 rows high:
1235 *
1236 * tile(y_tiled, X, Y, S) = A
1237 * where A = tile_num << 12 | offset
1238 * tile_num = (Y' >> 5) * tile_pitch + (X' >> 7)
1239 * offset = (X' & 0b1110000) << 5
1240 * | (Y' & 0b11111) << 4
1241 * | (X' & 0b1111)
1242 * X' = X * cpp
1243 * Y' = Y + S * qpitch
1244 * detile(y_tiled, A) = (X, Y, S)
1245 * where X = X' / cpp
1246 * Y = Y' % qpitch
1247 * S = Y' / qpitch
1248 * Y' = (tile_num / tile_pitch) << 5
1249 * | (A & 0b111110000) >> 4
1250 * X' = (tile_num % tile_pitch) << 7
1251 * | (A & 0b111000000000) >> 5
1252 * | (A & 0b1111)
1253 *
1254 * For W tiling, tile() combines together the low-order bits of the X and Y
1255 * coordinates in the pattern 0bxxxyyyyxyxyx, creating 4k tiles that are 64
1256 * bytes wide and 64 rows high (note that W tiling is only used for stencil
1257 * buffers, which always have cpp = 1 and S=0):
1258 *
1259 * tile(w_tiled, X, Y, S) = A
1260 * where A = tile_num << 12 | offset
1261 * tile_num = (Y' >> 6) * tile_pitch + (X' >> 6)
1262 * offset = (X' & 0b111000) << 6
1263 * | (Y' & 0b111100) << 3
1264 * | (X' & 0b100) << 2
1265 * | (Y' & 0b10) << 2
1266 * | (X' & 0b10) << 1
1267 * | (Y' & 0b1) << 1
1268 * | (X' & 0b1)
1269 * X' = X * cpp = X
1270 * Y' = Y + S * qpitch
1271 * detile(w_tiled, A) = (X, Y, S)
1272 * where X = X' / cpp = X'
1273 * Y = Y' % qpitch = Y'
1274 * S = Y / qpitch = 0
1275 * Y' = (tile_num / tile_pitch) << 6
1276 * | (A & 0b111100000) >> 3
1277 * | (A & 0b1000) >> 2
1278 * | (A & 0b10) >> 1
1279 * X' = (tile_num % tile_pitch) << 6
1280 * | (A & 0b111000000000) >> 6
1281 * | (A & 0b10000) >> 2
1282 * | (A & 0b100) >> 1
1283 * | (A & 0b1)
1284 *
1285 * Finally, for a non-tiled surface, tile() simply combines together the X and
1286 * Y coordinates in the natural way:
1287 *
1288 * tile(untiled, X, Y, S) = A
1289 * where A = Y * pitch + X'
1290 * X' = X * cpp
1291 * Y' = Y + S * qpitch
1292 * detile(untiled, A) = (X, Y, S)
1293 * where X = X' / cpp
1294 * Y = Y' % qpitch
1295 * S = Y' / qpitch
1296 * X' = A % pitch
1297 * Y' = A / pitch
1298 *
1299 * (In these formulas, pitch is the number of bytes occupied by a single row
1300 * of samples).
1301 */
1302 static nir_shader *
1303 brw_blorp_build_nir_shader(struct brw_context *brw,
1304 const brw_blorp_blit_prog_key *key)
1305 {
1306 nir_ssa_def *src_pos, *dst_pos, *color;
1307
1308 /* Sanity checks */
1309 if (key->dst_tiled_w && key->rt_samples > 0) {
1310 /* If the destination image is W tiled and multisampled, then the thread
1311 * must be dispatched once per sample, not once per pixel. This is
1312 * necessary because after conversion between W and Y tiling, there's no
1313 * guarantee that all samples corresponding to a single pixel will still
1314 * be together.
1315 */
1316 assert(key->persample_msaa_dispatch);
1317 }
1318
1319 if (key->blend) {
1320 /* We are blending, which means we won't have an opportunity to
1321 * translate the tiling and sample count for the texture surface. So
1322 * the surface state for the texture must be configured with the correct
1323 * tiling and sample count.
1324 */
1325 assert(!key->src_tiled_w);
1326 assert(key->tex_samples == key->src_samples);
1327 assert(key->tex_layout == key->src_layout);
1328 assert(key->tex_samples > 0);
1329 }
1330
1331 if (key->persample_msaa_dispatch) {
1332 /* It only makes sense to do persample dispatch if the render target is
1333 * configured as multisampled.
1334 */
1335 assert(key->rt_samples > 0);
1336 }
1337
1338 /* Make sure layout is consistent with sample count */
1339 assert((key->tex_layout == INTEL_MSAA_LAYOUT_NONE) ==
1340 (key->tex_samples == 0));
1341 assert((key->rt_layout == INTEL_MSAA_LAYOUT_NONE) ==
1342 (key->rt_samples == 0));
1343 assert((key->src_layout == INTEL_MSAA_LAYOUT_NONE) ==
1344 (key->src_samples == 0));
1345 assert((key->dst_layout == INTEL_MSAA_LAYOUT_NONE) ==
1346 (key->dst_samples == 0));
1347
1348 nir_builder b;
1349 nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_FRAGMENT, NULL);
1350
1351 struct brw_blorp_blit_vars v;
1352 brw_blorp_blit_vars_init(&b, &v, key);
1353
1354 dst_pos = blorp_blit_get_frag_coords(&b, key, &v);
1355
1356 /* Render target and texture hardware don't support W tiling until Gen8. */
1357 const bool rt_tiled_w = false;
1358 const bool tex_tiled_w = brw->gen >= 8 && key->src_tiled_w;
1359
1360 /* The address that data will be written to is determined by the
1361 * coordinates supplied to the WM thread and the tiling and sample count of
1362 * the render target, according to the formula:
1363 *
1364 * (X, Y, S) = decode_msaa(rt_samples, detile(rt_tiling, offset))
1365 *
1366 * If the actual tiling and sample count of the destination surface are not
1367 * the same as the configuration of the render target, then these
1368 * coordinates are wrong and we have to adjust them to compensate for the
1369 * difference.
1370 */
1371 if (rt_tiled_w != key->dst_tiled_w ||
1372 key->rt_samples != key->dst_samples ||
1373 key->rt_layout != key->dst_layout) {
1374 dst_pos = blorp_nir_encode_msaa(&b, dst_pos, key->rt_samples,
1375 key->rt_layout);
1376 /* Now (X, Y, S) = detile(rt_tiling, offset) */
1377 if (rt_tiled_w != key->dst_tiled_w)
1378 dst_pos = blorp_nir_retile_y_to_w(&b, dst_pos);
1379 /* Now (X, Y, S) = detile(rt_tiling, offset) */
1380 dst_pos = blorp_nir_decode_msaa(&b, dst_pos, key->dst_samples,
1381 key->dst_layout);
1382 }
1383
1384 /* Now (X, Y, S) = decode_msaa(dst_samples, detile(dst_tiling, offset)).
1385 *
1386 * That is: X, Y and S now contain the true coordinates and sample index of
1387 * the data that the WM thread should output.
1388 *
1389 * If we need to kill pixels that are outside the destination rectangle,
1390 * now is the time to do it.
1391 */
1392 if (key->use_kill)
1393 blorp_nir_discard_if_outside_rect(&b, dst_pos, &v);
1394
1395 src_pos = blorp_blit_apply_transform(&b, nir_i2f(&b, dst_pos), &v);
1396 if (dst_pos->num_components == 3) {
1397 /* The sample coordinate is an integer that we want left alone but
1398 * blorp_blit_apply_transform() blindly applies the transform to all
1399 * three coordinates. Grab the original sample index.
1400 */
1401 src_pos = nir_vec3(&b, nir_channel(&b, src_pos, 0),
1402 nir_channel(&b, src_pos, 1),
1403 nir_channel(&b, dst_pos, 2));
1404 }
1405
1406 /* If the source image is not multisampled, then we want to fetch sample
1407 * number 0, because that's the only sample there is.
1408 */
1409 if (key->src_samples == 0)
1410 src_pos = nir_channels(&b, src_pos, 0x3);
1411
1412 /* X, Y, and S are now the coordinates of the pixel in the source image
1413 * that we want to texture from. Exception: if we are blending, then S is
1414 * irrelevant, because we are going to fetch all samples.
1415 */
1416 if (key->blend && !key->blit_scaled) {
1417 /* Resolves (effecively) use texelFetch, so we need integers and we
1418 * don't care about the sample index if we got one.
1419 */
1420 src_pos = nir_f2i(&b, nir_channels(&b, src_pos, 0x3));
1421
1422 if (brw->gen == 6) {
1423 /* Because gen6 only supports 4x interleved MSAA, we can do all the
1424 * blending we need with a single linear-interpolated texture lookup
1425 * at the center of the sample. The texture coordinates to be odd
1426 * integers so that they correspond to the center of a 2x2 block
1427 * representing the four samples that maxe up a pixel. So we need
1428 * to multiply our X and Y coordinates each by 2 and then add 1.
1429 */
1430 src_pos = nir_ishl(&b, src_pos, nir_imm_int(&b, 1));
1431 src_pos = nir_iadd(&b, src_pos, nir_imm_int(&b, 1));
1432 src_pos = nir_i2f(&b, src_pos);
1433 color = blorp_nir_tex(&b, src_pos, key->texture_data_type);
1434 } else {
1435 /* Gen7+ hardware doesn't automaticaly blend. */
1436 color = blorp_nir_manual_blend_average(&b, src_pos, key->src_samples,
1437 key->src_layout,
1438 key->texture_data_type);
1439 }
1440 } else if (key->blend && key->blit_scaled) {
1441 color = blorp_nir_manual_blend_bilinear(&b, src_pos, key->src_samples, key, &v);
1442 } else {
1443 if (key->bilinear_filter) {
1444 color = blorp_nir_tex(&b, src_pos, key->texture_data_type);
1445 } else {
1446 /* We're going to use texelFetch, so we need integers */
1447 if (src_pos->num_components == 2) {
1448 src_pos = nir_f2i(&b, src_pos);
1449 } else {
1450 assert(src_pos->num_components == 3);
1451 src_pos = nir_vec3(&b, nir_channel(&b, nir_f2i(&b, src_pos), 0),
1452 nir_channel(&b, nir_f2i(&b, src_pos), 1),
1453 nir_channel(&b, src_pos, 2));
1454 }
1455
1456 /* We aren't blending, which means we just want to fetch a single
1457 * sample from the source surface. The address that we want to fetch
1458 * from is related to the X, Y and S values according to the formula:
1459 *
1460 * (X, Y, S) = decode_msaa(src_samples, detile(src_tiling, offset)).
1461 *
1462 * If the actual tiling and sample count of the source surface are
1463 * not the same as the configuration of the texture, then we need to
1464 * adjust the coordinates to compensate for the difference.
1465 */
1466 if (tex_tiled_w != key->src_tiled_w ||
1467 key->tex_samples != key->src_samples ||
1468 key->tex_layout != key->src_layout) {
1469 src_pos = blorp_nir_encode_msaa(&b, src_pos, key->src_samples,
1470 key->src_layout);
1471 /* Now (X, Y, S) = detile(src_tiling, offset) */
1472 if (tex_tiled_w != key->src_tiled_w)
1473 src_pos = blorp_nir_retile_w_to_y(&b, src_pos);
1474 /* Now (X, Y, S) = detile(tex_tiling, offset) */
1475 src_pos = blorp_nir_decode_msaa(&b, src_pos, key->tex_samples,
1476 key->tex_layout);
1477 }
1478
1479 /* Now (X, Y, S) = decode_msaa(tex_samples, detile(tex_tiling, offset)).
1480 *
1481 * In other words: X, Y, and S now contain values which, when passed to
1482 * the texturing unit, will cause data to be read from the correct
1483 * memory location. So we can fetch the texel now.
1484 */
1485 if (key->src_samples == 0) {
1486 color = blorp_nir_txf(&b, &v, src_pos, key->texture_data_type);
1487 } else {
1488 nir_ssa_def *mcs = NULL;
1489 if (key->tex_layout == INTEL_MSAA_LAYOUT_CMS)
1490 mcs = blorp_nir_txf_ms_mcs(&b, src_pos);
1491
1492 color = blorp_nir_txf_ms(&b, src_pos, mcs, key->texture_data_type);
1493 }
1494 }
1495 }
1496
1497 nir_store_var(&b, v.color_out, color, 0xf);
1498
1499 return b.shader;
1500 }
1501
1502 static void
1503 brw_blorp_get_blit_kernel(struct brw_context *brw,
1504 struct brw_blorp_params *params,
1505 const struct brw_blorp_blit_prog_key *prog_key)
1506 {
1507 if (brw_search_cache(&brw->cache, BRW_CACHE_BLORP_PROG,
1508 prog_key, sizeof(*prog_key),
1509 &params->wm_prog_kernel, &params->wm_prog_data))
1510 return;
1511
1512 const unsigned *program;
1513 unsigned program_size;
1514 struct brw_blorp_prog_data prog_data;
1515
1516 /* Try and compile with NIR first. If that fails, fall back to the old
1517 * method of building shaders manually.
1518 */
1519 nir_shader *nir = brw_blorp_build_nir_shader(brw, prog_key);
1520 struct brw_wm_prog_key wm_key;
1521 brw_blorp_init_wm_prog_key(&wm_key);
1522 wm_key.tex.compressed_multisample_layout_mask =
1523 prog_key->tex_layout == INTEL_MSAA_LAYOUT_CMS;
1524 wm_key.tex.msaa_16 = prog_key->tex_samples == 16;
1525 wm_key.multisample_fbo = prog_key->rt_samples > 1;
1526
1527 program = brw_blorp_compile_nir_shader(brw, nir, &wm_key, false,
1528 &prog_data, &program_size);
1529
1530 brw_upload_cache(&brw->cache, BRW_CACHE_BLORP_PROG,
1531 prog_key, sizeof(*prog_key),
1532 program, program_size,
1533 &prog_data, sizeof(prog_data),
1534 &params->wm_prog_kernel, &params->wm_prog_data);
1535 }
1536
1537 static void
1538 brw_blorp_setup_coord_transform(struct brw_blorp_coord_transform *xform,
1539 GLfloat src0, GLfloat src1,
1540 GLfloat dst0, GLfloat dst1,
1541 bool mirror)
1542 {
1543 float scale = (src1 - src0) / (dst1 - dst0);
1544 if (!mirror) {
1545 /* When not mirroring a coordinate (say, X), we need:
1546 * src_x - src_x0 = (dst_x - dst_x0 + 0.5) * scale
1547 * Therefore:
1548 * src_x = src_x0 + (dst_x - dst_x0 + 0.5) * scale
1549 *
1550 * blorp program uses "round toward zero" to convert the
1551 * transformed floating point coordinates to integer coordinates,
1552 * whereas the behaviour we actually want is "round to nearest",
1553 * so 0.5 provides the necessary correction.
1554 */
1555 xform->multiplier = scale;
1556 xform->offset = src0 + (-dst0 + 0.5f) * scale;
1557 } else {
1558 /* When mirroring X we need:
1559 * src_x - src_x0 = dst_x1 - dst_x - 0.5
1560 * Therefore:
1561 * src_x = src_x0 + (dst_x1 -dst_x - 0.5) * scale
1562 */
1563 xform->multiplier = -scale;
1564 xform->offset = src0 + (dst1 - 0.5f) * scale;
1565 }
1566 }
1567
1568
1569 /**
1570 * Determine which MSAA layout the GPU pipeline should be configured for,
1571 * based on the chip generation, the number of samples, and the true layout of
1572 * the image in memory.
1573 */
1574 inline intel_msaa_layout
1575 compute_msaa_layout_for_pipeline(struct brw_context *brw, unsigned num_samples,
1576 intel_msaa_layout true_layout)
1577 {
1578 if (num_samples <= 1) {
1579 /* Layout is used to determine if ld2dms is needed for sampling. In
1580 * single sampled case normal ld is enough avoiding also the need to
1581 * fetch mcs. Therefore simply set the layout to none.
1582 */
1583 if (brw->gen >= 9 && true_layout == INTEL_MSAA_LAYOUT_CMS) {
1584 return INTEL_MSAA_LAYOUT_NONE;
1585 }
1586
1587 /* When configuring the GPU for non-MSAA, we can still accommodate IMS
1588 * format buffers, by transforming coordinates appropriately.
1589 */
1590 assert(true_layout == INTEL_MSAA_LAYOUT_NONE ||
1591 true_layout == INTEL_MSAA_LAYOUT_IMS);
1592 return INTEL_MSAA_LAYOUT_NONE;
1593 } else {
1594 assert(true_layout != INTEL_MSAA_LAYOUT_NONE);
1595 }
1596
1597 /* Prior to Gen7, all MSAA surfaces use IMS layout. */
1598 if (brw->gen == 6) {
1599 assert(true_layout == INTEL_MSAA_LAYOUT_IMS);
1600 }
1601
1602 return true_layout;
1603 }
1604
1605
1606 /**
1607 * Note: if the src (or dst) is a 2D multisample array texture on Gen7+ using
1608 * INTEL_MSAA_LAYOUT_UMS or INTEL_MSAA_LAYOUT_CMS, src_layer (dst_layer) is
1609 * the physical layer holding sample 0. So, for example, if
1610 * src_mt->num_samples == 4, then logical layer n corresponds to src_layer ==
1611 * 4*n.
1612 */
1613 void
1614 brw_blorp_blit_miptrees(struct brw_context *brw,
1615 struct intel_mipmap_tree *src_mt,
1616 unsigned src_level, unsigned src_layer,
1617 mesa_format src_format, int src_swizzle,
1618 struct intel_mipmap_tree *dst_mt,
1619 unsigned dst_level, unsigned dst_layer,
1620 mesa_format dst_format,
1621 float src_x0, float src_y0,
1622 float src_x1, float src_y1,
1623 float dst_x0, float dst_y0,
1624 float dst_x1, float dst_y1,
1625 GLenum filter, bool mirror_x, bool mirror_y,
1626 bool decode_srgb, bool encode_srgb)
1627 {
1628 /* Get ready to blit. This includes depth resolving the src and dst
1629 * buffers if necessary. Note: it's not necessary to do a color resolve on
1630 * the destination buffer because we use the standard render path to render
1631 * to destination color buffers, and the standard render path is
1632 * fast-color-aware.
1633 */
1634 intel_miptree_resolve_color(brw, src_mt, INTEL_MIPTREE_IGNORE_CCS_E);
1635 intel_miptree_slice_resolve_depth(brw, src_mt, src_level, src_layer);
1636 intel_miptree_slice_resolve_depth(brw, dst_mt, dst_level, dst_layer);
1637
1638 intel_miptree_prepare_mcs(brw, dst_mt);
1639
1640 DBG("%s from %dx %s mt %p %d %d (%f,%f) (%f,%f)"
1641 "to %dx %s mt %p %d %d (%f,%f) (%f,%f) (flip %d,%d)\n",
1642 __func__,
1643 src_mt->num_samples, _mesa_get_format_name(src_mt->format), src_mt,
1644 src_level, src_layer, src_x0, src_y0, src_x1, src_y1,
1645 dst_mt->num_samples, _mesa_get_format_name(dst_mt->format), dst_mt,
1646 dst_level, dst_layer, dst_x0, dst_y0, dst_x1, dst_y1,
1647 mirror_x, mirror_y);
1648
1649 if (!decode_srgb && _mesa_get_format_color_encoding(src_format) == GL_SRGB)
1650 src_format = _mesa_get_srgb_format_linear(src_format);
1651
1652 if (!encode_srgb && _mesa_get_format_color_encoding(dst_format) == GL_SRGB)
1653 dst_format = _mesa_get_srgb_format_linear(dst_format);
1654
1655 struct brw_blorp_params params;
1656 brw_blorp_params_init(&params);
1657
1658 brw_blorp_surface_info_init(brw, &params.src, src_mt, src_level,
1659 src_layer, src_format, false);
1660 brw_blorp_surface_info_init(brw, &params.dst, dst_mt, dst_level,
1661 dst_layer, dst_format, true);
1662
1663 /* Even though we do multisample resolves at the time of the blit, OpenGL
1664 * specification defines them as if they happen at the time of rendering,
1665 * which means that the type of averaging we do during the resolve should
1666 * only depend on the source format; the destination format should be
1667 * ignored. But, specification doesn't seem to be strict about it.
1668 *
1669 * It has been observed that mulitisample resolves produce slightly better
1670 * looking images when averaging is done using destination format. NVIDIA's
1671 * proprietary OpenGL driver also follow this approach. So, we choose to
1672 * follow it in our driver.
1673 *
1674 * When multisampling, if the source and destination formats are equal
1675 * (aside from the color space), we choose to blit in sRGB space to get
1676 * this higher quality image.
1677 */
1678 if (params.src.num_samples > 1 &&
1679 _mesa_get_format_color_encoding(dst_mt->format) == GL_SRGB &&
1680 _mesa_get_srgb_format_linear(src_mt->format) ==
1681 _mesa_get_srgb_format_linear(dst_mt->format)) {
1682 assert(brw->format_supported_as_render_target[dst_mt->format]);
1683 params.dst.brw_surfaceformat = brw->render_target_format[dst_mt->format];
1684 params.src.brw_surfaceformat = brw_format_for_mesa_format(dst_mt->format);
1685 }
1686
1687 /* When doing a multisample resolve of a GL_LUMINANCE32F or GL_INTENSITY32F
1688 * texture, the above code configures the source format for L32_FLOAT or
1689 * I32_FLOAT, and the destination format for R32_FLOAT. On Sandy Bridge,
1690 * the SAMPLE message appears to handle multisampled L32_FLOAT and
1691 * I32_FLOAT textures incorrectly, resulting in blocky artifacts. So work
1692 * around the problem by using a source format of R32_FLOAT. This
1693 * shouldn't affect rendering correctness, since the destination format is
1694 * R32_FLOAT, so only the contents of the red channel matters.
1695 */
1696 if (brw->gen == 6 &&
1697 params.src.num_samples > 1 && params.dst.num_samples <= 1 &&
1698 src_mt->format == dst_mt->format &&
1699 params.dst.brw_surfaceformat == BRW_SURFACEFORMAT_R32_FLOAT) {
1700 params.src.brw_surfaceformat = params.dst.brw_surfaceformat;
1701 }
1702
1703 struct brw_blorp_blit_prog_key wm_prog_key;
1704 memset(&wm_prog_key, 0, sizeof(wm_prog_key));
1705
1706 /* texture_data_type indicates the register type that should be used to
1707 * manipulate texture data.
1708 */
1709 switch (_mesa_get_format_datatype(src_mt->format)) {
1710 case GL_UNSIGNED_NORMALIZED:
1711 case GL_SIGNED_NORMALIZED:
1712 case GL_FLOAT:
1713 wm_prog_key.texture_data_type = BRW_REGISTER_TYPE_F;
1714 break;
1715 case GL_UNSIGNED_INT:
1716 if (src_mt->format == MESA_FORMAT_S_UINT8) {
1717 /* We process stencil as though it's an unsigned normalized color */
1718 wm_prog_key.texture_data_type = BRW_REGISTER_TYPE_F;
1719 } else {
1720 wm_prog_key.texture_data_type = BRW_REGISTER_TYPE_UD;
1721 }
1722 break;
1723 case GL_INT:
1724 wm_prog_key.texture_data_type = BRW_REGISTER_TYPE_D;
1725 break;
1726 default:
1727 unreachable("Unrecognized blorp format");
1728 }
1729
1730 if (brw->gen > 6) {
1731 /* Gen7's rendering hardware only supports the IMS layout for depth and
1732 * stencil render targets. Blorp always maps its destination surface as
1733 * a color render target (even if it's actually a depth or stencil
1734 * buffer). So if the destination is IMS, we'll have to map it as a
1735 * single-sampled texture and interleave the samples ourselves.
1736 */
1737 if (dst_mt->msaa_layout == INTEL_MSAA_LAYOUT_IMS)
1738 params.dst.num_samples = 0;
1739 }
1740
1741 if (params.dst.map_stencil_as_y_tiled && params.dst.num_samples > 1) {
1742 /* If the destination surface is a W-tiled multisampled stencil buffer
1743 * that we're mapping as Y tiled, then we need to arrange for the WM
1744 * program to run once per sample rather than once per pixel, because
1745 * the memory layout of related samples doesn't match between W and Y
1746 * tiling.
1747 */
1748 wm_prog_key.persample_msaa_dispatch = true;
1749 }
1750
1751 if (params.src.num_samples > 0 && params.dst.num_samples > 1) {
1752 /* We are blitting from a multisample buffer to a multisample buffer, so
1753 * we must preserve samples within a pixel. This means we have to
1754 * arrange for the WM program to run once per sample rather than once
1755 * per pixel.
1756 */
1757 wm_prog_key.persample_msaa_dispatch = true;
1758 }
1759
1760 /* Scaled blitting or not. */
1761 wm_prog_key.blit_scaled =
1762 ((dst_x1 - dst_x0) == (src_x1 - src_x0) &&
1763 (dst_y1 - dst_y0) == (src_y1 - src_y0)) ? false : true;
1764
1765 /* Scaling factors used for bilinear filtering in multisample scaled
1766 * blits.
1767 */
1768 wm_prog_key.x_scale = 2.0f;
1769 wm_prog_key.y_scale = src_mt->num_samples / 2.0f;
1770
1771 if (filter == GL_LINEAR &&
1772 params.src.num_samples <= 1 && params.dst.num_samples <= 1)
1773 wm_prog_key.bilinear_filter = true;
1774
1775 GLenum base_format = _mesa_get_format_base_format(src_mt->format);
1776 if (base_format != GL_DEPTH_COMPONENT && /* TODO: what about depth/stencil? */
1777 base_format != GL_STENCIL_INDEX &&
1778 !_mesa_is_format_integer(src_mt->format) &&
1779 src_mt->num_samples > 1 && dst_mt->num_samples <= 1) {
1780 /* We are downsampling a non-integer color buffer, so blend.
1781 *
1782 * Regarding integer color buffers, the OpenGL ES 3.2 spec says:
1783 *
1784 * "If the source formats are integer types or stencil values, a
1785 * single sample's value is selected for each pixel."
1786 *
1787 * This implies we should not blend in that case.
1788 */
1789 wm_prog_key.blend = true;
1790 }
1791
1792 /* src_samples and dst_samples are the true sample counts */
1793 wm_prog_key.src_samples = src_mt->num_samples;
1794 wm_prog_key.dst_samples = dst_mt->num_samples;
1795
1796 /* tex_samples and rt_samples are the sample counts that are set up in
1797 * SURFACE_STATE.
1798 */
1799 wm_prog_key.tex_samples = params.src.num_samples;
1800 wm_prog_key.rt_samples = params.dst.num_samples;
1801
1802 /* tex_layout and rt_layout indicate the MSAA layout the GPU pipeline will
1803 * use to access the source and destination surfaces.
1804 */
1805 wm_prog_key.tex_layout =
1806 compute_msaa_layout_for_pipeline(brw, params.src.num_samples,
1807 params.src.msaa_layout);
1808 wm_prog_key.rt_layout =
1809 compute_msaa_layout_for_pipeline(brw, params.dst.num_samples,
1810 params.dst.msaa_layout);
1811
1812 /* src_layout and dst_layout indicate the true MSAA layout used by src and
1813 * dst.
1814 */
1815 wm_prog_key.src_layout = src_mt->msaa_layout;
1816 wm_prog_key.dst_layout = dst_mt->msaa_layout;
1817
1818 /* On gen9+ compressed single sampled buffers carry the same layout type as
1819 * multisampled. The difference is that they can be sampled using normal
1820 * ld message and as render target behave just like non-compressed surface
1821 * from compiler point of view. Therefore override the type in the program
1822 * key.
1823 */
1824 if (brw->gen >= 9 && params.src.num_samples <= 1 &&
1825 src_mt->msaa_layout == INTEL_MSAA_LAYOUT_CMS)
1826 wm_prog_key.src_layout = INTEL_MSAA_LAYOUT_NONE;
1827 if (brw->gen >= 9 && params.dst.num_samples <= 1 &&
1828 dst_mt->msaa_layout == INTEL_MSAA_LAYOUT_CMS)
1829 wm_prog_key.dst_layout = INTEL_MSAA_LAYOUT_NONE;
1830
1831 wm_prog_key.src_tiled_w = params.src.map_stencil_as_y_tiled;
1832 wm_prog_key.dst_tiled_w = params.dst.map_stencil_as_y_tiled;
1833 /* Round floating point values to nearest integer to avoid "off by one texel"
1834 * kind of errors when blitting.
1835 */
1836 params.x0 = params.wm_push_consts.dst_x0 = roundf(dst_x0);
1837 params.y0 = params.wm_push_consts.dst_y0 = roundf(dst_y0);
1838 params.x1 = params.wm_push_consts.dst_x1 = roundf(dst_x1);
1839 params.y1 = params.wm_push_consts.dst_y1 = roundf(dst_y1);
1840 params.wm_push_consts.rect_grid_x1 =
1841 minify(src_mt->logical_width0, src_level) * wm_prog_key.x_scale - 1.0f;
1842 params.wm_push_consts.rect_grid_y1 =
1843 minify(src_mt->logical_height0, src_level) * wm_prog_key.y_scale - 1.0f;
1844
1845 brw_blorp_setup_coord_transform(&params.wm_push_consts.x_transform,
1846 src_x0, src_x1, dst_x0, dst_x1, mirror_x);
1847 brw_blorp_setup_coord_transform(&params.wm_push_consts.y_transform,
1848 src_y0, src_y1, dst_y0, dst_y1, mirror_y);
1849
1850 params.wm_push_consts.src_z =
1851 params.src.mt->target == GL_TEXTURE_3D ? params.src.layer : 0;
1852
1853 if (params.dst.num_samples <= 1 && dst_mt->num_samples > 1) {
1854 /* We must expand the rectangle we send through the rendering pipeline,
1855 * to account for the fact that we are mapping the destination region as
1856 * single-sampled when it is in fact multisampled. We must also align
1857 * it to a multiple of the multisampling pattern, because the
1858 * differences between multisampled and single-sampled surface formats
1859 * will mean that pixels are scrambled within the multisampling pattern.
1860 * TODO: what if this makes the coordinates too large?
1861 *
1862 * Note: this only works if the destination surface uses the IMS layout.
1863 * If it's UMS, then we have no choice but to set up the rendering
1864 * pipeline as multisampled.
1865 */
1866 assert(dst_mt->msaa_layout == INTEL_MSAA_LAYOUT_IMS);
1867 switch (dst_mt->num_samples) {
1868 case 2:
1869 params.x0 = ROUND_DOWN_TO(params.x0 * 2, 4);
1870 params.y0 = ROUND_DOWN_TO(params.y0, 4);
1871 params.x1 = ALIGN(params.x1 * 2, 4);
1872 params.y1 = ALIGN(params.y1, 4);
1873 break;
1874 case 4:
1875 params.x0 = ROUND_DOWN_TO(params.x0 * 2, 4);
1876 params.y0 = ROUND_DOWN_TO(params.y0 * 2, 4);
1877 params.x1 = ALIGN(params.x1 * 2, 4);
1878 params.y1 = ALIGN(params.y1 * 2, 4);
1879 break;
1880 case 8:
1881 params.x0 = ROUND_DOWN_TO(params.x0 * 4, 8);
1882 params.y0 = ROUND_DOWN_TO(params.y0 * 2, 4);
1883 params.x1 = ALIGN(params.x1 * 4, 8);
1884 params.y1 = ALIGN(params.y1 * 2, 4);
1885 break;
1886 case 16:
1887 params.x0 = ROUND_DOWN_TO(params.x0 * 4, 8);
1888 params.y0 = ROUND_DOWN_TO(params.y0 * 4, 8);
1889 params.x1 = ALIGN(params.x1 * 4, 8);
1890 params.y1 = ALIGN(params.y1 * 4, 8);
1891 break;
1892 default:
1893 unreachable("Unrecognized sample count in brw_blorp_blit_params ctor");
1894 }
1895 wm_prog_key.use_kill = true;
1896 }
1897
1898 if (params.dst.map_stencil_as_y_tiled) {
1899 /* We must modify the rectangle we send through the rendering pipeline
1900 * (and the size and x/y offset of the destination surface), to account
1901 * for the fact that we are mapping it as Y-tiled when it is in fact
1902 * W-tiled.
1903 *
1904 * Both Y tiling and W tiling can be understood as organizations of
1905 * 32-byte sub-tiles; within each 32-byte sub-tile, the layout of pixels
1906 * is different, but the layout of the 32-byte sub-tiles within the 4k
1907 * tile is the same (8 sub-tiles across by 16 sub-tiles down, in
1908 * column-major order). In Y tiling, the sub-tiles are 16 bytes wide
1909 * and 2 rows high; in W tiling, they are 8 bytes wide and 4 rows high.
1910 *
1911 * Therefore, to account for the layout differences within the 32-byte
1912 * sub-tiles, we must expand the rectangle so the X coordinates of its
1913 * edges are multiples of 8 (the W sub-tile width), and its Y
1914 * coordinates of its edges are multiples of 4 (the W sub-tile height).
1915 * Then we need to scale the X and Y coordinates of the rectangle to
1916 * account for the differences in aspect ratio between the Y and W
1917 * sub-tiles. We need to modify the layer width and height similarly.
1918 *
1919 * A correction needs to be applied when MSAA is in use: since
1920 * INTEL_MSAA_LAYOUT_IMS uses an interleaving pattern whose height is 4,
1921 * we need to align the Y coordinates to multiples of 8, so that when
1922 * they are divided by two they are still multiples of 4.
1923 *
1924 * Note: Since the x/y offset of the surface will be applied using the
1925 * SURFACE_STATE command packet, it will be invisible to the swizzling
1926 * code in the shader; therefore it needs to be in a multiple of the
1927 * 32-byte sub-tile size. Fortunately it is, since the sub-tile is 8
1928 * pixels wide and 4 pixels high (when viewed as a W-tiled stencil
1929 * buffer), and the miplevel alignment used for stencil buffers is 8
1930 * pixels horizontally and either 4 or 8 pixels vertically (see
1931 * intel_horizontal_texture_alignment_unit() and
1932 * intel_vertical_texture_alignment_unit()).
1933 *
1934 * Note: Also, since the SURFACE_STATE command packet can only apply
1935 * offsets that are multiples of 4 pixels horizontally and 2 pixels
1936 * vertically, it is important that the offsets will be multiples of
1937 * these sizes after they are converted into Y-tiled coordinates.
1938 * Fortunately they will be, since we know from above that the offsets
1939 * are a multiple of the 32-byte sub-tile size, and in Y-tiled
1940 * coordinates the sub-tile is 16 pixels wide and 2 pixels high.
1941 *
1942 * TODO: what if this makes the coordinates (or the texture size) too
1943 * large?
1944 */
1945 const unsigned x_align = 8, y_align = params.dst.num_samples != 0 ? 8 : 4;
1946 params.x0 = ROUND_DOWN_TO(params.x0, x_align) * 2;
1947 params.y0 = ROUND_DOWN_TO(params.y0, y_align) / 2;
1948 params.x1 = ALIGN(params.x1, x_align) * 2;
1949 params.y1 = ALIGN(params.y1, y_align) / 2;
1950 params.dst.width = ALIGN(params.dst.width, x_align) * 2;
1951 params.dst.height = ALIGN(params.dst.height, y_align) / 2;
1952 params.dst.x_offset *= 2;
1953 params.dst.y_offset /= 2;
1954 wm_prog_key.use_kill = true;
1955 }
1956
1957 if (params.src.map_stencil_as_y_tiled) {
1958 /* We must modify the size and x/y offset of the source surface to
1959 * account for the fact that we are mapping it as Y-tiled when it is in
1960 * fact W tiled.
1961 *
1962 * See the comments above concerning x/y offset alignment for the
1963 * destination surface.
1964 *
1965 * TODO: what if this makes the texture size too large?
1966 */
1967 const unsigned x_align = 8, y_align = params.src.num_samples != 0 ? 8 : 4;
1968 params.src.width = ALIGN(params.src.width, x_align) * 2;
1969 params.src.height = ALIGN(params.src.height, y_align) / 2;
1970 params.src.x_offset *= 2;
1971 params.src.y_offset /= 2;
1972 }
1973
1974 brw_blorp_get_blit_kernel(brw, &params, &wm_prog_key);
1975
1976 params.src.swizzle = src_swizzle;
1977
1978 brw_blorp_exec(brw, &params);
1979
1980 intel_miptree_slice_set_needs_hiz_resolve(dst_mt, dst_level, dst_layer);
1981
1982 if (intel_miptree_is_lossless_compressed(brw, dst_mt))
1983 dst_mt->fast_clear_state = INTEL_FAST_CLEAR_STATE_UNRESOLVED;
1984 }