2 * Copyright © 2012 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 #include "main/context.h"
25 #include "main/teximage.h"
26 #include "main/fbobject.h"
28 #include "compiler/nir/nir_builder.h"
30 #include "intel_fbo.h"
32 #include "brw_blorp.h"
33 #include "brw_context.h"
34 #include "brw_state.h"
35 #include "brw_meta_util.h"
37 #define FILE_DEBUG_FLAG DEBUG_BLORP
39 static struct intel_mipmap_tree
*
40 find_miptree(GLbitfield buffer_bit
, struct intel_renderbuffer
*irb
)
42 struct intel_mipmap_tree
*mt
= irb
->mt
;
43 if (buffer_bit
== GL_STENCIL_BUFFER_BIT
&& mt
->stencil_mt
)
49 blorp_get_texture_swizzle(const struct intel_renderbuffer
*irb
)
51 return irb
->Base
.Base
._BaseFormat
== GL_RGB
?
52 MAKE_SWIZZLE4(SWIZZLE_X
, SWIZZLE_Y
, SWIZZLE_Z
, SWIZZLE_ONE
) :
57 do_blorp_blit(struct brw_context
*brw
, GLbitfield buffer_bit
,
58 struct intel_renderbuffer
*src_irb
, mesa_format src_format
,
59 struct intel_renderbuffer
*dst_irb
, mesa_format dst_format
,
60 GLfloat srcX0
, GLfloat srcY0
, GLfloat srcX1
, GLfloat srcY1
,
61 GLfloat dstX0
, GLfloat dstY0
, GLfloat dstX1
, GLfloat dstY1
,
62 GLenum filter
, bool mirror_x
, bool mirror_y
)
64 /* Find source/dst miptrees */
65 struct intel_mipmap_tree
*src_mt
= find_miptree(buffer_bit
, src_irb
);
66 struct intel_mipmap_tree
*dst_mt
= find_miptree(buffer_bit
, dst_irb
);
68 const bool es3
= _mesa_is_gles3(&brw
->ctx
);
70 brw_blorp_blit_miptrees(brw
,
71 src_mt
, src_irb
->mt_level
, src_irb
->mt_layer
,
72 src_format
, blorp_get_texture_swizzle(src_irb
),
73 dst_mt
, dst_irb
->mt_level
, dst_irb
->mt_layer
,
75 srcX0
, srcY0
, srcX1
, srcY1
,
76 dstX0
, dstY0
, dstX1
, dstY1
,
77 filter
, mirror_x
, mirror_y
,
80 dst_irb
->need_downsample
= true;
84 try_blorp_blit(struct brw_context
*brw
,
85 const struct gl_framebuffer
*read_fb
,
86 const struct gl_framebuffer
*draw_fb
,
87 GLfloat srcX0
, GLfloat srcY0
, GLfloat srcX1
, GLfloat srcY1
,
88 GLfloat dstX0
, GLfloat dstY0
, GLfloat dstX1
, GLfloat dstY1
,
89 GLenum filter
, GLbitfield buffer_bit
)
91 struct gl_context
*ctx
= &brw
->ctx
;
93 /* Sync up the state of window system buffers. We need to do this before
94 * we go looking for the buffers.
96 intel_prepare_render(brw
);
98 bool mirror_x
, mirror_y
;
99 if (brw_meta_mirror_clip_and_scissor(ctx
, read_fb
, draw_fb
,
100 &srcX0
, &srcY0
, &srcX1
, &srcY1
,
101 &dstX0
, &dstY0
, &dstX1
, &dstY1
,
102 &mirror_x
, &mirror_y
))
106 struct intel_renderbuffer
*src_irb
;
107 struct intel_renderbuffer
*dst_irb
;
108 struct intel_mipmap_tree
*src_mt
;
109 struct intel_mipmap_tree
*dst_mt
;
110 switch (buffer_bit
) {
111 case GL_COLOR_BUFFER_BIT
:
112 src_irb
= intel_renderbuffer(read_fb
->_ColorReadBuffer
);
113 for (unsigned i
= 0; i
< draw_fb
->_NumColorDrawBuffers
; ++i
) {
114 dst_irb
= intel_renderbuffer(draw_fb
->_ColorDrawBuffers
[i
]);
116 do_blorp_blit(brw
, buffer_bit
,
117 src_irb
, src_irb
->Base
.Base
.Format
,
118 dst_irb
, dst_irb
->Base
.Base
.Format
,
119 srcX0
, srcY0
, srcX1
, srcY1
,
120 dstX0
, dstY0
, dstX1
, dstY1
,
121 filter
, mirror_x
, mirror_y
);
124 case GL_DEPTH_BUFFER_BIT
:
126 intel_renderbuffer(read_fb
->Attachment
[BUFFER_DEPTH
].Renderbuffer
);
128 intel_renderbuffer(draw_fb
->Attachment
[BUFFER_DEPTH
].Renderbuffer
);
129 src_mt
= find_miptree(buffer_bit
, src_irb
);
130 dst_mt
= find_miptree(buffer_bit
, dst_irb
);
132 /* We can't handle format conversions between Z24 and other formats
133 * since we have to lie about the surface format. See the comments in
134 * brw_blorp_surface_info::set().
136 if ((src_mt
->format
== MESA_FORMAT_Z24_UNORM_X8_UINT
) !=
137 (dst_mt
->format
== MESA_FORMAT_Z24_UNORM_X8_UINT
))
140 do_blorp_blit(brw
, buffer_bit
, src_irb
, MESA_FORMAT_NONE
,
141 dst_irb
, MESA_FORMAT_NONE
, srcX0
, srcY0
,
142 srcX1
, srcY1
, dstX0
, dstY0
, dstX1
, dstY1
,
143 filter
, mirror_x
, mirror_y
);
145 case GL_STENCIL_BUFFER_BIT
:
147 intel_renderbuffer(read_fb
->Attachment
[BUFFER_STENCIL
].Renderbuffer
);
149 intel_renderbuffer(draw_fb
->Attachment
[BUFFER_STENCIL
].Renderbuffer
);
150 do_blorp_blit(brw
, buffer_bit
, src_irb
, MESA_FORMAT_NONE
,
151 dst_irb
, MESA_FORMAT_NONE
, srcX0
, srcY0
,
152 srcX1
, srcY1
, dstX0
, dstY0
, dstX1
, dstY1
,
153 filter
, mirror_x
, mirror_y
);
156 unreachable("not reached");
163 brw_blorp_copytexsubimage(struct brw_context
*brw
,
164 struct gl_renderbuffer
*src_rb
,
165 struct gl_texture_image
*dst_image
,
167 int srcX0
, int srcY0
,
168 int dstX0
, int dstY0
,
169 int width
, int height
)
171 struct gl_context
*ctx
= &brw
->ctx
;
172 struct intel_renderbuffer
*src_irb
= intel_renderbuffer(src_rb
);
173 struct intel_texture_image
*intel_image
= intel_texture_image(dst_image
);
175 /* No pixel transfer operations (zoom, bias, mapping), just a blit */
176 if (brw
->ctx
._ImageTransferState
)
179 /* Sync up the state of window system buffers. We need to do this before
180 * we go looking at the src renderbuffer's miptree.
182 intel_prepare_render(brw
);
184 struct intel_mipmap_tree
*src_mt
= src_irb
->mt
;
185 struct intel_mipmap_tree
*dst_mt
= intel_image
->mt
;
187 /* There is support for only up to eight samples. */
188 if (src_mt
->num_samples
> 8 || dst_mt
->num_samples
> 8)
191 /* BLORP is only supported from Gen6 onwards. */
195 if (_mesa_get_format_base_format(src_rb
->Format
) !=
196 _mesa_get_format_base_format(dst_image
->TexFormat
)) {
200 /* We can't handle format conversions between Z24 and other formats since
201 * we have to lie about the surface format. See the comments in
202 * brw_blorp_surface_info::set().
204 if ((src_mt
->format
== MESA_FORMAT_Z24_UNORM_X8_UINT
) !=
205 (dst_mt
->format
== MESA_FORMAT_Z24_UNORM_X8_UINT
)) {
209 if (!brw
->format_supported_as_render_target
[dst_image
->TexFormat
])
212 /* Source clipping shouldn't be necessary, since copytexsubimage (in
213 * src/mesa/main/teximage.c) calls _mesa_clip_copytexsubimage() which
216 * Destination clipping shouldn't be necessary since the restrictions on
217 * glCopyTexSubImage prevent the user from specifying a destination rectangle
218 * that falls outside the bounds of the destination texture.
219 * See error_check_subtexture_dimensions().
222 int srcY1
= srcY0
+ height
;
223 int srcX1
= srcX0
+ width
;
224 int dstX1
= dstX0
+ width
;
225 int dstY1
= dstY0
+ height
;
227 /* Account for the fact that in the system framebuffer, the origin is at
230 bool mirror_y
= false;
231 if (_mesa_is_winsys_fbo(ctx
->ReadBuffer
)) {
232 GLint tmp
= src_rb
->Height
- srcY0
;
233 srcY0
= src_rb
->Height
- srcY1
;
238 /* Account for face selection and texture view MinLayer */
239 int dst_slice
= slice
+ dst_image
->TexObject
->MinLayer
+ dst_image
->Face
;
240 int dst_level
= dst_image
->Level
+ dst_image
->TexObject
->MinLevel
;
242 brw_blorp_blit_miptrees(brw
,
243 src_mt
, src_irb
->mt_level
, src_irb
->mt_layer
,
244 src_rb
->Format
, blorp_get_texture_swizzle(src_irb
),
245 dst_mt
, dst_level
, dst_slice
,
246 dst_image
->TexFormat
,
247 srcX0
, srcY0
, srcX1
, srcY1
,
248 dstX0
, dstY0
, dstX1
, dstY1
,
249 GL_NEAREST
, false, mirror_y
,
252 /* If we're copying to a packed depth stencil texture and the source
253 * framebuffer has separate stencil, we need to also copy the stencil data
256 src_rb
= ctx
->ReadBuffer
->Attachment
[BUFFER_STENCIL
].Renderbuffer
;
257 if (_mesa_get_format_bits(dst_image
->TexFormat
, GL_STENCIL_BITS
) > 0 &&
259 src_irb
= intel_renderbuffer(src_rb
);
260 src_mt
= src_irb
->mt
;
262 if (src_mt
->stencil_mt
)
263 src_mt
= src_mt
->stencil_mt
;
264 if (dst_mt
->stencil_mt
)
265 dst_mt
= dst_mt
->stencil_mt
;
267 if (src_mt
!= dst_mt
) {
268 brw_blorp_blit_miptrees(brw
,
269 src_mt
, src_irb
->mt_level
, src_irb
->mt_layer
,
271 blorp_get_texture_swizzle(src_irb
),
272 dst_mt
, dst_level
, dst_slice
,
274 srcX0
, srcY0
, srcX1
, srcY1
,
275 dstX0
, dstY0
, dstX1
, dstY1
,
276 GL_NEAREST
, false, mirror_y
,
286 brw_blorp_framebuffer(struct brw_context
*brw
,
287 struct gl_framebuffer
*readFb
,
288 struct gl_framebuffer
*drawFb
,
289 GLint srcX0
, GLint srcY0
, GLint srcX1
, GLint srcY1
,
290 GLint dstX0
, GLint dstY0
, GLint dstX1
, GLint dstY1
,
291 GLbitfield mask
, GLenum filter
)
293 /* BLORP is not supported before Gen6. */
297 /* There is support for only up to eight samples. */
298 if (readFb
->Visual
.samples
> 8 || drawFb
->Visual
.samples
> 8)
301 static GLbitfield buffer_bits
[] = {
304 GL_STENCIL_BUFFER_BIT
,
307 for (unsigned int i
= 0; i
< ARRAY_SIZE(buffer_bits
); ++i
) {
308 if ((mask
& buffer_bits
[i
]) &&
309 try_blorp_blit(brw
, readFb
, drawFb
,
310 srcX0
, srcY0
, srcX1
, srcY1
,
311 dstX0
, dstY0
, dstX1
, dstY1
,
312 filter
, buffer_bits
[i
])) {
313 mask
&= ~buffer_bits
[i
];
322 * Enum to specify the order of arguments in a sampler message
324 enum sampler_message_arg
326 SAMPLER_MESSAGE_ARG_U_FLOAT
,
327 SAMPLER_MESSAGE_ARG_V_FLOAT
,
328 SAMPLER_MESSAGE_ARG_U_INT
,
329 SAMPLER_MESSAGE_ARG_V_INT
,
330 SAMPLER_MESSAGE_ARG_R_INT
,
331 SAMPLER_MESSAGE_ARG_SI_INT
,
332 SAMPLER_MESSAGE_ARG_MCS_INT
,
333 SAMPLER_MESSAGE_ARG_ZERO_INT
,
336 struct brw_blorp_blit_vars
{
337 /* Uniforms values from brw_blorp_wm_push_constants */
338 nir_variable
*u_dst_x0
;
339 nir_variable
*u_dst_x1
;
340 nir_variable
*u_dst_y0
;
341 nir_variable
*u_dst_y1
;
342 nir_variable
*u_rect_grid_x1
;
343 nir_variable
*u_rect_grid_y1
;
345 nir_variable
*multiplier
;
346 nir_variable
*offset
;
347 } u_x_transform
, u_y_transform
;
348 nir_variable
*u_src_z
;
351 nir_variable
*frag_coord
;
354 nir_variable
*color_out
;
358 brw_blorp_blit_vars_init(nir_builder
*b
, struct brw_blorp_blit_vars
*v
,
359 const struct brw_blorp_blit_prog_key
*key
)
361 #define LOAD_UNIFORM(name, type)\
362 v->u_##name = nir_variable_create(b->shader, nir_var_uniform, type, #name); \
363 v->u_##name->data.location = \
364 offsetof(struct brw_blorp_wm_push_constants, name);
366 LOAD_UNIFORM(dst_x0
, glsl_uint_type())
367 LOAD_UNIFORM(dst_x1
, glsl_uint_type())
368 LOAD_UNIFORM(dst_y0
, glsl_uint_type())
369 LOAD_UNIFORM(dst_y1
, glsl_uint_type())
370 LOAD_UNIFORM(rect_grid_x1
, glsl_float_type())
371 LOAD_UNIFORM(rect_grid_y1
, glsl_float_type())
372 LOAD_UNIFORM(x_transform
.multiplier
, glsl_float_type())
373 LOAD_UNIFORM(x_transform
.offset
, glsl_float_type())
374 LOAD_UNIFORM(y_transform
.multiplier
, glsl_float_type())
375 LOAD_UNIFORM(y_transform
.offset
, glsl_float_type())
376 LOAD_UNIFORM(src_z
, glsl_uint_type())
380 v
->frag_coord
= nir_variable_create(b
->shader
, nir_var_shader_in
,
381 glsl_vec4_type(), "gl_FragCoord");
382 v
->frag_coord
->data
.location
= VARYING_SLOT_POS
;
383 v
->frag_coord
->data
.origin_upper_left
= true;
385 v
->color_out
= nir_variable_create(b
->shader
, nir_var_shader_out
,
386 glsl_vec4_type(), "gl_FragColor");
387 v
->color_out
->data
.location
= FRAG_RESULT_COLOR
;
391 blorp_blit_get_frag_coords(nir_builder
*b
,
392 const struct brw_blorp_blit_prog_key
*key
,
393 struct brw_blorp_blit_vars
*v
)
395 nir_ssa_def
*coord
= nir_f2i(b
, nir_load_var(b
, v
->frag_coord
));
397 if (key
->persample_msaa_dispatch
) {
398 return nir_vec3(b
, nir_channel(b
, coord
, 0), nir_channel(b
, coord
, 1),
399 nir_load_system_value(b
, nir_intrinsic_load_sample_id
, 0));
401 return nir_vec2(b
, nir_channel(b
, coord
, 0), nir_channel(b
, coord
, 1));
406 * Emit code to translate from destination (X, Y) coordinates to source (X, Y)
410 blorp_blit_apply_transform(nir_builder
*b
, nir_ssa_def
*src_pos
,
411 struct brw_blorp_blit_vars
*v
)
413 nir_ssa_def
*offset
= nir_vec2(b
, nir_load_var(b
, v
->u_x_transform
.offset
),
414 nir_load_var(b
, v
->u_y_transform
.offset
));
415 nir_ssa_def
*mul
= nir_vec2(b
, nir_load_var(b
, v
->u_x_transform
.multiplier
),
416 nir_load_var(b
, v
->u_y_transform
.multiplier
));
418 return nir_ffma(b
, src_pos
, mul
, offset
);
422 blorp_nir_discard_if_outside_rect(nir_builder
*b
, nir_ssa_def
*pos
,
423 struct brw_blorp_blit_vars
*v
)
425 nir_ssa_def
*c0
, *c1
, *c2
, *c3
;
426 c0
= nir_ult(b
, nir_channel(b
, pos
, 0), nir_load_var(b
, v
->u_dst_x0
));
427 c1
= nir_uge(b
, nir_channel(b
, pos
, 0), nir_load_var(b
, v
->u_dst_x1
));
428 c2
= nir_ult(b
, nir_channel(b
, pos
, 1), nir_load_var(b
, v
->u_dst_y0
));
429 c3
= nir_uge(b
, nir_channel(b
, pos
, 1), nir_load_var(b
, v
->u_dst_y1
));
430 nir_ssa_def
*oob
= nir_ior(b
, nir_ior(b
, c0
, c1
), nir_ior(b
, c2
, c3
));
432 nir_intrinsic_instr
*discard
=
433 nir_intrinsic_instr_create(b
->shader
, nir_intrinsic_discard_if
);
434 discard
->src
[0] = nir_src_for_ssa(oob
);
435 nir_builder_instr_insert(b
, &discard
->instr
);
438 static nir_tex_instr
*
439 blorp_create_nir_tex_instr(nir_shader
*shader
, nir_texop op
,
440 nir_ssa_def
*pos
, unsigned num_srcs
,
441 enum brw_reg_type dst_type
)
443 nir_tex_instr
*tex
= nir_tex_instr_create(shader
, num_srcs
);
448 case BRW_REGISTER_TYPE_F
:
449 tex
->dest_type
= nir_type_float
;
451 case BRW_REGISTER_TYPE_D
:
452 tex
->dest_type
= nir_type_int
;
454 case BRW_REGISTER_TYPE_UD
:
455 tex
->dest_type
= nir_type_uint
;
458 unreachable("Invalid texture return type");
461 tex
->is_array
= false;
462 tex
->is_shadow
= false;
464 /* Blorp only has one texture and it's bound at unit 0 */
467 tex
->texture_index
= 0;
468 tex
->sampler_index
= 0;
470 nir_ssa_dest_init(&tex
->instr
, &tex
->dest
, 4, 32, NULL
);
476 blorp_nir_tex(nir_builder
*b
, nir_ssa_def
*pos
, enum brw_reg_type dst_type
)
479 blorp_create_nir_tex_instr(b
->shader
, nir_texop_tex
, pos
, 2, dst_type
);
481 assert(pos
->num_components
== 2);
482 tex
->sampler_dim
= GLSL_SAMPLER_DIM_2D
;
483 tex
->coord_components
= 2;
484 tex
->src
[0].src_type
= nir_tex_src_coord
;
485 tex
->src
[0].src
= nir_src_for_ssa(pos
);
486 tex
->src
[1].src_type
= nir_tex_src_lod
;
487 tex
->src
[1].src
= nir_src_for_ssa(nir_imm_int(b
, 0));
489 nir_builder_instr_insert(b
, &tex
->instr
);
491 return &tex
->dest
.ssa
;
495 blorp_nir_txf(nir_builder
*b
, struct brw_blorp_blit_vars
*v
,
496 nir_ssa_def
*pos
, enum brw_reg_type dst_type
)
499 blorp_create_nir_tex_instr(b
->shader
, nir_texop_txf
, pos
, 2, dst_type
);
501 /* In order to properly handle 3-D textures, we pull the Z component from
502 * a uniform. TODO: This is a bit magic; we should probably make this
503 * more explicit in the future.
505 assert(pos
->num_components
== 2);
506 pos
= nir_vec3(b
, nir_channel(b
, pos
, 0), nir_channel(b
, pos
, 1),
507 nir_load_var(b
, v
->u_src_z
));
509 tex
->sampler_dim
= GLSL_SAMPLER_DIM_3D
;
510 tex
->coord_components
= 3;
511 tex
->src
[0].src_type
= nir_tex_src_coord
;
512 tex
->src
[0].src
= nir_src_for_ssa(pos
);
513 tex
->src
[1].src_type
= nir_tex_src_lod
;
514 tex
->src
[1].src
= nir_src_for_ssa(nir_imm_int(b
, 0));
516 nir_builder_instr_insert(b
, &tex
->instr
);
518 return &tex
->dest
.ssa
;
522 blorp_nir_txf_ms(nir_builder
*b
, nir_ssa_def
*pos
, nir_ssa_def
*mcs
,
523 enum brw_reg_type dst_type
)
526 blorp_create_nir_tex_instr(b
->shader
, nir_texop_txf_ms
, pos
,
527 mcs
!= NULL
? 3 : 2, dst_type
);
529 tex
->sampler_dim
= GLSL_SAMPLER_DIM_MS
;
530 tex
->coord_components
= 2;
531 tex
->src
[0].src_type
= nir_tex_src_coord
;
532 tex
->src
[0].src
= nir_src_for_ssa(pos
);
534 tex
->src
[1].src_type
= nir_tex_src_ms_index
;
535 if (pos
->num_components
== 2) {
536 tex
->src
[1].src
= nir_src_for_ssa(nir_imm_int(b
, 0));
538 assert(pos
->num_components
== 3);
539 tex
->src
[1].src
= nir_src_for_ssa(nir_channel(b
, pos
, 2));
543 tex
->src
[2].src_type
= nir_tex_src_ms_mcs
;
544 tex
->src
[2].src
= nir_src_for_ssa(mcs
);
547 nir_builder_instr_insert(b
, &tex
->instr
);
549 return &tex
->dest
.ssa
;
553 blorp_nir_txf_ms_mcs(nir_builder
*b
, nir_ssa_def
*pos
)
556 blorp_create_nir_tex_instr(b
->shader
, nir_texop_txf_ms_mcs
,
557 pos
, 1, BRW_REGISTER_TYPE_D
);
559 tex
->sampler_dim
= GLSL_SAMPLER_DIM_MS
;
560 tex
->coord_components
= 2;
561 tex
->src
[0].src_type
= nir_tex_src_coord
;
562 tex
->src
[0].src
= nir_src_for_ssa(pos
);
564 nir_builder_instr_insert(b
, &tex
->instr
);
566 return &tex
->dest
.ssa
;
570 nir_mask_shift_or(struct nir_builder
*b
, nir_ssa_def
*dst
, nir_ssa_def
*src
,
571 uint32_t src_mask
, int src_left_shift
)
573 nir_ssa_def
*masked
= nir_iand(b
, src
, nir_imm_int(b
, src_mask
));
575 nir_ssa_def
*shifted
;
576 if (src_left_shift
> 0) {
577 shifted
= nir_ishl(b
, masked
, nir_imm_int(b
, src_left_shift
));
578 } else if (src_left_shift
< 0) {
579 shifted
= nir_ushr(b
, masked
, nir_imm_int(b
, -src_left_shift
));
581 assert(src_left_shift
== 0);
585 return nir_ior(b
, dst
, shifted
);
589 * Emit code to compensate for the difference between Y and W tiling.
591 * This code modifies the X and Y coordinates according to the formula:
593 * (X', Y', S') = detile(W-MAJOR, tile(Y-MAJOR, X, Y, S))
595 * (See brw_blorp_build_nir_shader).
597 static inline nir_ssa_def
*
598 blorp_nir_retile_y_to_w(nir_builder
*b
, nir_ssa_def
*pos
)
600 assert(pos
->num_components
== 2);
601 nir_ssa_def
*x_Y
= nir_channel(b
, pos
, 0);
602 nir_ssa_def
*y_Y
= nir_channel(b
, pos
, 1);
604 /* Given X and Y coordinates that describe an address using Y tiling,
605 * translate to the X and Y coordinates that describe the same address
608 * If we break down the low order bits of X and Y, using a
609 * single letter to represent each low-order bit:
611 * X = A << 7 | 0bBCDEFGH
612 * Y = J << 5 | 0bKLMNP (1)
614 * Then we can apply the Y tiling formula to see the memory offset being
617 * offset = (J * tile_pitch + A) << 12 | 0bBCDKLMNPEFGH (2)
619 * If we apply the W detiling formula to this memory location, that the
620 * corresponding X' and Y' coordinates are:
622 * X' = A << 6 | 0bBCDPFH (3)
623 * Y' = J << 6 | 0bKLMNEG
625 * Combining (1) and (3), we see that to transform (X, Y) to (X', Y'),
626 * we need to make the following computation:
628 * X' = (X & ~0b1011) >> 1 | (Y & 0b1) << 2 | X & 0b1 (4)
629 * Y' = (Y & ~0b1) << 1 | (X & 0b1000) >> 2 | (X & 0b10) >> 1
631 nir_ssa_def
*x_W
= nir_imm_int(b
, 0);
632 x_W
= nir_mask_shift_or(b
, x_W
, x_Y
, 0xfffffff4, -1);
633 x_W
= nir_mask_shift_or(b
, x_W
, y_Y
, 0x1, 2);
634 x_W
= nir_mask_shift_or(b
, x_W
, x_Y
, 0x1, 0);
636 nir_ssa_def
*y_W
= nir_imm_int(b
, 0);
637 y_W
= nir_mask_shift_or(b
, y_W
, y_Y
, 0xfffffffe, 1);
638 y_W
= nir_mask_shift_or(b
, y_W
, x_Y
, 0x8, -2);
639 y_W
= nir_mask_shift_or(b
, y_W
, x_Y
, 0x2, -1);
641 return nir_vec2(b
, x_W
, y_W
);
645 * Emit code to compensate for the difference between Y and W tiling.
647 * This code modifies the X and Y coordinates according to the formula:
649 * (X', Y', S') = detile(Y-MAJOR, tile(W-MAJOR, X, Y, S))
651 * (See brw_blorp_build_nir_shader).
653 static inline nir_ssa_def
*
654 blorp_nir_retile_w_to_y(nir_builder
*b
, nir_ssa_def
*pos
)
656 assert(pos
->num_components
== 2);
657 nir_ssa_def
*x_W
= nir_channel(b
, pos
, 0);
658 nir_ssa_def
*y_W
= nir_channel(b
, pos
, 1);
660 /* Applying the same logic as above, but in reverse, we obtain the
663 * X' = (X & ~0b101) << 1 | (Y & 0b10) << 2 | (Y & 0b1) << 1 | X & 0b1
664 * Y' = (Y & ~0b11) >> 1 | (X & 0b100) >> 2
666 nir_ssa_def
*x_Y
= nir_imm_int(b
, 0);
667 x_Y
= nir_mask_shift_or(b
, x_Y
, x_W
, 0xfffffffa, 1);
668 x_Y
= nir_mask_shift_or(b
, x_Y
, y_W
, 0x2, 2);
669 x_Y
= nir_mask_shift_or(b
, x_Y
, y_W
, 0x1, 1);
670 x_Y
= nir_mask_shift_or(b
, x_Y
, x_W
, 0x1, 0);
672 nir_ssa_def
*y_Y
= nir_imm_int(b
, 0);
673 y_Y
= nir_mask_shift_or(b
, y_Y
, y_W
, 0xfffffffc, -1);
674 y_Y
= nir_mask_shift_or(b
, y_Y
, x_W
, 0x4, -2);
676 return nir_vec2(b
, x_Y
, y_Y
);
680 * Emit code to compensate for the difference between MSAA and non-MSAA
683 * This code modifies the X and Y coordinates according to the formula:
685 * (X', Y', S') = encode_msaa(num_samples, IMS, X, Y, S)
687 * (See brw_blorp_blit_program).
689 static inline nir_ssa_def
*
690 blorp_nir_encode_msaa(nir_builder
*b
, nir_ssa_def
*pos
,
691 unsigned num_samples
, enum intel_msaa_layout layout
)
693 assert(pos
->num_components
== 2 || pos
->num_components
== 3);
696 case INTEL_MSAA_LAYOUT_NONE
:
697 assert(pos
->num_components
== 2);
699 case INTEL_MSAA_LAYOUT_CMS
:
700 /* We can't compensate for compressed layout since at this point in the
701 * program we haven't read from the MCS buffer.
703 unreachable("Bad layout in encode_msaa");
704 case INTEL_MSAA_LAYOUT_UMS
:
705 /* No translation needed */
707 case INTEL_MSAA_LAYOUT_IMS
: {
708 nir_ssa_def
*x_in
= nir_channel(b
, pos
, 0);
709 nir_ssa_def
*y_in
= nir_channel(b
, pos
, 1);
710 nir_ssa_def
*s_in
= pos
->num_components
== 2 ? nir_imm_int(b
, 0) :
711 nir_channel(b
, pos
, 2);
713 nir_ssa_def
*x_out
= nir_imm_int(b
, 0);
714 nir_ssa_def
*y_out
= nir_imm_int(b
, 0);
715 switch (num_samples
) {
718 /* encode_msaa(2, IMS, X, Y, S) = (X', Y', 0)
719 * where X' = (X & ~0b1) << 1 | (S & 0b1) << 1 | (X & 0b1)
722 * encode_msaa(4, IMS, X, Y, S) = (X', Y', 0)
723 * where X' = (X & ~0b1) << 1 | (S & 0b1) << 1 | (X & 0b1)
724 * Y' = (Y & ~0b1) << 1 | (S & 0b10) | (Y & 0b1)
726 x_out
= nir_mask_shift_or(b
, x_out
, x_in
, 0xfffffffe, 1);
727 x_out
= nir_mask_shift_or(b
, x_out
, s_in
, 0x1, 1);
728 x_out
= nir_mask_shift_or(b
, x_out
, x_in
, 0x1, 0);
729 if (num_samples
== 2) {
732 y_out
= nir_mask_shift_or(b
, y_out
, y_in
, 0xfffffffe, 1);
733 y_out
= nir_mask_shift_or(b
, y_out
, s_in
, 0x2, 0);
734 y_out
= nir_mask_shift_or(b
, y_out
, y_in
, 0x1, 0);
739 /* encode_msaa(8, IMS, X, Y, S) = (X', Y', 0)
740 * where X' = (X & ~0b1) << 2 | (S & 0b100) | (S & 0b1) << 1
742 * Y' = (Y & ~0b1) << 1 | (S & 0b10) | (Y & 0b1)
744 x_out
= nir_mask_shift_or(b
, x_out
, x_in
, 0xfffffffe, 2);
745 x_out
= nir_mask_shift_or(b
, x_out
, s_in
, 0x4, 0);
746 x_out
= nir_mask_shift_or(b
, x_out
, s_in
, 0x1, 1);
747 x_out
= nir_mask_shift_or(b
, x_out
, x_in
, 0x1, 0);
748 y_out
= nir_mask_shift_or(b
, y_out
, y_in
, 0xfffffffe, 1);
749 y_out
= nir_mask_shift_or(b
, y_out
, s_in
, 0x2, 0);
750 y_out
= nir_mask_shift_or(b
, y_out
, y_in
, 0x1, 0);
754 /* encode_msaa(16, IMS, X, Y, S) = (X', Y', 0)
755 * where X' = (X & ~0b1) << 2 | (S & 0b100) | (S & 0b1) << 1
757 * Y' = (Y & ~0b1) << 2 | (S & 0b1000) >> 1 (S & 0b10)
760 x_out
= nir_mask_shift_or(b
, x_out
, x_in
, 0xfffffffe, 2);
761 x_out
= nir_mask_shift_or(b
, x_out
, s_in
, 0x4, 0);
762 x_out
= nir_mask_shift_or(b
, x_out
, s_in
, 0x1, 1);
763 x_out
= nir_mask_shift_or(b
, x_out
, x_in
, 0x1, 0);
764 y_out
= nir_mask_shift_or(b
, y_out
, y_in
, 0xfffffffe, 2);
765 y_out
= nir_mask_shift_or(b
, y_out
, s_in
, 0x8, -1);
766 y_out
= nir_mask_shift_or(b
, y_out
, s_in
, 0x2, 0);
767 y_out
= nir_mask_shift_or(b
, y_out
, y_in
, 0x1, 0);
771 unreachable("Invalid number of samples for IMS layout");
774 return nir_vec2(b
, x_out
, y_out
);
778 unreachable("Invalid MSAA layout");
783 * Emit code to compensate for the difference between MSAA and non-MSAA
786 * This code modifies the X and Y coordinates according to the formula:
788 * (X', Y', S) = decode_msaa(num_samples, IMS, X, Y, S)
790 * (See brw_blorp_blit_program).
792 static inline nir_ssa_def
*
793 blorp_nir_decode_msaa(nir_builder
*b
, nir_ssa_def
*pos
,
794 unsigned num_samples
, enum intel_msaa_layout layout
)
796 assert(pos
->num_components
== 2 || pos
->num_components
== 3);
799 case INTEL_MSAA_LAYOUT_NONE
:
800 /* No translation necessary, and S should already be zero. */
801 assert(pos
->num_components
== 2);
803 case INTEL_MSAA_LAYOUT_CMS
:
804 /* We can't compensate for compressed layout since at this point in the
805 * program we don't have access to the MCS buffer.
807 unreachable("Bad layout in encode_msaa");
808 case INTEL_MSAA_LAYOUT_UMS
:
809 /* No translation necessary. */
811 case INTEL_MSAA_LAYOUT_IMS
: {
812 assert(pos
->num_components
== 2);
814 nir_ssa_def
*x_in
= nir_channel(b
, pos
, 0);
815 nir_ssa_def
*y_in
= nir_channel(b
, pos
, 1);
817 nir_ssa_def
*x_out
= nir_imm_int(b
, 0);
818 nir_ssa_def
*y_out
= nir_imm_int(b
, 0);
819 nir_ssa_def
*s_out
= nir_imm_int(b
, 0);
820 switch (num_samples
) {
823 /* decode_msaa(2, IMS, X, Y, 0) = (X', Y', S)
824 * where X' = (X & ~0b11) >> 1 | (X & 0b1)
825 * S = (X & 0b10) >> 1
827 * decode_msaa(4, IMS, X, Y, 0) = (X', Y', S)
828 * where X' = (X & ~0b11) >> 1 | (X & 0b1)
829 * Y' = (Y & ~0b11) >> 1 | (Y & 0b1)
830 * S = (Y & 0b10) | (X & 0b10) >> 1
832 x_out
= nir_mask_shift_or(b
, x_out
, x_in
, 0xfffffffc, -1);
833 x_out
= nir_mask_shift_or(b
, x_out
, x_in
, 0x1, 0);
834 if (num_samples
== 2) {
836 s_out
= nir_mask_shift_or(b
, s_out
, x_in
, 0x2, -1);
838 y_out
= nir_mask_shift_or(b
, y_out
, y_in
, 0xfffffffc, -1);
839 y_out
= nir_mask_shift_or(b
, y_out
, y_in
, 0x1, 0);
840 s_out
= nir_mask_shift_or(b
, s_out
, x_in
, 0x2, -1);
841 s_out
= nir_mask_shift_or(b
, s_out
, y_in
, 0x2, 0);
846 /* decode_msaa(8, IMS, X, Y, 0) = (X', Y', S)
847 * where X' = (X & ~0b111) >> 2 | (X & 0b1)
848 * Y' = (Y & ~0b11) >> 1 | (Y & 0b1)
849 * S = (X & 0b100) | (Y & 0b10) | (X & 0b10) >> 1
851 x_out
= nir_mask_shift_or(b
, x_out
, x_in
, 0xfffffff8, -2);
852 x_out
= nir_mask_shift_or(b
, x_out
, x_in
, 0x1, 0);
853 y_out
= nir_mask_shift_or(b
, y_out
, y_in
, 0xfffffffc, -1);
854 y_out
= nir_mask_shift_or(b
, y_out
, y_in
, 0x1, 0);
855 s_out
= nir_mask_shift_or(b
, s_out
, x_in
, 0x4, 0);
856 s_out
= nir_mask_shift_or(b
, s_out
, y_in
, 0x2, 0);
857 s_out
= nir_mask_shift_or(b
, s_out
, x_in
, 0x2, -1);
861 /* decode_msaa(16, IMS, X, Y, 0) = (X', Y', S)
862 * where X' = (X & ~0b111) >> 2 | (X & 0b1)
863 * Y' = (Y & ~0b111) >> 2 | (Y & 0b1)
864 * S = (Y & 0b100) << 1 | (X & 0b100) |
865 * (Y & 0b10) | (X & 0b10) >> 1
867 x_out
= nir_mask_shift_or(b
, x_out
, x_in
, 0xfffffff8, -2);
868 x_out
= nir_mask_shift_or(b
, x_out
, x_in
, 0x1, 0);
869 y_out
= nir_mask_shift_or(b
, y_out
, y_in
, 0xfffffff8, -2);
870 y_out
= nir_mask_shift_or(b
, y_out
, y_in
, 0x1, 0);
871 s_out
= nir_mask_shift_or(b
, s_out
, y_in
, 0x4, 1);
872 s_out
= nir_mask_shift_or(b
, s_out
, x_in
, 0x4, 0);
873 s_out
= nir_mask_shift_or(b
, s_out
, y_in
, 0x2, 0);
874 s_out
= nir_mask_shift_or(b
, s_out
, x_in
, 0x2, -1);
878 unreachable("Invalid number of samples for IMS layout");
881 return nir_vec3(b
, x_out
, y_out
, s_out
);
885 unreachable("Invalid MSAA layout");
890 * Count the number of trailing 1 bits in the given value. For example:
892 * count_trailing_one_bits(0) == 0
893 * count_trailing_one_bits(7) == 3
894 * count_trailing_one_bits(11) == 2
896 static inline int count_trailing_one_bits(unsigned value
)
898 #ifdef HAVE___BUILTIN_CTZ
899 return __builtin_ctz(~value
);
901 return _mesa_bitcount(value
& ~(value
+ 1));
906 blorp_nir_manual_blend_average(nir_builder
*b
, nir_ssa_def
*pos
,
907 unsigned tex_samples
,
908 enum intel_msaa_layout tex_layout
,
909 enum brw_reg_type dst_type
)
911 /* If non-null, this is the outer-most if statement */
912 nir_if
*outer_if
= NULL
;
914 nir_variable
*color
=
915 nir_local_variable_create(b
->impl
, glsl_vec4_type(), "color");
917 nir_ssa_def
*mcs
= NULL
;
918 if (tex_layout
== INTEL_MSAA_LAYOUT_CMS
)
919 mcs
= blorp_nir_txf_ms_mcs(b
, pos
);
921 /* We add together samples using a binary tree structure, e.g. for 4x MSAA:
923 * result = ((sample[0] + sample[1]) + (sample[2] + sample[3])) / 4
925 * This ensures that when all samples have the same value, no numerical
926 * precision is lost, since each addition operation always adds two equal
927 * values, and summing two equal floating point values does not lose
930 * We perform this computation by treating the texture_data array as a
931 * stack and performing the following operations:
933 * - push sample 0 onto stack
934 * - push sample 1 onto stack
935 * - add top two stack entries
936 * - push sample 2 onto stack
937 * - push sample 3 onto stack
938 * - add top two stack entries
939 * - add top two stack entries
940 * - divide top stack entry by 4
942 * Note that after pushing sample i onto the stack, the number of add
943 * operations we do is equal to the number of trailing 1 bits in i. This
944 * works provided the total number of samples is a power of two, which it
945 * always is for i965.
947 * For integer formats, we replace the add operations with average
948 * operations and skip the final division.
950 nir_ssa_def
*texture_data
[5];
951 unsigned stack_depth
= 0;
952 for (unsigned i
= 0; i
< tex_samples
; ++i
) {
953 assert(stack_depth
== _mesa_bitcount(i
)); /* Loop invariant */
955 /* Push sample i onto the stack */
956 assert(stack_depth
< ARRAY_SIZE(texture_data
));
958 nir_ssa_def
*ms_pos
= nir_vec3(b
, nir_channel(b
, pos
, 0),
959 nir_channel(b
, pos
, 1),
961 texture_data
[stack_depth
++] = blorp_nir_txf_ms(b
, ms_pos
, mcs
, dst_type
);
963 if (i
== 0 && tex_layout
== INTEL_MSAA_LAYOUT_CMS
) {
964 /* The Ivy Bridge PRM, Vol4 Part1 p27 (Multisample Control Surface)
965 * suggests an optimization:
967 * "A simple optimization with probable large return in
968 * performance is to compare the MCS value to zero (indicating
969 * all samples are on sample slice 0), and sample only from
970 * sample slice 0 using ld2dss if MCS is zero."
972 * Note that in the case where the MCS value is zero, sampling from
973 * sample slice 0 using ld2dss and sampling from sample 0 using
974 * ld2dms are equivalent (since all samples are on sample slice 0).
975 * Since we have already sampled from sample 0, all we need to do is
976 * skip the remaining fetches and averaging if MCS is zero.
978 nir_ssa_def
*mcs_zero
=
979 nir_ieq(b
, nir_channel(b
, mcs
, 0), nir_imm_int(b
, 0));
980 if (tex_samples
== 16) {
981 mcs_zero
= nir_iand(b
, mcs_zero
,
982 nir_ieq(b
, nir_channel(b
, mcs
, 1), nir_imm_int(b
, 0)));
985 nir_if
*if_stmt
= nir_if_create(b
->shader
);
986 if_stmt
->condition
= nir_src_for_ssa(mcs_zero
);
987 nir_cf_node_insert(b
->cursor
, &if_stmt
->cf_node
);
989 b
->cursor
= nir_after_cf_list(&if_stmt
->then_list
);
990 nir_store_var(b
, color
, texture_data
[0], 0xf);
992 b
->cursor
= nir_after_cf_list(&if_stmt
->else_list
);
996 for (int j
= 0; j
< count_trailing_one_bits(i
); j
++) {
997 assert(stack_depth
>= 2);
1000 assert(dst_type
== BRW_REGISTER_TYPE_F
);
1001 texture_data
[stack_depth
- 1] =
1002 nir_fadd(b
, texture_data
[stack_depth
- 1],
1003 texture_data
[stack_depth
]);
1007 /* We should have just 1 sample on the stack now. */
1008 assert(stack_depth
== 1);
1010 texture_data
[0] = nir_fmul(b
, texture_data
[0],
1011 nir_imm_float(b
, 1.0 / tex_samples
));
1013 nir_store_var(b
, color
, texture_data
[0], 0xf);
1016 b
->cursor
= nir_after_cf_node(&outer_if
->cf_node
);
1018 return nir_load_var(b
, color
);
1021 static inline nir_ssa_def
*
1022 nir_imm_vec2(nir_builder
*build
, float x
, float y
)
1026 memset(&v
, 0, sizeof(v
));
1030 return nir_build_imm(build
, 4, 32, v
);
1033 static nir_ssa_def
*
1034 blorp_nir_manual_blend_bilinear(nir_builder
*b
, nir_ssa_def
*pos
,
1035 unsigned tex_samples
,
1036 const brw_blorp_blit_prog_key
*key
,
1037 struct brw_blorp_blit_vars
*v
)
1039 nir_ssa_def
*pos_xy
= nir_channels(b
, pos
, 0x3);
1041 nir_ssa_def
*scale
= nir_imm_vec2(b
, key
->x_scale
, key
->y_scale
);
1043 /* Translate coordinates to lay out the samples in a rectangular grid
1044 * roughly corresponding to sample locations.
1046 pos_xy
= nir_fmul(b
, pos_xy
, scale
);
1047 /* Adjust coordinates so that integers represent pixel centers rather
1050 pos_xy
= nir_fadd(b
, pos_xy
, nir_imm_float(b
, -0.5));
1051 /* Clamp the X, Y texture coordinates to properly handle the sampling of
1052 * texels on texture edges.
1054 pos_xy
= nir_fmin(b
, nir_fmax(b
, pos_xy
, nir_imm_float(b
, 0.0)),
1055 nir_vec2(b
, nir_load_var(b
, v
->u_rect_grid_x1
),
1056 nir_load_var(b
, v
->u_rect_grid_y1
)));
1058 /* Store the fractional parts to be used as bilinear interpolation
1061 nir_ssa_def
*frac_xy
= nir_ffract(b
, pos_xy
);
1062 /* Round the float coordinates down to nearest integer */
1063 pos_xy
= nir_fdiv(b
, nir_ftrunc(b
, pos_xy
), scale
);
1065 nir_ssa_def
*tex_data
[4];
1066 for (unsigned i
= 0; i
< 4; ++i
) {
1067 float sample_off_x
= (float)(i
& 0x1) / key
->x_scale
;
1068 float sample_off_y
= (float)((i
>> 1) & 0x1) / key
->y_scale
;
1069 nir_ssa_def
*sample_off
= nir_imm_vec2(b
, sample_off_x
, sample_off_y
);
1071 nir_ssa_def
*sample_coords
= nir_fadd(b
, pos_xy
, sample_off
);
1072 nir_ssa_def
*sample_coords_int
= nir_f2i(b
, sample_coords
);
1074 /* The MCS value we fetch has to match up with the pixel that we're
1075 * sampling from. Since we sample from different pixels in each
1076 * iteration of this "for" loop, the call to mcs_fetch() should be
1077 * here inside the loop after computing the pixel coordinates.
1079 nir_ssa_def
*mcs
= NULL
;
1080 if (key
->tex_layout
== INTEL_MSAA_LAYOUT_CMS
)
1081 mcs
= blorp_nir_txf_ms_mcs(b
, sample_coords_int
);
1083 /* Compute sample index and map the sample index to a sample number.
1084 * Sample index layout shows the numbering of slots in a rectangular
1085 * grid of samples with in a pixel. Sample number layout shows the
1086 * rectangular grid of samples roughly corresponding to the real sample
1087 * locations with in a pixel.
1088 * In case of 4x MSAA, layout of sample indices matches the layout of
1096 * In case of 8x MSAA the two layouts don't match.
1097 * sample index layout : --------- sample number layout : ---------
1098 * | 0 | 1 | | 5 | 2 |
1099 * --------- ---------
1100 * | 2 | 3 | | 4 | 6 |
1101 * --------- ---------
1102 * | 4 | 5 | | 0 | 3 |
1103 * --------- ---------
1104 * | 6 | 7 | | 7 | 1 |
1105 * --------- ---------
1107 * Fortunately, this can be done fairly easily as:
1108 * S' = (0x17306425 >> (S * 4)) & 0xf
1110 * In the case of 16x MSAA the two layouts don't match.
1111 * Sample index layout: Sample number layout:
1112 * --------------------- ---------------------
1113 * | 0 | 1 | 2 | 3 | | 15 | 10 | 9 | 13 |
1114 * --------------------- ---------------------
1115 * | 4 | 5 | 6 | 7 | | 4 | 1 | 7 | 3 |
1116 * --------------------- ---------------------
1117 * | 8 | 9 | 10 | 11 | | 12 | 2 | 0 | 6 |
1118 * --------------------- ---------------------
1119 * | 12 | 13 | 14 | 15 | | 11 | 8 | 5 | 14 |
1120 * --------------------- ---------------------
1122 * This is equivalent to
1123 * S' = (0xfa9d4173c206b85e >> (S * 4)) & 0xf
1125 nir_ssa_def
*frac
= nir_ffract(b
, sample_coords
);
1126 nir_ssa_def
*sample
=
1127 nir_fdot2(b
, frac
, nir_imm_vec2(b
, key
->x_scale
,
1128 key
->x_scale
* key
->y_scale
));
1129 sample
= nir_f2i(b
, sample
);
1131 if (tex_samples
== 8) {
1132 sample
= nir_iand(b
, nir_ishr(b
, nir_imm_int(b
, 0x17306425),
1133 nir_ishl(b
, sample
, nir_imm_int(b
, 2))),
1134 nir_imm_int(b
, 0xf));
1135 } else if (tex_samples
== 16) {
1136 nir_ssa_def
*sample_low
=
1137 nir_iand(b
, nir_ishr(b
, nir_imm_int(b
, 0xc206b85e),
1138 nir_ishl(b
, sample
, nir_imm_int(b
, 2))),
1139 nir_imm_int(b
, 0xf));
1140 nir_ssa_def
*sample_high
=
1141 nir_iand(b
, nir_ishr(b
, nir_imm_int(b
, 0xfa9d4173),
1142 nir_ishl(b
, nir_iadd(b
, sample
,
1143 nir_imm_int(b
, -8)),
1144 nir_imm_int(b
, 2))),
1145 nir_imm_int(b
, 0xf));
1147 sample
= nir_bcsel(b
, nir_ilt(b
, sample
, nir_imm_int(b
, 8)),
1148 sample_low
, sample_high
);
1150 nir_ssa_def
*pos_ms
= nir_vec3(b
, nir_channel(b
, sample_coords_int
, 0),
1151 nir_channel(b
, sample_coords_int
, 1),
1153 tex_data
[i
] = blorp_nir_txf_ms(b
, pos_ms
, mcs
, key
->texture_data_type
);
1156 nir_ssa_def
*frac_x
= nir_channel(b
, frac_xy
, 0);
1157 nir_ssa_def
*frac_y
= nir_channel(b
, frac_xy
, 1);
1158 return nir_flrp(b
, nir_flrp(b
, tex_data
[0], tex_data
[1], frac_x
),
1159 nir_flrp(b
, tex_data
[2], tex_data
[3], frac_x
),
1164 * Generator for WM programs used in BLORP blits.
1166 * The bulk of the work done by the WM program is to wrap and unwrap the
1167 * coordinate transformations used by the hardware to store surfaces in
1168 * memory. The hardware transforms a pixel location (X, Y, S) (where S is the
1169 * sample index for a multisampled surface) to a memory offset by the
1170 * following formulas:
1172 * offset = tile(tiling_format, encode_msaa(num_samples, layout, X, Y, S))
1173 * (X, Y, S) = decode_msaa(num_samples, layout, detile(tiling_format, offset))
1175 * For a single-sampled surface, or for a multisampled surface using
1176 * INTEL_MSAA_LAYOUT_UMS, encode_msaa() and decode_msaa are the identity
1179 * encode_msaa(1, NONE, X, Y, 0) = (X, Y, 0)
1180 * decode_msaa(1, NONE, X, Y, 0) = (X, Y, 0)
1181 * encode_msaa(n, UMS, X, Y, S) = (X, Y, S)
1182 * decode_msaa(n, UMS, X, Y, S) = (X, Y, S)
1184 * For a 4x multisampled surface using INTEL_MSAA_LAYOUT_IMS, encode_msaa()
1185 * embeds the sample number into bit 1 of the X and Y coordinates:
1187 * encode_msaa(4, IMS, X, Y, S) = (X', Y', 0)
1188 * where X' = (X & ~0b1) << 1 | (S & 0b1) << 1 | (X & 0b1)
1189 * Y' = (Y & ~0b1 ) << 1 | (S & 0b10) | (Y & 0b1)
1190 * decode_msaa(4, IMS, X, Y, 0) = (X', Y', S)
1191 * where X' = (X & ~0b11) >> 1 | (X & 0b1)
1192 * Y' = (Y & ~0b11) >> 1 | (Y & 0b1)
1193 * S = (Y & 0b10) | (X & 0b10) >> 1
1195 * For an 8x multisampled surface using INTEL_MSAA_LAYOUT_IMS, encode_msaa()
1196 * embeds the sample number into bits 1 and 2 of the X coordinate and bit 1 of
1199 * encode_msaa(8, IMS, X, Y, S) = (X', Y', 0)
1200 * where X' = (X & ~0b1) << 2 | (S & 0b100) | (S & 0b1) << 1 | (X & 0b1)
1201 * Y' = (Y & ~0b1) << 1 | (S & 0b10) | (Y & 0b1)
1202 * decode_msaa(8, IMS, X, Y, 0) = (X', Y', S)
1203 * where X' = (X & ~0b111) >> 2 | (X & 0b1)
1204 * Y' = (Y & ~0b11) >> 1 | (Y & 0b1)
1205 * S = (X & 0b100) | (Y & 0b10) | (X & 0b10) >> 1
1207 * For X tiling, tile() combines together the low-order bits of the X and Y
1208 * coordinates in the pattern 0byyyxxxxxxxxx, creating 4k tiles that are 512
1209 * bytes wide and 8 rows high:
1211 * tile(x_tiled, X, Y, S) = A
1212 * where A = tile_num << 12 | offset
1213 * tile_num = (Y' >> 3) * tile_pitch + (X' >> 9)
1214 * offset = (Y' & 0b111) << 9
1215 * | (X & 0b111111111)
1217 * Y' = Y + S * qpitch
1218 * detile(x_tiled, A) = (X, Y, S)
1219 * where X = X' / cpp
1222 * Y' = (tile_num / tile_pitch) << 3
1223 * | (A & 0b111000000000) >> 9
1224 * X' = (tile_num % tile_pitch) << 9
1225 * | (A & 0b111111111)
1227 * (In all tiling formulas, cpp is the number of bytes occupied by a single
1228 * sample ("chars per pixel"), tile_pitch is the number of 4k tiles required
1229 * to fill the width of the surface, and qpitch is the spacing (in rows)
1230 * between array slices).
1232 * For Y tiling, tile() combines together the low-order bits of the X and Y
1233 * coordinates in the pattern 0bxxxyyyyyxxxx, creating 4k tiles that are 128
1234 * bytes wide and 32 rows high:
1236 * tile(y_tiled, X, Y, S) = A
1237 * where A = tile_num << 12 | offset
1238 * tile_num = (Y' >> 5) * tile_pitch + (X' >> 7)
1239 * offset = (X' & 0b1110000) << 5
1240 * | (Y' & 0b11111) << 4
1243 * Y' = Y + S * qpitch
1244 * detile(y_tiled, A) = (X, Y, S)
1245 * where X = X' / cpp
1248 * Y' = (tile_num / tile_pitch) << 5
1249 * | (A & 0b111110000) >> 4
1250 * X' = (tile_num % tile_pitch) << 7
1251 * | (A & 0b111000000000) >> 5
1254 * For W tiling, tile() combines together the low-order bits of the X and Y
1255 * coordinates in the pattern 0bxxxyyyyxyxyx, creating 4k tiles that are 64
1256 * bytes wide and 64 rows high (note that W tiling is only used for stencil
1257 * buffers, which always have cpp = 1 and S=0):
1259 * tile(w_tiled, X, Y, S) = A
1260 * where A = tile_num << 12 | offset
1261 * tile_num = (Y' >> 6) * tile_pitch + (X' >> 6)
1262 * offset = (X' & 0b111000) << 6
1263 * | (Y' & 0b111100) << 3
1264 * | (X' & 0b100) << 2
1265 * | (Y' & 0b10) << 2
1266 * | (X' & 0b10) << 1
1270 * Y' = Y + S * qpitch
1271 * detile(w_tiled, A) = (X, Y, S)
1272 * where X = X' / cpp = X'
1273 * Y = Y' % qpitch = Y'
1274 * S = Y / qpitch = 0
1275 * Y' = (tile_num / tile_pitch) << 6
1276 * | (A & 0b111100000) >> 3
1277 * | (A & 0b1000) >> 2
1279 * X' = (tile_num % tile_pitch) << 6
1280 * | (A & 0b111000000000) >> 6
1281 * | (A & 0b10000) >> 2
1282 * | (A & 0b100) >> 1
1285 * Finally, for a non-tiled surface, tile() simply combines together the X and
1286 * Y coordinates in the natural way:
1288 * tile(untiled, X, Y, S) = A
1289 * where A = Y * pitch + X'
1291 * Y' = Y + S * qpitch
1292 * detile(untiled, A) = (X, Y, S)
1293 * where X = X' / cpp
1299 * (In these formulas, pitch is the number of bytes occupied by a single row
1303 brw_blorp_build_nir_shader(struct brw_context
*brw
,
1304 const brw_blorp_blit_prog_key
*key
)
1306 nir_ssa_def
*src_pos
, *dst_pos
, *color
;
1309 if (key
->dst_tiled_w
&& key
->rt_samples
> 0) {
1310 /* If the destination image is W tiled and multisampled, then the thread
1311 * must be dispatched once per sample, not once per pixel. This is
1312 * necessary because after conversion between W and Y tiling, there's no
1313 * guarantee that all samples corresponding to a single pixel will still
1316 assert(key
->persample_msaa_dispatch
);
1320 /* We are blending, which means we won't have an opportunity to
1321 * translate the tiling and sample count for the texture surface. So
1322 * the surface state for the texture must be configured with the correct
1323 * tiling and sample count.
1325 assert(!key
->src_tiled_w
);
1326 assert(key
->tex_samples
== key
->src_samples
);
1327 assert(key
->tex_layout
== key
->src_layout
);
1328 assert(key
->tex_samples
> 0);
1331 if (key
->persample_msaa_dispatch
) {
1332 /* It only makes sense to do persample dispatch if the render target is
1333 * configured as multisampled.
1335 assert(key
->rt_samples
> 0);
1338 /* Make sure layout is consistent with sample count */
1339 assert((key
->tex_layout
== INTEL_MSAA_LAYOUT_NONE
) ==
1340 (key
->tex_samples
== 0));
1341 assert((key
->rt_layout
== INTEL_MSAA_LAYOUT_NONE
) ==
1342 (key
->rt_samples
== 0));
1343 assert((key
->src_layout
== INTEL_MSAA_LAYOUT_NONE
) ==
1344 (key
->src_samples
== 0));
1345 assert((key
->dst_layout
== INTEL_MSAA_LAYOUT_NONE
) ==
1346 (key
->dst_samples
== 0));
1349 nir_builder_init_simple_shader(&b
, NULL
, MESA_SHADER_FRAGMENT
, NULL
);
1351 struct brw_blorp_blit_vars v
;
1352 brw_blorp_blit_vars_init(&b
, &v
, key
);
1354 dst_pos
= blorp_blit_get_frag_coords(&b
, key
, &v
);
1356 /* Render target and texture hardware don't support W tiling until Gen8. */
1357 const bool rt_tiled_w
= false;
1358 const bool tex_tiled_w
= brw
->gen
>= 8 && key
->src_tiled_w
;
1360 /* The address that data will be written to is determined by the
1361 * coordinates supplied to the WM thread and the tiling and sample count of
1362 * the render target, according to the formula:
1364 * (X, Y, S) = decode_msaa(rt_samples, detile(rt_tiling, offset))
1366 * If the actual tiling and sample count of the destination surface are not
1367 * the same as the configuration of the render target, then these
1368 * coordinates are wrong and we have to adjust them to compensate for the
1371 if (rt_tiled_w
!= key
->dst_tiled_w
||
1372 key
->rt_samples
!= key
->dst_samples
||
1373 key
->rt_layout
!= key
->dst_layout
) {
1374 dst_pos
= blorp_nir_encode_msaa(&b
, dst_pos
, key
->rt_samples
,
1376 /* Now (X, Y, S) = detile(rt_tiling, offset) */
1377 if (rt_tiled_w
!= key
->dst_tiled_w
)
1378 dst_pos
= blorp_nir_retile_y_to_w(&b
, dst_pos
);
1379 /* Now (X, Y, S) = detile(rt_tiling, offset) */
1380 dst_pos
= blorp_nir_decode_msaa(&b
, dst_pos
, key
->dst_samples
,
1384 /* Now (X, Y, S) = decode_msaa(dst_samples, detile(dst_tiling, offset)).
1386 * That is: X, Y and S now contain the true coordinates and sample index of
1387 * the data that the WM thread should output.
1389 * If we need to kill pixels that are outside the destination rectangle,
1390 * now is the time to do it.
1393 blorp_nir_discard_if_outside_rect(&b
, dst_pos
, &v
);
1395 src_pos
= blorp_blit_apply_transform(&b
, nir_i2f(&b
, dst_pos
), &v
);
1396 if (dst_pos
->num_components
== 3) {
1397 /* The sample coordinate is an integer that we want left alone but
1398 * blorp_blit_apply_transform() blindly applies the transform to all
1399 * three coordinates. Grab the original sample index.
1401 src_pos
= nir_vec3(&b
, nir_channel(&b
, src_pos
, 0),
1402 nir_channel(&b
, src_pos
, 1),
1403 nir_channel(&b
, dst_pos
, 2));
1406 /* If the source image is not multisampled, then we want to fetch sample
1407 * number 0, because that's the only sample there is.
1409 if (key
->src_samples
== 0)
1410 src_pos
= nir_channels(&b
, src_pos
, 0x3);
1412 /* X, Y, and S are now the coordinates of the pixel in the source image
1413 * that we want to texture from. Exception: if we are blending, then S is
1414 * irrelevant, because we are going to fetch all samples.
1416 if (key
->blend
&& !key
->blit_scaled
) {
1417 /* Resolves (effecively) use texelFetch, so we need integers and we
1418 * don't care about the sample index if we got one.
1420 src_pos
= nir_f2i(&b
, nir_channels(&b
, src_pos
, 0x3));
1422 if (brw
->gen
== 6) {
1423 /* Because gen6 only supports 4x interleved MSAA, we can do all the
1424 * blending we need with a single linear-interpolated texture lookup
1425 * at the center of the sample. The texture coordinates to be odd
1426 * integers so that they correspond to the center of a 2x2 block
1427 * representing the four samples that maxe up a pixel. So we need
1428 * to multiply our X and Y coordinates each by 2 and then add 1.
1430 src_pos
= nir_ishl(&b
, src_pos
, nir_imm_int(&b
, 1));
1431 src_pos
= nir_iadd(&b
, src_pos
, nir_imm_int(&b
, 1));
1432 src_pos
= nir_i2f(&b
, src_pos
);
1433 color
= blorp_nir_tex(&b
, src_pos
, key
->texture_data_type
);
1435 /* Gen7+ hardware doesn't automaticaly blend. */
1436 color
= blorp_nir_manual_blend_average(&b
, src_pos
, key
->src_samples
,
1438 key
->texture_data_type
);
1440 } else if (key
->blend
&& key
->blit_scaled
) {
1441 color
= blorp_nir_manual_blend_bilinear(&b
, src_pos
, key
->src_samples
, key
, &v
);
1443 if (key
->bilinear_filter
) {
1444 color
= blorp_nir_tex(&b
, src_pos
, key
->texture_data_type
);
1446 /* We're going to use texelFetch, so we need integers */
1447 if (src_pos
->num_components
== 2) {
1448 src_pos
= nir_f2i(&b
, src_pos
);
1450 assert(src_pos
->num_components
== 3);
1451 src_pos
= nir_vec3(&b
, nir_channel(&b
, nir_f2i(&b
, src_pos
), 0),
1452 nir_channel(&b
, nir_f2i(&b
, src_pos
), 1),
1453 nir_channel(&b
, src_pos
, 2));
1456 /* We aren't blending, which means we just want to fetch a single
1457 * sample from the source surface. The address that we want to fetch
1458 * from is related to the X, Y and S values according to the formula:
1460 * (X, Y, S) = decode_msaa(src_samples, detile(src_tiling, offset)).
1462 * If the actual tiling and sample count of the source surface are
1463 * not the same as the configuration of the texture, then we need to
1464 * adjust the coordinates to compensate for the difference.
1466 if (tex_tiled_w
!= key
->src_tiled_w
||
1467 key
->tex_samples
!= key
->src_samples
||
1468 key
->tex_layout
!= key
->src_layout
) {
1469 src_pos
= blorp_nir_encode_msaa(&b
, src_pos
, key
->src_samples
,
1471 /* Now (X, Y, S) = detile(src_tiling, offset) */
1472 if (tex_tiled_w
!= key
->src_tiled_w
)
1473 src_pos
= blorp_nir_retile_w_to_y(&b
, src_pos
);
1474 /* Now (X, Y, S) = detile(tex_tiling, offset) */
1475 src_pos
= blorp_nir_decode_msaa(&b
, src_pos
, key
->tex_samples
,
1479 /* Now (X, Y, S) = decode_msaa(tex_samples, detile(tex_tiling, offset)).
1481 * In other words: X, Y, and S now contain values which, when passed to
1482 * the texturing unit, will cause data to be read from the correct
1483 * memory location. So we can fetch the texel now.
1485 if (key
->src_samples
== 0) {
1486 color
= blorp_nir_txf(&b
, &v
, src_pos
, key
->texture_data_type
);
1488 nir_ssa_def
*mcs
= NULL
;
1489 if (key
->tex_layout
== INTEL_MSAA_LAYOUT_CMS
)
1490 mcs
= blorp_nir_txf_ms_mcs(&b
, src_pos
);
1492 color
= blorp_nir_txf_ms(&b
, src_pos
, mcs
, key
->texture_data_type
);
1497 nir_store_var(&b
, v
.color_out
, color
, 0xf);
1503 brw_blorp_get_blit_kernel(struct brw_context
*brw
,
1504 struct brw_blorp_params
*params
,
1505 const struct brw_blorp_blit_prog_key
*prog_key
)
1507 if (brw_search_cache(&brw
->cache
, BRW_CACHE_BLORP_PROG
,
1508 prog_key
, sizeof(*prog_key
),
1509 ¶ms
->wm_prog_kernel
, ¶ms
->wm_prog_data
))
1512 const unsigned *program
;
1513 unsigned program_size
;
1514 struct brw_blorp_prog_data prog_data
;
1516 /* Try and compile with NIR first. If that fails, fall back to the old
1517 * method of building shaders manually.
1519 nir_shader
*nir
= brw_blorp_build_nir_shader(brw
, prog_key
);
1520 struct brw_wm_prog_key wm_key
;
1521 brw_blorp_init_wm_prog_key(&wm_key
);
1522 wm_key
.tex
.compressed_multisample_layout_mask
=
1523 prog_key
->tex_layout
== INTEL_MSAA_LAYOUT_CMS
;
1524 wm_key
.tex
.msaa_16
= prog_key
->tex_samples
== 16;
1525 wm_key
.multisample_fbo
= prog_key
->rt_samples
> 1;
1527 program
= brw_blorp_compile_nir_shader(brw
, nir
, &wm_key
, false,
1528 &prog_data
, &program_size
);
1530 brw_upload_cache(&brw
->cache
, BRW_CACHE_BLORP_PROG
,
1531 prog_key
, sizeof(*prog_key
),
1532 program
, program_size
,
1533 &prog_data
, sizeof(prog_data
),
1534 ¶ms
->wm_prog_kernel
, ¶ms
->wm_prog_data
);
1538 brw_blorp_setup_coord_transform(struct brw_blorp_coord_transform
*xform
,
1539 GLfloat src0
, GLfloat src1
,
1540 GLfloat dst0
, GLfloat dst1
,
1543 float scale
= (src1
- src0
) / (dst1
- dst0
);
1545 /* When not mirroring a coordinate (say, X), we need:
1546 * src_x - src_x0 = (dst_x - dst_x0 + 0.5) * scale
1548 * src_x = src_x0 + (dst_x - dst_x0 + 0.5) * scale
1550 * blorp program uses "round toward zero" to convert the
1551 * transformed floating point coordinates to integer coordinates,
1552 * whereas the behaviour we actually want is "round to nearest",
1553 * so 0.5 provides the necessary correction.
1555 xform
->multiplier
= scale
;
1556 xform
->offset
= src0
+ (-dst0
+ 0.5f
) * scale
;
1558 /* When mirroring X we need:
1559 * src_x - src_x0 = dst_x1 - dst_x - 0.5
1561 * src_x = src_x0 + (dst_x1 -dst_x - 0.5) * scale
1563 xform
->multiplier
= -scale
;
1564 xform
->offset
= src0
+ (dst1
- 0.5f
) * scale
;
1570 * Determine which MSAA layout the GPU pipeline should be configured for,
1571 * based on the chip generation, the number of samples, and the true layout of
1572 * the image in memory.
1574 inline intel_msaa_layout
1575 compute_msaa_layout_for_pipeline(struct brw_context
*brw
, unsigned num_samples
,
1576 intel_msaa_layout true_layout
)
1578 if (num_samples
<= 1) {
1579 /* Layout is used to determine if ld2dms is needed for sampling. In
1580 * single sampled case normal ld is enough avoiding also the need to
1581 * fetch mcs. Therefore simply set the layout to none.
1583 if (brw
->gen
>= 9 && true_layout
== INTEL_MSAA_LAYOUT_CMS
) {
1584 return INTEL_MSAA_LAYOUT_NONE
;
1587 /* When configuring the GPU for non-MSAA, we can still accommodate IMS
1588 * format buffers, by transforming coordinates appropriately.
1590 assert(true_layout
== INTEL_MSAA_LAYOUT_NONE
||
1591 true_layout
== INTEL_MSAA_LAYOUT_IMS
);
1592 return INTEL_MSAA_LAYOUT_NONE
;
1594 assert(true_layout
!= INTEL_MSAA_LAYOUT_NONE
);
1597 /* Prior to Gen7, all MSAA surfaces use IMS layout. */
1598 if (brw
->gen
== 6) {
1599 assert(true_layout
== INTEL_MSAA_LAYOUT_IMS
);
1607 * Note: if the src (or dst) is a 2D multisample array texture on Gen7+ using
1608 * INTEL_MSAA_LAYOUT_UMS or INTEL_MSAA_LAYOUT_CMS, src_layer (dst_layer) is
1609 * the physical layer holding sample 0. So, for example, if
1610 * src_mt->num_samples == 4, then logical layer n corresponds to src_layer ==
1614 brw_blorp_blit_miptrees(struct brw_context
*brw
,
1615 struct intel_mipmap_tree
*src_mt
,
1616 unsigned src_level
, unsigned src_layer
,
1617 mesa_format src_format
, int src_swizzle
,
1618 struct intel_mipmap_tree
*dst_mt
,
1619 unsigned dst_level
, unsigned dst_layer
,
1620 mesa_format dst_format
,
1621 float src_x0
, float src_y0
,
1622 float src_x1
, float src_y1
,
1623 float dst_x0
, float dst_y0
,
1624 float dst_x1
, float dst_y1
,
1625 GLenum filter
, bool mirror_x
, bool mirror_y
,
1626 bool decode_srgb
, bool encode_srgb
)
1628 /* Get ready to blit. This includes depth resolving the src and dst
1629 * buffers if necessary. Note: it's not necessary to do a color resolve on
1630 * the destination buffer because we use the standard render path to render
1631 * to destination color buffers, and the standard render path is
1634 intel_miptree_resolve_color(brw
, src_mt
, INTEL_MIPTREE_IGNORE_CCS_E
);
1635 intel_miptree_slice_resolve_depth(brw
, src_mt
, src_level
, src_layer
);
1636 intel_miptree_slice_resolve_depth(brw
, dst_mt
, dst_level
, dst_layer
);
1638 intel_miptree_prepare_mcs(brw
, dst_mt
);
1640 DBG("%s from %dx %s mt %p %d %d (%f,%f) (%f,%f)"
1641 "to %dx %s mt %p %d %d (%f,%f) (%f,%f) (flip %d,%d)\n",
1643 src_mt
->num_samples
, _mesa_get_format_name(src_mt
->format
), src_mt
,
1644 src_level
, src_layer
, src_x0
, src_y0
, src_x1
, src_y1
,
1645 dst_mt
->num_samples
, _mesa_get_format_name(dst_mt
->format
), dst_mt
,
1646 dst_level
, dst_layer
, dst_x0
, dst_y0
, dst_x1
, dst_y1
,
1647 mirror_x
, mirror_y
);
1649 if (!decode_srgb
&& _mesa_get_format_color_encoding(src_format
) == GL_SRGB
)
1650 src_format
= _mesa_get_srgb_format_linear(src_format
);
1652 if (!encode_srgb
&& _mesa_get_format_color_encoding(dst_format
) == GL_SRGB
)
1653 dst_format
= _mesa_get_srgb_format_linear(dst_format
);
1655 struct brw_blorp_params params
;
1656 brw_blorp_params_init(¶ms
);
1658 brw_blorp_surface_info_init(brw
, ¶ms
.src
, src_mt
, src_level
,
1659 src_layer
, src_format
, false);
1660 brw_blorp_surface_info_init(brw
, ¶ms
.dst
, dst_mt
, dst_level
,
1661 dst_layer
, dst_format
, true);
1663 /* Even though we do multisample resolves at the time of the blit, OpenGL
1664 * specification defines them as if they happen at the time of rendering,
1665 * which means that the type of averaging we do during the resolve should
1666 * only depend on the source format; the destination format should be
1667 * ignored. But, specification doesn't seem to be strict about it.
1669 * It has been observed that mulitisample resolves produce slightly better
1670 * looking images when averaging is done using destination format. NVIDIA's
1671 * proprietary OpenGL driver also follow this approach. So, we choose to
1672 * follow it in our driver.
1674 * When multisampling, if the source and destination formats are equal
1675 * (aside from the color space), we choose to blit in sRGB space to get
1676 * this higher quality image.
1678 if (params
.src
.num_samples
> 1 &&
1679 _mesa_get_format_color_encoding(dst_mt
->format
) == GL_SRGB
&&
1680 _mesa_get_srgb_format_linear(src_mt
->format
) ==
1681 _mesa_get_srgb_format_linear(dst_mt
->format
)) {
1682 assert(brw
->format_supported_as_render_target
[dst_mt
->format
]);
1683 params
.dst
.brw_surfaceformat
= brw
->render_target_format
[dst_mt
->format
];
1684 params
.src
.brw_surfaceformat
= brw_format_for_mesa_format(dst_mt
->format
);
1687 /* When doing a multisample resolve of a GL_LUMINANCE32F or GL_INTENSITY32F
1688 * texture, the above code configures the source format for L32_FLOAT or
1689 * I32_FLOAT, and the destination format for R32_FLOAT. On Sandy Bridge,
1690 * the SAMPLE message appears to handle multisampled L32_FLOAT and
1691 * I32_FLOAT textures incorrectly, resulting in blocky artifacts. So work
1692 * around the problem by using a source format of R32_FLOAT. This
1693 * shouldn't affect rendering correctness, since the destination format is
1694 * R32_FLOAT, so only the contents of the red channel matters.
1696 if (brw
->gen
== 6 &&
1697 params
.src
.num_samples
> 1 && params
.dst
.num_samples
<= 1 &&
1698 src_mt
->format
== dst_mt
->format
&&
1699 params
.dst
.brw_surfaceformat
== BRW_SURFACEFORMAT_R32_FLOAT
) {
1700 params
.src
.brw_surfaceformat
= params
.dst
.brw_surfaceformat
;
1703 struct brw_blorp_blit_prog_key wm_prog_key
;
1704 memset(&wm_prog_key
, 0, sizeof(wm_prog_key
));
1706 /* texture_data_type indicates the register type that should be used to
1707 * manipulate texture data.
1709 switch (_mesa_get_format_datatype(src_mt
->format
)) {
1710 case GL_UNSIGNED_NORMALIZED
:
1711 case GL_SIGNED_NORMALIZED
:
1713 wm_prog_key
.texture_data_type
= BRW_REGISTER_TYPE_F
;
1715 case GL_UNSIGNED_INT
:
1716 if (src_mt
->format
== MESA_FORMAT_S_UINT8
) {
1717 /* We process stencil as though it's an unsigned normalized color */
1718 wm_prog_key
.texture_data_type
= BRW_REGISTER_TYPE_F
;
1720 wm_prog_key
.texture_data_type
= BRW_REGISTER_TYPE_UD
;
1724 wm_prog_key
.texture_data_type
= BRW_REGISTER_TYPE_D
;
1727 unreachable("Unrecognized blorp format");
1731 /* Gen7's rendering hardware only supports the IMS layout for depth and
1732 * stencil render targets. Blorp always maps its destination surface as
1733 * a color render target (even if it's actually a depth or stencil
1734 * buffer). So if the destination is IMS, we'll have to map it as a
1735 * single-sampled texture and interleave the samples ourselves.
1737 if (dst_mt
->msaa_layout
== INTEL_MSAA_LAYOUT_IMS
)
1738 params
.dst
.num_samples
= 0;
1741 if (params
.dst
.map_stencil_as_y_tiled
&& params
.dst
.num_samples
> 1) {
1742 /* If the destination surface is a W-tiled multisampled stencil buffer
1743 * that we're mapping as Y tiled, then we need to arrange for the WM
1744 * program to run once per sample rather than once per pixel, because
1745 * the memory layout of related samples doesn't match between W and Y
1748 wm_prog_key
.persample_msaa_dispatch
= true;
1751 if (params
.src
.num_samples
> 0 && params
.dst
.num_samples
> 1) {
1752 /* We are blitting from a multisample buffer to a multisample buffer, so
1753 * we must preserve samples within a pixel. This means we have to
1754 * arrange for the WM program to run once per sample rather than once
1757 wm_prog_key
.persample_msaa_dispatch
= true;
1760 /* Scaled blitting or not. */
1761 wm_prog_key
.blit_scaled
=
1762 ((dst_x1
- dst_x0
) == (src_x1
- src_x0
) &&
1763 (dst_y1
- dst_y0
) == (src_y1
- src_y0
)) ? false : true;
1765 /* Scaling factors used for bilinear filtering in multisample scaled
1768 wm_prog_key
.x_scale
= 2.0f
;
1769 wm_prog_key
.y_scale
= src_mt
->num_samples
/ 2.0f
;
1771 if (filter
== GL_LINEAR
&&
1772 params
.src
.num_samples
<= 1 && params
.dst
.num_samples
<= 1)
1773 wm_prog_key
.bilinear_filter
= true;
1775 GLenum base_format
= _mesa_get_format_base_format(src_mt
->format
);
1776 if (base_format
!= GL_DEPTH_COMPONENT
&& /* TODO: what about depth/stencil? */
1777 base_format
!= GL_STENCIL_INDEX
&&
1778 !_mesa_is_format_integer(src_mt
->format
) &&
1779 src_mt
->num_samples
> 1 && dst_mt
->num_samples
<= 1) {
1780 /* We are downsampling a non-integer color buffer, so blend.
1782 * Regarding integer color buffers, the OpenGL ES 3.2 spec says:
1784 * "If the source formats are integer types or stencil values, a
1785 * single sample's value is selected for each pixel."
1787 * This implies we should not blend in that case.
1789 wm_prog_key
.blend
= true;
1792 /* src_samples and dst_samples are the true sample counts */
1793 wm_prog_key
.src_samples
= src_mt
->num_samples
;
1794 wm_prog_key
.dst_samples
= dst_mt
->num_samples
;
1796 /* tex_samples and rt_samples are the sample counts that are set up in
1799 wm_prog_key
.tex_samples
= params
.src
.num_samples
;
1800 wm_prog_key
.rt_samples
= params
.dst
.num_samples
;
1802 /* tex_layout and rt_layout indicate the MSAA layout the GPU pipeline will
1803 * use to access the source and destination surfaces.
1805 wm_prog_key
.tex_layout
=
1806 compute_msaa_layout_for_pipeline(brw
, params
.src
.num_samples
,
1807 params
.src
.msaa_layout
);
1808 wm_prog_key
.rt_layout
=
1809 compute_msaa_layout_for_pipeline(brw
, params
.dst
.num_samples
,
1810 params
.dst
.msaa_layout
);
1812 /* src_layout and dst_layout indicate the true MSAA layout used by src and
1815 wm_prog_key
.src_layout
= src_mt
->msaa_layout
;
1816 wm_prog_key
.dst_layout
= dst_mt
->msaa_layout
;
1818 /* On gen9+ compressed single sampled buffers carry the same layout type as
1819 * multisampled. The difference is that they can be sampled using normal
1820 * ld message and as render target behave just like non-compressed surface
1821 * from compiler point of view. Therefore override the type in the program
1824 if (brw
->gen
>= 9 && params
.src
.num_samples
<= 1 &&
1825 src_mt
->msaa_layout
== INTEL_MSAA_LAYOUT_CMS
)
1826 wm_prog_key
.src_layout
= INTEL_MSAA_LAYOUT_NONE
;
1827 if (brw
->gen
>= 9 && params
.dst
.num_samples
<= 1 &&
1828 dst_mt
->msaa_layout
== INTEL_MSAA_LAYOUT_CMS
)
1829 wm_prog_key
.dst_layout
= INTEL_MSAA_LAYOUT_NONE
;
1831 wm_prog_key
.src_tiled_w
= params
.src
.map_stencil_as_y_tiled
;
1832 wm_prog_key
.dst_tiled_w
= params
.dst
.map_stencil_as_y_tiled
;
1833 /* Round floating point values to nearest integer to avoid "off by one texel"
1834 * kind of errors when blitting.
1836 params
.x0
= params
.wm_push_consts
.dst_x0
= roundf(dst_x0
);
1837 params
.y0
= params
.wm_push_consts
.dst_y0
= roundf(dst_y0
);
1838 params
.x1
= params
.wm_push_consts
.dst_x1
= roundf(dst_x1
);
1839 params
.y1
= params
.wm_push_consts
.dst_y1
= roundf(dst_y1
);
1840 params
.wm_push_consts
.rect_grid_x1
=
1841 minify(src_mt
->logical_width0
, src_level
) * wm_prog_key
.x_scale
- 1.0f
;
1842 params
.wm_push_consts
.rect_grid_y1
=
1843 minify(src_mt
->logical_height0
, src_level
) * wm_prog_key
.y_scale
- 1.0f
;
1845 brw_blorp_setup_coord_transform(¶ms
.wm_push_consts
.x_transform
,
1846 src_x0
, src_x1
, dst_x0
, dst_x1
, mirror_x
);
1847 brw_blorp_setup_coord_transform(¶ms
.wm_push_consts
.y_transform
,
1848 src_y0
, src_y1
, dst_y0
, dst_y1
, mirror_y
);
1850 params
.wm_push_consts
.src_z
=
1851 params
.src
.mt
->target
== GL_TEXTURE_3D
? params
.src
.layer
: 0;
1853 if (params
.dst
.num_samples
<= 1 && dst_mt
->num_samples
> 1) {
1854 /* We must expand the rectangle we send through the rendering pipeline,
1855 * to account for the fact that we are mapping the destination region as
1856 * single-sampled when it is in fact multisampled. We must also align
1857 * it to a multiple of the multisampling pattern, because the
1858 * differences between multisampled and single-sampled surface formats
1859 * will mean that pixels are scrambled within the multisampling pattern.
1860 * TODO: what if this makes the coordinates too large?
1862 * Note: this only works if the destination surface uses the IMS layout.
1863 * If it's UMS, then we have no choice but to set up the rendering
1864 * pipeline as multisampled.
1866 assert(dst_mt
->msaa_layout
== INTEL_MSAA_LAYOUT_IMS
);
1867 switch (dst_mt
->num_samples
) {
1869 params
.x0
= ROUND_DOWN_TO(params
.x0
* 2, 4);
1870 params
.y0
= ROUND_DOWN_TO(params
.y0
, 4);
1871 params
.x1
= ALIGN(params
.x1
* 2, 4);
1872 params
.y1
= ALIGN(params
.y1
, 4);
1875 params
.x0
= ROUND_DOWN_TO(params
.x0
* 2, 4);
1876 params
.y0
= ROUND_DOWN_TO(params
.y0
* 2, 4);
1877 params
.x1
= ALIGN(params
.x1
* 2, 4);
1878 params
.y1
= ALIGN(params
.y1
* 2, 4);
1881 params
.x0
= ROUND_DOWN_TO(params
.x0
* 4, 8);
1882 params
.y0
= ROUND_DOWN_TO(params
.y0
* 2, 4);
1883 params
.x1
= ALIGN(params
.x1
* 4, 8);
1884 params
.y1
= ALIGN(params
.y1
* 2, 4);
1887 params
.x0
= ROUND_DOWN_TO(params
.x0
* 4, 8);
1888 params
.y0
= ROUND_DOWN_TO(params
.y0
* 4, 8);
1889 params
.x1
= ALIGN(params
.x1
* 4, 8);
1890 params
.y1
= ALIGN(params
.y1
* 4, 8);
1893 unreachable("Unrecognized sample count in brw_blorp_blit_params ctor");
1895 wm_prog_key
.use_kill
= true;
1898 if (params
.dst
.map_stencil_as_y_tiled
) {
1899 /* We must modify the rectangle we send through the rendering pipeline
1900 * (and the size and x/y offset of the destination surface), to account
1901 * for the fact that we are mapping it as Y-tiled when it is in fact
1904 * Both Y tiling and W tiling can be understood as organizations of
1905 * 32-byte sub-tiles; within each 32-byte sub-tile, the layout of pixels
1906 * is different, but the layout of the 32-byte sub-tiles within the 4k
1907 * tile is the same (8 sub-tiles across by 16 sub-tiles down, in
1908 * column-major order). In Y tiling, the sub-tiles are 16 bytes wide
1909 * and 2 rows high; in W tiling, they are 8 bytes wide and 4 rows high.
1911 * Therefore, to account for the layout differences within the 32-byte
1912 * sub-tiles, we must expand the rectangle so the X coordinates of its
1913 * edges are multiples of 8 (the W sub-tile width), and its Y
1914 * coordinates of its edges are multiples of 4 (the W sub-tile height).
1915 * Then we need to scale the X and Y coordinates of the rectangle to
1916 * account for the differences in aspect ratio between the Y and W
1917 * sub-tiles. We need to modify the layer width and height similarly.
1919 * A correction needs to be applied when MSAA is in use: since
1920 * INTEL_MSAA_LAYOUT_IMS uses an interleaving pattern whose height is 4,
1921 * we need to align the Y coordinates to multiples of 8, so that when
1922 * they are divided by two they are still multiples of 4.
1924 * Note: Since the x/y offset of the surface will be applied using the
1925 * SURFACE_STATE command packet, it will be invisible to the swizzling
1926 * code in the shader; therefore it needs to be in a multiple of the
1927 * 32-byte sub-tile size. Fortunately it is, since the sub-tile is 8
1928 * pixels wide and 4 pixels high (when viewed as a W-tiled stencil
1929 * buffer), and the miplevel alignment used for stencil buffers is 8
1930 * pixels horizontally and either 4 or 8 pixels vertically (see
1931 * intel_horizontal_texture_alignment_unit() and
1932 * intel_vertical_texture_alignment_unit()).
1934 * Note: Also, since the SURFACE_STATE command packet can only apply
1935 * offsets that are multiples of 4 pixels horizontally and 2 pixels
1936 * vertically, it is important that the offsets will be multiples of
1937 * these sizes after they are converted into Y-tiled coordinates.
1938 * Fortunately they will be, since we know from above that the offsets
1939 * are a multiple of the 32-byte sub-tile size, and in Y-tiled
1940 * coordinates the sub-tile is 16 pixels wide and 2 pixels high.
1942 * TODO: what if this makes the coordinates (or the texture size) too
1945 const unsigned x_align
= 8, y_align
= params
.dst
.num_samples
!= 0 ? 8 : 4;
1946 params
.x0
= ROUND_DOWN_TO(params
.x0
, x_align
) * 2;
1947 params
.y0
= ROUND_DOWN_TO(params
.y0
, y_align
) / 2;
1948 params
.x1
= ALIGN(params
.x1
, x_align
) * 2;
1949 params
.y1
= ALIGN(params
.y1
, y_align
) / 2;
1950 params
.dst
.width
= ALIGN(params
.dst
.width
, x_align
) * 2;
1951 params
.dst
.height
= ALIGN(params
.dst
.height
, y_align
) / 2;
1952 params
.dst
.x_offset
*= 2;
1953 params
.dst
.y_offset
/= 2;
1954 wm_prog_key
.use_kill
= true;
1957 if (params
.src
.map_stencil_as_y_tiled
) {
1958 /* We must modify the size and x/y offset of the source surface to
1959 * account for the fact that we are mapping it as Y-tiled when it is in
1962 * See the comments above concerning x/y offset alignment for the
1963 * destination surface.
1965 * TODO: what if this makes the texture size too large?
1967 const unsigned x_align
= 8, y_align
= params
.src
.num_samples
!= 0 ? 8 : 4;
1968 params
.src
.width
= ALIGN(params
.src
.width
, x_align
) * 2;
1969 params
.src
.height
= ALIGN(params
.src
.height
, y_align
) / 2;
1970 params
.src
.x_offset
*= 2;
1971 params
.src
.y_offset
/= 2;
1974 brw_blorp_get_blit_kernel(brw
, ¶ms
, &wm_prog_key
);
1976 params
.src
.swizzle
= src_swizzle
;
1978 brw_blorp_exec(brw
, ¶ms
);
1980 intel_miptree_slice_set_needs_hiz_resolve(dst_mt
, dst_level
, dst_layer
);
1982 if (intel_miptree_is_lossless_compressed(brw
, dst_mt
))
1983 dst_mt
->fast_clear_state
= INTEL_FAST_CLEAR_STATE_UNRESOLVED
;