2 * Copyright © 2012 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 #include "main/context.h"
25 #include "main/teximage.h"
26 #include "main/fbobject.h"
28 #include "compiler/nir/nir_builder.h"
30 #include "intel_fbo.h"
32 #include "brw_blorp.h"
33 #include "brw_context.h"
34 #include "brw_state.h"
35 #include "brw_meta_util.h"
37 #define FILE_DEBUG_FLAG DEBUG_BLORP
39 static struct intel_mipmap_tree
*
40 find_miptree(GLbitfield buffer_bit
, struct intel_renderbuffer
*irb
)
42 struct intel_mipmap_tree
*mt
= irb
->mt
;
43 if (buffer_bit
== GL_STENCIL_BUFFER_BIT
&& mt
->stencil_mt
)
49 blorp_get_texture_swizzle(const struct intel_renderbuffer
*irb
)
51 return irb
->Base
.Base
._BaseFormat
== GL_RGB
?
52 MAKE_SWIZZLE4(SWIZZLE_X
, SWIZZLE_Y
, SWIZZLE_Z
, SWIZZLE_ONE
) :
57 do_blorp_blit(struct brw_context
*brw
, GLbitfield buffer_bit
,
58 struct intel_renderbuffer
*src_irb
, mesa_format src_format
,
59 struct intel_renderbuffer
*dst_irb
, mesa_format dst_format
,
60 GLfloat srcX0
, GLfloat srcY0
, GLfloat srcX1
, GLfloat srcY1
,
61 GLfloat dstX0
, GLfloat dstY0
, GLfloat dstX1
, GLfloat dstY1
,
62 GLenum filter
, bool mirror_x
, bool mirror_y
)
64 /* Find source/dst miptrees */
65 struct intel_mipmap_tree
*src_mt
= find_miptree(buffer_bit
, src_irb
);
66 struct intel_mipmap_tree
*dst_mt
= find_miptree(buffer_bit
, dst_irb
);
68 const bool es3
= _mesa_is_gles3(&brw
->ctx
);
70 brw_blorp_blit_miptrees(brw
,
71 src_mt
, src_irb
->mt_level
, src_irb
->mt_layer
,
72 src_format
, blorp_get_texture_swizzle(src_irb
),
73 dst_mt
, dst_irb
->mt_level
, dst_irb
->mt_layer
,
75 srcX0
, srcY0
, srcX1
, srcY1
,
76 dstX0
, dstY0
, dstX1
, dstY1
,
77 filter
, mirror_x
, mirror_y
,
80 dst_irb
->need_downsample
= true;
84 try_blorp_blit(struct brw_context
*brw
,
85 const struct gl_framebuffer
*read_fb
,
86 const struct gl_framebuffer
*draw_fb
,
87 GLfloat srcX0
, GLfloat srcY0
, GLfloat srcX1
, GLfloat srcY1
,
88 GLfloat dstX0
, GLfloat dstY0
, GLfloat dstX1
, GLfloat dstY1
,
89 GLenum filter
, GLbitfield buffer_bit
)
91 struct gl_context
*ctx
= &brw
->ctx
;
93 /* Sync up the state of window system buffers. We need to do this before
94 * we go looking for the buffers.
96 intel_prepare_render(brw
);
98 bool mirror_x
, mirror_y
;
99 if (brw_meta_mirror_clip_and_scissor(ctx
, read_fb
, draw_fb
,
100 &srcX0
, &srcY0
, &srcX1
, &srcY1
,
101 &dstX0
, &dstY0
, &dstX1
, &dstY1
,
102 &mirror_x
, &mirror_y
))
106 struct intel_renderbuffer
*src_irb
;
107 struct intel_renderbuffer
*dst_irb
;
108 struct intel_mipmap_tree
*src_mt
;
109 struct intel_mipmap_tree
*dst_mt
;
110 switch (buffer_bit
) {
111 case GL_COLOR_BUFFER_BIT
:
112 src_irb
= intel_renderbuffer(read_fb
->_ColorReadBuffer
);
113 for (unsigned i
= 0; i
< draw_fb
->_NumColorDrawBuffers
; ++i
) {
114 dst_irb
= intel_renderbuffer(draw_fb
->_ColorDrawBuffers
[i
]);
116 do_blorp_blit(brw
, buffer_bit
,
117 src_irb
, src_irb
->Base
.Base
.Format
,
118 dst_irb
, dst_irb
->Base
.Base
.Format
,
119 srcX0
, srcY0
, srcX1
, srcY1
,
120 dstX0
, dstY0
, dstX1
, dstY1
,
121 filter
, mirror_x
, mirror_y
);
124 case GL_DEPTH_BUFFER_BIT
:
126 intel_renderbuffer(read_fb
->Attachment
[BUFFER_DEPTH
].Renderbuffer
);
128 intel_renderbuffer(draw_fb
->Attachment
[BUFFER_DEPTH
].Renderbuffer
);
129 src_mt
= find_miptree(buffer_bit
, src_irb
);
130 dst_mt
= find_miptree(buffer_bit
, dst_irb
);
132 /* We can't handle format conversions between Z24 and other formats
133 * since we have to lie about the surface format. See the comments in
134 * brw_blorp_surface_info::set().
136 if ((src_mt
->format
== MESA_FORMAT_Z24_UNORM_X8_UINT
) !=
137 (dst_mt
->format
== MESA_FORMAT_Z24_UNORM_X8_UINT
))
140 do_blorp_blit(brw
, buffer_bit
, src_irb
, MESA_FORMAT_NONE
,
141 dst_irb
, MESA_FORMAT_NONE
, srcX0
, srcY0
,
142 srcX1
, srcY1
, dstX0
, dstY0
, dstX1
, dstY1
,
143 filter
, mirror_x
, mirror_y
);
145 case GL_STENCIL_BUFFER_BIT
:
147 intel_renderbuffer(read_fb
->Attachment
[BUFFER_STENCIL
].Renderbuffer
);
149 intel_renderbuffer(draw_fb
->Attachment
[BUFFER_STENCIL
].Renderbuffer
);
150 do_blorp_blit(brw
, buffer_bit
, src_irb
, MESA_FORMAT_NONE
,
151 dst_irb
, MESA_FORMAT_NONE
, srcX0
, srcY0
,
152 srcX1
, srcY1
, dstX0
, dstY0
, dstX1
, dstY1
,
153 filter
, mirror_x
, mirror_y
);
156 unreachable("not reached");
163 brw_blorp_copytexsubimage(struct brw_context
*brw
,
164 struct gl_renderbuffer
*src_rb
,
165 struct gl_texture_image
*dst_image
,
167 int srcX0
, int srcY0
,
168 int dstX0
, int dstY0
,
169 int width
, int height
)
171 struct gl_context
*ctx
= &brw
->ctx
;
172 struct intel_renderbuffer
*src_irb
= intel_renderbuffer(src_rb
);
173 struct intel_texture_image
*intel_image
= intel_texture_image(dst_image
);
175 /* No pixel transfer operations (zoom, bias, mapping), just a blit */
176 if (brw
->ctx
._ImageTransferState
)
179 /* Sync up the state of window system buffers. We need to do this before
180 * we go looking at the src renderbuffer's miptree.
182 intel_prepare_render(brw
);
184 struct intel_mipmap_tree
*src_mt
= src_irb
->mt
;
185 struct intel_mipmap_tree
*dst_mt
= intel_image
->mt
;
187 /* There is support for only up to eight samples. */
188 if (src_mt
->num_samples
> 8 || dst_mt
->num_samples
> 8)
191 /* BLORP is only supported from Gen6 onwards. */
195 if (_mesa_get_format_base_format(src_rb
->Format
) !=
196 _mesa_get_format_base_format(dst_image
->TexFormat
)) {
200 /* We can't handle format conversions between Z24 and other formats since
201 * we have to lie about the surface format. See the comments in
202 * brw_blorp_surface_info::set().
204 if ((src_mt
->format
== MESA_FORMAT_Z24_UNORM_X8_UINT
) !=
205 (dst_mt
->format
== MESA_FORMAT_Z24_UNORM_X8_UINT
)) {
209 if (!brw
->format_supported_as_render_target
[dst_image
->TexFormat
])
212 /* Source clipping shouldn't be necessary, since copytexsubimage (in
213 * src/mesa/main/teximage.c) calls _mesa_clip_copytexsubimage() which
216 * Destination clipping shouldn't be necessary since the restrictions on
217 * glCopyTexSubImage prevent the user from specifying a destination rectangle
218 * that falls outside the bounds of the destination texture.
219 * See error_check_subtexture_dimensions().
222 int srcY1
= srcY0
+ height
;
223 int srcX1
= srcX0
+ width
;
224 int dstX1
= dstX0
+ width
;
225 int dstY1
= dstY0
+ height
;
227 /* Account for the fact that in the system framebuffer, the origin is at
230 bool mirror_y
= false;
231 if (_mesa_is_winsys_fbo(ctx
->ReadBuffer
)) {
232 GLint tmp
= src_rb
->Height
- srcY0
;
233 srcY0
= src_rb
->Height
- srcY1
;
238 /* Account for face selection and texture view MinLayer */
239 int dst_slice
= slice
+ dst_image
->TexObject
->MinLayer
+ dst_image
->Face
;
240 int dst_level
= dst_image
->Level
+ dst_image
->TexObject
->MinLevel
;
242 brw_blorp_blit_miptrees(brw
,
243 src_mt
, src_irb
->mt_level
, src_irb
->mt_layer
,
244 src_rb
->Format
, blorp_get_texture_swizzle(src_irb
),
245 dst_mt
, dst_level
, dst_slice
,
246 dst_image
->TexFormat
,
247 srcX0
, srcY0
, srcX1
, srcY1
,
248 dstX0
, dstY0
, dstX1
, dstY1
,
249 GL_NEAREST
, false, mirror_y
,
252 /* If we're copying to a packed depth stencil texture and the source
253 * framebuffer has separate stencil, we need to also copy the stencil data
256 src_rb
= ctx
->ReadBuffer
->Attachment
[BUFFER_STENCIL
].Renderbuffer
;
257 if (_mesa_get_format_bits(dst_image
->TexFormat
, GL_STENCIL_BITS
) > 0 &&
259 src_irb
= intel_renderbuffer(src_rb
);
260 src_mt
= src_irb
->mt
;
262 if (src_mt
->stencil_mt
)
263 src_mt
= src_mt
->stencil_mt
;
264 if (dst_mt
->stencil_mt
)
265 dst_mt
= dst_mt
->stencil_mt
;
267 if (src_mt
!= dst_mt
) {
268 brw_blorp_blit_miptrees(brw
,
269 src_mt
, src_irb
->mt_level
, src_irb
->mt_layer
,
271 blorp_get_texture_swizzle(src_irb
),
272 dst_mt
, dst_level
, dst_slice
,
274 srcX0
, srcY0
, srcX1
, srcY1
,
275 dstX0
, dstY0
, dstX1
, dstY1
,
276 GL_NEAREST
, false, mirror_y
,
286 brw_blorp_framebuffer(struct brw_context
*brw
,
287 struct gl_framebuffer
*readFb
,
288 struct gl_framebuffer
*drawFb
,
289 GLint srcX0
, GLint srcY0
, GLint srcX1
, GLint srcY1
,
290 GLint dstX0
, GLint dstY0
, GLint dstX1
, GLint dstY1
,
291 GLbitfield mask
, GLenum filter
)
293 /* BLORP is not supported before Gen6. */
297 static GLbitfield buffer_bits
[] = {
300 GL_STENCIL_BUFFER_BIT
,
303 for (unsigned int i
= 0; i
< ARRAY_SIZE(buffer_bits
); ++i
) {
304 if ((mask
& buffer_bits
[i
]) &&
305 try_blorp_blit(brw
, readFb
, drawFb
,
306 srcX0
, srcY0
, srcX1
, srcY1
,
307 dstX0
, dstY0
, dstX1
, dstY1
,
308 filter
, buffer_bits
[i
])) {
309 mask
&= ~buffer_bits
[i
];
318 * Enum to specify the order of arguments in a sampler message
320 enum sampler_message_arg
322 SAMPLER_MESSAGE_ARG_U_FLOAT
,
323 SAMPLER_MESSAGE_ARG_V_FLOAT
,
324 SAMPLER_MESSAGE_ARG_U_INT
,
325 SAMPLER_MESSAGE_ARG_V_INT
,
326 SAMPLER_MESSAGE_ARG_R_INT
,
327 SAMPLER_MESSAGE_ARG_SI_INT
,
328 SAMPLER_MESSAGE_ARG_MCS_INT
,
329 SAMPLER_MESSAGE_ARG_ZERO_INT
,
332 struct brw_blorp_blit_vars
{
333 /* Uniforms values from brw_blorp_wm_push_constants */
334 nir_variable
*u_dst_x0
;
335 nir_variable
*u_dst_x1
;
336 nir_variable
*u_dst_y0
;
337 nir_variable
*u_dst_y1
;
338 nir_variable
*u_rect_grid_x1
;
339 nir_variable
*u_rect_grid_y1
;
341 nir_variable
*multiplier
;
342 nir_variable
*offset
;
343 } u_x_transform
, u_y_transform
;
344 nir_variable
*u_src_z
;
347 nir_variable
*frag_coord
;
350 nir_variable
*color_out
;
354 brw_blorp_blit_vars_init(nir_builder
*b
, struct brw_blorp_blit_vars
*v
,
355 const struct brw_blorp_blit_prog_key
*key
)
357 #define LOAD_UNIFORM(name, type)\
358 v->u_##name = nir_variable_create(b->shader, nir_var_uniform, type, #name); \
359 v->u_##name->data.location = \
360 offsetof(struct brw_blorp_wm_push_constants, name);
362 LOAD_UNIFORM(dst_x0
, glsl_uint_type())
363 LOAD_UNIFORM(dst_x1
, glsl_uint_type())
364 LOAD_UNIFORM(dst_y0
, glsl_uint_type())
365 LOAD_UNIFORM(dst_y1
, glsl_uint_type())
366 LOAD_UNIFORM(rect_grid_x1
, glsl_float_type())
367 LOAD_UNIFORM(rect_grid_y1
, glsl_float_type())
368 LOAD_UNIFORM(x_transform
.multiplier
, glsl_float_type())
369 LOAD_UNIFORM(x_transform
.offset
, glsl_float_type())
370 LOAD_UNIFORM(y_transform
.multiplier
, glsl_float_type())
371 LOAD_UNIFORM(y_transform
.offset
, glsl_float_type())
372 LOAD_UNIFORM(src_z
, glsl_uint_type())
376 v
->frag_coord
= nir_variable_create(b
->shader
, nir_var_shader_in
,
377 glsl_vec4_type(), "gl_FragCoord");
378 v
->frag_coord
->data
.location
= VARYING_SLOT_POS
;
379 v
->frag_coord
->data
.origin_upper_left
= true;
381 v
->color_out
= nir_variable_create(b
->shader
, nir_var_shader_out
,
382 glsl_vec4_type(), "gl_FragColor");
383 v
->color_out
->data
.location
= FRAG_RESULT_COLOR
;
387 blorp_blit_get_frag_coords(nir_builder
*b
,
388 const struct brw_blorp_blit_prog_key
*key
,
389 struct brw_blorp_blit_vars
*v
)
391 nir_ssa_def
*coord
= nir_f2i(b
, nir_load_var(b
, v
->frag_coord
));
393 if (key
->persample_msaa_dispatch
) {
394 return nir_vec3(b
, nir_channel(b
, coord
, 0), nir_channel(b
, coord
, 1),
395 nir_load_system_value(b
, nir_intrinsic_load_sample_id
, 0));
397 return nir_vec2(b
, nir_channel(b
, coord
, 0), nir_channel(b
, coord
, 1));
402 * Emit code to translate from destination (X, Y) coordinates to source (X, Y)
406 blorp_blit_apply_transform(nir_builder
*b
, nir_ssa_def
*src_pos
,
407 struct brw_blorp_blit_vars
*v
)
409 nir_ssa_def
*offset
= nir_vec2(b
, nir_load_var(b
, v
->u_x_transform
.offset
),
410 nir_load_var(b
, v
->u_y_transform
.offset
));
411 nir_ssa_def
*mul
= nir_vec2(b
, nir_load_var(b
, v
->u_x_transform
.multiplier
),
412 nir_load_var(b
, v
->u_y_transform
.multiplier
));
414 return nir_ffma(b
, src_pos
, mul
, offset
);
418 blorp_nir_discard_if_outside_rect(nir_builder
*b
, nir_ssa_def
*pos
,
419 struct brw_blorp_blit_vars
*v
)
421 nir_ssa_def
*c0
, *c1
, *c2
, *c3
;
422 c0
= nir_ult(b
, nir_channel(b
, pos
, 0), nir_load_var(b
, v
->u_dst_x0
));
423 c1
= nir_uge(b
, nir_channel(b
, pos
, 0), nir_load_var(b
, v
->u_dst_x1
));
424 c2
= nir_ult(b
, nir_channel(b
, pos
, 1), nir_load_var(b
, v
->u_dst_y0
));
425 c3
= nir_uge(b
, nir_channel(b
, pos
, 1), nir_load_var(b
, v
->u_dst_y1
));
426 nir_ssa_def
*oob
= nir_ior(b
, nir_ior(b
, c0
, c1
), nir_ior(b
, c2
, c3
));
428 nir_intrinsic_instr
*discard
=
429 nir_intrinsic_instr_create(b
->shader
, nir_intrinsic_discard_if
);
430 discard
->src
[0] = nir_src_for_ssa(oob
);
431 nir_builder_instr_insert(b
, &discard
->instr
);
434 static nir_tex_instr
*
435 blorp_create_nir_tex_instr(nir_shader
*shader
, nir_texop op
,
436 nir_ssa_def
*pos
, unsigned num_srcs
,
437 enum brw_reg_type dst_type
)
439 nir_tex_instr
*tex
= nir_tex_instr_create(shader
, num_srcs
);
444 case BRW_REGISTER_TYPE_F
:
445 tex
->dest_type
= nir_type_float
;
447 case BRW_REGISTER_TYPE_D
:
448 tex
->dest_type
= nir_type_int
;
450 case BRW_REGISTER_TYPE_UD
:
451 tex
->dest_type
= nir_type_uint
;
454 unreachable("Invalid texture return type");
457 tex
->is_array
= false;
458 tex
->is_shadow
= false;
460 /* Blorp only has one texture and it's bound at unit 0 */
463 tex
->texture_index
= 0;
464 tex
->sampler_index
= 0;
466 nir_ssa_dest_init(&tex
->instr
, &tex
->dest
, 4, 32, NULL
);
472 blorp_nir_tex(nir_builder
*b
, nir_ssa_def
*pos
, enum brw_reg_type dst_type
)
475 blorp_create_nir_tex_instr(b
->shader
, nir_texop_tex
, pos
, 2, dst_type
);
477 assert(pos
->num_components
== 2);
478 tex
->sampler_dim
= GLSL_SAMPLER_DIM_2D
;
479 tex
->coord_components
= 2;
480 tex
->src
[0].src_type
= nir_tex_src_coord
;
481 tex
->src
[0].src
= nir_src_for_ssa(pos
);
482 tex
->src
[1].src_type
= nir_tex_src_lod
;
483 tex
->src
[1].src
= nir_src_for_ssa(nir_imm_int(b
, 0));
485 nir_builder_instr_insert(b
, &tex
->instr
);
487 return &tex
->dest
.ssa
;
491 blorp_nir_txf(nir_builder
*b
, struct brw_blorp_blit_vars
*v
,
492 nir_ssa_def
*pos
, enum brw_reg_type dst_type
)
495 blorp_create_nir_tex_instr(b
->shader
, nir_texop_txf
, pos
, 2, dst_type
);
497 /* In order to properly handle 3-D textures, we pull the Z component from
498 * a uniform. TODO: This is a bit magic; we should probably make this
499 * more explicit in the future.
501 assert(pos
->num_components
== 2);
502 pos
= nir_vec3(b
, nir_channel(b
, pos
, 0), nir_channel(b
, pos
, 1),
503 nir_load_var(b
, v
->u_src_z
));
505 tex
->sampler_dim
= GLSL_SAMPLER_DIM_3D
;
506 tex
->coord_components
= 3;
507 tex
->src
[0].src_type
= nir_tex_src_coord
;
508 tex
->src
[0].src
= nir_src_for_ssa(pos
);
509 tex
->src
[1].src_type
= nir_tex_src_lod
;
510 tex
->src
[1].src
= nir_src_for_ssa(nir_imm_int(b
, 0));
512 nir_builder_instr_insert(b
, &tex
->instr
);
514 return &tex
->dest
.ssa
;
518 blorp_nir_txf_ms(nir_builder
*b
, nir_ssa_def
*pos
, nir_ssa_def
*mcs
,
519 enum brw_reg_type dst_type
)
522 blorp_create_nir_tex_instr(b
->shader
, nir_texop_txf_ms
, pos
,
523 mcs
!= NULL
? 3 : 2, dst_type
);
525 tex
->sampler_dim
= GLSL_SAMPLER_DIM_MS
;
526 tex
->coord_components
= 2;
527 tex
->src
[0].src_type
= nir_tex_src_coord
;
528 tex
->src
[0].src
= nir_src_for_ssa(pos
);
530 tex
->src
[1].src_type
= nir_tex_src_ms_index
;
531 if (pos
->num_components
== 2) {
532 tex
->src
[1].src
= nir_src_for_ssa(nir_imm_int(b
, 0));
534 assert(pos
->num_components
== 3);
535 tex
->src
[1].src
= nir_src_for_ssa(nir_channel(b
, pos
, 2));
539 tex
->src
[2].src_type
= nir_tex_src_ms_mcs
;
540 tex
->src
[2].src
= nir_src_for_ssa(mcs
);
543 nir_builder_instr_insert(b
, &tex
->instr
);
545 return &tex
->dest
.ssa
;
549 blorp_nir_txf_ms_mcs(nir_builder
*b
, nir_ssa_def
*pos
)
552 blorp_create_nir_tex_instr(b
->shader
, nir_texop_txf_ms_mcs
,
553 pos
, 1, BRW_REGISTER_TYPE_D
);
555 tex
->sampler_dim
= GLSL_SAMPLER_DIM_MS
;
556 tex
->coord_components
= 2;
557 tex
->src
[0].src_type
= nir_tex_src_coord
;
558 tex
->src
[0].src
= nir_src_for_ssa(pos
);
560 nir_builder_instr_insert(b
, &tex
->instr
);
562 return &tex
->dest
.ssa
;
566 nir_mask_shift_or(struct nir_builder
*b
, nir_ssa_def
*dst
, nir_ssa_def
*src
,
567 uint32_t src_mask
, int src_left_shift
)
569 nir_ssa_def
*masked
= nir_iand(b
, src
, nir_imm_int(b
, src_mask
));
571 nir_ssa_def
*shifted
;
572 if (src_left_shift
> 0) {
573 shifted
= nir_ishl(b
, masked
, nir_imm_int(b
, src_left_shift
));
574 } else if (src_left_shift
< 0) {
575 shifted
= nir_ushr(b
, masked
, nir_imm_int(b
, -src_left_shift
));
577 assert(src_left_shift
== 0);
581 return nir_ior(b
, dst
, shifted
);
585 * Emit code to compensate for the difference between Y and W tiling.
587 * This code modifies the X and Y coordinates according to the formula:
589 * (X', Y', S') = detile(W-MAJOR, tile(Y-MAJOR, X, Y, S))
591 * (See brw_blorp_build_nir_shader).
593 static inline nir_ssa_def
*
594 blorp_nir_retile_y_to_w(nir_builder
*b
, nir_ssa_def
*pos
)
596 assert(pos
->num_components
== 2);
597 nir_ssa_def
*x_Y
= nir_channel(b
, pos
, 0);
598 nir_ssa_def
*y_Y
= nir_channel(b
, pos
, 1);
600 /* Given X and Y coordinates that describe an address using Y tiling,
601 * translate to the X and Y coordinates that describe the same address
604 * If we break down the low order bits of X and Y, using a
605 * single letter to represent each low-order bit:
607 * X = A << 7 | 0bBCDEFGH
608 * Y = J << 5 | 0bKLMNP (1)
610 * Then we can apply the Y tiling formula to see the memory offset being
613 * offset = (J * tile_pitch + A) << 12 | 0bBCDKLMNPEFGH (2)
615 * If we apply the W detiling formula to this memory location, that the
616 * corresponding X' and Y' coordinates are:
618 * X' = A << 6 | 0bBCDPFH (3)
619 * Y' = J << 6 | 0bKLMNEG
621 * Combining (1) and (3), we see that to transform (X, Y) to (X', Y'),
622 * we need to make the following computation:
624 * X' = (X & ~0b1011) >> 1 | (Y & 0b1) << 2 | X & 0b1 (4)
625 * Y' = (Y & ~0b1) << 1 | (X & 0b1000) >> 2 | (X & 0b10) >> 1
627 nir_ssa_def
*x_W
= nir_imm_int(b
, 0);
628 x_W
= nir_mask_shift_or(b
, x_W
, x_Y
, 0xfffffff4, -1);
629 x_W
= nir_mask_shift_or(b
, x_W
, y_Y
, 0x1, 2);
630 x_W
= nir_mask_shift_or(b
, x_W
, x_Y
, 0x1, 0);
632 nir_ssa_def
*y_W
= nir_imm_int(b
, 0);
633 y_W
= nir_mask_shift_or(b
, y_W
, y_Y
, 0xfffffffe, 1);
634 y_W
= nir_mask_shift_or(b
, y_W
, x_Y
, 0x8, -2);
635 y_W
= nir_mask_shift_or(b
, y_W
, x_Y
, 0x2, -1);
637 return nir_vec2(b
, x_W
, y_W
);
641 * Emit code to compensate for the difference between Y and W tiling.
643 * This code modifies the X and Y coordinates according to the formula:
645 * (X', Y', S') = detile(Y-MAJOR, tile(W-MAJOR, X, Y, S))
647 * (See brw_blorp_build_nir_shader).
649 static inline nir_ssa_def
*
650 blorp_nir_retile_w_to_y(nir_builder
*b
, nir_ssa_def
*pos
)
652 assert(pos
->num_components
== 2);
653 nir_ssa_def
*x_W
= nir_channel(b
, pos
, 0);
654 nir_ssa_def
*y_W
= nir_channel(b
, pos
, 1);
656 /* Applying the same logic as above, but in reverse, we obtain the
659 * X' = (X & ~0b101) << 1 | (Y & 0b10) << 2 | (Y & 0b1) << 1 | X & 0b1
660 * Y' = (Y & ~0b11) >> 1 | (X & 0b100) >> 2
662 nir_ssa_def
*x_Y
= nir_imm_int(b
, 0);
663 x_Y
= nir_mask_shift_or(b
, x_Y
, x_W
, 0xfffffffa, 1);
664 x_Y
= nir_mask_shift_or(b
, x_Y
, y_W
, 0x2, 2);
665 x_Y
= nir_mask_shift_or(b
, x_Y
, y_W
, 0x1, 1);
666 x_Y
= nir_mask_shift_or(b
, x_Y
, x_W
, 0x1, 0);
668 nir_ssa_def
*y_Y
= nir_imm_int(b
, 0);
669 y_Y
= nir_mask_shift_or(b
, y_Y
, y_W
, 0xfffffffc, -1);
670 y_Y
= nir_mask_shift_or(b
, y_Y
, x_W
, 0x4, -2);
672 return nir_vec2(b
, x_Y
, y_Y
);
676 * Emit code to compensate for the difference between MSAA and non-MSAA
679 * This code modifies the X and Y coordinates according to the formula:
681 * (X', Y', S') = encode_msaa(num_samples, IMS, X, Y, S)
683 * (See brw_blorp_blit_program).
685 static inline nir_ssa_def
*
686 blorp_nir_encode_msaa(nir_builder
*b
, nir_ssa_def
*pos
,
687 unsigned num_samples
, enum intel_msaa_layout layout
)
689 assert(pos
->num_components
== 2 || pos
->num_components
== 3);
692 case INTEL_MSAA_LAYOUT_NONE
:
693 assert(pos
->num_components
== 2);
695 case INTEL_MSAA_LAYOUT_CMS
:
696 /* We can't compensate for compressed layout since at this point in the
697 * program we haven't read from the MCS buffer.
699 unreachable("Bad layout in encode_msaa");
700 case INTEL_MSAA_LAYOUT_UMS
:
701 /* No translation needed */
703 case INTEL_MSAA_LAYOUT_IMS
: {
704 nir_ssa_def
*x_in
= nir_channel(b
, pos
, 0);
705 nir_ssa_def
*y_in
= nir_channel(b
, pos
, 1);
706 nir_ssa_def
*s_in
= pos
->num_components
== 2 ? nir_imm_int(b
, 0) :
707 nir_channel(b
, pos
, 2);
709 nir_ssa_def
*x_out
= nir_imm_int(b
, 0);
710 nir_ssa_def
*y_out
= nir_imm_int(b
, 0);
711 switch (num_samples
) {
714 /* encode_msaa(2, IMS, X, Y, S) = (X', Y', 0)
715 * where X' = (X & ~0b1) << 1 | (S & 0b1) << 1 | (X & 0b1)
718 * encode_msaa(4, IMS, X, Y, S) = (X', Y', 0)
719 * where X' = (X & ~0b1) << 1 | (S & 0b1) << 1 | (X & 0b1)
720 * Y' = (Y & ~0b1) << 1 | (S & 0b10) | (Y & 0b1)
722 x_out
= nir_mask_shift_or(b
, x_out
, x_in
, 0xfffffffe, 1);
723 x_out
= nir_mask_shift_or(b
, x_out
, s_in
, 0x1, 1);
724 x_out
= nir_mask_shift_or(b
, x_out
, x_in
, 0x1, 0);
725 if (num_samples
== 2) {
728 y_out
= nir_mask_shift_or(b
, y_out
, y_in
, 0xfffffffe, 1);
729 y_out
= nir_mask_shift_or(b
, y_out
, s_in
, 0x2, 0);
730 y_out
= nir_mask_shift_or(b
, y_out
, y_in
, 0x1, 0);
735 /* encode_msaa(8, IMS, X, Y, S) = (X', Y', 0)
736 * where X' = (X & ~0b1) << 2 | (S & 0b100) | (S & 0b1) << 1
738 * Y' = (Y & ~0b1) << 1 | (S & 0b10) | (Y & 0b1)
740 x_out
= nir_mask_shift_or(b
, x_out
, x_in
, 0xfffffffe, 2);
741 x_out
= nir_mask_shift_or(b
, x_out
, s_in
, 0x4, 0);
742 x_out
= nir_mask_shift_or(b
, x_out
, s_in
, 0x1, 1);
743 x_out
= nir_mask_shift_or(b
, x_out
, x_in
, 0x1, 0);
744 y_out
= nir_mask_shift_or(b
, y_out
, y_in
, 0xfffffffe, 1);
745 y_out
= nir_mask_shift_or(b
, y_out
, s_in
, 0x2, 0);
746 y_out
= nir_mask_shift_or(b
, y_out
, y_in
, 0x1, 0);
750 /* encode_msaa(16, IMS, X, Y, S) = (X', Y', 0)
751 * where X' = (X & ~0b1) << 2 | (S & 0b100) | (S & 0b1) << 1
753 * Y' = (Y & ~0b1) << 2 | (S & 0b1000) >> 1 (S & 0b10)
756 x_out
= nir_mask_shift_or(b
, x_out
, x_in
, 0xfffffffe, 2);
757 x_out
= nir_mask_shift_or(b
, x_out
, s_in
, 0x4, 0);
758 x_out
= nir_mask_shift_or(b
, x_out
, s_in
, 0x1, 1);
759 x_out
= nir_mask_shift_or(b
, x_out
, x_in
, 0x1, 0);
760 y_out
= nir_mask_shift_or(b
, y_out
, y_in
, 0xfffffffe, 2);
761 y_out
= nir_mask_shift_or(b
, y_out
, s_in
, 0x8, -1);
762 y_out
= nir_mask_shift_or(b
, y_out
, s_in
, 0x2, 0);
763 y_out
= nir_mask_shift_or(b
, y_out
, y_in
, 0x1, 0);
767 unreachable("Invalid number of samples for IMS layout");
770 return nir_vec2(b
, x_out
, y_out
);
774 unreachable("Invalid MSAA layout");
779 * Emit code to compensate for the difference between MSAA and non-MSAA
782 * This code modifies the X and Y coordinates according to the formula:
784 * (X', Y', S) = decode_msaa(num_samples, IMS, X, Y, S)
786 * (See brw_blorp_blit_program).
788 static inline nir_ssa_def
*
789 blorp_nir_decode_msaa(nir_builder
*b
, nir_ssa_def
*pos
,
790 unsigned num_samples
, enum intel_msaa_layout layout
)
792 assert(pos
->num_components
== 2 || pos
->num_components
== 3);
795 case INTEL_MSAA_LAYOUT_NONE
:
796 /* No translation necessary, and S should already be zero. */
797 assert(pos
->num_components
== 2);
799 case INTEL_MSAA_LAYOUT_CMS
:
800 /* We can't compensate for compressed layout since at this point in the
801 * program we don't have access to the MCS buffer.
803 unreachable("Bad layout in encode_msaa");
804 case INTEL_MSAA_LAYOUT_UMS
:
805 /* No translation necessary. */
807 case INTEL_MSAA_LAYOUT_IMS
: {
808 assert(pos
->num_components
== 2);
810 nir_ssa_def
*x_in
= nir_channel(b
, pos
, 0);
811 nir_ssa_def
*y_in
= nir_channel(b
, pos
, 1);
813 nir_ssa_def
*x_out
= nir_imm_int(b
, 0);
814 nir_ssa_def
*y_out
= nir_imm_int(b
, 0);
815 nir_ssa_def
*s_out
= nir_imm_int(b
, 0);
816 switch (num_samples
) {
819 /* decode_msaa(2, IMS, X, Y, 0) = (X', Y', S)
820 * where X' = (X & ~0b11) >> 1 | (X & 0b1)
821 * S = (X & 0b10) >> 1
823 * decode_msaa(4, IMS, X, Y, 0) = (X', Y', S)
824 * where X' = (X & ~0b11) >> 1 | (X & 0b1)
825 * Y' = (Y & ~0b11) >> 1 | (Y & 0b1)
826 * S = (Y & 0b10) | (X & 0b10) >> 1
828 x_out
= nir_mask_shift_or(b
, x_out
, x_in
, 0xfffffffc, -1);
829 x_out
= nir_mask_shift_or(b
, x_out
, x_in
, 0x1, 0);
830 if (num_samples
== 2) {
832 s_out
= nir_mask_shift_or(b
, s_out
, x_in
, 0x2, -1);
834 y_out
= nir_mask_shift_or(b
, y_out
, y_in
, 0xfffffffc, -1);
835 y_out
= nir_mask_shift_or(b
, y_out
, y_in
, 0x1, 0);
836 s_out
= nir_mask_shift_or(b
, s_out
, x_in
, 0x2, -1);
837 s_out
= nir_mask_shift_or(b
, s_out
, y_in
, 0x2, 0);
842 /* decode_msaa(8, IMS, X, Y, 0) = (X', Y', S)
843 * where X' = (X & ~0b111) >> 2 | (X & 0b1)
844 * Y' = (Y & ~0b11) >> 1 | (Y & 0b1)
845 * S = (X & 0b100) | (Y & 0b10) | (X & 0b10) >> 1
847 x_out
= nir_mask_shift_or(b
, x_out
, x_in
, 0xfffffff8, -2);
848 x_out
= nir_mask_shift_or(b
, x_out
, x_in
, 0x1, 0);
849 y_out
= nir_mask_shift_or(b
, y_out
, y_in
, 0xfffffffc, -1);
850 y_out
= nir_mask_shift_or(b
, y_out
, y_in
, 0x1, 0);
851 s_out
= nir_mask_shift_or(b
, s_out
, x_in
, 0x4, 0);
852 s_out
= nir_mask_shift_or(b
, s_out
, y_in
, 0x2, 0);
853 s_out
= nir_mask_shift_or(b
, s_out
, x_in
, 0x2, -1);
857 /* decode_msaa(16, IMS, X, Y, 0) = (X', Y', S)
858 * where X' = (X & ~0b111) >> 2 | (X & 0b1)
859 * Y' = (Y & ~0b111) >> 2 | (Y & 0b1)
860 * S = (Y & 0b100) << 1 | (X & 0b100) |
861 * (Y & 0b10) | (X & 0b10) >> 1
863 x_out
= nir_mask_shift_or(b
, x_out
, x_in
, 0xfffffff8, -2);
864 x_out
= nir_mask_shift_or(b
, x_out
, x_in
, 0x1, 0);
865 y_out
= nir_mask_shift_or(b
, y_out
, y_in
, 0xfffffff8, -2);
866 y_out
= nir_mask_shift_or(b
, y_out
, y_in
, 0x1, 0);
867 s_out
= nir_mask_shift_or(b
, s_out
, y_in
, 0x4, 1);
868 s_out
= nir_mask_shift_or(b
, s_out
, x_in
, 0x4, 0);
869 s_out
= nir_mask_shift_or(b
, s_out
, y_in
, 0x2, 0);
870 s_out
= nir_mask_shift_or(b
, s_out
, x_in
, 0x2, -1);
874 unreachable("Invalid number of samples for IMS layout");
877 return nir_vec3(b
, x_out
, y_out
, s_out
);
881 unreachable("Invalid MSAA layout");
886 * Count the number of trailing 1 bits in the given value. For example:
888 * count_trailing_one_bits(0) == 0
889 * count_trailing_one_bits(7) == 3
890 * count_trailing_one_bits(11) == 2
892 static inline int count_trailing_one_bits(unsigned value
)
894 #ifdef HAVE___BUILTIN_CTZ
895 return __builtin_ctz(~value
);
897 return _mesa_bitcount(value
& ~(value
+ 1));
902 blorp_nir_manual_blend_average(nir_builder
*b
, nir_ssa_def
*pos
,
903 unsigned tex_samples
,
904 enum intel_msaa_layout tex_layout
,
905 enum brw_reg_type dst_type
)
907 /* If non-null, this is the outer-most if statement */
908 nir_if
*outer_if
= NULL
;
910 nir_variable
*color
=
911 nir_local_variable_create(b
->impl
, glsl_vec4_type(), "color");
913 nir_ssa_def
*mcs
= NULL
;
914 if (tex_layout
== INTEL_MSAA_LAYOUT_CMS
)
915 mcs
= blorp_nir_txf_ms_mcs(b
, pos
);
917 /* We add together samples using a binary tree structure, e.g. for 4x MSAA:
919 * result = ((sample[0] + sample[1]) + (sample[2] + sample[3])) / 4
921 * This ensures that when all samples have the same value, no numerical
922 * precision is lost, since each addition operation always adds two equal
923 * values, and summing two equal floating point values does not lose
926 * We perform this computation by treating the texture_data array as a
927 * stack and performing the following operations:
929 * - push sample 0 onto stack
930 * - push sample 1 onto stack
931 * - add top two stack entries
932 * - push sample 2 onto stack
933 * - push sample 3 onto stack
934 * - add top two stack entries
935 * - add top two stack entries
936 * - divide top stack entry by 4
938 * Note that after pushing sample i onto the stack, the number of add
939 * operations we do is equal to the number of trailing 1 bits in i. This
940 * works provided the total number of samples is a power of two, which it
941 * always is for i965.
943 * For integer formats, we replace the add operations with average
944 * operations and skip the final division.
946 nir_ssa_def
*texture_data
[5];
947 unsigned stack_depth
= 0;
948 for (unsigned i
= 0; i
< tex_samples
; ++i
) {
949 assert(stack_depth
== _mesa_bitcount(i
)); /* Loop invariant */
951 /* Push sample i onto the stack */
952 assert(stack_depth
< ARRAY_SIZE(texture_data
));
954 nir_ssa_def
*ms_pos
= nir_vec3(b
, nir_channel(b
, pos
, 0),
955 nir_channel(b
, pos
, 1),
957 texture_data
[stack_depth
++] = blorp_nir_txf_ms(b
, ms_pos
, mcs
, dst_type
);
959 if (i
== 0 && tex_layout
== INTEL_MSAA_LAYOUT_CMS
) {
960 /* The Ivy Bridge PRM, Vol4 Part1 p27 (Multisample Control Surface)
961 * suggests an optimization:
963 * "A simple optimization with probable large return in
964 * performance is to compare the MCS value to zero (indicating
965 * all samples are on sample slice 0), and sample only from
966 * sample slice 0 using ld2dss if MCS is zero."
968 * Note that in the case where the MCS value is zero, sampling from
969 * sample slice 0 using ld2dss and sampling from sample 0 using
970 * ld2dms are equivalent (since all samples are on sample slice 0).
971 * Since we have already sampled from sample 0, all we need to do is
972 * skip the remaining fetches and averaging if MCS is zero.
974 nir_ssa_def
*mcs_zero
=
975 nir_ieq(b
, nir_channel(b
, mcs
, 0), nir_imm_int(b
, 0));
976 if (tex_samples
== 16) {
977 mcs_zero
= nir_iand(b
, mcs_zero
,
978 nir_ieq(b
, nir_channel(b
, mcs
, 1), nir_imm_int(b
, 0)));
981 nir_if
*if_stmt
= nir_if_create(b
->shader
);
982 if_stmt
->condition
= nir_src_for_ssa(mcs_zero
);
983 nir_cf_node_insert(b
->cursor
, &if_stmt
->cf_node
);
985 b
->cursor
= nir_after_cf_list(&if_stmt
->then_list
);
986 nir_store_var(b
, color
, texture_data
[0], 0xf);
988 b
->cursor
= nir_after_cf_list(&if_stmt
->else_list
);
992 for (int j
= 0; j
< count_trailing_one_bits(i
); j
++) {
993 assert(stack_depth
>= 2);
996 assert(dst_type
== BRW_REGISTER_TYPE_F
);
997 texture_data
[stack_depth
- 1] =
998 nir_fadd(b
, texture_data
[stack_depth
- 1],
999 texture_data
[stack_depth
]);
1003 /* We should have just 1 sample on the stack now. */
1004 assert(stack_depth
== 1);
1006 texture_data
[0] = nir_fmul(b
, texture_data
[0],
1007 nir_imm_float(b
, 1.0 / tex_samples
));
1009 nir_store_var(b
, color
, texture_data
[0], 0xf);
1012 b
->cursor
= nir_after_cf_node(&outer_if
->cf_node
);
1014 return nir_load_var(b
, color
);
1017 static inline nir_ssa_def
*
1018 nir_imm_vec2(nir_builder
*build
, float x
, float y
)
1022 memset(&v
, 0, sizeof(v
));
1026 return nir_build_imm(build
, 4, 32, v
);
1029 static nir_ssa_def
*
1030 blorp_nir_manual_blend_bilinear(nir_builder
*b
, nir_ssa_def
*pos
,
1031 unsigned tex_samples
,
1032 const brw_blorp_blit_prog_key
*key
,
1033 struct brw_blorp_blit_vars
*v
)
1035 nir_ssa_def
*pos_xy
= nir_channels(b
, pos
, 0x3);
1037 nir_ssa_def
*scale
= nir_imm_vec2(b
, key
->x_scale
, key
->y_scale
);
1039 /* Translate coordinates to lay out the samples in a rectangular grid
1040 * roughly corresponding to sample locations.
1042 pos_xy
= nir_fmul(b
, pos_xy
, scale
);
1043 /* Adjust coordinates so that integers represent pixel centers rather
1046 pos_xy
= nir_fadd(b
, pos_xy
, nir_imm_float(b
, -0.5));
1047 /* Clamp the X, Y texture coordinates to properly handle the sampling of
1048 * texels on texture edges.
1050 pos_xy
= nir_fmin(b
, nir_fmax(b
, pos_xy
, nir_imm_float(b
, 0.0)),
1051 nir_vec2(b
, nir_load_var(b
, v
->u_rect_grid_x1
),
1052 nir_load_var(b
, v
->u_rect_grid_y1
)));
1054 /* Store the fractional parts to be used as bilinear interpolation
1057 nir_ssa_def
*frac_xy
= nir_ffract(b
, pos_xy
);
1058 /* Round the float coordinates down to nearest integer */
1059 pos_xy
= nir_fdiv(b
, nir_ftrunc(b
, pos_xy
), scale
);
1061 nir_ssa_def
*tex_data
[4];
1062 for (unsigned i
= 0; i
< 4; ++i
) {
1063 float sample_off_x
= (float)(i
& 0x1) / key
->x_scale
;
1064 float sample_off_y
= (float)((i
>> 1) & 0x1) / key
->y_scale
;
1065 nir_ssa_def
*sample_off
= nir_imm_vec2(b
, sample_off_x
, sample_off_y
);
1067 nir_ssa_def
*sample_coords
= nir_fadd(b
, pos_xy
, sample_off
);
1068 nir_ssa_def
*sample_coords_int
= nir_f2i(b
, sample_coords
);
1070 /* The MCS value we fetch has to match up with the pixel that we're
1071 * sampling from. Since we sample from different pixels in each
1072 * iteration of this "for" loop, the call to mcs_fetch() should be
1073 * here inside the loop after computing the pixel coordinates.
1075 nir_ssa_def
*mcs
= NULL
;
1076 if (key
->tex_layout
== INTEL_MSAA_LAYOUT_CMS
)
1077 mcs
= blorp_nir_txf_ms_mcs(b
, sample_coords_int
);
1079 /* Compute sample index and map the sample index to a sample number.
1080 * Sample index layout shows the numbering of slots in a rectangular
1081 * grid of samples with in a pixel. Sample number layout shows the
1082 * rectangular grid of samples roughly corresponding to the real sample
1083 * locations with in a pixel.
1084 * In case of 4x MSAA, layout of sample indices matches the layout of
1092 * In case of 8x MSAA the two layouts don't match.
1093 * sample index layout : --------- sample number layout : ---------
1094 * | 0 | 1 | | 5 | 2 |
1095 * --------- ---------
1096 * | 2 | 3 | | 4 | 6 |
1097 * --------- ---------
1098 * | 4 | 5 | | 0 | 3 |
1099 * --------- ---------
1100 * | 6 | 7 | | 7 | 1 |
1101 * --------- ---------
1103 * Fortunately, this can be done fairly easily as:
1104 * S' = (0x17306425 >> (S * 4)) & 0xf
1106 * In the case of 16x MSAA the two layouts don't match.
1107 * Sample index layout: Sample number layout:
1108 * --------------------- ---------------------
1109 * | 0 | 1 | 2 | 3 | | 15 | 10 | 9 | 13 |
1110 * --------------------- ---------------------
1111 * | 4 | 5 | 6 | 7 | | 4 | 1 | 7 | 3 |
1112 * --------------------- ---------------------
1113 * | 8 | 9 | 10 | 11 | | 12 | 2 | 0 | 6 |
1114 * --------------------- ---------------------
1115 * | 12 | 13 | 14 | 15 | | 11 | 8 | 5 | 14 |
1116 * --------------------- ---------------------
1118 * This is equivalent to
1119 * S' = (0xfa9d4173c206b85e >> (S * 4)) & 0xf
1121 nir_ssa_def
*frac
= nir_ffract(b
, sample_coords
);
1122 nir_ssa_def
*sample
=
1123 nir_fdot2(b
, frac
, nir_imm_vec2(b
, key
->x_scale
,
1124 key
->x_scale
* key
->y_scale
));
1125 sample
= nir_f2i(b
, sample
);
1127 if (tex_samples
== 8) {
1128 sample
= nir_iand(b
, nir_ishr(b
, nir_imm_int(b
, 0x17306425),
1129 nir_ishl(b
, sample
, nir_imm_int(b
, 2))),
1130 nir_imm_int(b
, 0xf));
1131 } else if (tex_samples
== 16) {
1132 nir_ssa_def
*sample_low
=
1133 nir_iand(b
, nir_ishr(b
, nir_imm_int(b
, 0xc206b85e),
1134 nir_ishl(b
, sample
, nir_imm_int(b
, 2))),
1135 nir_imm_int(b
, 0xf));
1136 nir_ssa_def
*sample_high
=
1137 nir_iand(b
, nir_ishr(b
, nir_imm_int(b
, 0xfa9d4173),
1138 nir_ishl(b
, nir_iadd(b
, sample
,
1139 nir_imm_int(b
, -8)),
1140 nir_imm_int(b
, 2))),
1141 nir_imm_int(b
, 0xf));
1143 sample
= nir_bcsel(b
, nir_ilt(b
, sample
, nir_imm_int(b
, 8)),
1144 sample_low
, sample_high
);
1146 nir_ssa_def
*pos_ms
= nir_vec3(b
, nir_channel(b
, sample_coords_int
, 0),
1147 nir_channel(b
, sample_coords_int
, 1),
1149 tex_data
[i
] = blorp_nir_txf_ms(b
, pos_ms
, mcs
, key
->texture_data_type
);
1152 nir_ssa_def
*frac_x
= nir_channel(b
, frac_xy
, 0);
1153 nir_ssa_def
*frac_y
= nir_channel(b
, frac_xy
, 1);
1154 return nir_flrp(b
, nir_flrp(b
, tex_data
[0], tex_data
[1], frac_x
),
1155 nir_flrp(b
, tex_data
[2], tex_data
[3], frac_x
),
1160 * Generator for WM programs used in BLORP blits.
1162 * The bulk of the work done by the WM program is to wrap and unwrap the
1163 * coordinate transformations used by the hardware to store surfaces in
1164 * memory. The hardware transforms a pixel location (X, Y, S) (where S is the
1165 * sample index for a multisampled surface) to a memory offset by the
1166 * following formulas:
1168 * offset = tile(tiling_format, encode_msaa(num_samples, layout, X, Y, S))
1169 * (X, Y, S) = decode_msaa(num_samples, layout, detile(tiling_format, offset))
1171 * For a single-sampled surface, or for a multisampled surface using
1172 * INTEL_MSAA_LAYOUT_UMS, encode_msaa() and decode_msaa are the identity
1175 * encode_msaa(1, NONE, X, Y, 0) = (X, Y, 0)
1176 * decode_msaa(1, NONE, X, Y, 0) = (X, Y, 0)
1177 * encode_msaa(n, UMS, X, Y, S) = (X, Y, S)
1178 * decode_msaa(n, UMS, X, Y, S) = (X, Y, S)
1180 * For a 4x multisampled surface using INTEL_MSAA_LAYOUT_IMS, encode_msaa()
1181 * embeds the sample number into bit 1 of the X and Y coordinates:
1183 * encode_msaa(4, IMS, X, Y, S) = (X', Y', 0)
1184 * where X' = (X & ~0b1) << 1 | (S & 0b1) << 1 | (X & 0b1)
1185 * Y' = (Y & ~0b1 ) << 1 | (S & 0b10) | (Y & 0b1)
1186 * decode_msaa(4, IMS, X, Y, 0) = (X', Y', S)
1187 * where X' = (X & ~0b11) >> 1 | (X & 0b1)
1188 * Y' = (Y & ~0b11) >> 1 | (Y & 0b1)
1189 * S = (Y & 0b10) | (X & 0b10) >> 1
1191 * For an 8x multisampled surface using INTEL_MSAA_LAYOUT_IMS, encode_msaa()
1192 * embeds the sample number into bits 1 and 2 of the X coordinate and bit 1 of
1195 * encode_msaa(8, IMS, X, Y, S) = (X', Y', 0)
1196 * where X' = (X & ~0b1) << 2 | (S & 0b100) | (S & 0b1) << 1 | (X & 0b1)
1197 * Y' = (Y & ~0b1) << 1 | (S & 0b10) | (Y & 0b1)
1198 * decode_msaa(8, IMS, X, Y, 0) = (X', Y', S)
1199 * where X' = (X & ~0b111) >> 2 | (X & 0b1)
1200 * Y' = (Y & ~0b11) >> 1 | (Y & 0b1)
1201 * S = (X & 0b100) | (Y & 0b10) | (X & 0b10) >> 1
1203 * For X tiling, tile() combines together the low-order bits of the X and Y
1204 * coordinates in the pattern 0byyyxxxxxxxxx, creating 4k tiles that are 512
1205 * bytes wide and 8 rows high:
1207 * tile(x_tiled, X, Y, S) = A
1208 * where A = tile_num << 12 | offset
1209 * tile_num = (Y' >> 3) * tile_pitch + (X' >> 9)
1210 * offset = (Y' & 0b111) << 9
1211 * | (X & 0b111111111)
1213 * Y' = Y + S * qpitch
1214 * detile(x_tiled, A) = (X, Y, S)
1215 * where X = X' / cpp
1218 * Y' = (tile_num / tile_pitch) << 3
1219 * | (A & 0b111000000000) >> 9
1220 * X' = (tile_num % tile_pitch) << 9
1221 * | (A & 0b111111111)
1223 * (In all tiling formulas, cpp is the number of bytes occupied by a single
1224 * sample ("chars per pixel"), tile_pitch is the number of 4k tiles required
1225 * to fill the width of the surface, and qpitch is the spacing (in rows)
1226 * between array slices).
1228 * For Y tiling, tile() combines together the low-order bits of the X and Y
1229 * coordinates in the pattern 0bxxxyyyyyxxxx, creating 4k tiles that are 128
1230 * bytes wide and 32 rows high:
1232 * tile(y_tiled, X, Y, S) = A
1233 * where A = tile_num << 12 | offset
1234 * tile_num = (Y' >> 5) * tile_pitch + (X' >> 7)
1235 * offset = (X' & 0b1110000) << 5
1236 * | (Y' & 0b11111) << 4
1239 * Y' = Y + S * qpitch
1240 * detile(y_tiled, A) = (X, Y, S)
1241 * where X = X' / cpp
1244 * Y' = (tile_num / tile_pitch) << 5
1245 * | (A & 0b111110000) >> 4
1246 * X' = (tile_num % tile_pitch) << 7
1247 * | (A & 0b111000000000) >> 5
1250 * For W tiling, tile() combines together the low-order bits of the X and Y
1251 * coordinates in the pattern 0bxxxyyyyxyxyx, creating 4k tiles that are 64
1252 * bytes wide and 64 rows high (note that W tiling is only used for stencil
1253 * buffers, which always have cpp = 1 and S=0):
1255 * tile(w_tiled, X, Y, S) = A
1256 * where A = tile_num << 12 | offset
1257 * tile_num = (Y' >> 6) * tile_pitch + (X' >> 6)
1258 * offset = (X' & 0b111000) << 6
1259 * | (Y' & 0b111100) << 3
1260 * | (X' & 0b100) << 2
1261 * | (Y' & 0b10) << 2
1262 * | (X' & 0b10) << 1
1266 * Y' = Y + S * qpitch
1267 * detile(w_tiled, A) = (X, Y, S)
1268 * where X = X' / cpp = X'
1269 * Y = Y' % qpitch = Y'
1270 * S = Y / qpitch = 0
1271 * Y' = (tile_num / tile_pitch) << 6
1272 * | (A & 0b111100000) >> 3
1273 * | (A & 0b1000) >> 2
1275 * X' = (tile_num % tile_pitch) << 6
1276 * | (A & 0b111000000000) >> 6
1277 * | (A & 0b10000) >> 2
1278 * | (A & 0b100) >> 1
1281 * Finally, for a non-tiled surface, tile() simply combines together the X and
1282 * Y coordinates in the natural way:
1284 * tile(untiled, X, Y, S) = A
1285 * where A = Y * pitch + X'
1287 * Y' = Y + S * qpitch
1288 * detile(untiled, A) = (X, Y, S)
1289 * where X = X' / cpp
1295 * (In these formulas, pitch is the number of bytes occupied by a single row
1299 brw_blorp_build_nir_shader(struct brw_context
*brw
,
1300 const brw_blorp_blit_prog_key
*key
)
1302 nir_ssa_def
*src_pos
, *dst_pos
, *color
;
1305 if (key
->dst_tiled_w
&& key
->rt_samples
> 0) {
1306 /* If the destination image is W tiled and multisampled, then the thread
1307 * must be dispatched once per sample, not once per pixel. This is
1308 * necessary because after conversion between W and Y tiling, there's no
1309 * guarantee that all samples corresponding to a single pixel will still
1312 assert(key
->persample_msaa_dispatch
);
1316 /* We are blending, which means we won't have an opportunity to
1317 * translate the tiling and sample count for the texture surface. So
1318 * the surface state for the texture must be configured with the correct
1319 * tiling and sample count.
1321 assert(!key
->src_tiled_w
);
1322 assert(key
->tex_samples
== key
->src_samples
);
1323 assert(key
->tex_layout
== key
->src_layout
);
1324 assert(key
->tex_samples
> 0);
1327 if (key
->persample_msaa_dispatch
) {
1328 /* It only makes sense to do persample dispatch if the render target is
1329 * configured as multisampled.
1331 assert(key
->rt_samples
> 0);
1334 /* Make sure layout is consistent with sample count */
1335 assert((key
->tex_layout
== INTEL_MSAA_LAYOUT_NONE
) ==
1336 (key
->tex_samples
== 0));
1337 assert((key
->rt_layout
== INTEL_MSAA_LAYOUT_NONE
) ==
1338 (key
->rt_samples
== 0));
1339 assert((key
->src_layout
== INTEL_MSAA_LAYOUT_NONE
) ==
1340 (key
->src_samples
== 0));
1341 assert((key
->dst_layout
== INTEL_MSAA_LAYOUT_NONE
) ==
1342 (key
->dst_samples
== 0));
1345 nir_builder_init_simple_shader(&b
, NULL
, MESA_SHADER_FRAGMENT
, NULL
);
1347 struct brw_blorp_blit_vars v
;
1348 brw_blorp_blit_vars_init(&b
, &v
, key
);
1350 dst_pos
= blorp_blit_get_frag_coords(&b
, key
, &v
);
1352 /* Render target and texture hardware don't support W tiling until Gen8. */
1353 const bool rt_tiled_w
= false;
1354 const bool tex_tiled_w
= brw
->gen
>= 8 && key
->src_tiled_w
;
1356 /* The address that data will be written to is determined by the
1357 * coordinates supplied to the WM thread and the tiling and sample count of
1358 * the render target, according to the formula:
1360 * (X, Y, S) = decode_msaa(rt_samples, detile(rt_tiling, offset))
1362 * If the actual tiling and sample count of the destination surface are not
1363 * the same as the configuration of the render target, then these
1364 * coordinates are wrong and we have to adjust them to compensate for the
1367 if (rt_tiled_w
!= key
->dst_tiled_w
||
1368 key
->rt_samples
!= key
->dst_samples
||
1369 key
->rt_layout
!= key
->dst_layout
) {
1370 dst_pos
= blorp_nir_encode_msaa(&b
, dst_pos
, key
->rt_samples
,
1372 /* Now (X, Y, S) = detile(rt_tiling, offset) */
1373 if (rt_tiled_w
!= key
->dst_tiled_w
)
1374 dst_pos
= blorp_nir_retile_y_to_w(&b
, dst_pos
);
1375 /* Now (X, Y, S) = detile(rt_tiling, offset) */
1376 dst_pos
= blorp_nir_decode_msaa(&b
, dst_pos
, key
->dst_samples
,
1380 /* Now (X, Y, S) = decode_msaa(dst_samples, detile(dst_tiling, offset)).
1382 * That is: X, Y and S now contain the true coordinates and sample index of
1383 * the data that the WM thread should output.
1385 * If we need to kill pixels that are outside the destination rectangle,
1386 * now is the time to do it.
1389 blorp_nir_discard_if_outside_rect(&b
, dst_pos
, &v
);
1391 src_pos
= blorp_blit_apply_transform(&b
, nir_i2f(&b
, dst_pos
), &v
);
1392 if (dst_pos
->num_components
== 3) {
1393 /* The sample coordinate is an integer that we want left alone but
1394 * blorp_blit_apply_transform() blindly applies the transform to all
1395 * three coordinates. Grab the original sample index.
1397 src_pos
= nir_vec3(&b
, nir_channel(&b
, src_pos
, 0),
1398 nir_channel(&b
, src_pos
, 1),
1399 nir_channel(&b
, dst_pos
, 2));
1402 /* If the source image is not multisampled, then we want to fetch sample
1403 * number 0, because that's the only sample there is.
1405 if (key
->src_samples
== 0)
1406 src_pos
= nir_channels(&b
, src_pos
, 0x3);
1408 /* X, Y, and S are now the coordinates of the pixel in the source image
1409 * that we want to texture from. Exception: if we are blending, then S is
1410 * irrelevant, because we are going to fetch all samples.
1412 if (key
->blend
&& !key
->blit_scaled
) {
1413 /* Resolves (effecively) use texelFetch, so we need integers and we
1414 * don't care about the sample index if we got one.
1416 src_pos
= nir_f2i(&b
, nir_channels(&b
, src_pos
, 0x3));
1418 if (brw
->gen
== 6) {
1419 /* Because gen6 only supports 4x interleved MSAA, we can do all the
1420 * blending we need with a single linear-interpolated texture lookup
1421 * at the center of the sample. The texture coordinates to be odd
1422 * integers so that they correspond to the center of a 2x2 block
1423 * representing the four samples that maxe up a pixel. So we need
1424 * to multiply our X and Y coordinates each by 2 and then add 1.
1426 src_pos
= nir_ishl(&b
, src_pos
, nir_imm_int(&b
, 1));
1427 src_pos
= nir_iadd(&b
, src_pos
, nir_imm_int(&b
, 1));
1428 src_pos
= nir_i2f(&b
, src_pos
);
1429 color
= blorp_nir_tex(&b
, src_pos
, key
->texture_data_type
);
1431 /* Gen7+ hardware doesn't automaticaly blend. */
1432 color
= blorp_nir_manual_blend_average(&b
, src_pos
, key
->src_samples
,
1434 key
->texture_data_type
);
1436 } else if (key
->blend
&& key
->blit_scaled
) {
1437 color
= blorp_nir_manual_blend_bilinear(&b
, src_pos
, key
->src_samples
, key
, &v
);
1439 if (key
->bilinear_filter
) {
1440 color
= blorp_nir_tex(&b
, src_pos
, key
->texture_data_type
);
1442 /* We're going to use texelFetch, so we need integers */
1443 if (src_pos
->num_components
== 2) {
1444 src_pos
= nir_f2i(&b
, src_pos
);
1446 assert(src_pos
->num_components
== 3);
1447 src_pos
= nir_vec3(&b
, nir_channel(&b
, nir_f2i(&b
, src_pos
), 0),
1448 nir_channel(&b
, nir_f2i(&b
, src_pos
), 1),
1449 nir_channel(&b
, src_pos
, 2));
1452 /* We aren't blending, which means we just want to fetch a single
1453 * sample from the source surface. The address that we want to fetch
1454 * from is related to the X, Y and S values according to the formula:
1456 * (X, Y, S) = decode_msaa(src_samples, detile(src_tiling, offset)).
1458 * If the actual tiling and sample count of the source surface are
1459 * not the same as the configuration of the texture, then we need to
1460 * adjust the coordinates to compensate for the difference.
1462 if (tex_tiled_w
!= key
->src_tiled_w
||
1463 key
->tex_samples
!= key
->src_samples
||
1464 key
->tex_layout
!= key
->src_layout
) {
1465 src_pos
= blorp_nir_encode_msaa(&b
, src_pos
, key
->src_samples
,
1467 /* Now (X, Y, S) = detile(src_tiling, offset) */
1468 if (tex_tiled_w
!= key
->src_tiled_w
)
1469 src_pos
= blorp_nir_retile_w_to_y(&b
, src_pos
);
1470 /* Now (X, Y, S) = detile(tex_tiling, offset) */
1471 src_pos
= blorp_nir_decode_msaa(&b
, src_pos
, key
->tex_samples
,
1475 /* Now (X, Y, S) = decode_msaa(tex_samples, detile(tex_tiling, offset)).
1477 * In other words: X, Y, and S now contain values which, when passed to
1478 * the texturing unit, will cause data to be read from the correct
1479 * memory location. So we can fetch the texel now.
1481 if (key
->src_samples
== 0) {
1482 color
= blorp_nir_txf(&b
, &v
, src_pos
, key
->texture_data_type
);
1484 nir_ssa_def
*mcs
= NULL
;
1485 if (key
->tex_layout
== INTEL_MSAA_LAYOUT_CMS
)
1486 mcs
= blorp_nir_txf_ms_mcs(&b
, src_pos
);
1488 color
= blorp_nir_txf_ms(&b
, src_pos
, mcs
, key
->texture_data_type
);
1493 nir_store_var(&b
, v
.color_out
, color
, 0xf);
1499 brw_blorp_get_blit_kernel(struct brw_context
*brw
,
1500 struct brw_blorp_params
*params
,
1501 const struct brw_blorp_blit_prog_key
*prog_key
)
1503 if (brw_search_cache(&brw
->cache
, BRW_CACHE_BLORP_PROG
,
1504 prog_key
, sizeof(*prog_key
),
1505 ¶ms
->wm_prog_kernel
, ¶ms
->wm_prog_data
))
1508 const unsigned *program
;
1509 unsigned program_size
;
1510 struct brw_blorp_prog_data prog_data
;
1512 /* Try and compile with NIR first. If that fails, fall back to the old
1513 * method of building shaders manually.
1515 nir_shader
*nir
= brw_blorp_build_nir_shader(brw
, prog_key
);
1516 struct brw_wm_prog_key wm_key
;
1517 brw_blorp_init_wm_prog_key(&wm_key
);
1518 wm_key
.tex
.compressed_multisample_layout_mask
=
1519 prog_key
->tex_layout
== INTEL_MSAA_LAYOUT_CMS
;
1520 wm_key
.tex
.msaa_16
= prog_key
->tex_samples
== 16;
1521 wm_key
.multisample_fbo
= prog_key
->rt_samples
> 1;
1523 program
= brw_blorp_compile_nir_shader(brw
, nir
, &wm_key
, false,
1524 &prog_data
, &program_size
);
1526 brw_upload_cache(&brw
->cache
, BRW_CACHE_BLORP_PROG
,
1527 prog_key
, sizeof(*prog_key
),
1528 program
, program_size
,
1529 &prog_data
, sizeof(prog_data
),
1530 ¶ms
->wm_prog_kernel
, ¶ms
->wm_prog_data
);
1534 brw_blorp_setup_coord_transform(struct brw_blorp_coord_transform
*xform
,
1535 GLfloat src0
, GLfloat src1
,
1536 GLfloat dst0
, GLfloat dst1
,
1539 float scale
= (src1
- src0
) / (dst1
- dst0
);
1541 /* When not mirroring a coordinate (say, X), we need:
1542 * src_x - src_x0 = (dst_x - dst_x0 + 0.5) * scale
1544 * src_x = src_x0 + (dst_x - dst_x0 + 0.5) * scale
1546 * blorp program uses "round toward zero" to convert the
1547 * transformed floating point coordinates to integer coordinates,
1548 * whereas the behaviour we actually want is "round to nearest",
1549 * so 0.5 provides the necessary correction.
1551 xform
->multiplier
= scale
;
1552 xform
->offset
= src0
+ (-dst0
+ 0.5f
) * scale
;
1554 /* When mirroring X we need:
1555 * src_x - src_x0 = dst_x1 - dst_x - 0.5
1557 * src_x = src_x0 + (dst_x1 -dst_x - 0.5) * scale
1559 xform
->multiplier
= -scale
;
1560 xform
->offset
= src0
+ (dst1
- 0.5f
) * scale
;
1566 * Determine which MSAA layout the GPU pipeline should be configured for,
1567 * based on the chip generation, the number of samples, and the true layout of
1568 * the image in memory.
1570 inline intel_msaa_layout
1571 compute_msaa_layout_for_pipeline(struct brw_context
*brw
, unsigned num_samples
,
1572 intel_msaa_layout true_layout
)
1574 if (num_samples
<= 1) {
1575 /* Layout is used to determine if ld2dms is needed for sampling. In
1576 * single sampled case normal ld is enough avoiding also the need to
1577 * fetch mcs. Therefore simply set the layout to none.
1579 if (brw
->gen
>= 9 && true_layout
== INTEL_MSAA_LAYOUT_CMS
) {
1580 return INTEL_MSAA_LAYOUT_NONE
;
1583 /* When configuring the GPU for non-MSAA, we can still accommodate IMS
1584 * format buffers, by transforming coordinates appropriately.
1586 assert(true_layout
== INTEL_MSAA_LAYOUT_NONE
||
1587 true_layout
== INTEL_MSAA_LAYOUT_IMS
);
1588 return INTEL_MSAA_LAYOUT_NONE
;
1590 assert(true_layout
!= INTEL_MSAA_LAYOUT_NONE
);
1593 /* Prior to Gen7, all MSAA surfaces use IMS layout. */
1594 if (brw
->gen
== 6) {
1595 assert(true_layout
== INTEL_MSAA_LAYOUT_IMS
);
1603 * Note: if the src (or dst) is a 2D multisample array texture on Gen7+ using
1604 * INTEL_MSAA_LAYOUT_UMS or INTEL_MSAA_LAYOUT_CMS, src_layer (dst_layer) is
1605 * the physical layer holding sample 0. So, for example, if
1606 * src_mt->num_samples == 4, then logical layer n corresponds to src_layer ==
1610 brw_blorp_blit_miptrees(struct brw_context
*brw
,
1611 struct intel_mipmap_tree
*src_mt
,
1612 unsigned src_level
, unsigned src_layer
,
1613 mesa_format src_format
, int src_swizzle
,
1614 struct intel_mipmap_tree
*dst_mt
,
1615 unsigned dst_level
, unsigned dst_layer
,
1616 mesa_format dst_format
,
1617 float src_x0
, float src_y0
,
1618 float src_x1
, float src_y1
,
1619 float dst_x0
, float dst_y0
,
1620 float dst_x1
, float dst_y1
,
1621 GLenum filter
, bool mirror_x
, bool mirror_y
,
1622 bool decode_srgb
, bool encode_srgb
)
1624 /* Get ready to blit. This includes depth resolving the src and dst
1625 * buffers if necessary. Note: it's not necessary to do a color resolve on
1626 * the destination buffer because we use the standard render path to render
1627 * to destination color buffers, and the standard render path is
1630 intel_miptree_resolve_color(brw
, src_mt
, INTEL_MIPTREE_IGNORE_CCS_E
);
1631 intel_miptree_slice_resolve_depth(brw
, src_mt
, src_level
, src_layer
);
1632 intel_miptree_slice_resolve_depth(brw
, dst_mt
, dst_level
, dst_layer
);
1634 intel_miptree_prepare_mcs(brw
, dst_mt
);
1636 DBG("%s from %dx %s mt %p %d %d (%f,%f) (%f,%f)"
1637 "to %dx %s mt %p %d %d (%f,%f) (%f,%f) (flip %d,%d)\n",
1639 src_mt
->num_samples
, _mesa_get_format_name(src_mt
->format
), src_mt
,
1640 src_level
, src_layer
, src_x0
, src_y0
, src_x1
, src_y1
,
1641 dst_mt
->num_samples
, _mesa_get_format_name(dst_mt
->format
), dst_mt
,
1642 dst_level
, dst_layer
, dst_x0
, dst_y0
, dst_x1
, dst_y1
,
1643 mirror_x
, mirror_y
);
1645 if (!decode_srgb
&& _mesa_get_format_color_encoding(src_format
) == GL_SRGB
)
1646 src_format
= _mesa_get_srgb_format_linear(src_format
);
1648 if (!encode_srgb
&& _mesa_get_format_color_encoding(dst_format
) == GL_SRGB
)
1649 dst_format
= _mesa_get_srgb_format_linear(dst_format
);
1651 struct brw_blorp_params params
;
1652 brw_blorp_params_init(¶ms
);
1654 brw_blorp_surface_info_init(brw
, ¶ms
.src
, src_mt
, src_level
,
1655 src_layer
, src_format
, false);
1656 brw_blorp_surface_info_init(brw
, ¶ms
.dst
, dst_mt
, dst_level
,
1657 dst_layer
, dst_format
, true);
1659 /* Even though we do multisample resolves at the time of the blit, OpenGL
1660 * specification defines them as if they happen at the time of rendering,
1661 * which means that the type of averaging we do during the resolve should
1662 * only depend on the source format; the destination format should be
1663 * ignored. But, specification doesn't seem to be strict about it.
1665 * It has been observed that mulitisample resolves produce slightly better
1666 * looking images when averaging is done using destination format. NVIDIA's
1667 * proprietary OpenGL driver also follow this approach. So, we choose to
1668 * follow it in our driver.
1670 * When multisampling, if the source and destination formats are equal
1671 * (aside from the color space), we choose to blit in sRGB space to get
1672 * this higher quality image.
1674 if (params
.src
.num_samples
> 1 &&
1675 _mesa_get_format_color_encoding(dst_mt
->format
) == GL_SRGB
&&
1676 _mesa_get_srgb_format_linear(src_mt
->format
) ==
1677 _mesa_get_srgb_format_linear(dst_mt
->format
)) {
1678 assert(brw
->format_supported_as_render_target
[dst_mt
->format
]);
1679 params
.dst
.brw_surfaceformat
= brw
->render_target_format
[dst_mt
->format
];
1680 params
.src
.brw_surfaceformat
= brw_format_for_mesa_format(dst_mt
->format
);
1683 /* When doing a multisample resolve of a GL_LUMINANCE32F or GL_INTENSITY32F
1684 * texture, the above code configures the source format for L32_FLOAT or
1685 * I32_FLOAT, and the destination format for R32_FLOAT. On Sandy Bridge,
1686 * the SAMPLE message appears to handle multisampled L32_FLOAT and
1687 * I32_FLOAT textures incorrectly, resulting in blocky artifacts. So work
1688 * around the problem by using a source format of R32_FLOAT. This
1689 * shouldn't affect rendering correctness, since the destination format is
1690 * R32_FLOAT, so only the contents of the red channel matters.
1692 if (brw
->gen
== 6 &&
1693 params
.src
.num_samples
> 1 && params
.dst
.num_samples
<= 1 &&
1694 src_mt
->format
== dst_mt
->format
&&
1695 params
.dst
.brw_surfaceformat
== BRW_SURFACEFORMAT_R32_FLOAT
) {
1696 params
.src
.brw_surfaceformat
= params
.dst
.brw_surfaceformat
;
1699 struct brw_blorp_blit_prog_key wm_prog_key
;
1700 memset(&wm_prog_key
, 0, sizeof(wm_prog_key
));
1702 /* texture_data_type indicates the register type that should be used to
1703 * manipulate texture data.
1705 switch (_mesa_get_format_datatype(src_mt
->format
)) {
1706 case GL_UNSIGNED_NORMALIZED
:
1707 case GL_SIGNED_NORMALIZED
:
1709 wm_prog_key
.texture_data_type
= BRW_REGISTER_TYPE_F
;
1711 case GL_UNSIGNED_INT
:
1712 if (src_mt
->format
== MESA_FORMAT_S_UINT8
) {
1713 /* We process stencil as though it's an unsigned normalized color */
1714 wm_prog_key
.texture_data_type
= BRW_REGISTER_TYPE_F
;
1716 wm_prog_key
.texture_data_type
= BRW_REGISTER_TYPE_UD
;
1720 wm_prog_key
.texture_data_type
= BRW_REGISTER_TYPE_D
;
1723 unreachable("Unrecognized blorp format");
1727 /* Gen7's rendering hardware only supports the IMS layout for depth and
1728 * stencil render targets. Blorp always maps its destination surface as
1729 * a color render target (even if it's actually a depth or stencil
1730 * buffer). So if the destination is IMS, we'll have to map it as a
1731 * single-sampled texture and interleave the samples ourselves.
1733 if (dst_mt
->msaa_layout
== INTEL_MSAA_LAYOUT_IMS
)
1734 params
.dst
.num_samples
= 0;
1737 if (params
.dst
.map_stencil_as_y_tiled
&& params
.dst
.num_samples
> 1) {
1738 /* If the destination surface is a W-tiled multisampled stencil buffer
1739 * that we're mapping as Y tiled, then we need to arrange for the WM
1740 * program to run once per sample rather than once per pixel, because
1741 * the memory layout of related samples doesn't match between W and Y
1744 wm_prog_key
.persample_msaa_dispatch
= true;
1747 if (params
.src
.num_samples
> 0 && params
.dst
.num_samples
> 1) {
1748 /* We are blitting from a multisample buffer to a multisample buffer, so
1749 * we must preserve samples within a pixel. This means we have to
1750 * arrange for the WM program to run once per sample rather than once
1753 wm_prog_key
.persample_msaa_dispatch
= true;
1756 /* Scaled blitting or not. */
1757 wm_prog_key
.blit_scaled
=
1758 ((dst_x1
- dst_x0
) == (src_x1
- src_x0
) &&
1759 (dst_y1
- dst_y0
) == (src_y1
- src_y0
)) ? false : true;
1761 /* Scaling factors used for bilinear filtering in multisample scaled
1764 wm_prog_key
.x_scale
= 2.0f
;
1765 wm_prog_key
.y_scale
= src_mt
->num_samples
/ 2.0f
;
1767 if (filter
== GL_LINEAR
&&
1768 params
.src
.num_samples
<= 1 && params
.dst
.num_samples
<= 1)
1769 wm_prog_key
.bilinear_filter
= true;
1771 GLenum base_format
= _mesa_get_format_base_format(src_mt
->format
);
1772 if (base_format
!= GL_DEPTH_COMPONENT
&& /* TODO: what about depth/stencil? */
1773 base_format
!= GL_STENCIL_INDEX
&&
1774 !_mesa_is_format_integer(src_mt
->format
) &&
1775 src_mt
->num_samples
> 1 && dst_mt
->num_samples
<= 1) {
1776 /* We are downsampling a non-integer color buffer, so blend.
1778 * Regarding integer color buffers, the OpenGL ES 3.2 spec says:
1780 * "If the source formats are integer types or stencil values, a
1781 * single sample's value is selected for each pixel."
1783 * This implies we should not blend in that case.
1785 wm_prog_key
.blend
= true;
1788 /* src_samples and dst_samples are the true sample counts */
1789 wm_prog_key
.src_samples
= src_mt
->num_samples
;
1790 wm_prog_key
.dst_samples
= dst_mt
->num_samples
;
1792 /* tex_samples and rt_samples are the sample counts that are set up in
1795 wm_prog_key
.tex_samples
= params
.src
.num_samples
;
1796 wm_prog_key
.rt_samples
= params
.dst
.num_samples
;
1798 /* tex_layout and rt_layout indicate the MSAA layout the GPU pipeline will
1799 * use to access the source and destination surfaces.
1801 wm_prog_key
.tex_layout
=
1802 compute_msaa_layout_for_pipeline(brw
, params
.src
.num_samples
,
1803 params
.src
.msaa_layout
);
1804 wm_prog_key
.rt_layout
=
1805 compute_msaa_layout_for_pipeline(brw
, params
.dst
.num_samples
,
1806 params
.dst
.msaa_layout
);
1808 /* src_layout and dst_layout indicate the true MSAA layout used by src and
1811 wm_prog_key
.src_layout
= src_mt
->msaa_layout
;
1812 wm_prog_key
.dst_layout
= dst_mt
->msaa_layout
;
1814 /* On gen9+ compressed single sampled buffers carry the same layout type as
1815 * multisampled. The difference is that they can be sampled using normal
1816 * ld message and as render target behave just like non-compressed surface
1817 * from compiler point of view. Therefore override the type in the program
1820 if (brw
->gen
>= 9 && params
.src
.num_samples
<= 1 &&
1821 src_mt
->msaa_layout
== INTEL_MSAA_LAYOUT_CMS
)
1822 wm_prog_key
.src_layout
= INTEL_MSAA_LAYOUT_NONE
;
1823 if (brw
->gen
>= 9 && params
.dst
.num_samples
<= 1 &&
1824 dst_mt
->msaa_layout
== INTEL_MSAA_LAYOUT_CMS
)
1825 wm_prog_key
.dst_layout
= INTEL_MSAA_LAYOUT_NONE
;
1827 wm_prog_key
.src_tiled_w
= params
.src
.map_stencil_as_y_tiled
;
1828 wm_prog_key
.dst_tiled_w
= params
.dst
.map_stencil_as_y_tiled
;
1829 /* Round floating point values to nearest integer to avoid "off by one texel"
1830 * kind of errors when blitting.
1832 params
.x0
= params
.wm_push_consts
.dst_x0
= roundf(dst_x0
);
1833 params
.y0
= params
.wm_push_consts
.dst_y0
= roundf(dst_y0
);
1834 params
.x1
= params
.wm_push_consts
.dst_x1
= roundf(dst_x1
);
1835 params
.y1
= params
.wm_push_consts
.dst_y1
= roundf(dst_y1
);
1836 params
.wm_push_consts
.rect_grid_x1
=
1837 minify(src_mt
->logical_width0
, src_level
) * wm_prog_key
.x_scale
- 1.0f
;
1838 params
.wm_push_consts
.rect_grid_y1
=
1839 minify(src_mt
->logical_height0
, src_level
) * wm_prog_key
.y_scale
- 1.0f
;
1841 brw_blorp_setup_coord_transform(¶ms
.wm_push_consts
.x_transform
,
1842 src_x0
, src_x1
, dst_x0
, dst_x1
, mirror_x
);
1843 brw_blorp_setup_coord_transform(¶ms
.wm_push_consts
.y_transform
,
1844 src_y0
, src_y1
, dst_y0
, dst_y1
, mirror_y
);
1846 params
.wm_push_consts
.src_z
=
1847 params
.src
.mt
->target
== GL_TEXTURE_3D
? params
.src
.layer
: 0;
1849 if (params
.dst
.num_samples
<= 1 && dst_mt
->num_samples
> 1) {
1850 /* We must expand the rectangle we send through the rendering pipeline,
1851 * to account for the fact that we are mapping the destination region as
1852 * single-sampled when it is in fact multisampled. We must also align
1853 * it to a multiple of the multisampling pattern, because the
1854 * differences between multisampled and single-sampled surface formats
1855 * will mean that pixels are scrambled within the multisampling pattern.
1856 * TODO: what if this makes the coordinates too large?
1858 * Note: this only works if the destination surface uses the IMS layout.
1859 * If it's UMS, then we have no choice but to set up the rendering
1860 * pipeline as multisampled.
1862 assert(dst_mt
->msaa_layout
== INTEL_MSAA_LAYOUT_IMS
);
1863 switch (dst_mt
->num_samples
) {
1865 params
.x0
= ROUND_DOWN_TO(params
.x0
* 2, 4);
1866 params
.y0
= ROUND_DOWN_TO(params
.y0
, 4);
1867 params
.x1
= ALIGN(params
.x1
* 2, 4);
1868 params
.y1
= ALIGN(params
.y1
, 4);
1871 params
.x0
= ROUND_DOWN_TO(params
.x0
* 2, 4);
1872 params
.y0
= ROUND_DOWN_TO(params
.y0
* 2, 4);
1873 params
.x1
= ALIGN(params
.x1
* 2, 4);
1874 params
.y1
= ALIGN(params
.y1
* 2, 4);
1877 params
.x0
= ROUND_DOWN_TO(params
.x0
* 4, 8);
1878 params
.y0
= ROUND_DOWN_TO(params
.y0
* 2, 4);
1879 params
.x1
= ALIGN(params
.x1
* 4, 8);
1880 params
.y1
= ALIGN(params
.y1
* 2, 4);
1883 params
.x0
= ROUND_DOWN_TO(params
.x0
* 4, 8);
1884 params
.y0
= ROUND_DOWN_TO(params
.y0
* 4, 8);
1885 params
.x1
= ALIGN(params
.x1
* 4, 8);
1886 params
.y1
= ALIGN(params
.y1
* 4, 8);
1889 unreachable("Unrecognized sample count in brw_blorp_blit_params ctor");
1891 wm_prog_key
.use_kill
= true;
1894 if (params
.dst
.map_stencil_as_y_tiled
) {
1895 /* We must modify the rectangle we send through the rendering pipeline
1896 * (and the size and x/y offset of the destination surface), to account
1897 * for the fact that we are mapping it as Y-tiled when it is in fact
1900 * Both Y tiling and W tiling can be understood as organizations of
1901 * 32-byte sub-tiles; within each 32-byte sub-tile, the layout of pixels
1902 * is different, but the layout of the 32-byte sub-tiles within the 4k
1903 * tile is the same (8 sub-tiles across by 16 sub-tiles down, in
1904 * column-major order). In Y tiling, the sub-tiles are 16 bytes wide
1905 * and 2 rows high; in W tiling, they are 8 bytes wide and 4 rows high.
1907 * Therefore, to account for the layout differences within the 32-byte
1908 * sub-tiles, we must expand the rectangle so the X coordinates of its
1909 * edges are multiples of 8 (the W sub-tile width), and its Y
1910 * coordinates of its edges are multiples of 4 (the W sub-tile height).
1911 * Then we need to scale the X and Y coordinates of the rectangle to
1912 * account for the differences in aspect ratio between the Y and W
1913 * sub-tiles. We need to modify the layer width and height similarly.
1915 * A correction needs to be applied when MSAA is in use: since
1916 * INTEL_MSAA_LAYOUT_IMS uses an interleaving pattern whose height is 4,
1917 * we need to align the Y coordinates to multiples of 8, so that when
1918 * they are divided by two they are still multiples of 4.
1920 * Note: Since the x/y offset of the surface will be applied using the
1921 * SURFACE_STATE command packet, it will be invisible to the swizzling
1922 * code in the shader; therefore it needs to be in a multiple of the
1923 * 32-byte sub-tile size. Fortunately it is, since the sub-tile is 8
1924 * pixels wide and 4 pixels high (when viewed as a W-tiled stencil
1925 * buffer), and the miplevel alignment used for stencil buffers is 8
1926 * pixels horizontally and either 4 or 8 pixels vertically (see
1927 * intel_horizontal_texture_alignment_unit() and
1928 * intel_vertical_texture_alignment_unit()).
1930 * Note: Also, since the SURFACE_STATE command packet can only apply
1931 * offsets that are multiples of 4 pixels horizontally and 2 pixels
1932 * vertically, it is important that the offsets will be multiples of
1933 * these sizes after they are converted into Y-tiled coordinates.
1934 * Fortunately they will be, since we know from above that the offsets
1935 * are a multiple of the 32-byte sub-tile size, and in Y-tiled
1936 * coordinates the sub-tile is 16 pixels wide and 2 pixels high.
1938 * TODO: what if this makes the coordinates (or the texture size) too
1941 const unsigned x_align
= 8, y_align
= params
.dst
.num_samples
!= 0 ? 8 : 4;
1942 params
.x0
= ROUND_DOWN_TO(params
.x0
, x_align
) * 2;
1943 params
.y0
= ROUND_DOWN_TO(params
.y0
, y_align
) / 2;
1944 params
.x1
= ALIGN(params
.x1
, x_align
) * 2;
1945 params
.y1
= ALIGN(params
.y1
, y_align
) / 2;
1946 params
.dst
.width
= ALIGN(params
.dst
.width
, x_align
) * 2;
1947 params
.dst
.height
= ALIGN(params
.dst
.height
, y_align
) / 2;
1948 params
.dst
.x_offset
*= 2;
1949 params
.dst
.y_offset
/= 2;
1950 wm_prog_key
.use_kill
= true;
1953 if (params
.src
.map_stencil_as_y_tiled
) {
1954 /* We must modify the size and x/y offset of the source surface to
1955 * account for the fact that we are mapping it as Y-tiled when it is in
1958 * See the comments above concerning x/y offset alignment for the
1959 * destination surface.
1961 * TODO: what if this makes the texture size too large?
1963 const unsigned x_align
= 8, y_align
= params
.src
.num_samples
!= 0 ? 8 : 4;
1964 params
.src
.width
= ALIGN(params
.src
.width
, x_align
) * 2;
1965 params
.src
.height
= ALIGN(params
.src
.height
, y_align
) / 2;
1966 params
.src
.x_offset
*= 2;
1967 params
.src
.y_offset
/= 2;
1970 brw_blorp_get_blit_kernel(brw
, ¶ms
, &wm_prog_key
);
1972 params
.src
.swizzle
= src_swizzle
;
1974 brw_blorp_exec(brw
, ¶ms
);
1976 intel_miptree_slice_set_needs_hiz_resolve(dst_mt
, dst_level
, dst_layer
);
1978 if (intel_miptree_is_lossless_compressed(brw
, dst_mt
))
1979 dst_mt
->fast_clear_state
= INTEL_FAST_CLEAR_STATE_UNRESOLVED
;