src/mesa/drivers/dri/i965/brw_blorp_blit.cpp

   1 /*
   2  * Copyright © 2012 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "main/context.h"
  25 #include "main/teximage.h"
  26 #include "main/fbobject.h"
  27
  28 #include "compiler/nir/nir_builder.h"
  29
  30 #include "intel_fbo.h"
  31
  32 #include "brw_blorp.h"
  33 #include "brw_context.h"
  34 #include "brw_state.h"
  35 #include "brw_meta_util.h"
  36
  37 #define FILE_DEBUG_FLAG DEBUG_BLORP
  38
  39 static struct intel_mipmap_tree *
  40 find_miptree(GLbitfield buffer_bit, struct intel_renderbuffer *irb)
  41 {
  42    struct intel_mipmap_tree *mt = irb->mt;
  43    if (buffer_bit == GL_STENCIL_BUFFER_BIT && mt->stencil_mt)
  44       mt = mt->stencil_mt;
  45    return mt;
  46 }
  47
  48 static int
  49 blorp_get_texture_swizzle(const struct intel_renderbuffer *irb)
  50 {
  51    return irb->Base.Base._BaseFormat == GL_RGB ?
  52       MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ONE) :
  53       SWIZZLE_XYZW;
  54 }
  55
  56 static void
  57 do_blorp_blit(struct brw_context *brw, GLbitfield buffer_bit,
  58               struct intel_renderbuffer *src_irb, mesa_format src_format,
  59               struct intel_renderbuffer *dst_irb, mesa_format dst_format,
  60               GLfloat srcX0, GLfloat srcY0, GLfloat srcX1, GLfloat srcY1,
  61               GLfloat dstX0, GLfloat dstY0, GLfloat dstX1, GLfloat dstY1,
  62               GLenum filter, bool mirror_x, bool mirror_y)
  63 {
  64    const struct gl_context *ctx = &brw->ctx;
  65
  66    /* Find source/dst miptrees */
  67    struct intel_mipmap_tree *src_mt = find_miptree(buffer_bit, src_irb);
  68    struct intel_mipmap_tree *dst_mt = find_miptree(buffer_bit, dst_irb);
  69
  70    const bool do_srgb = ctx->Color.sRGBEnabled;
  71
  72    /* Do the blit */
  73    brw_blorp_blit_miptrees(brw,
  74                            src_mt, src_irb->mt_level, src_irb->mt_layer,
  75                            src_format, blorp_get_texture_swizzle(src_irb),
  76                            dst_mt, dst_irb->mt_level, dst_irb->mt_layer,
  77                            dst_format,
  78                            srcX0, srcY0, srcX1, srcY1,
  79                            dstX0, dstY0, dstX1, dstY1,
  80                            filter, mirror_x, mirror_y,
  81                            do_srgb, do_srgb);
  82
  83    dst_irb->need_downsample = true;
  84 }
  85
  86 static bool
  87 try_blorp_blit(struct brw_context *brw,
  88                const struct gl_framebuffer *read_fb,
  89                const struct gl_framebuffer *draw_fb,
  90                GLfloat srcX0, GLfloat srcY0, GLfloat srcX1, GLfloat srcY1,
  91                GLfloat dstX0, GLfloat dstY0, GLfloat dstX1, GLfloat dstY1,
  92                GLenum filter, GLbitfield buffer_bit)
  93 {
  94    struct gl_context *ctx = &brw->ctx;
  95
  96    /* Sync up the state of window system buffers.  We need to do this before
  97     * we go looking for the buffers.
  98     */
  99    intel_prepare_render(brw);
 100
 101    bool mirror_x, mirror_y;
 102    if (brw_meta_mirror_clip_and_scissor(ctx, read_fb, draw_fb,
 103                                         &srcX0, &srcY0, &srcX1, &srcY1,
 104                                         &dstX0, &dstY0, &dstX1, &dstY1,
 105                                         &mirror_x, &mirror_y))
 106       return true;
 107
 108    /* Find buffers */
 109    struct intel_renderbuffer *src_irb;
 110    struct intel_renderbuffer *dst_irb;
 111    struct intel_mipmap_tree *src_mt;
 112    struct intel_mipmap_tree *dst_mt;
 113    switch (buffer_bit) {
 114    case GL_COLOR_BUFFER_BIT:
 115       src_irb = intel_renderbuffer(read_fb->_ColorReadBuffer);
 116       for (unsigned i = 0; i < draw_fb->_NumColorDrawBuffers; ++i) {
 117          dst_irb = intel_renderbuffer(draw_fb->_ColorDrawBuffers[i]);
 118          if (dst_irb)
 119             do_blorp_blit(brw, buffer_bit,
 120                           src_irb, src_irb->Base.Base.Format,
 121                           dst_irb, dst_irb->Base.Base.Format,
 122                           srcX0, srcY0, srcX1, srcY1,
 123                           dstX0, dstY0, dstX1, dstY1,
 124                           filter, mirror_x, mirror_y);
 125       }
 126       break;
 127    case GL_DEPTH_BUFFER_BIT:
 128       src_irb =
 129          intel_renderbuffer(read_fb->Attachment[BUFFER_DEPTH].Renderbuffer);
 130       dst_irb =
 131          intel_renderbuffer(draw_fb->Attachment[BUFFER_DEPTH].Renderbuffer);
 132       src_mt = find_miptree(buffer_bit, src_irb);
 133       dst_mt = find_miptree(buffer_bit, dst_irb);
 134
 135       /* We can't handle format conversions between Z24 and other formats
 136        * since we have to lie about the surface format. See the comments in
 137        * brw_blorp_surface_info::set().
 138        */
 139       if ((src_mt->format == MESA_FORMAT_Z24_UNORM_X8_UINT) !=
 140           (dst_mt->format == MESA_FORMAT_Z24_UNORM_X8_UINT))
 141          return false;
 142
 143       do_blorp_blit(brw, buffer_bit, src_irb, MESA_FORMAT_NONE,
 144                     dst_irb, MESA_FORMAT_NONE, srcX0, srcY0,
 145                     srcX1, srcY1, dstX0, dstY0, dstX1, dstY1,
 146                     filter, mirror_x, mirror_y);
 147       break;
 148    case GL_STENCIL_BUFFER_BIT:
 149       src_irb =
 150          intel_renderbuffer(read_fb->Attachment[BUFFER_STENCIL].Renderbuffer);
 151       dst_irb =
 152          intel_renderbuffer(draw_fb->Attachment[BUFFER_STENCIL].Renderbuffer);
 153       do_blorp_blit(brw, buffer_bit, src_irb, MESA_FORMAT_NONE,
 154                     dst_irb, MESA_FORMAT_NONE, srcX0, srcY0,
 155                     srcX1, srcY1, dstX0, dstY0, dstX1, dstY1,
 156                     filter, mirror_x, mirror_y);
 157       break;
 158    default:
 159       unreachable("not reached");
 160    }
 161
 162    return true;
 163 }
 164
 165 bool
 166 brw_blorp_copytexsubimage(struct brw_context *brw,
 167                           struct gl_renderbuffer *src_rb,
 168                           struct gl_texture_image *dst_image,
 169                           int slice,
 170                           int srcX0, int srcY0,
 171                           int dstX0, int dstY0,
 172                           int width, int height)
 173 {
 174    struct gl_context *ctx = &brw->ctx;
 175    struct intel_renderbuffer *src_irb = intel_renderbuffer(src_rb);
 176    struct intel_texture_image *intel_image = intel_texture_image(dst_image);
 177
 178    /* No pixel transfer operations (zoom, bias, mapping), just a blit */
 179    if (brw->ctx._ImageTransferState)
 180       return false;
 181
 182    /* Sync up the state of window system buffers.  We need to do this before
 183     * we go looking at the src renderbuffer's miptree.
 184     */
 185    intel_prepare_render(brw);
 186
 187    struct intel_mipmap_tree *src_mt = src_irb->mt;
 188    struct intel_mipmap_tree *dst_mt = intel_image->mt;
 189
 190    /* There is support for only up to eight samples. */
 191    if (src_mt->num_samples > 8 || dst_mt->num_samples > 8)
 192       return false;
 193
 194    /* BLORP is only supported from Gen6 onwards. */
 195    if (brw->gen < 6)
 196       return false;
 197
 198    if (_mesa_get_format_base_format(src_rb->Format) !=
 199        _mesa_get_format_base_format(dst_image->TexFormat)) {
 200       return false;
 201    }
 202
 203    /* We can't handle format conversions between Z24 and other formats since
 204     * we have to lie about the surface format.  See the comments in
 205     * brw_blorp_surface_info::set().
 206     */
 207    if ((src_mt->format == MESA_FORMAT_Z24_UNORM_X8_UINT) !=
 208        (dst_mt->format == MESA_FORMAT_Z24_UNORM_X8_UINT)) {
 209       return false;
 210    }
 211
 212    if (!brw->format_supported_as_render_target[dst_image->TexFormat])
 213       return false;
 214
 215    /* Source clipping shouldn't be necessary, since copytexsubimage (in
 216     * src/mesa/main/teximage.c) calls _mesa_clip_copytexsubimage() which
 217     * takes care of it.
 218     *
 219     * Destination clipping shouldn't be necessary since the restrictions on
 220     * glCopyTexSubImage prevent the user from specifying a destination rectangle
 221     * that falls outside the bounds of the destination texture.
 222     * See error_check_subtexture_dimensions().
 223     */
 224
 225    int srcY1 = srcY0 + height;
 226    int srcX1 = srcX0 + width;
 227    int dstX1 = dstX0 + width;
 228    int dstY1 = dstY0 + height;
 229
 230    /* Account for the fact that in the system framebuffer, the origin is at
 231     * the lower left.
 232     */
 233    bool mirror_y = false;
 234    if (_mesa_is_winsys_fbo(ctx->ReadBuffer)) {
 235       GLint tmp = src_rb->Height - srcY0;
 236       srcY0 = src_rb->Height - srcY1;
 237       srcY1 = tmp;
 238       mirror_y = true;
 239    }
 240
 241    /* Account for face selection and texture view MinLayer */
 242    int dst_slice = slice + dst_image->TexObject->MinLayer + dst_image->Face;
 243    int dst_level = dst_image->Level + dst_image->TexObject->MinLevel;
 244
 245    brw_blorp_blit_miptrees(brw,
 246                            src_mt, src_irb->mt_level, src_irb->mt_layer,
 247                            src_rb->Format, blorp_get_texture_swizzle(src_irb),
 248                            dst_mt, dst_level, dst_slice,
 249                            dst_image->TexFormat,
 250                            srcX0, srcY0, srcX1, srcY1,
 251                            dstX0, dstY0, dstX1, dstY1,
 252                            GL_NEAREST, false, mirror_y,
 253                            false, false);
 254
 255    /* If we're copying to a packed depth stencil texture and the source
 256     * framebuffer has separate stencil, we need to also copy the stencil data
 257     * over.
 258     */
 259    src_rb = ctx->ReadBuffer->Attachment[BUFFER_STENCIL].Renderbuffer;
 260    if (_mesa_get_format_bits(dst_image->TexFormat, GL_STENCIL_BITS) > 0 &&
 261        src_rb != NULL) {
 262       src_irb = intel_renderbuffer(src_rb);
 263       src_mt = src_irb->mt;
 264
 265       if (src_mt->stencil_mt)
 266          src_mt = src_mt->stencil_mt;
 267       if (dst_mt->stencil_mt)
 268          dst_mt = dst_mt->stencil_mt;
 269
 270       if (src_mt != dst_mt) {
 271          brw_blorp_blit_miptrees(brw,
 272                                  src_mt, src_irb->mt_level, src_irb->mt_layer,
 273                                  src_mt->format,
 274                                  blorp_get_texture_swizzle(src_irb),
 275                                  dst_mt, dst_level, dst_slice,
 276                                  dst_mt->format,
 277                                  srcX0, srcY0, srcX1, srcY1,
 278                                  dstX0, dstY0, dstX1, dstY1,
 279                                  GL_NEAREST, false, mirror_y,
 280                                  false, false);
 281       }
 282    }
 283
 284    return true;
 285 }
 286
 287
 288 GLbitfield
 289 brw_blorp_framebuffer(struct brw_context *brw,
 290                       struct gl_framebuffer *readFb,
 291                       struct gl_framebuffer *drawFb,
 292                       GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
 293                       GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
 294                       GLbitfield mask, GLenum filter)
 295 {
 296    /* BLORP is not supported before Gen6. */
 297    if (brw->gen < 6)
 298       return mask;
 299
 300    static GLbitfield buffer_bits[] = {
 301       GL_COLOR_BUFFER_BIT,
 302       GL_DEPTH_BUFFER_BIT,
 303       GL_STENCIL_BUFFER_BIT,
 304    };
 305
 306    for (unsigned int i = 0; i < ARRAY_SIZE(buffer_bits); ++i) {
 307       if ((mask & buffer_bits[i]) &&
 308        try_blorp_blit(brw, readFb, drawFb,
 309                       srcX0, srcY0, srcX1, srcY1,
 310                       dstX0, dstY0, dstX1, dstY1,
 311                       filter, buffer_bits[i])) {
 312          mask &= ~buffer_bits[i];
 313       }
 314    }
 315
 316    return mask;
 317 }
 318
 319
 320 /**
 321  * Enum to specify the order of arguments in a sampler message
 322  */
 323 enum sampler_message_arg
 324 {
 325    SAMPLER_MESSAGE_ARG_U_FLOAT,
 326    SAMPLER_MESSAGE_ARG_V_FLOAT,
 327    SAMPLER_MESSAGE_ARG_U_INT,
 328    SAMPLER_MESSAGE_ARG_V_INT,
 329    SAMPLER_MESSAGE_ARG_R_INT,
 330    SAMPLER_MESSAGE_ARG_SI_INT,
 331    SAMPLER_MESSAGE_ARG_MCS_INT,
 332    SAMPLER_MESSAGE_ARG_ZERO_INT,
 333 };
 334
 335 struct brw_blorp_blit_vars {
 336    /* Input values from brw_blorp_wm_inputs */
 337    nir_variable *v_discard_rect;
 338    nir_variable *v_rect_grid;
 339    nir_variable *v_coord_transform;
 340    nir_variable *v_src_z;
 341
 342    /* gl_FragCoord */
 343    nir_variable *frag_coord;
 344
 345    /* gl_FragColor */
 346    nir_variable *color_out;
 347 };
 348
 349 static void
 350 brw_blorp_blit_vars_init(nir_builder *b, struct brw_blorp_blit_vars *v,
 351                          const struct brw_blorp_blit_prog_key *key)
 352 {
 353     /* Blended and scaled blits never use pixel discard. */
 354     assert(!key->use_kill || !(key->blend && key->blit_scaled));
 355
 356 #define LOAD_INPUT(name, type)\
 357    v->v_##name = nir_variable_create(b->shader, nir_var_shader_in, \
 358                                      type, #name); \
 359    v->v_##name->data.interpolation = INTERP_MODE_FLAT; \
 360    v->v_##name->data.location = VARYING_SLOT_VAR0 + \
 361       offsetof(struct brw_blorp_wm_inputs, name) / (4 * sizeof(float));
 362
 363    LOAD_INPUT(discard_rect, glsl_vec4_type())
 364    LOAD_INPUT(rect_grid, glsl_vec4_type())
 365    LOAD_INPUT(coord_transform, glsl_vec4_type())
 366    LOAD_INPUT(src_z, glsl_uint_type())
 367
 368 #undef LOAD_INPUT
 369
 370    v->frag_coord = nir_variable_create(b->shader, nir_var_shader_in,
 371                                        glsl_vec4_type(), "gl_FragCoord");
 372    v->frag_coord->data.location = VARYING_SLOT_POS;
 373    v->frag_coord->data.origin_upper_left = true;
 374
 375    v->color_out = nir_variable_create(b->shader, nir_var_shader_out,
 376                                       glsl_vec4_type(), "gl_FragColor");
 377    v->color_out->data.location = FRAG_RESULT_COLOR;
 378 }
 379
 380 nir_ssa_def *
 381 blorp_blit_get_frag_coords(nir_builder *b,
 382                            const struct brw_blorp_blit_prog_key *key,
 383                            struct brw_blorp_blit_vars *v)
 384 {
 385    nir_ssa_def *coord = nir_f2i(b, nir_load_var(b, v->frag_coord));
 386
 387    if (key->persample_msaa_dispatch) {
 388       return nir_vec3(b, nir_channel(b, coord, 0), nir_channel(b, coord, 1),
 389          nir_load_system_value(b, nir_intrinsic_load_sample_id, 0));
 390    } else {
 391       return nir_vec2(b, nir_channel(b, coord, 0), nir_channel(b, coord, 1));
 392    }
 393 }
 394
 395 /**
 396  * Emit code to translate from destination (X, Y) coordinates to source (X, Y)
 397  * coordinates.
 398  */
 399 nir_ssa_def *
 400 blorp_blit_apply_transform(nir_builder *b, nir_ssa_def *src_pos,
 401                            struct brw_blorp_blit_vars *v)
 402 {
 403    nir_ssa_def *coord_transform = nir_load_var(b, v->v_coord_transform);
 404
 405    nir_ssa_def *offset = nir_vec2(b, nir_channel(b, coord_transform, 1),
 406                                      nir_channel(b, coord_transform, 3));
 407    nir_ssa_def *mul = nir_vec2(b, nir_channel(b, coord_transform, 0),
 408                                   nir_channel(b, coord_transform, 2));
 409
 410    return nir_ffma(b, src_pos, mul, offset);
 411 }
 412
 413 static inline void
 414 blorp_nir_discard_if_outside_rect(nir_builder *b, nir_ssa_def *pos,
 415                                   struct brw_blorp_blit_vars *v)
 416 {
 417    nir_ssa_def *c0, *c1, *c2, *c3;
 418    nir_ssa_def *discard_rect = nir_load_var(b, v->v_discard_rect);
 419    nir_ssa_def *dst_x0 = nir_channel(b, discard_rect, 0);
 420    nir_ssa_def *dst_x1 = nir_channel(b, discard_rect, 1);
 421    nir_ssa_def *dst_y0 = nir_channel(b, discard_rect, 2);
 422    nir_ssa_def *dst_y1 = nir_channel(b, discard_rect, 3);
 423
 424    c0 = nir_ult(b, nir_channel(b, pos, 0), dst_x0);
 425    c1 = nir_uge(b, nir_channel(b, pos, 0), dst_x1);
 426    c2 = nir_ult(b, nir_channel(b, pos, 1), dst_y0);
 427    c3 = nir_uge(b, nir_channel(b, pos, 1), dst_y1);
 428
 429    nir_ssa_def *oob = nir_ior(b, nir_ior(b, c0, c1), nir_ior(b, c2, c3));
 430
 431    nir_intrinsic_instr *discard =
 432       nir_intrinsic_instr_create(b->shader, nir_intrinsic_discard_if);
 433    discard->src[0] = nir_src_for_ssa(oob);
 434    nir_builder_instr_insert(b, &discard->instr);
 435 }
 436
 437 static nir_tex_instr *
 438 blorp_create_nir_tex_instr(nir_builder *b, struct brw_blorp_blit_vars *v,
 439                            nir_texop op, nir_ssa_def *pos, unsigned num_srcs,
 440                            enum brw_reg_type dst_type)
 441 {
 442    nir_tex_instr *tex = nir_tex_instr_create(b->shader, num_srcs);
 443
 444    tex->op = op;
 445
 446    switch (dst_type) {
 447    case BRW_REGISTER_TYPE_F:
 448       tex->dest_type = nir_type_float;
 449       break;
 450    case BRW_REGISTER_TYPE_D:
 451       tex->dest_type = nir_type_int;
 452       break;
 453    case BRW_REGISTER_TYPE_UD:
 454       tex->dest_type = nir_type_uint;
 455       break;
 456    default:
 457       unreachable("Invalid texture return type");
 458    }
 459
 460    tex->is_array = false;
 461    tex->is_shadow = false;
 462
 463    /* Blorp only has one texture and it's bound at unit 0 */
 464    tex->texture = NULL;
 465    tex->sampler = NULL;
 466    tex->texture_index = 0;
 467    tex->sampler_index = 0;
 468
 469    /* To properly handle 3-D and 2-D array textures, we pull the Z component
 470     * from an input.  TODO: This is a bit magic; we should probably make this
 471     * more explicit in the future.
 472     */
 473    assert(pos->num_components >= 2);
 474    pos = nir_vec3(b, nir_channel(b, pos, 0), nir_channel(b, pos, 1),
 475                      nir_load_var(b, v->v_src_z));
 476
 477    tex->src[0].src_type = nir_tex_src_coord;
 478    tex->src[0].src = nir_src_for_ssa(pos);
 479    tex->coord_components = 3;
 480
 481    nir_ssa_dest_init(&tex->instr, &tex->dest, 4, 32, NULL);
 482
 483    return tex;
 484 }
 485
 486 static nir_ssa_def *
 487 blorp_nir_tex(nir_builder *b, struct brw_blorp_blit_vars *v,
 488               nir_ssa_def *pos, enum brw_reg_type dst_type)
 489 {
 490    nir_tex_instr *tex =
 491       blorp_create_nir_tex_instr(b, v, nir_texop_tex, pos, 2, dst_type);
 492
 493    assert(pos->num_components == 2);
 494    tex->sampler_dim = GLSL_SAMPLER_DIM_2D;
 495    tex->src[1].src_type = nir_tex_src_lod;
 496    tex->src[1].src = nir_src_for_ssa(nir_imm_int(b, 0));
 497
 498    nir_builder_instr_insert(b, &tex->instr);
 499
 500    return &tex->dest.ssa;
 501 }
 502
 503 static nir_ssa_def *
 504 blorp_nir_txf(nir_builder *b, struct brw_blorp_blit_vars *v,
 505               nir_ssa_def *pos, enum brw_reg_type dst_type)
 506 {
 507    nir_tex_instr *tex =
 508       blorp_create_nir_tex_instr(b, v, nir_texop_txf, pos, 2, dst_type);
 509
 510    tex->sampler_dim = GLSL_SAMPLER_DIM_3D;
 511    tex->src[1].src_type = nir_tex_src_lod;
 512    tex->src[1].src = nir_src_for_ssa(nir_imm_int(b, 0));
 513
 514    nir_builder_instr_insert(b, &tex->instr);
 515
 516    return &tex->dest.ssa;
 517 }
 518
 519 static nir_ssa_def *
 520 blorp_nir_txf_ms(nir_builder *b, struct brw_blorp_blit_vars *v,
 521                  nir_ssa_def *pos, nir_ssa_def *mcs, enum brw_reg_type dst_type)
 522 {
 523    nir_tex_instr *tex =
 524       blorp_create_nir_tex_instr(b, v, nir_texop_txf_ms, pos,
 525                                  mcs != NULL ? 3 : 2, dst_type);
 526
 527    tex->sampler_dim = GLSL_SAMPLER_DIM_MS;
 528
 529    tex->src[1].src_type = nir_tex_src_ms_index;
 530    if (pos->num_components == 2) {
 531       tex->src[1].src = nir_src_for_ssa(nir_imm_int(b, 0));
 532    } else {
 533       assert(pos->num_components == 3);
 534       tex->src[1].src = nir_src_for_ssa(nir_channel(b, pos, 2));
 535    }
 536
 537    if (mcs) {
 538       tex->src[2].src_type = nir_tex_src_ms_mcs;
 539       tex->src[2].src = nir_src_for_ssa(mcs);
 540    }
 541
 542    nir_builder_instr_insert(b, &tex->instr);
 543
 544    return &tex->dest.ssa;
 545 }
 546
 547 static nir_ssa_def *
 548 blorp_nir_txf_ms_mcs(nir_builder *b, struct brw_blorp_blit_vars *v, nir_ssa_def *pos)
 549 {
 550    nir_tex_instr *tex =
 551       blorp_create_nir_tex_instr(b, v, nir_texop_txf_ms_mcs,
 552                                  pos, 1, BRW_REGISTER_TYPE_D);
 553
 554    tex->sampler_dim = GLSL_SAMPLER_DIM_MS;
 555
 556    nir_builder_instr_insert(b, &tex->instr);
 557
 558    return &tex->dest.ssa;
 559 }
 560
 561 static nir_ssa_def *
 562 nir_mask_shift_or(struct nir_builder *b, nir_ssa_def *dst, nir_ssa_def *src,
 563                   uint32_t src_mask, int src_left_shift)
 564 {
 565    nir_ssa_def *masked = nir_iand(b, src, nir_imm_int(b, src_mask));
 566
 567    nir_ssa_def *shifted;
 568    if (src_left_shift > 0) {
 569       shifted = nir_ishl(b, masked, nir_imm_int(b, src_left_shift));
 570    } else if (src_left_shift < 0) {
 571       shifted = nir_ushr(b, masked, nir_imm_int(b, -src_left_shift));
 572    } else {
 573       assert(src_left_shift == 0);
 574       shifted = masked;
 575    }
 576
 577    return nir_ior(b, dst, shifted);
 578 }
 579
 580 /**
 581  * Emit code to compensate for the difference between Y and W tiling.
 582  *
 583  * This code modifies the X and Y coordinates according to the formula:
 584  *
 585  *   (X', Y', S') = detile(W-MAJOR, tile(Y-MAJOR, X, Y, S))
 586  *
 587  * (See brw_blorp_build_nir_shader).
 588  */
 589 static inline nir_ssa_def *
 590 blorp_nir_retile_y_to_w(nir_builder *b, nir_ssa_def *pos)
 591 {
 592    assert(pos->num_components == 2);
 593    nir_ssa_def *x_Y = nir_channel(b, pos, 0);
 594    nir_ssa_def *y_Y = nir_channel(b, pos, 1);
 595
 596    /* Given X and Y coordinates that describe an address using Y tiling,
 597     * translate to the X and Y coordinates that describe the same address
 598     * using W tiling.
 599     *
 600     * If we break down the low order bits of X and Y, using a
 601     * single letter to represent each low-order bit:
 602     *
 603     *   X = A << 7 | 0bBCDEFGH
 604     *   Y = J << 5 | 0bKLMNP                                       (1)
 605     *
 606     * Then we can apply the Y tiling formula to see the memory offset being
 607     * addressed:
 608     *
 609     *   offset = (J * tile_pitch + A) << 12 | 0bBCDKLMNPEFGH       (2)
 610     *
 611     * If we apply the W detiling formula to this memory location, that the
 612     * corresponding X' and Y' coordinates are:
 613     *
 614     *   X' = A << 6 | 0bBCDPFH                                     (3)
 615     *   Y' = J << 6 | 0bKLMNEG
 616     *
 617     * Combining (1) and (3), we see that to transform (X, Y) to (X', Y'),
 618     * we need to make the following computation:
 619     *
 620     *   X' = (X & ~0b1011) >> 1 | (Y & 0b1) << 2 | X & 0b1         (4)
 621     *   Y' = (Y & ~0b1) << 1 | (X & 0b1000) >> 2 | (X & 0b10) >> 1
 622     */
 623    nir_ssa_def *x_W = nir_imm_int(b, 0);
 624    x_W = nir_mask_shift_or(b, x_W, x_Y, 0xfffffff4, -1);
 625    x_W = nir_mask_shift_or(b, x_W, y_Y, 0x1, 2);
 626    x_W = nir_mask_shift_or(b, x_W, x_Y, 0x1, 0);
 627
 628    nir_ssa_def *y_W = nir_imm_int(b, 0);
 629    y_W = nir_mask_shift_or(b, y_W, y_Y, 0xfffffffe, 1);
 630    y_W = nir_mask_shift_or(b, y_W, x_Y, 0x8, -2);
 631    y_W = nir_mask_shift_or(b, y_W, x_Y, 0x2, -1);
 632
 633    return nir_vec2(b, x_W, y_W);
 634 }
 635
 636 /**
 637  * Emit code to compensate for the difference between Y and W tiling.
 638  *
 639  * This code modifies the X and Y coordinates according to the formula:
 640  *
 641  *   (X', Y', S') = detile(Y-MAJOR, tile(W-MAJOR, X, Y, S))
 642  *
 643  * (See brw_blorp_build_nir_shader).
 644  */
 645 static inline nir_ssa_def *
 646 blorp_nir_retile_w_to_y(nir_builder *b, nir_ssa_def *pos)
 647 {
 648    assert(pos->num_components == 2);
 649    nir_ssa_def *x_W = nir_channel(b, pos, 0);
 650    nir_ssa_def *y_W = nir_channel(b, pos, 1);
 651
 652    /* Applying the same logic as above, but in reverse, we obtain the
 653     * formulas:
 654     *
 655     * X' = (X & ~0b101) << 1 | (Y & 0b10) << 2 | (Y & 0b1) << 1 | X & 0b1
 656     * Y' = (Y & ~0b11) >> 1 | (X & 0b100) >> 2
 657     */
 658    nir_ssa_def *x_Y = nir_imm_int(b, 0);
 659    x_Y = nir_mask_shift_or(b, x_Y, x_W, 0xfffffffa, 1);
 660    x_Y = nir_mask_shift_or(b, x_Y, y_W, 0x2, 2);
 661    x_Y = nir_mask_shift_or(b, x_Y, y_W, 0x1, 1);
 662    x_Y = nir_mask_shift_or(b, x_Y, x_W, 0x1, 0);
 663
 664    nir_ssa_def *y_Y = nir_imm_int(b, 0);
 665    y_Y = nir_mask_shift_or(b, y_Y, y_W, 0xfffffffc, -1);
 666    y_Y = nir_mask_shift_or(b, y_Y, x_W, 0x4, -2);
 667
 668    return nir_vec2(b, x_Y, y_Y);
 669 }
 670
 671 /**
 672  * Emit code to compensate for the difference between MSAA and non-MSAA
 673  * surfaces.
 674  *
 675  * This code modifies the X and Y coordinates according to the formula:
 676  *
 677  *   (X', Y', S') = encode_msaa(num_samples, IMS, X, Y, S)
 678  *
 679  * (See brw_blorp_blit_program).
 680  */
 681 static inline nir_ssa_def *
 682 blorp_nir_encode_msaa(nir_builder *b, nir_ssa_def *pos,
 683                       unsigned num_samples, enum isl_msaa_layout layout)
 684 {
 685    assert(pos->num_components == 2 || pos->num_components == 3);
 686
 687    switch (layout) {
 688    case ISL_MSAA_LAYOUT_NONE:
 689       assert(pos->num_components == 2);
 690       return pos;
 691    case ISL_MSAA_LAYOUT_ARRAY:
 692       /* No translation needed */
 693       return pos;
 694    case ISL_MSAA_LAYOUT_INTERLEAVED: {
 695       nir_ssa_def *x_in = nir_channel(b, pos, 0);
 696       nir_ssa_def *y_in = nir_channel(b, pos, 1);
 697       nir_ssa_def *s_in = pos->num_components == 2 ? nir_imm_int(b, 0) :
 698                                                      nir_channel(b, pos, 2);
 699
 700       nir_ssa_def *x_out = nir_imm_int(b, 0);
 701       nir_ssa_def *y_out = nir_imm_int(b, 0);
 702       switch (num_samples) {
 703       case 2:
 704       case 4:
 705          /* encode_msaa(2, IMS, X, Y, S) = (X', Y', 0)
 706           *   where X' = (X & ~0b1) << 1 | (S & 0b1) << 1 | (X & 0b1)
 707           *         Y' = Y
 708           *
 709           * encode_msaa(4, IMS, X, Y, S) = (X', Y', 0)
 710           *   where X' = (X & ~0b1) << 1 | (S & 0b1) << 1 | (X & 0b1)
 711           *         Y' = (Y & ~0b1) << 1 | (S & 0b10) | (Y & 0b1)
 712           */
 713          x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffffe, 1);
 714          x_out = nir_mask_shift_or(b, x_out, s_in, 0x1, 1);
 715          x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0);
 716          if (num_samples == 2) {
 717             y_out = y_in;
 718          } else {
 719             y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffe, 1);
 720             y_out = nir_mask_shift_or(b, y_out, s_in, 0x2, 0);
 721             y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0);
 722          }
 723          break;
 724
 725       case 8:
 726          /* encode_msaa(8, IMS, X, Y, S) = (X', Y', 0)
 727           *   where X' = (X & ~0b1) << 2 | (S & 0b100) | (S & 0b1) << 1
 728           *              | (X & 0b1)
 729           *         Y' = (Y & ~0b1) << 1 | (S & 0b10) | (Y & 0b1)
 730           */
 731          x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffffe, 2);
 732          x_out = nir_mask_shift_or(b, x_out, s_in, 0x4, 0);
 733          x_out = nir_mask_shift_or(b, x_out, s_in, 0x1, 1);
 734          x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0);
 735          y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffe, 1);
 736          y_out = nir_mask_shift_or(b, y_out, s_in, 0x2, 0);
 737          y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0);
 738          break;
 739
 740       case 16:
 741          /* encode_msaa(16, IMS, X, Y, S) = (X', Y', 0)
 742           *   where X' = (X & ~0b1) << 2 | (S & 0b100) | (S & 0b1) << 1
 743           *              | (X & 0b1)
 744           *         Y' = (Y & ~0b1) << 2 | (S & 0b1000) >> 1 (S & 0b10)
 745           *              | (Y & 0b1)
 746           */
 747          x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffffe, 2);
 748          x_out = nir_mask_shift_or(b, x_out, s_in, 0x4, 0);
 749          x_out = nir_mask_shift_or(b, x_out, s_in, 0x1, 1);
 750          x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0);
 751          y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffe, 2);
 752          y_out = nir_mask_shift_or(b, y_out, s_in, 0x8, -1);
 753          y_out = nir_mask_shift_or(b, y_out, s_in, 0x2, 0);
 754          y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0);
 755          break;
 756
 757       default:
 758          unreachable("Invalid number of samples for IMS layout");
 759       }
 760
 761       return nir_vec2(b, x_out, y_out);
 762    }
 763
 764    default:
 765       unreachable("Invalid MSAA layout");
 766    }
 767 }
 768
 769 /**
 770  * Emit code to compensate for the difference between MSAA and non-MSAA
 771  * surfaces.
 772  *
 773  * This code modifies the X and Y coordinates according to the formula:
 774  *
 775  *   (X', Y', S) = decode_msaa(num_samples, IMS, X, Y, S)
 776  *
 777  * (See brw_blorp_blit_program).
 778  */
 779 static inline nir_ssa_def *
 780 blorp_nir_decode_msaa(nir_builder *b, nir_ssa_def *pos,
 781                       unsigned num_samples, enum isl_msaa_layout layout)
 782 {
 783    assert(pos->num_components == 2 || pos->num_components == 3);
 784
 785    switch (layout) {
 786    case ISL_MSAA_LAYOUT_NONE:
 787       /* No translation necessary, and S should already be zero. */
 788       assert(pos->num_components == 2);
 789       return pos;
 790    case ISL_MSAA_LAYOUT_ARRAY:
 791       /* No translation necessary. */
 792       return pos;
 793    case ISL_MSAA_LAYOUT_INTERLEAVED: {
 794       assert(pos->num_components == 2);
 795
 796       nir_ssa_def *x_in = nir_channel(b, pos, 0);
 797       nir_ssa_def *y_in = nir_channel(b, pos, 1);
 798
 799       nir_ssa_def *x_out = nir_imm_int(b, 0);
 800       nir_ssa_def *y_out = nir_imm_int(b, 0);
 801       nir_ssa_def *s_out = nir_imm_int(b, 0);
 802       switch (num_samples) {
 803       case 2:
 804       case 4:
 805          /* decode_msaa(2, IMS, X, Y, 0) = (X', Y', S)
 806           *   where X' = (X & ~0b11) >> 1 | (X & 0b1)
 807           *         S = (X & 0b10) >> 1
 808           *
 809           * decode_msaa(4, IMS, X, Y, 0) = (X', Y', S)
 810           *   where X' = (X & ~0b11) >> 1 | (X & 0b1)
 811           *         Y' = (Y & ~0b11) >> 1 | (Y & 0b1)
 812           *         S = (Y & 0b10) | (X & 0b10) >> 1
 813           */
 814          x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffffc, -1);
 815          x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0);
 816          if (num_samples == 2) {
 817             y_out = y_in;
 818             s_out = nir_mask_shift_or(b, s_out, x_in, 0x2, -1);
 819          } else {
 820             y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffc, -1);
 821             y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0);
 822             s_out = nir_mask_shift_or(b, s_out, x_in, 0x2, -1);
 823             s_out = nir_mask_shift_or(b, s_out, y_in, 0x2, 0);
 824          }
 825          break;
 826
 827       case 8:
 828          /* decode_msaa(8, IMS, X, Y, 0) = (X', Y', S)
 829           *   where X' = (X & ~0b111) >> 2 | (X & 0b1)
 830           *         Y' = (Y & ~0b11) >> 1 | (Y & 0b1)
 831           *         S = (X & 0b100) | (Y & 0b10) | (X & 0b10) >> 1
 832           */
 833          x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffff8, -2);
 834          x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0);
 835          y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffffc, -1);
 836          y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0);
 837          s_out = nir_mask_shift_or(b, s_out, x_in, 0x4, 0);
 838          s_out = nir_mask_shift_or(b, s_out, y_in, 0x2, 0);
 839          s_out = nir_mask_shift_or(b, s_out, x_in, 0x2, -1);
 840          break;
 841
 842       case 16:
 843          /* decode_msaa(16, IMS, X, Y, 0) = (X', Y', S)
 844           *   where X' = (X & ~0b111) >> 2 | (X & 0b1)
 845           *         Y' = (Y & ~0b111) >> 2 | (Y & 0b1)
 846           *         S = (Y & 0b100) << 1 | (X & 0b100) |
 847           *             (Y & 0b10) | (X & 0b10) >> 1
 848           */
 849          x_out = nir_mask_shift_or(b, x_out, x_in, 0xfffffff8, -2);
 850          x_out = nir_mask_shift_or(b, x_out, x_in, 0x1, 0);
 851          y_out = nir_mask_shift_or(b, y_out, y_in, 0xfffffff8, -2);
 852          y_out = nir_mask_shift_or(b, y_out, y_in, 0x1, 0);
 853          s_out = nir_mask_shift_or(b, s_out, y_in, 0x4, 1);
 854          s_out = nir_mask_shift_or(b, s_out, x_in, 0x4, 0);
 855          s_out = nir_mask_shift_or(b, s_out, y_in, 0x2, 0);
 856          s_out = nir_mask_shift_or(b, s_out, x_in, 0x2, -1);
 857          break;
 858
 859       default:
 860          unreachable("Invalid number of samples for IMS layout");
 861       }
 862
 863       return nir_vec3(b, x_out, y_out, s_out);
 864    }
 865
 866    default:
 867       unreachable("Invalid MSAA layout");
 868    }
 869 }
 870
 871 /**
 872  * Count the number of trailing 1 bits in the given value.  For example:
 873  *
 874  * count_trailing_one_bits(0) == 0
 875  * count_trailing_one_bits(7) == 3
 876  * count_trailing_one_bits(11) == 2
 877  */
 878 static inline int count_trailing_one_bits(unsigned value)
 879 {
 880 #ifdef HAVE___BUILTIN_CTZ
 881    return __builtin_ctz(~value);
 882 #else
 883    return _mesa_bitcount(value & ~(value + 1));
 884 #endif
 885 }
 886
 887 static nir_ssa_def *
 888 blorp_nir_manual_blend_average(nir_builder *b, struct brw_blorp_blit_vars *v,
 889                                nir_ssa_def *pos, unsigned tex_samples,
 890                                enum isl_aux_usage tex_aux_usage,
 891                                enum brw_reg_type dst_type)
 892 {
 893    /* If non-null, this is the outer-most if statement */
 894    nir_if *outer_if = NULL;
 895
 896    nir_variable *color =
 897       nir_local_variable_create(b->impl, glsl_vec4_type(), "color");
 898
 899    nir_ssa_def *mcs = NULL;
 900    if (tex_aux_usage == ISL_AUX_USAGE_MCS)
 901       mcs = blorp_nir_txf_ms_mcs(b, v, pos);
 902
 903    /* We add together samples using a binary tree structure, e.g. for 4x MSAA:
 904     *
 905     *   result = ((sample[0] + sample[1]) + (sample[2] + sample[3])) / 4
 906     *
 907     * This ensures that when all samples have the same value, no numerical
 908     * precision is lost, since each addition operation always adds two equal
 909     * values, and summing two equal floating point values does not lose
 910     * precision.
 911     *
 912     * We perform this computation by treating the texture_data array as a
 913     * stack and performing the following operations:
 914     *
 915     * - push sample 0 onto stack
 916     * - push sample 1 onto stack
 917     * - add top two stack entries
 918     * - push sample 2 onto stack
 919     * - push sample 3 onto stack
 920     * - add top two stack entries
 921     * - add top two stack entries
 922     * - divide top stack entry by 4
 923     *
 924     * Note that after pushing sample i onto the stack, the number of add
 925     * operations we do is equal to the number of trailing 1 bits in i.  This
 926     * works provided the total number of samples is a power of two, which it
 927     * always is for i965.
 928     *
 929     * For integer formats, we replace the add operations with average
 930     * operations and skip the final division.
 931     */
 932    nir_ssa_def *texture_data[5];
 933    unsigned stack_depth = 0;
 934    for (unsigned i = 0; i < tex_samples; ++i) {
 935       assert(stack_depth == _mesa_bitcount(i)); /* Loop invariant */
 936
 937       /* Push sample i onto the stack */
 938       assert(stack_depth < ARRAY_SIZE(texture_data));
 939
 940       nir_ssa_def *ms_pos = nir_vec3(b, nir_channel(b, pos, 0),
 941                                         nir_channel(b, pos, 1),
 942                                         nir_imm_int(b, i));
 943       texture_data[stack_depth++] = blorp_nir_txf_ms(b, v, ms_pos, mcs, dst_type);
 944
 945       if (i == 0 && tex_aux_usage == ISL_AUX_USAGE_MCS) {
 946          /* The Ivy Bridge PRM, Vol4 Part1 p27 (Multisample Control Surface)
 947           * suggests an optimization:
 948           *
 949           *     "A simple optimization with probable large return in
 950           *     performance is to compare the MCS value to zero (indicating
 951           *     all samples are on sample slice 0), and sample only from
 952           *     sample slice 0 using ld2dss if MCS is zero."
 953           *
 954           * Note that in the case where the MCS value is zero, sampling from
 955           * sample slice 0 using ld2dss and sampling from sample 0 using
 956           * ld2dms are equivalent (since all samples are on sample slice 0).
 957           * Since we have already sampled from sample 0, all we need to do is
 958           * skip the remaining fetches and averaging if MCS is zero.
 959           */
 960          nir_ssa_def *mcs_zero =
 961             nir_ieq(b, nir_channel(b, mcs, 0), nir_imm_int(b, 0));
 962          if (tex_samples == 16) {
 963             mcs_zero = nir_iand(b, mcs_zero,
 964                nir_ieq(b, nir_channel(b, mcs, 1), nir_imm_int(b, 0)));
 965          }
 966
 967          nir_if *if_stmt = nir_if_create(b->shader);
 968          if_stmt->condition = nir_src_for_ssa(mcs_zero);
 969          nir_cf_node_insert(b->cursor, &if_stmt->cf_node);
 970
 971          b->cursor = nir_after_cf_list(&if_stmt->then_list);
 972          nir_store_var(b, color, texture_data[0], 0xf);
 973
 974          b->cursor = nir_after_cf_list(&if_stmt->else_list);
 975          outer_if = if_stmt;
 976       }
 977
 978       for (int j = 0; j < count_trailing_one_bits(i); j++) {
 979          assert(stack_depth >= 2);
 980          --stack_depth;
 981
 982          assert(dst_type == BRW_REGISTER_TYPE_F);
 983          texture_data[stack_depth - 1] =
 984             nir_fadd(b, texture_data[stack_depth - 1],
 985                         texture_data[stack_depth]);
 986       }
 987    }
 988
 989    /* We should have just 1 sample on the stack now. */
 990    assert(stack_depth == 1);
 991
 992    texture_data[0] = nir_fmul(b, texture_data[0],
 993                               nir_imm_float(b, 1.0 / tex_samples));
 994
 995    nir_store_var(b, color, texture_data[0], 0xf);
 996
 997    if (outer_if)
 998       b->cursor = nir_after_cf_node(&outer_if->cf_node);
 999
1000    return nir_load_var(b, color);
1001 }
1002
1003 static inline nir_ssa_def *
1004 nir_imm_vec2(nir_builder *build, float x, float y)
1005 {
1006    nir_const_value v;
1007
1008    memset(&v, 0, sizeof(v));
1009    v.f32[0] = x;
1010    v.f32[1] = y;
1011
1012    return nir_build_imm(build, 4, 32, v);
1013 }
1014
1015 static nir_ssa_def *
1016 blorp_nir_manual_blend_bilinear(nir_builder *b, nir_ssa_def *pos,
1017                                 unsigned tex_samples,
1018                                 const brw_blorp_blit_prog_key *key,
1019                                 struct brw_blorp_blit_vars *v)
1020 {
1021    nir_ssa_def *pos_xy = nir_channels(b, pos, 0x3);
1022    nir_ssa_def *rect_grid = nir_load_var(b, v->v_rect_grid);
1023    nir_ssa_def *scale = nir_imm_vec2(b, key->x_scale, key->y_scale);
1024
1025    /* Translate coordinates to lay out the samples in a rectangular  grid
1026     * roughly corresponding to sample locations.
1027     */
1028    pos_xy = nir_fmul(b, pos_xy, scale);
1029    /* Adjust coordinates so that integers represent pixel centers rather
1030     * than pixel edges.
1031     */
1032    pos_xy = nir_fadd(b, pos_xy, nir_imm_float(b, -0.5));
1033    /* Clamp the X, Y texture coordinates to properly handle the sampling of
1034     * texels on texture edges.
1035     */
1036    pos_xy = nir_fmin(b, nir_fmax(b, pos_xy, nir_imm_float(b, 0.0)),
1037                         nir_vec2(b, nir_channel(b, rect_grid, 0),
1038                                     nir_channel(b, rect_grid, 1)));
1039
1040    /* Store the fractional parts to be used as bilinear interpolation
1041     * coefficients.
1042     */
1043    nir_ssa_def *frac_xy = nir_ffract(b, pos_xy);
1044    /* Round the float coordinates down to nearest integer */
1045    pos_xy = nir_fdiv(b, nir_ftrunc(b, pos_xy), scale);
1046
1047    nir_ssa_def *tex_data[4];
1048    for (unsigned i = 0; i < 4; ++i) {
1049       float sample_off_x = (float)(i & 0x1) / key->x_scale;
1050       float sample_off_y = (float)((i >> 1) & 0x1) / key->y_scale;
1051       nir_ssa_def *sample_off = nir_imm_vec2(b, sample_off_x, sample_off_y);
1052
1053       nir_ssa_def *sample_coords = nir_fadd(b, pos_xy, sample_off);
1054       nir_ssa_def *sample_coords_int = nir_f2i(b, sample_coords);
1055
1056       /* The MCS value we fetch has to match up with the pixel that we're
1057        * sampling from. Since we sample from different pixels in each
1058        * iteration of this "for" loop, the call to mcs_fetch() should be
1059        * here inside the loop after computing the pixel coordinates.
1060        */
1061       nir_ssa_def *mcs = NULL;
1062       if (key->tex_aux_usage == ISL_AUX_USAGE_MCS)
1063          mcs = blorp_nir_txf_ms_mcs(b, v, sample_coords_int);
1064
1065       /* Compute sample index and map the sample index to a sample number.
1066        * Sample index layout shows the numbering of slots in a rectangular
1067        * grid of samples with in a pixel. Sample number layout shows the
1068        * rectangular grid of samples roughly corresponding to the real sample
1069        * locations with in a pixel.
1070        * In case of 4x MSAA, layout of sample indices matches the layout of
1071        * sample numbers:
1072        *           ---------
1073        *           | 0 | 1 |
1074        *           ---------
1075        *           | 2 | 3 |
1076        *           ---------
1077        *
1078        * In case of 8x MSAA the two layouts don't match.
1079        * sample index layout :  ---------    sample number layout :  ---------
1080        *                        | 0 | 1 |                            | 3 | 7 |
1081        *                        ---------                            ---------
1082        *                        | 2 | 3 |                            | 5 | 0 |
1083        *                        ---------                            ---------
1084        *                        | 4 | 5 |                            | 1 | 2 |
1085        *                        ---------                            ---------
1086        *                        | 6 | 7 |                            | 4 | 6 |
1087        *                        ---------                            ---------
1088        *
1089        * Fortunately, this can be done fairly easily as:
1090        * S' = (0x17306425 >> (S * 4)) & 0xf
1091        *
1092        * In the case of 16x MSAA the two layouts don't match.
1093        * Sample index layout:                Sample number layout:
1094        * ---------------------               ---------------------
1095        * |  0 |  1 |  2 |  3 |               | 15 | 10 |  9 |  7 |
1096        * ---------------------               ---------------------
1097        * |  4 |  5 |  6 |  7 |               |  4 |  1 |  3 | 13 |
1098        * ---------------------               ---------------------
1099        * |  8 |  9 | 10 | 11 |               | 12 |  2 |  0 |  6 |
1100        * ---------------------               ---------------------
1101        * | 12 | 13 | 14 | 15 |               | 11 |  8 |  5 | 14 |
1102        * ---------------------               ---------------------
1103        *
1104        * This is equivalent to
1105        * S' = (0xe58b602cd31479af >> (S * 4)) & 0xf
1106        */
1107       nir_ssa_def *frac = nir_ffract(b, sample_coords);
1108       nir_ssa_def *sample =
1109          nir_fdot2(b, frac, nir_imm_vec2(b, key->x_scale,
1110                                             key->x_scale * key->y_scale));
1111       sample = nir_f2i(b, sample);
1112
1113       if (tex_samples == 8) {
1114          sample = nir_iand(b, nir_ishr(b, nir_imm_int(b, 0x64210573),
1115                                        nir_ishl(b, sample, nir_imm_int(b, 2))),
1116                            nir_imm_int(b, 0xf));
1117       } else if (tex_samples == 16) {
1118          nir_ssa_def *sample_low =
1119             nir_iand(b, nir_ishr(b, nir_imm_int(b, 0xd31479af),
1120                                  nir_ishl(b, sample, nir_imm_int(b, 2))),
1121                      nir_imm_int(b, 0xf));
1122          nir_ssa_def *sample_high =
1123             nir_iand(b, nir_ishr(b, nir_imm_int(b, 0xe58b602c),
1124                                  nir_ishl(b, nir_iadd(b, sample,
1125                                                       nir_imm_int(b, -8)),
1126                                           nir_imm_int(b, 2))),
1127                      nir_imm_int(b, 0xf));
1128
1129          sample = nir_bcsel(b, nir_ilt(b, sample, nir_imm_int(b, 8)),
1130                             sample_low, sample_high);
1131       }
1132       nir_ssa_def *pos_ms = nir_vec3(b, nir_channel(b, sample_coords_int, 0),
1133                                         nir_channel(b, sample_coords_int, 1),
1134                                         sample);
1135       tex_data[i] = blorp_nir_txf_ms(b, v, pos_ms, mcs, key->texture_data_type);
1136    }
1137
1138    nir_ssa_def *frac_x = nir_channel(b, frac_xy, 0);
1139    nir_ssa_def *frac_y = nir_channel(b, frac_xy, 1);
1140    return nir_flrp(b, nir_flrp(b, tex_data[0], tex_data[1], frac_x),
1141                       nir_flrp(b, tex_data[2], tex_data[3], frac_x),
1142                       frac_y);
1143 }
1144
1145 /**
1146  * Generator for WM programs used in BLORP blits.
1147  *
1148  * The bulk of the work done by the WM program is to wrap and unwrap the
1149  * coordinate transformations used by the hardware to store surfaces in
1150  * memory.  The hardware transforms a pixel location (X, Y, S) (where S is the
1151  * sample index for a multisampled surface) to a memory offset by the
1152  * following formulas:
1153  *
1154  *   offset = tile(tiling_format, encode_msaa(num_samples, layout, X, Y, S))
1155  *   (X, Y, S) = decode_msaa(num_samples, layout, detile(tiling_format, offset))
1156  *
1157  * For a single-sampled surface, or for a multisampled surface using
1158  * INTEL_MSAA_LAYOUT_UMS, encode_msaa() and decode_msaa are the identity
1159  * function:
1160  *
1161  *   encode_msaa(1, NONE, X, Y, 0) = (X, Y, 0)
1162  *   decode_msaa(1, NONE, X, Y, 0) = (X, Y, 0)
1163  *   encode_msaa(n, UMS, X, Y, S) = (X, Y, S)
1164  *   decode_msaa(n, UMS, X, Y, S) = (X, Y, S)
1165  *
1166  * For a 4x multisampled surface using INTEL_MSAA_LAYOUT_IMS, encode_msaa()
1167  * embeds the sample number into bit 1 of the X and Y coordinates:
1168  *
1169  *   encode_msaa(4, IMS, X, Y, S) = (X', Y', 0)
1170  *     where X' = (X & ~0b1) << 1 | (S & 0b1) << 1 | (X & 0b1)
1171  *           Y' = (Y & ~0b1 ) << 1 | (S & 0b10) | (Y & 0b1)
1172  *   decode_msaa(4, IMS, X, Y, 0) = (X', Y', S)
1173  *     where X' = (X & ~0b11) >> 1 | (X & 0b1)
1174  *           Y' = (Y & ~0b11) >> 1 | (Y & 0b1)
1175  *           S = (Y & 0b10) | (X & 0b10) >> 1
1176  *
1177  * For an 8x multisampled surface using INTEL_MSAA_LAYOUT_IMS, encode_msaa()
1178  * embeds the sample number into bits 1 and 2 of the X coordinate and bit 1 of
1179  * the Y coordinate:
1180  *
1181  *   encode_msaa(8, IMS, X, Y, S) = (X', Y', 0)
1182  *     where X' = (X & ~0b1) << 2 | (S & 0b100) | (S & 0b1) << 1 | (X & 0b1)
1183  *           Y' = (Y & ~0b1) << 1 | (S & 0b10) | (Y & 0b1)
1184  *   decode_msaa(8, IMS, X, Y, 0) = (X', Y', S)
1185  *     where X' = (X & ~0b111) >> 2 | (X & 0b1)
1186  *           Y' = (Y & ~0b11) >> 1 | (Y & 0b1)
1187  *           S = (X & 0b100) | (Y & 0b10) | (X & 0b10) >> 1
1188  *
1189  * For X tiling, tile() combines together the low-order bits of the X and Y
1190  * coordinates in the pattern 0byyyxxxxxxxxx, creating 4k tiles that are 512
1191  * bytes wide and 8 rows high:
1192  *
1193  *   tile(x_tiled, X, Y, S) = A
1194  *     where A = tile_num << 12 | offset
1195  *           tile_num = (Y' >> 3) * tile_pitch + (X' >> 9)
1196  *           offset = (Y' & 0b111) << 9
1197  *                    | (X & 0b111111111)
1198  *           X' = X * cpp
1199  *           Y' = Y + S * qpitch
1200  *   detile(x_tiled, A) = (X, Y, S)
1201  *     where X = X' / cpp
1202  *           Y = Y' % qpitch
1203  *           S = Y' / qpitch
1204  *           Y' = (tile_num / tile_pitch) << 3
1205  *                | (A & 0b111000000000) >> 9
1206  *           X' = (tile_num % tile_pitch) << 9
1207  *                | (A & 0b111111111)
1208  *
1209  * (In all tiling formulas, cpp is the number of bytes occupied by a single
1210  * sample ("chars per pixel"), tile_pitch is the number of 4k tiles required
1211  * to fill the width of the surface, and qpitch is the spacing (in rows)
1212  * between array slices).
1213  *
1214  * For Y tiling, tile() combines together the low-order bits of the X and Y
1215  * coordinates in the pattern 0bxxxyyyyyxxxx, creating 4k tiles that are 128
1216  * bytes wide and 32 rows high:
1217  *
1218  *   tile(y_tiled, X, Y, S) = A
1219  *     where A = tile_num << 12 | offset
1220  *           tile_num = (Y' >> 5) * tile_pitch + (X' >> 7)
1221  *           offset = (X' & 0b1110000) << 5
1222  *                    | (Y' & 0b11111) << 4
1223  *                    | (X' & 0b1111)
1224  *           X' = X * cpp
1225  *           Y' = Y + S * qpitch
1226  *   detile(y_tiled, A) = (X, Y, S)
1227  *     where X = X' / cpp
1228  *           Y = Y' % qpitch
1229  *           S = Y' / qpitch
1230  *           Y' = (tile_num / tile_pitch) << 5
1231  *                | (A & 0b111110000) >> 4
1232  *           X' = (tile_num % tile_pitch) << 7
1233  *                | (A & 0b111000000000) >> 5
1234  *                | (A & 0b1111)
1235  *
1236  * For W tiling, tile() combines together the low-order bits of the X and Y
1237  * coordinates in the pattern 0bxxxyyyyxyxyx, creating 4k tiles that are 64
1238  * bytes wide and 64 rows high (note that W tiling is only used for stencil
1239  * buffers, which always have cpp = 1 and S=0):
1240  *
1241  *   tile(w_tiled, X, Y, S) = A
1242  *     where A = tile_num << 12 | offset
1243  *           tile_num = (Y' >> 6) * tile_pitch + (X' >> 6)
1244  *           offset = (X' & 0b111000) << 6
1245  *                    | (Y' & 0b111100) << 3
1246  *                    | (X' & 0b100) << 2
1247  *                    | (Y' & 0b10) << 2
1248  *                    | (X' & 0b10) << 1
1249  *                    | (Y' & 0b1) << 1
1250  *                    | (X' & 0b1)
1251  *           X' = X * cpp = X
1252  *           Y' = Y + S * qpitch
1253  *   detile(w_tiled, A) = (X, Y, S)
1254  *     where X = X' / cpp = X'
1255  *           Y = Y' % qpitch = Y'
1256  *           S = Y / qpitch = 0
1257  *           Y' = (tile_num / tile_pitch) << 6
1258  *                | (A & 0b111100000) >> 3
1259  *                | (A & 0b1000) >> 2
1260  *                | (A & 0b10) >> 1
1261  *           X' = (tile_num % tile_pitch) << 6
1262  *                | (A & 0b111000000000) >> 6
1263  *                | (A & 0b10000) >> 2
1264  *                | (A & 0b100) >> 1
1265  *                | (A & 0b1)
1266  *
1267  * Finally, for a non-tiled surface, tile() simply combines together the X and
1268  * Y coordinates in the natural way:
1269  *
1270  *   tile(untiled, X, Y, S) = A
1271  *     where A = Y * pitch + X'
1272  *           X' = X * cpp
1273  *           Y' = Y + S * qpitch
1274  *   detile(untiled, A) = (X, Y, S)
1275  *     where X = X' / cpp
1276  *           Y = Y' % qpitch
1277  *           S = Y' / qpitch
1278  *           X' = A % pitch
1279  *           Y' = A / pitch
1280  *
1281  * (In these formulas, pitch is the number of bytes occupied by a single row
1282  * of samples).
1283  */
1284 static nir_shader *
1285 brw_blorp_build_nir_shader(struct brw_context *brw,
1286                            const brw_blorp_blit_prog_key *key)
1287 {
1288    nir_ssa_def *src_pos, *dst_pos, *color;
1289
1290    /* Sanity checks */
1291    if (key->dst_tiled_w && key->rt_samples > 1) {
1292       /* If the destination image is W tiled and multisampled, then the thread
1293        * must be dispatched once per sample, not once per pixel.  This is
1294        * necessary because after conversion between W and Y tiling, there's no
1295        * guarantee that all samples corresponding to a single pixel will still
1296        * be together.
1297        */
1298       assert(key->persample_msaa_dispatch);
1299    }
1300
1301    if (key->blend) {
1302       /* We are blending, which means we won't have an opportunity to
1303        * translate the tiling and sample count for the texture surface.  So
1304        * the surface state for the texture must be configured with the correct
1305        * tiling and sample count.
1306        */
1307       assert(!key->src_tiled_w);
1308       assert(key->tex_samples == key->src_samples);
1309       assert(key->tex_layout == key->src_layout);
1310       assert(key->tex_samples > 0);
1311    }
1312
1313    if (key->persample_msaa_dispatch) {
1314       /* It only makes sense to do persample dispatch if the render target is
1315        * configured as multisampled.
1316        */
1317       assert(key->rt_samples > 0);
1318    }
1319
1320    /* Make sure layout is consistent with sample count */
1321    assert((key->tex_layout == ISL_MSAA_LAYOUT_NONE) ==
1322           (key->tex_samples <= 1));
1323    assert((key->rt_layout == ISL_MSAA_LAYOUT_NONE) ==
1324           (key->rt_samples <= 1));
1325    assert((key->src_layout == ISL_MSAA_LAYOUT_NONE) ==
1326           (key->src_samples <= 1));
1327    assert((key->dst_layout == ISL_MSAA_LAYOUT_NONE) ==
1328           (key->dst_samples <= 1));
1329
1330    nir_builder b;
1331    nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_FRAGMENT, NULL);
1332
1333    struct brw_blorp_blit_vars v;
1334    brw_blorp_blit_vars_init(&b, &v, key);
1335
1336    dst_pos = blorp_blit_get_frag_coords(&b, key, &v);
1337
1338    /* Render target and texture hardware don't support W tiling until Gen8. */
1339    const bool rt_tiled_w = false;
1340    const bool tex_tiled_w = brw->gen >= 8 && key->src_tiled_w;
1341
1342    /* The address that data will be written to is determined by the
1343     * coordinates supplied to the WM thread and the tiling and sample count of
1344     * the render target, according to the formula:
1345     *
1346     * (X, Y, S) = decode_msaa(rt_samples, detile(rt_tiling, offset))
1347     *
1348     * If the actual tiling and sample count of the destination surface are not
1349     * the same as the configuration of the render target, then these
1350     * coordinates are wrong and we have to adjust them to compensate for the
1351     * difference.
1352     */
1353    if (rt_tiled_w != key->dst_tiled_w ||
1354        key->rt_samples != key->dst_samples ||
1355        key->rt_layout != key->dst_layout) {
1356       dst_pos = blorp_nir_encode_msaa(&b, dst_pos, key->rt_samples,
1357                                       key->rt_layout);
1358       /* Now (X, Y, S) = detile(rt_tiling, offset) */
1359       if (rt_tiled_w != key->dst_tiled_w)
1360          dst_pos = blorp_nir_retile_y_to_w(&b, dst_pos);
1361       /* Now (X, Y, S) = detile(rt_tiling, offset) */
1362       dst_pos = blorp_nir_decode_msaa(&b, dst_pos, key->dst_samples,
1363                                       key->dst_layout);
1364    }
1365
1366    /* Now (X, Y, S) = decode_msaa(dst_samples, detile(dst_tiling, offset)).
1367     *
1368     * That is: X, Y and S now contain the true coordinates and sample index of
1369     * the data that the WM thread should output.
1370     *
1371     * If we need to kill pixels that are outside the destination rectangle,
1372     * now is the time to do it.
1373     */
1374    if (key->use_kill) {
1375       assert(!(key->blend && key->blit_scaled));
1376       blorp_nir_discard_if_outside_rect(&b, dst_pos, &v);
1377    }
1378
1379    src_pos = blorp_blit_apply_transform(&b, nir_i2f(&b, dst_pos), &v);
1380    if (dst_pos->num_components == 3) {
1381       /* The sample coordinate is an integer that we want left alone but
1382        * blorp_blit_apply_transform() blindly applies the transform to all
1383        * three coordinates.  Grab the original sample index.
1384        */
1385       src_pos = nir_vec3(&b, nir_channel(&b, src_pos, 0),
1386                              nir_channel(&b, src_pos, 1),
1387                              nir_channel(&b, dst_pos, 2));
1388    }
1389
1390    /* If the source image is not multisampled, then we want to fetch sample
1391     * number 0, because that's the only sample there is.
1392     */
1393    if (key->src_samples == 0)
1394       src_pos = nir_channels(&b, src_pos, 0x3);
1395
1396    /* X, Y, and S are now the coordinates of the pixel in the source image
1397     * that we want to texture from.  Exception: if we are blending, then S is
1398     * irrelevant, because we are going to fetch all samples.
1399     */
1400    if (key->blend && !key->blit_scaled) {
1401       /* Resolves (effecively) use texelFetch, so we need integers and we
1402        * don't care about the sample index if we got one.
1403        */
1404       src_pos = nir_f2i(&b, nir_channels(&b, src_pos, 0x3));
1405
1406       if (brw->gen == 6) {
1407          /* Because gen6 only supports 4x interleved MSAA, we can do all the
1408           * blending we need with a single linear-interpolated texture lookup
1409           * at the center of the sample. The texture coordinates to be odd
1410           * integers so that they correspond to the center of a 2x2 block
1411           * representing the four samples that maxe up a pixel.  So we need
1412           * to multiply our X and Y coordinates each by 2 and then add 1.
1413           */
1414          src_pos = nir_ishl(&b, src_pos, nir_imm_int(&b, 1));
1415          src_pos = nir_iadd(&b, src_pos, nir_imm_int(&b, 1));
1416          src_pos = nir_i2f(&b, src_pos);
1417          color = blorp_nir_tex(&b, &v, src_pos, key->texture_data_type);
1418       } else {
1419          /* Gen7+ hardware doesn't automaticaly blend. */
1420          color = blorp_nir_manual_blend_average(&b, &v, src_pos, key->src_samples,
1421                                                 key->tex_aux_usage,
1422                                                 key->texture_data_type);
1423       }
1424    } else if (key->blend && key->blit_scaled) {
1425       assert(!key->use_kill);
1426       color = blorp_nir_manual_blend_bilinear(&b, src_pos, key->src_samples, key, &v);
1427    } else {
1428       if (key->bilinear_filter) {
1429          color = blorp_nir_tex(&b, &v, src_pos, key->texture_data_type);
1430       } else {
1431          /* We're going to use texelFetch, so we need integers */
1432          if (src_pos->num_components == 2) {
1433             src_pos = nir_f2i(&b, src_pos);
1434          } else {
1435             assert(src_pos->num_components == 3);
1436             src_pos = nir_vec3(&b, nir_channel(&b, nir_f2i(&b, src_pos), 0),
1437                                    nir_channel(&b, nir_f2i(&b, src_pos), 1),
1438                                    nir_channel(&b, src_pos, 2));
1439          }
1440
1441          /* We aren't blending, which means we just want to fetch a single
1442           * sample from the source surface.  The address that we want to fetch
1443           * from is related to the X, Y and S values according to the formula:
1444           *
1445           * (X, Y, S) = decode_msaa(src_samples, detile(src_tiling, offset)).
1446           *
1447           * If the actual tiling and sample count of the source surface are
1448           * not the same as the configuration of the texture, then we need to
1449           * adjust the coordinates to compensate for the difference.
1450           */
1451          if (tex_tiled_w != key->src_tiled_w ||
1452              key->tex_samples != key->src_samples ||
1453              key->tex_layout != key->src_layout) {
1454             src_pos = blorp_nir_encode_msaa(&b, src_pos, key->src_samples,
1455                                             key->src_layout);
1456             /* Now (X, Y, S) = detile(src_tiling, offset) */
1457             if (tex_tiled_w != key->src_tiled_w)
1458                src_pos = blorp_nir_retile_w_to_y(&b, src_pos);
1459             /* Now (X, Y, S) = detile(tex_tiling, offset) */
1460             src_pos = blorp_nir_decode_msaa(&b, src_pos, key->tex_samples,
1461                                             key->tex_layout);
1462          }
1463
1464          /* Now (X, Y, S) = decode_msaa(tex_samples, detile(tex_tiling, offset)).
1465           *
1466           * In other words: X, Y, and S now contain values which, when passed to
1467           * the texturing unit, will cause data to be read from the correct
1468           * memory location.  So we can fetch the texel now.
1469           */
1470          if (key->src_samples == 0) {
1471             color = blorp_nir_txf(&b, &v, src_pos, key->texture_data_type);
1472          } else {
1473             nir_ssa_def *mcs = NULL;
1474             if (key->tex_aux_usage == ISL_AUX_USAGE_MCS)
1475                mcs = blorp_nir_txf_ms_mcs(&b, &v, src_pos);
1476
1477             color = blorp_nir_txf_ms(&b, &v, src_pos, mcs, key->texture_data_type);
1478          }
1479       }
1480    }
1481
1482    nir_store_var(&b, v.color_out, color, 0xf);
1483
1484    return b.shader;
1485 }
1486
1487 static void
1488 brw_blorp_get_blit_kernel(struct brw_context *brw,
1489                           struct brw_blorp_params *params,
1490                           const struct brw_blorp_blit_prog_key *prog_key)
1491 {
1492    if (brw_search_cache(&brw->cache, BRW_CACHE_BLORP_PROG,
1493                         prog_key, sizeof(*prog_key),
1494                         &params->wm_prog_kernel, &params->wm_prog_data))
1495       return;
1496
1497    const unsigned *program;
1498    unsigned program_size;
1499    struct brw_blorp_prog_data prog_data;
1500
1501    /* Try and compile with NIR first.  If that fails, fall back to the old
1502     * method of building shaders manually.
1503     */
1504    nir_shader *nir = brw_blorp_build_nir_shader(brw, prog_key);
1505    struct brw_wm_prog_key wm_key;
1506    brw_blorp_init_wm_prog_key(&wm_key);
1507    wm_key.tex.compressed_multisample_layout_mask =
1508       prog_key->tex_aux_usage == ISL_AUX_USAGE_MCS;
1509    wm_key.tex.msaa_16 = prog_key->tex_samples == 16;
1510    wm_key.multisample_fbo = prog_key->rt_samples > 1;
1511
1512    program = brw_blorp_compile_nir_shader(brw, nir, &wm_key, false,
1513                                           &prog_data, &program_size);
1514
1515    brw_upload_cache(&brw->cache, BRW_CACHE_BLORP_PROG,
1516                     prog_key, sizeof(*prog_key),
1517                     program, program_size,
1518                     &prog_data, sizeof(prog_data),
1519                     &params->wm_prog_kernel, &params->wm_prog_data);
1520 }
1521
1522 static void
1523 brw_blorp_setup_coord_transform(struct brw_blorp_coord_transform *xform,
1524                                 GLfloat src0, GLfloat src1,
1525                                 GLfloat dst0, GLfloat dst1,
1526                                 bool mirror)
1527 {
1528    float scale = (src1 - src0) / (dst1 - dst0);
1529    if (!mirror) {
1530       /* When not mirroring a coordinate (say, X), we need:
1531        *   src_x - src_x0 = (dst_x - dst_x0 + 0.5) * scale
1532        * Therefore:
1533        *   src_x = src_x0 + (dst_x - dst_x0 + 0.5) * scale
1534        *
1535        * blorp program uses "round toward zero" to convert the
1536        * transformed floating point coordinates to integer coordinates,
1537        * whereas the behaviour we actually want is "round to nearest",
1538        * so 0.5 provides the necessary correction.
1539        */
1540       xform->multiplier = scale;
1541       xform->offset = src0 + (-dst0 + 0.5f) * scale;
1542    } else {
1543       /* When mirroring X we need:
1544        *   src_x - src_x0 = dst_x1 - dst_x - 0.5
1545        * Therefore:
1546        *   src_x = src_x0 + (dst_x1 -dst_x - 0.5) * scale
1547        */
1548       xform->multiplier = -scale;
1549       xform->offset = src0 + (dst1 - 0.5f) * scale;
1550    }
1551 }
1552
1553 static enum isl_msaa_layout
1554 get_isl_msaa_layout(unsigned samples, enum intel_msaa_layout layout)
1555 {
1556    if (samples > 1) {
1557       switch (layout) {
1558       case INTEL_MSAA_LAYOUT_NONE:
1559          return ISL_MSAA_LAYOUT_NONE;
1560       case INTEL_MSAA_LAYOUT_IMS:
1561          return ISL_MSAA_LAYOUT_INTERLEAVED;
1562       case INTEL_MSAA_LAYOUT_UMS:
1563       case INTEL_MSAA_LAYOUT_CMS:
1564          return ISL_MSAA_LAYOUT_ARRAY;
1565       default:
1566          unreachable("Invalid MSAA layout");
1567       }
1568    } else {
1569       return ISL_MSAA_LAYOUT_NONE;
1570    }
1571 }
1572
1573 /**
1574  * Convert an swizzle enumeration (i.e. SWIZZLE_X) to one of the Gen7.5+
1575  * "Shader Channel Select" enumerations (i.e. HSW_SCS_RED).  The mappings are
1576  *
1577  * SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W, SWIZZLE_ZERO, SWIZZLE_ONE
1578  *         0          1          2          3             4            5
1579  *         4          5          6          7             0            1
1580  *   SCS_RED, SCS_GREEN,  SCS_BLUE, SCS_ALPHA,     SCS_ZERO,     SCS_ONE
1581  *
1582  * which is simply adding 4 then modding by 8 (or anding with 7).
1583  *
1584  * We then may need to apply workarounds for textureGather hardware bugs.
1585  */
1586 static enum isl_channel_select
1587 swizzle_to_scs(GLenum swizzle)
1588 {
1589    return (enum isl_channel_select)((swizzle + 4) & 7);
1590 }
1591
1592 static void
1593 surf_convert_to_single_slice(struct brw_context *brw,
1594                              struct brw_blorp_surface_info *info)
1595 {
1596    /* This only makes sense for a single level and array slice */
1597    assert(info->view.levels == 1 && info->view.array_len == 1);
1598
1599    /* Just bail if we have nothing to do. */
1600    if (info->surf.dim == ISL_SURF_DIM_2D &&
1601        info->view.base_level == 0 && info->view.base_array_layer == 0 &&
1602        info->surf.levels == 0 && info->surf.logical_level0_px.array_len == 0)
1603       return;
1604
1605    uint32_t x_offset_sa, y_offset_sa;
1606    blorp_get_image_offset_sa(&brw->isl_dev, &info->surf, info->view.base_level,
1607                              info->view.base_array_layer,
1608                              &x_offset_sa, &y_offset_sa);
1609
1610    isl_tiling_get_intratile_offset_sa(&brw->isl_dev, info->surf.tiling,
1611                                       info->view.format, info->surf.row_pitch,
1612                                       x_offset_sa, y_offset_sa,
1613                                       &info->bo_offset,
1614                                       &info->tile_x_sa, &info->tile_y_sa);
1615
1616    /* TODO: Once this file gets converted to C, we shouls just use designated
1617     * initializers.
1618     */
1619    struct isl_surf_init_info init_info = isl_surf_init_info();
1620
1621    init_info.dim = ISL_SURF_DIM_2D;
1622    init_info.format = ISL_FORMAT_R8_UINT;
1623    init_info.width =
1624       minify(info->surf.logical_level0_px.width, info->view.base_level);
1625    init_info.height =
1626       minify(info->surf.logical_level0_px.height, info->view.base_level);
1627    init_info.depth = 1;
1628    init_info.levels = 1;
1629    init_info.array_len = 1;
1630    init_info.samples = info->surf.samples;
1631    init_info.min_pitch = info->surf.row_pitch;
1632    init_info.usage = info->surf.usage;
1633    init_info.tiling_flags = 1 << info->surf.tiling;
1634
1635    isl_surf_init_s(&brw->isl_dev, &info->surf, &init_info);
1636    assert(info->surf.row_pitch == init_info.min_pitch);
1637
1638    /* The view is also different now. */
1639    info->view.base_level = 0;
1640    info->view.levels = 1;
1641    info->view.base_array_layer = 0;
1642    info->view.array_len = 1;
1643 }
1644
1645 static void
1646 surf_fake_interleaved_msaa(struct brw_context *brw,
1647                            struct brw_blorp_surface_info *info)
1648 {
1649    assert(info->surf.msaa_layout == ISL_MSAA_LAYOUT_INTERLEAVED);
1650
1651    /* First, we need to convert it to a simple 1-level 1-layer 2-D surface */
1652    surf_convert_to_single_slice(brw, info);
1653
1654    info->surf.logical_level0_px = info->surf.phys_level0_sa;
1655    info->surf.samples = 1;
1656    info->surf.msaa_layout = ISL_MSAA_LAYOUT_NONE;
1657 }
1658
1659 static void
1660 surf_retile_w_to_y(struct brw_context *brw,
1661                    struct brw_blorp_surface_info *info)
1662 {
1663    assert(info->surf.tiling == ISL_TILING_W);
1664
1665    /* First, we need to convert it to a simple 1-level 1-layer 2-D surface */
1666    surf_convert_to_single_slice(brw, info);
1667
1668    /* On gen7+, we don't have interleaved multisampling for color render
1669     * targets so we have to fake it.
1670     *
1671     * TODO: Are we sure we don't also need to fake it on gen6?
1672     */
1673    if (brw->gen > 6 && info->surf.msaa_layout == ISL_MSAA_LAYOUT_INTERLEAVED) {
1674       info->surf.logical_level0_px = info->surf.phys_level0_sa;
1675       info->surf.samples = 1;
1676       info->surf.msaa_layout = ISL_MSAA_LAYOUT_NONE;
1677    }
1678
1679    if (brw->gen == 6) {
1680       /* Gen6 stencil buffers have a very large alignment coming in from the
1681        * miptree.  It's out-of-bounds for what the surface state can handle.
1682        * Since we have a single layer and level, it doesn't really matter as
1683        * long as we don't pass a bogus value into isl_surf_fill_state().
1684        */
1685       info->surf.image_alignment_el = isl_extent3d(4, 2, 1);
1686    }
1687
1688    /* Now that we've converted everything to a simple 2-D surface with only
1689     * one miplevel, we can go about retiling it.
1690     */
1691    const unsigned x_align = 8, y_align = info->surf.samples != 0 ? 8 : 4;
1692    info->surf.tiling = ISL_TILING_Y0;
1693    info->surf.logical_level0_px.width =
1694       ALIGN(info->surf.logical_level0_px.width, x_align) * 2;
1695    info->surf.logical_level0_px.height =
1696       ALIGN(info->surf.logical_level0_px.height, y_align) / 2;
1697    info->tile_x_sa *= 2;
1698    info->tile_y_sa /= 2;
1699 }
1700
1701 /**
1702  * Note: if the src (or dst) is a 2D multisample array texture on Gen7+ using
1703  * INTEL_MSAA_LAYOUT_UMS or INTEL_MSAA_LAYOUT_CMS, src_layer (dst_layer) is
1704  * the physical layer holding sample 0.  So, for example, if
1705  * src_mt->num_samples == 4, then logical layer n corresponds to src_layer ==
1706  * 4*n.
1707  */
1708 void
1709 brw_blorp_blit_miptrees(struct brw_context *brw,
1710                         struct intel_mipmap_tree *src_mt,
1711                         unsigned src_level, unsigned src_layer,
1712                         mesa_format src_format, int src_swizzle,
1713                         struct intel_mipmap_tree *dst_mt,
1714                         unsigned dst_level, unsigned dst_layer,
1715                         mesa_format dst_format,
1716                         float src_x0, float src_y0,
1717                         float src_x1, float src_y1,
1718                         float dst_x0, float dst_y0,
1719                         float dst_x1, float dst_y1,
1720                         GLenum filter, bool mirror_x, bool mirror_y,
1721                         bool decode_srgb, bool encode_srgb)
1722 {
1723    /* Get ready to blit.  This includes depth resolving the src and dst
1724     * buffers if necessary.  Note: it's not necessary to do a color resolve on
1725     * the destination buffer because we use the standard render path to render
1726     * to destination color buffers, and the standard render path is
1727     * fast-color-aware.
1728     */
1729    intel_miptree_resolve_color(brw, src_mt, INTEL_MIPTREE_IGNORE_CCS_E);
1730    intel_miptree_slice_resolve_depth(brw, src_mt, src_level, src_layer);
1731    intel_miptree_slice_resolve_depth(brw, dst_mt, dst_level, dst_layer);
1732
1733    intel_miptree_prepare_mcs(brw, dst_mt);
1734
1735    DBG("%s from %dx %s mt %p %d %d (%f,%f) (%f,%f)"
1736        "to %dx %s mt %p %d %d (%f,%f) (%f,%f) (flip %d,%d)\n",
1737        __func__,
1738        src_mt->num_samples, _mesa_get_format_name(src_mt->format), src_mt,
1739        src_level, src_layer, src_x0, src_y0, src_x1, src_y1,
1740        dst_mt->num_samples, _mesa_get_format_name(dst_mt->format), dst_mt,
1741        dst_level, dst_layer, dst_x0, dst_y0, dst_x1, dst_y1,
1742        mirror_x, mirror_y);
1743
1744    if (!decode_srgb && _mesa_get_format_color_encoding(src_format) == GL_SRGB)
1745       src_format = _mesa_get_srgb_format_linear(src_format);
1746
1747    if (!encode_srgb && _mesa_get_format_color_encoding(dst_format) == GL_SRGB)
1748       dst_format = _mesa_get_srgb_format_linear(dst_format);
1749
1750    struct brw_blorp_params params;
1751    brw_blorp_params_init(&params);
1752
1753    brw_blorp_surface_info_init(brw, &params.src, src_mt, src_level,
1754                                src_layer, src_format, false);
1755    brw_blorp_surface_info_init(brw, &params.dst, dst_mt, dst_level,
1756                                dst_layer, dst_format, true);
1757
1758    /* When doing a multisample resolve of a GL_LUMINANCE32F or GL_INTENSITY32F
1759     * texture, the above code configures the source format for L32_FLOAT or
1760     * I32_FLOAT, and the destination format for R32_FLOAT.  On Sandy Bridge,
1761     * the SAMPLE message appears to handle multisampled L32_FLOAT and
1762     * I32_FLOAT textures incorrectly, resulting in blocky artifacts.  So work
1763     * around the problem by using a source format of R32_FLOAT.  This
1764     * shouldn't affect rendering correctness, since the destination format is
1765     * R32_FLOAT, so only the contents of the red channel matters.
1766     */
1767    if (brw->gen == 6 &&
1768        params.src.surf.samples > 1 && params.dst.surf.samples <= 1 &&
1769        src_mt->format == dst_mt->format &&
1770        params.dst.view.format == ISL_FORMAT_R32_FLOAT) {
1771       params.src.view.format = params.dst.view.format;
1772    }
1773
1774    struct brw_blorp_blit_prog_key wm_prog_key;
1775    memset(&wm_prog_key, 0, sizeof(wm_prog_key));
1776
1777    /* texture_data_type indicates the register type that should be used to
1778     * manipulate texture data.
1779     */
1780    switch (_mesa_get_format_datatype(src_mt->format)) {
1781    case GL_UNSIGNED_NORMALIZED:
1782    case GL_SIGNED_NORMALIZED:
1783    case GL_FLOAT:
1784       wm_prog_key.texture_data_type = BRW_REGISTER_TYPE_F;
1785       break;
1786    case GL_UNSIGNED_INT:
1787       if (src_mt->format == MESA_FORMAT_S_UINT8) {
1788          /* We process stencil as though it's an unsigned normalized color */
1789          wm_prog_key.texture_data_type = BRW_REGISTER_TYPE_F;
1790       } else {
1791          wm_prog_key.texture_data_type = BRW_REGISTER_TYPE_UD;
1792       }
1793       break;
1794    case GL_INT:
1795       wm_prog_key.texture_data_type = BRW_REGISTER_TYPE_D;
1796       break;
1797    default:
1798       unreachable("Unrecognized blorp format");
1799    }
1800
1801    /* Scaled blitting or not. */
1802    wm_prog_key.blit_scaled =
1803       ((dst_x1 - dst_x0) == (src_x1 - src_x0) &&
1804        (dst_y1 - dst_y0) == (src_y1 - src_y0)) ? false : true;
1805
1806    /* Scaling factors used for bilinear filtering in multisample scaled
1807     * blits.
1808     */
1809    if (src_mt->num_samples == 16)
1810       wm_prog_key.x_scale = 4.0f;
1811    else
1812       wm_prog_key.x_scale = 2.0f;
1813    wm_prog_key.y_scale = src_mt->num_samples / wm_prog_key.x_scale;
1814
1815    if (filter == GL_LINEAR &&
1816        params.src.surf.samples <= 1 && params.dst.surf.samples <= 1)
1817       wm_prog_key.bilinear_filter = true;
1818
1819    GLenum base_format = _mesa_get_format_base_format(src_mt->format);
1820    if (base_format != GL_DEPTH_COMPONENT && /* TODO: what about depth/stencil? */
1821        base_format != GL_STENCIL_INDEX &&
1822        !_mesa_is_format_integer(src_mt->format) &&
1823        src_mt->num_samples > 1 && dst_mt->num_samples <= 1) {
1824       /* We are downsampling a non-integer color buffer, so blend.
1825        *
1826        * Regarding integer color buffers, the OpenGL ES 3.2 spec says:
1827        *
1828        *    "If the source formats are integer types or stencil values, a
1829        *    single sample's value is selected for each pixel."
1830        *
1831        * This implies we should not blend in that case.
1832        */
1833       wm_prog_key.blend = true;
1834    }
1835
1836    /* src_samples and dst_samples are the true sample counts */
1837    wm_prog_key.src_samples = src_mt->num_samples;
1838    wm_prog_key.dst_samples = dst_mt->num_samples;
1839
1840    wm_prog_key.tex_aux_usage = params.src.aux_usage;
1841
1842    /* src_layout and dst_layout indicate the true MSAA layout used by src and
1843     * dst.
1844     */
1845    wm_prog_key.src_layout = get_isl_msaa_layout(src_mt->num_samples,
1846                                                 src_mt->msaa_layout);
1847    wm_prog_key.dst_layout = get_isl_msaa_layout(dst_mt->num_samples,
1848                                                 dst_mt->msaa_layout);
1849
1850    /* Round floating point values to nearest integer to avoid "off by one texel"
1851     * kind of errors when blitting.
1852     */
1853    params.x0 = params.wm_inputs.discard_rect.x0 = roundf(dst_x0);
1854    params.y0 = params.wm_inputs.discard_rect.y0 = roundf(dst_y0);
1855    params.x1 = params.wm_inputs.discard_rect.x1 = roundf(dst_x1);
1856    params.y1 = params.wm_inputs.discard_rect.y1 = roundf(dst_y1);
1857
1858    params.wm_inputs.rect_grid.x1 =
1859       minify(src_mt->logical_width0, src_level) * wm_prog_key.x_scale - 1.0f;
1860    params.wm_inputs.rect_grid.y1 =
1861       minify(src_mt->logical_height0, src_level) * wm_prog_key.y_scale - 1.0f;
1862
1863    brw_blorp_setup_coord_transform(&params.wm_inputs.coord_transform[0],
1864                                    src_x0, src_x1, dst_x0, dst_x1, mirror_x);
1865    brw_blorp_setup_coord_transform(&params.wm_inputs.coord_transform[1],
1866                                    src_y0, src_y1, dst_y0, dst_y1, mirror_y);
1867
1868    /* For some texture types, we need to pass the layer through the sampler. */
1869    params.wm_inputs.src_z = params.src.z_offset;
1870
1871    if (brw->gen > 6 &&
1872        params.dst.surf.msaa_layout == ISL_MSAA_LAYOUT_INTERLEAVED) {
1873       assert(params.dst.surf.samples > 1);
1874
1875       /* We must expand the rectangle we send through the rendering pipeline,
1876        * to account for the fact that we are mapping the destination region as
1877        * single-sampled when it is in fact multisampled.  We must also align
1878        * it to a multiple of the multisampling pattern, because the
1879        * differences between multisampled and single-sampled surface formats
1880        * will mean that pixels are scrambled within the multisampling pattern.
1881        * TODO: what if this makes the coordinates too large?
1882        *
1883        * Note: this only works if the destination surface uses the IMS layout.
1884        * If it's UMS, then we have no choice but to set up the rendering
1885        * pipeline as multisampled.
1886        */
1887       switch (params.dst.surf.samples) {
1888       case 2:
1889          params.x0 = ROUND_DOWN_TO(params.x0 * 2, 4);
1890          params.y0 = ROUND_DOWN_TO(params.y0, 4);
1891          params.x1 = ALIGN(params.x1 * 2, 4);
1892          params.y1 = ALIGN(params.y1, 4);
1893          break;
1894       case 4:
1895          params.x0 = ROUND_DOWN_TO(params.x0 * 2, 4);
1896          params.y0 = ROUND_DOWN_TO(params.y0 * 2, 4);
1897          params.x1 = ALIGN(params.x1 * 2, 4);
1898          params.y1 = ALIGN(params.y1 * 2, 4);
1899          break;
1900       case 8:
1901          params.x0 = ROUND_DOWN_TO(params.x0 * 4, 8);
1902          params.y0 = ROUND_DOWN_TO(params.y0 * 2, 4);
1903          params.x1 = ALIGN(params.x1 * 4, 8);
1904          params.y1 = ALIGN(params.y1 * 2, 4);
1905          break;
1906       case 16:
1907          params.x0 = ROUND_DOWN_TO(params.x0 * 4, 8);
1908          params.y0 = ROUND_DOWN_TO(params.y0 * 4, 8);
1909          params.x1 = ALIGN(params.x1 * 4, 8);
1910          params.y1 = ALIGN(params.y1 * 4, 8);
1911          break;
1912       default:
1913          unreachable("Unrecognized sample count in brw_blorp_blit_params ctor");
1914       }
1915
1916       surf_fake_interleaved_msaa(brw, &params.dst);
1917
1918       wm_prog_key.use_kill = true;
1919    }
1920
1921    if (params.dst.surf.tiling == ISL_TILING_W) {
1922       /* We must modify the rectangle we send through the rendering pipeline
1923        * (and the size and x/y offset of the destination surface), to account
1924        * for the fact that we are mapping it as Y-tiled when it is in fact
1925        * W-tiled.
1926        *
1927        * Both Y tiling and W tiling can be understood as organizations of
1928        * 32-byte sub-tiles; within each 32-byte sub-tile, the layout of pixels
1929        * is different, but the layout of the 32-byte sub-tiles within the 4k
1930        * tile is the same (8 sub-tiles across by 16 sub-tiles down, in
1931        * column-major order).  In Y tiling, the sub-tiles are 16 bytes wide
1932        * and 2 rows high; in W tiling, they are 8 bytes wide and 4 rows high.
1933        *
1934        * Therefore, to account for the layout differences within the 32-byte
1935        * sub-tiles, we must expand the rectangle so the X coordinates of its
1936        * edges are multiples of 8 (the W sub-tile width), and its Y
1937        * coordinates of its edges are multiples of 4 (the W sub-tile height).
1938        * Then we need to scale the X and Y coordinates of the rectangle to
1939        * account for the differences in aspect ratio between the Y and W
1940        * sub-tiles.  We need to modify the layer width and height similarly.
1941        *
1942        * A correction needs to be applied when MSAA is in use: since
1943        * INTEL_MSAA_LAYOUT_IMS uses an interleaving pattern whose height is 4,
1944        * we need to align the Y coordinates to multiples of 8, so that when
1945        * they are divided by two they are still multiples of 4.
1946        *
1947        * Note: Since the x/y offset of the surface will be applied using the
1948        * SURFACE_STATE command packet, it will be invisible to the swizzling
1949        * code in the shader; therefore it needs to be in a multiple of the
1950        * 32-byte sub-tile size.  Fortunately it is, since the sub-tile is 8
1951        * pixels wide and 4 pixels high (when viewed as a W-tiled stencil
1952        * buffer), and the miplevel alignment used for stencil buffers is 8
1953        * pixels horizontally and either 4 or 8 pixels vertically (see
1954        * intel_horizontal_texture_alignment_unit() and
1955        * intel_vertical_texture_alignment_unit()).
1956        *
1957        * Note: Also, since the SURFACE_STATE command packet can only apply
1958        * offsets that are multiples of 4 pixels horizontally and 2 pixels
1959        * vertically, it is important that the offsets will be multiples of
1960        * these sizes after they are converted into Y-tiled coordinates.
1961        * Fortunately they will be, since we know from above that the offsets
1962        * are a multiple of the 32-byte sub-tile size, and in Y-tiled
1963        * coordinates the sub-tile is 16 pixels wide and 2 pixels high.
1964        *
1965        * TODO: what if this makes the coordinates (or the texture size) too
1966        * large?
1967        */
1968       const unsigned x_align = 8, y_align = params.dst.surf.samples != 0 ? 8 : 4;
1969       params.x0 = ROUND_DOWN_TO(params.x0, x_align) * 2;
1970       params.y0 = ROUND_DOWN_TO(params.y0, y_align) / 2;
1971       params.x1 = ALIGN(params.x1, x_align) * 2;
1972       params.y1 = ALIGN(params.y1, y_align) / 2;
1973
1974       /* Retile the surface to Y-tiled */
1975       surf_retile_w_to_y(brw, &params.dst);
1976
1977       wm_prog_key.dst_tiled_w = true;
1978       wm_prog_key.use_kill = true;
1979
1980       if (params.dst.surf.samples > 1) {
1981          /* If the destination surface is a W-tiled multisampled stencil
1982           * buffer that we're mapping as Y tiled, then we need to arrange for
1983           * the WM program to run once per sample rather than once per pixel,
1984           * because the memory layout of related samples doesn't match between
1985           * W and Y tiling.
1986           */
1987          wm_prog_key.persample_msaa_dispatch = true;
1988       }
1989    }
1990
1991    if (brw->gen < 8 && params.src.surf.tiling == ISL_TILING_W) {
1992       /* On Haswell and earlier, we have to fake W-tiled sources as Y-tiled.
1993        * Broadwell adds support for sampling from stencil.
1994        *
1995        * See the comments above concerning x/y offset alignment for the
1996        * destination surface.
1997        *
1998        * TODO: what if this makes the texture size too large?
1999        */
2000       surf_retile_w_to_y(brw, &params.src);
2001
2002       wm_prog_key.src_tiled_w = true;
2003    }
2004
2005    /* tex_samples and rt_samples are the sample counts that are set up in
2006     * SURFACE_STATE.
2007     */
2008    wm_prog_key.tex_samples = params.src.surf.samples;
2009    wm_prog_key.rt_samples  = params.dst.surf.samples;
2010
2011    /* tex_layout and rt_layout indicate the MSAA layout the GPU pipeline will
2012     * use to access the source and destination surfaces.
2013     */
2014    wm_prog_key.tex_layout = params.src.surf.msaa_layout;
2015    wm_prog_key.rt_layout = params.dst.surf.msaa_layout;
2016
2017    if (params.src.surf.samples > 0 && params.dst.surf.samples > 1) {
2018       /* We are blitting from a multisample buffer to a multisample buffer, so
2019        * we must preserve samples within a pixel.  This means we have to
2020        * arrange for the WM program to run once per sample rather than once
2021        * per pixel.
2022        */
2023       wm_prog_key.persample_msaa_dispatch = true;
2024    }
2025
2026    brw_blorp_get_blit_kernel(brw, &params, &wm_prog_key);
2027
2028    for (unsigned i = 0; i < 4; i++) {
2029       params.src.view.channel_select[i] =
2030          swizzle_to_scs(GET_SWZ(src_swizzle, i));
2031    }
2032
2033    brw_blorp_exec(brw, &params);
2034
2035    intel_miptree_slice_set_needs_hiz_resolve(dst_mt, dst_level, dst_layer);
2036
2037    if (intel_miptree_is_lossless_compressed(brw, dst_mt))
2038       dst_mt->fast_clear_state = INTEL_FAST_CLEAR_STATE_UNRESOLVED;
2039 }