src/mesa/drivers/dri/i965/brw_blorp_blit.cpp

   1 /*
   2  * Copyright © 2012 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "main/teximage.h"
  25 #include "main/fbobject.h"
  26
  27 #include "glsl/ralloc.h"
  28
  29 #include "intel_fbo.h"
  30
  31 #include "brw_blorp.h"
  32 #include "brw_context.h"
  33 #include "brw_eu.h"
  34 #include "brw_state.h"
  35
  36
  37 /**
  38  * Helper function for handling mirror image blits.
  39  *
  40  * If coord0 > coord1, swap them and invert the "mirror" boolean.
  41  */
  42 static inline void
  43 fixup_mirroring(bool &mirror, GLint &coord0, GLint &coord1)
  44 {
  45    if (coord0 > coord1) {
  46       mirror = !mirror;
  47       GLint tmp = coord0;
  48       coord0 = coord1;
  49       coord1 = tmp;
  50    }
  51 }
  52
  53
  54 /**
  55  * Adjust {src,dst}_x{0,1} to account for clipping and scissoring of
  56  * destination coordinates.
  57  *
  58  * Return true if there is still blitting to do, false if all pixels got
  59  * rejected by the clip and/or scissor.
  60  *
  61  * For clarity, the nomenclature of this function assumes we are clipping and
  62  * scissoring the X coordinate; the exact same logic applies for Y
  63  * coordinates.
  64  *
  65  * Note: this function may also be used to account for clipping of source
  66  * coordinates, by swapping the roles of src and dst.
  67  */
  68 static inline bool
  69 clip_or_scissor(bool mirror, GLint &src_x0, GLint &src_x1, GLint &dst_x0,
  70                 GLint &dst_x1, GLint fb_xmin, GLint fb_xmax)
  71 {
  72    /* If we are going to scissor everything away, stop. */
  73    if (!(fb_xmin < fb_xmax &&
  74          dst_x0 < fb_xmax &&
  75          fb_xmin < dst_x1 &&
  76          dst_x0 < dst_x1)) {
  77       return false;
  78    }
  79
  80    /* Clip the destination rectangle, and keep track of how many pixels we
  81     * clipped off of the left and right sides of it.
  82     */
  83    GLint pixels_clipped_left = 0;
  84    GLint pixels_clipped_right = 0;
  85    if (dst_x0 < fb_xmin) {
  86       pixels_clipped_left = fb_xmin - dst_x0;
  87       dst_x0 = fb_xmin;
  88    }
  89    if (fb_xmax < dst_x1) {
  90       pixels_clipped_right = dst_x1 - fb_xmax;
  91       dst_x1 = fb_xmax;
  92    }
  93
  94    /* If we are mirrored, then before applying pixels_clipped_{left,right} to
  95     * the source coordinates, we need to flip them to account for the
  96     * mirroring.
  97     */
  98    if (mirror) {
  99       GLint tmp = pixels_clipped_left;
 100       pixels_clipped_left = pixels_clipped_right;
 101       pixels_clipped_right = tmp;
 102    }
 103
 104    /* Adjust the source rectangle to remove the pixels corresponding to those
 105     * that were clipped/scissored out of the destination rectangle.
 106     */
 107    src_x0 += pixels_clipped_left;
 108    src_x1 -= pixels_clipped_right;
 109
 110    return true;
 111 }
 112
 113
 114 static struct intel_mipmap_tree *
 115 find_miptree(GLbitfield buffer_bit, struct gl_renderbuffer *rb)
 116 {
 117    struct intel_renderbuffer *irb = intel_renderbuffer(rb);
 118    struct intel_mipmap_tree *mt = irb->mt;
 119    if (buffer_bit == GL_STENCIL_BUFFER_BIT && mt->stencil_mt)
 120       mt = mt->stencil_mt;
 121    return mt;
 122 }
 123
 124 void
 125 brw_blorp_blit_miptrees(struct intel_context *intel,
 126                         struct intel_mipmap_tree *src_mt,
 127                         struct intel_mipmap_tree *dst_mt,
 128                         int src_x0, int src_y0,
 129                         int dst_x0, int dst_y0,
 130                         int dst_x1, int dst_y1,
 131                         bool mirror_x, bool mirror_y)
 132 {
 133    brw_blorp_blit_params params(brw_context(&intel->ctx),
 134                                 src_mt, dst_mt,
 135                                 src_x0, src_y0,
 136                                 dst_x0, dst_y0,
 137                                 dst_x1, dst_y1,
 138                                 mirror_x, mirror_y);
 139    brw_blorp_exec(intel, &params);
 140 }
 141
 142 static void
 143 do_blorp_blit(struct intel_context *intel, GLbitfield buffer_bit,
 144               struct gl_renderbuffer *src_rb, struct gl_renderbuffer *dst_rb,
 145               GLint srcX0, GLint srcY0,
 146               GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
 147               bool mirror_x, bool mirror_y)
 148 {
 149    struct gl_context *ctx = &intel->ctx;
 150
 151    /* Find source/dst miptrees */
 152    struct intel_mipmap_tree *src_mt = find_miptree(buffer_bit, src_rb);
 153    struct intel_mipmap_tree *dst_mt = find_miptree(buffer_bit, dst_rb);
 154
 155    /* Get ready to blit.  This includes depth resolving the src and dst
 156     * buffers if necessary.
 157     */
 158    intel_renderbuffer_resolve_depth(intel, intel_renderbuffer(src_rb));
 159    intel_renderbuffer_resolve_depth(intel, intel_renderbuffer(dst_rb));
 160
 161    /* Do the blit */
 162    brw_blorp_blit_miptrees(intel, src_mt, dst_mt,
 163                            srcX0, srcY0, dstX0, dstY0, dstX1, dstY1,
 164                            mirror_x, mirror_y);
 165
 166    /* Mark the dst buffer as needing a HiZ resolve if necessary. */
 167    intel_renderbuffer_set_needs_hiz_resolve(intel_renderbuffer(dst_rb));
 168 }
 169
 170
 171 static bool
 172 formats_match(GLbitfield buffer_bit, struct gl_renderbuffer *src_rb,
 173               struct gl_renderbuffer *dst_rb)
 174 {
 175    /* Note: don't just check gl_renderbuffer::Format, because in some cases
 176     * multiple gl_formats resolve to the same native type in the miptree (for
 177     * example MESA_FORMAT_X8_Z24 and MESA_FORMAT_S8_Z24), and we can blit
 178     * between those formats.
 179     */
 180    return find_miptree(buffer_bit, src_rb)->format ==
 181       find_miptree(buffer_bit, dst_rb)->format;
 182 }
 183
 184
 185 static bool
 186 try_blorp_blit(struct intel_context *intel,
 187                GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
 188                GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
 189                GLenum filter, GLbitfield buffer_bit)
 190 {
 191    struct gl_context *ctx = &intel->ctx;
 192
 193    /* Sync up the state of window system buffers.  We need to do this before
 194     * we go looking for the buffers.
 195     */
 196    intel_prepare_render(intel);
 197
 198    const struct gl_framebuffer *read_fb = ctx->ReadBuffer;
 199    const struct gl_framebuffer *draw_fb = ctx->DrawBuffer;
 200
 201    /* Detect if the blit needs to be mirrored */
 202    bool mirror_x = false, mirror_y = false;
 203    fixup_mirroring(mirror_x, srcX0, srcX1);
 204    fixup_mirroring(mirror_x, dstX0, dstX1);
 205    fixup_mirroring(mirror_y, srcY0, srcY1);
 206    fixup_mirroring(mirror_y, dstY0, dstY1);
 207
 208    /* Make sure width and height match */
 209    if (srcX1 - srcX0 != dstX1 - dstX0) return false;
 210    if (srcY1 - srcY0 != dstY1 - dstY0) return false;
 211
 212    /* If the destination rectangle needs to be clipped or scissored, do so.
 213     */
 214    if (!(clip_or_scissor(mirror_x, srcX0, srcX1, dstX0, dstX1,
 215                          draw_fb->_Xmin, draw_fb->_Xmax) &&
 216          clip_or_scissor(mirror_y, srcY0, srcY1, dstY0, dstY1,
 217                          draw_fb->_Ymin, draw_fb->_Ymax))) {
 218       /* Everything got clipped/scissored away, so the blit was successful. */
 219       return true;
 220    }
 221
 222    /* If the source rectangle needs to be clipped or scissored, do so. */
 223    if (!(clip_or_scissor(mirror_x, dstX0, dstX1, srcX0, srcX1,
 224                          0, read_fb->Width) &&
 225          clip_or_scissor(mirror_y, dstY0, dstY1, srcY0, srcY1,
 226                          0, read_fb->Height))) {
 227       /* Everything got clipped/scissored away, so the blit was successful. */
 228       return true;
 229    }
 230
 231    /* Account for the fact that in the system framebuffer, the origin is at
 232     * the lower left.
 233     */
 234    if (_mesa_is_winsys_fbo(read_fb)) {
 235       GLint tmp = read_fb->Height - srcY0;
 236       srcY0 = read_fb->Height - srcY1;
 237       srcY1 = tmp;
 238       mirror_y = !mirror_y;
 239    }
 240    if (_mesa_is_winsys_fbo(draw_fb)) {
 241       GLint tmp = draw_fb->Height - dstY0;
 242       dstY0 = draw_fb->Height - dstY1;
 243       dstY1 = tmp;
 244       mirror_y = !mirror_y;
 245    }
 246
 247    /* Find buffers */
 248    struct gl_renderbuffer *src_rb;
 249    struct gl_renderbuffer *dst_rb;
 250    switch (buffer_bit) {
 251    case GL_COLOR_BUFFER_BIT:
 252       src_rb = read_fb->_ColorReadBuffer;
 253       for (unsigned i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; ++i) {
 254          dst_rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
 255          if (dst_rb && !formats_match(buffer_bit, src_rb, dst_rb))
 256             return false;
 257       }
 258       for (unsigned i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; ++i) {
 259          dst_rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
 260          do_blorp_blit(intel, buffer_bit, src_rb, dst_rb, srcX0, srcY0,
 261                        dstX0, dstY0, dstX1, dstY1, mirror_x, mirror_y);
 262       }
 263       break;
 264    case GL_DEPTH_BUFFER_BIT:
 265       src_rb = read_fb->Attachment[BUFFER_DEPTH].Renderbuffer;
 266       dst_rb = draw_fb->Attachment[BUFFER_DEPTH].Renderbuffer;
 267       if (!formats_match(buffer_bit, src_rb, dst_rb))
 268          return false;
 269       do_blorp_blit(intel, buffer_bit, src_rb, dst_rb, srcX0, srcY0,
 270                     dstX0, dstY0, dstX1, dstY1, mirror_x, mirror_y);
 271       break;
 272    case GL_STENCIL_BUFFER_BIT:
 273       src_rb = read_fb->Attachment[BUFFER_STENCIL].Renderbuffer;
 274       dst_rb = draw_fb->Attachment[BUFFER_STENCIL].Renderbuffer;
 275       if (!formats_match(buffer_bit, src_rb, dst_rb))
 276          return false;
 277       do_blorp_blit(intel, buffer_bit, src_rb, dst_rb, srcX0, srcY0,
 278                     dstX0, dstY0, dstX1, dstY1, mirror_x, mirror_y);
 279       break;
 280    default:
 281       assert(false);
 282    }
 283
 284    return true;
 285 }
 286
 287 GLbitfield
 288 brw_blorp_framebuffer(struct intel_context *intel,
 289                       GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
 290                       GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
 291                       GLbitfield mask, GLenum filter)
 292 {
 293    /* BLORP is not supported before Gen6. */
 294    if (intel->gen < 6)
 295       return mask;
 296
 297    static GLbitfield buffer_bits[] = {
 298       GL_COLOR_BUFFER_BIT,
 299       GL_DEPTH_BUFFER_BIT,
 300       GL_STENCIL_BUFFER_BIT,
 301    };
 302
 303    for (unsigned int i = 0; i < ARRAY_SIZE(buffer_bits); ++i) {
 304       if ((mask & buffer_bits[i]) &&
 305        try_blorp_blit(intel,
 306                       srcX0, srcY0, srcX1, srcY1,
 307                       dstX0, dstY0, dstX1, dstY1,
 308                       filter, buffer_bits[i])) {
 309          mask &= ~buffer_bits[i];
 310       }
 311    }
 312
 313    return mask;
 314 }
 315
 316
 317 /**
 318  * Enum to specify the order of arguments in a sampler message
 319  */
 320 enum sampler_message_arg
 321 {
 322    SAMPLER_MESSAGE_ARG_U_FLOAT,
 323    SAMPLER_MESSAGE_ARG_V_FLOAT,
 324    SAMPLER_MESSAGE_ARG_U_INT,
 325    SAMPLER_MESSAGE_ARG_V_INT,
 326    SAMPLER_MESSAGE_ARG_SI_INT,
 327    SAMPLER_MESSAGE_ARG_MCS_INT,
 328    SAMPLER_MESSAGE_ARG_ZERO_INT,
 329 };
 330
 331 /**
 332  * Generator for WM programs used in BLORP blits.
 333  *
 334  * The bulk of the work done by the WM program is to wrap and unwrap the
 335  * coordinate transformations used by the hardware to store surfaces in
 336  * memory.  The hardware transforms a pixel location (X, Y, S) (where S is the
 337  * sample index for a multisampled surface) to a memory offset by the
 338  * following formulas:
 339  *
 340  *   offset = tile(tiling_format, encode_msaa(num_samples, layout, X, Y, S))
 341  *   (X, Y, S) = decode_msaa(num_samples, layout, detile(tiling_format, offset))
 342  *
 343  * For a single-sampled surface, or for a multisampled surface using
 344  * INTEL_MSAA_LAYOUT_UMS, encode_msaa() and decode_msaa are the identity
 345  * function:
 346  *
 347  *   encode_msaa(1, NONE, X, Y, 0) = (X, Y, 0)
 348  *   decode_msaa(1, NONE, X, Y, 0) = (X, Y, 0)
 349  *   encode_msaa(n, UMS, X, Y, S) = (X, Y, S)
 350  *   decode_msaa(n, UMS, X, Y, S) = (X, Y, S)
 351  *
 352  * For a 4x multisampled surface using INTEL_MSAA_LAYOUT_IMS, encode_msaa()
 353  * embeds the sample number into bit 1 of the X and Y coordinates:
 354  *
 355  *   encode_msaa(4, IMS, X, Y, S) = (X', Y', 0)
 356  *     where X' = (X & ~0b1) << 1 | (S & 0b1) << 1 | (X & 0b1)
 357  *           Y' = (Y & ~0b1 ) << 1 | (S & 0b10) | (Y & 0b1)
 358  *   decode_msaa(4, IMS, X, Y, 0) = (X', Y', S)
 359  *     where X' = (X & ~0b11) >> 1 | (X & 0b1)
 360  *           Y' = (Y & ~0b11) >> 1 | (Y & 0b1)
 361  *           S = (Y & 0b10) | (X & 0b10) >> 1
 362  *
 363  * For an 8x multisampled surface using INTEL_MSAA_LAYOUT_IMS, encode_msaa()
 364  * embeds the sample number into bits 1 and 2 of the X coordinate and bit 1 of
 365  * the Y coordinate:
 366  *
 367  *   encode_msaa(8, IMS, X, Y, S) = (X', Y', 0)
 368  *     where X' = (X & ~0b1) << 2 | (S & 0b100) | (S & 0b1) << 1 | (X & 0b1)
 369  *           Y' = (Y & ~0b1) << 1 | (S & 0b10) | (Y & 0b1)
 370  *   decode_msaa(8, IMS, X, Y, 0) = (X', Y', S)
 371  *     where X' = (X & ~0b111) >> 2 | (X & 0b1)
 372  *           Y' = (Y & ~0b11) >> 1 | (Y & 0b1)
 373  *           S = (X & 0b100) | (Y & 0b10) | (X & 0b10) >> 1
 374  *
 375  * For X tiling, tile() combines together the low-order bits of the X and Y
 376  * coordinates in the pattern 0byyyxxxxxxxxx, creating 4k tiles that are 512
 377  * bytes wide and 8 rows high:
 378  *
 379  *   tile(x_tiled, X, Y, S) = A
 380  *     where A = tile_num << 12 | offset
 381  *           tile_num = (Y' >> 3) * tile_pitch + (X' >> 9)
 382  *           offset = (Y' & 0b111) << 9
 383  *                    | (X & 0b111111111)
 384  *           X' = X * cpp
 385  *           Y' = Y + S * qpitch
 386  *   detile(x_tiled, A) = (X, Y, S)
 387  *     where X = X' / cpp
 388  *           Y = Y' % qpitch
 389  *           S = Y' / qpitch
 390  *           Y' = (tile_num / tile_pitch) << 3
 391  *                | (A & 0b111000000000) >> 9
 392  *           X' = (tile_num % tile_pitch) << 9
 393  *                | (A & 0b111111111)
 394  *
 395  * (In all tiling formulas, cpp is the number of bytes occupied by a single
 396  * sample ("chars per pixel"), tile_pitch is the number of 4k tiles required
 397  * to fill the width of the surface, and qpitch is the spacing (in rows)
 398  * between array slices).
 399  *
 400  * For Y tiling, tile() combines together the low-order bits of the X and Y
 401  * coordinates in the pattern 0bxxxyyyyyxxxx, creating 4k tiles that are 128
 402  * bytes wide and 32 rows high:
 403  *
 404  *   tile(y_tiled, X, Y, S) = A
 405  *     where A = tile_num << 12 | offset
 406  *           tile_num = (Y' >> 5) * tile_pitch + (X' >> 7)
 407  *           offset = (X' & 0b1110000) << 5
 408  *                    | (Y' & 0b11111) << 4
 409  *                    | (X' & 0b1111)
 410  *           X' = X * cpp
 411  *           Y' = Y + S * qpitch
 412  *   detile(y_tiled, A) = (X, Y, S)
 413  *     where X = X' / cpp
 414  *           Y = Y' % qpitch
 415  *           S = Y' / qpitch
 416  *           Y' = (tile_num / tile_pitch) << 5
 417  *                | (A & 0b111110000) >> 4
 418  *           X' = (tile_num % tile_pitch) << 7
 419  *                | (A & 0b111000000000) >> 5
 420  *                | (A & 0b1111)
 421  *
 422  * For W tiling, tile() combines together the low-order bits of the X and Y
 423  * coordinates in the pattern 0bxxxyyyyxyxyx, creating 4k tiles that are 64
 424  * bytes wide and 64 rows high (note that W tiling is only used for stencil
 425  * buffers, which always have cpp = 1 and S=0):
 426  *
 427  *   tile(w_tiled, X, Y, S) = A
 428  *     where A = tile_num << 12 | offset
 429  *           tile_num = (Y' >> 6) * tile_pitch + (X' >> 6)
 430  *           offset = (X' & 0b111000) << 6
 431  *                    | (Y' & 0b111100) << 3
 432  *                    | (X' & 0b100) << 2
 433  *                    | (Y' & 0b10) << 2
 434  *                    | (X' & 0b10) << 1
 435  *                    | (Y' & 0b1) << 1
 436  *                    | (X' & 0b1)
 437  *           X' = X * cpp = X
 438  *           Y' = Y + S * qpitch
 439  *   detile(w_tiled, A) = (X, Y, S)
 440  *     where X = X' / cpp = X'
 441  *           Y = Y' % qpitch = Y'
 442  *           S = Y / qpitch = 0
 443  *           Y' = (tile_num / tile_pitch) << 6
 444  *                | (A & 0b111100000) >> 3
 445  *                | (A & 0b1000) >> 2
 446  *                | (A & 0b10) >> 1
 447  *           X' = (tile_num % tile_pitch) << 6
 448  *                | (A & 0b111000000000) >> 6
 449  *                | (A & 0b10000) >> 2
 450  *                | (A & 0b100) >> 1
 451  *                | (A & 0b1)
 452  *
 453  * Finally, for a non-tiled surface, tile() simply combines together the X and
 454  * Y coordinates in the natural way:
 455  *
 456  *   tile(untiled, X, Y, S) = A
 457  *     where A = Y * pitch + X'
 458  *           X' = X * cpp
 459  *           Y' = Y + S * qpitch
 460  *   detile(untiled, A) = (X, Y, S)
 461  *     where X = X' / cpp
 462  *           Y = Y' % qpitch
 463  *           S = Y' / qpitch
 464  *           X' = A % pitch
 465  *           Y' = A / pitch
 466  *
 467  * (In these formulas, pitch is the number of bytes occupied by a single row
 468  * of samples).
 469  */
 470 class brw_blorp_blit_program
 471 {
 472 public:
 473    brw_blorp_blit_program(struct brw_context *brw,
 474                           const brw_blorp_blit_prog_key *key);
 475    ~brw_blorp_blit_program();
 476
 477    const GLuint *compile(struct brw_context *brw, GLuint *program_size);
 478
 479    brw_blorp_prog_data prog_data;
 480
 481 private:
 482    void alloc_regs();
 483    void alloc_push_const_regs(int base_reg);
 484    void compute_frag_coords();
 485    void translate_tiling(bool old_tiled_w, bool new_tiled_w);
 486    void encode_msaa(unsigned num_samples, intel_msaa_layout layout);
 487    void decode_msaa(unsigned num_samples, intel_msaa_layout layout);
 488    void kill_if_outside_dst_rect();
 489    void translate_dst_to_src();
 490    void single_to_blend();
 491    void manual_blend(unsigned num_samples);
 492    void sample(struct brw_reg dst);
 493    void texel_fetch(struct brw_reg dst);
 494    void mcs_fetch();
 495    void expand_to_32_bits(struct brw_reg src, struct brw_reg dst);
 496    void texture_lookup(struct brw_reg dst, GLuint msg_type,
 497                        const sampler_message_arg *args, int num_args);
 498    void render_target_write();
 499
 500    /**
 501     * Base-2 logarithm of the maximum number of samples that can be blended.
 502     */
 503    static const unsigned LOG2_MAX_BLEND_SAMPLES = 3;
 504
 505    void *mem_ctx;
 506    struct brw_context *brw;
 507    const brw_blorp_blit_prog_key *key;
 508    struct brw_compile func;
 509
 510    /* Thread dispatch header */
 511    struct brw_reg R0;
 512
 513    /* Pixel X/Y coordinates (always in R1). */
 514    struct brw_reg R1;
 515
 516    /* Push constants */
 517    struct brw_reg dst_x0;
 518    struct brw_reg dst_x1;
 519    struct brw_reg dst_y0;
 520    struct brw_reg dst_y1;
 521    struct {
 522       struct brw_reg multiplier;
 523       struct brw_reg offset;
 524    } x_transform, y_transform;
 525
 526    /* Data read from texture (4 vec16's per array element) */
 527    struct brw_reg texture_data[LOG2_MAX_BLEND_SAMPLES + 1];
 528
 529    /* Auxiliary storage for the contents of the MCS surface.
 530     *
 531     * Since the sampler always returns 8 registers worth of data, this is 8
 532     * registers wide, even though we only use the first 2 registers of it.
 533     */
 534    struct brw_reg mcs_data;
 535
 536    /* X coordinates.  We have two of them so that we can perform coordinate
 537     * transformations easily.
 538     */
 539    struct brw_reg x_coords[2];
 540
 541    /* Y coordinates.  We have two of them so that we can perform coordinate
 542     * transformations easily.
 543     */
 544    struct brw_reg y_coords[2];
 545
 546    /* Which element of x_coords and y_coords is currently in use.
 547     */
 548    int xy_coord_index;
 549
 550    /* True if, at the point in the program currently being compiled, the
 551     * sample index is known to be zero.
 552     */
 553    bool s_is_zero;
 554
 555    /* Register storing the sample index when s_is_zero is false. */
 556    struct brw_reg sample_index;
 557
 558    /* Temporaries */
 559    struct brw_reg t1;
 560    struct brw_reg t2;
 561
 562    /* MRF used for sampling and render target writes */
 563    GLuint base_mrf;
 564 };
 565
 566 brw_blorp_blit_program::brw_blorp_blit_program(
 567       struct brw_context *brw,
 568       const brw_blorp_blit_prog_key *key)
 569    : mem_ctx(ralloc_context(NULL)),
 570      brw(brw),
 571      key(key)
 572 {
 573    brw_init_compile(brw, &func, mem_ctx);
 574 }
 575
 576 brw_blorp_blit_program::~brw_blorp_blit_program()
 577 {
 578    ralloc_free(mem_ctx);
 579 }
 580
 581 const GLuint *
 582 brw_blorp_blit_program::compile(struct brw_context *brw,
 583                                 GLuint *program_size)
 584 {
 585    /* Sanity checks */
 586    if (key->dst_tiled_w && key->rt_samples > 0) {
 587       /* If the destination image is W tiled and multisampled, then the thread
 588        * must be dispatched once per sample, not once per pixel.  This is
 589        * necessary because after conversion between W and Y tiling, there's no
 590        * guarantee that all samples corresponding to a single pixel will still
 591        * be together.
 592        */
 593       assert(key->persample_msaa_dispatch);
 594    }
 595
 596    if (key->blend) {
 597       /* We are blending, which means we won't have an opportunity to
 598        * translate the tiling and sample count for the texture surface.  So
 599        * the surface state for the texture must be configured with the correct
 600        * tiling and sample count.
 601        */
 602       assert(!key->src_tiled_w);
 603       assert(key->tex_samples == key->src_samples);
 604       assert(key->tex_layout == key->src_layout);
 605       assert(key->tex_samples > 0);
 606    }
 607
 608    if (key->persample_msaa_dispatch) {
 609       /* It only makes sense to do persample dispatch if the render target is
 610        * configured as multisampled.
 611        */
 612       assert(key->rt_samples > 0);
 613    }
 614
 615    /* Make sure layout is consistent with sample count */
 616    assert((key->tex_layout == INTEL_MSAA_LAYOUT_NONE) ==
 617           (key->tex_samples == 0));
 618    assert((key->rt_layout == INTEL_MSAA_LAYOUT_NONE) ==
 619           (key->rt_samples == 0));
 620    assert((key->src_layout == INTEL_MSAA_LAYOUT_NONE) ==
 621           (key->src_samples == 0));
 622    assert((key->dst_layout == INTEL_MSAA_LAYOUT_NONE) ==
 623           (key->dst_samples == 0));
 624
 625    /* Set up prog_data */
 626    memset(&prog_data, 0, sizeof(prog_data));
 627    prog_data.persample_msaa_dispatch = key->persample_msaa_dispatch;
 628
 629    brw_set_compression_control(&func, BRW_COMPRESSION_NONE);
 630
 631    alloc_regs();
 632    compute_frag_coords();
 633
 634    /* Render target and texture hardware don't support W tiling. */
 635    const bool rt_tiled_w = false;
 636    const bool tex_tiled_w = false;
 637
 638    /* The address that data will be written to is determined by the
 639     * coordinates supplied to the WM thread and the tiling and sample count of
 640     * the render target, according to the formula:
 641     *
 642     * (X, Y, S) = decode_msaa(rt_samples, detile(rt_tiling, offset))
 643     *
 644     * If the actual tiling and sample count of the destination surface are not
 645     * the same as the configuration of the render target, then these
 646     * coordinates are wrong and we have to adjust them to compensate for the
 647     * difference.
 648     */
 649    if (rt_tiled_w != key->dst_tiled_w ||
 650        key->rt_samples != key->dst_samples ||
 651        key->rt_layout != key->dst_layout) {
 652       encode_msaa(key->rt_samples, key->rt_layout);
 653       /* Now (X, Y, S) = detile(rt_tiling, offset) */
 654       translate_tiling(rt_tiled_w, key->dst_tiled_w);
 655       /* Now (X, Y, S) = detile(dst_tiling, offset) */
 656       decode_msaa(key->dst_samples, key->dst_layout);
 657    }
 658
 659    /* Now (X, Y, S) = decode_msaa(dst_samples, detile(dst_tiling, offset)).
 660     *
 661     * That is: X, Y and S now contain the true coordinates and sample index of
 662     * the data that the WM thread should output.
 663     *
 664     * If we need to kill pixels that are outside the destination rectangle,
 665     * now is the time to do it.
 666     */
 667
 668    if (key->use_kill)
 669       kill_if_outside_dst_rect();
 670
 671    /* Next, apply a translation to obtain coordinates in the source image. */
 672    translate_dst_to_src();
 673
 674    /* If the source image is not multisampled, then we want to fetch sample
 675     * number 0, because that's the only sample there is.
 676     */
 677    if (key->src_samples == 0)
 678       s_is_zero = true;
 679
 680    /* X, Y, and S are now the coordinates of the pixel in the source image
 681     * that we want to texture from.  Exception: if we are blending, then S is
 682     * irrelevant, because we are going to fetch all samples.
 683     */
 684    if (key->blend) {
 685       if (brw->intel.gen == 6) {
 686          /* Gen6 hardware an automatically blend using the SAMPLE message */
 687          single_to_blend();
 688          sample(texture_data[0]);
 689       } else {
 690          /* Gen7+ hardware doesn't automaticaly blend. */
 691          manual_blend(key->src_samples);
 692       }
 693    } else {
 694       /* We aren't blending, which means we just want to fetch a single sample
 695        * from the source surface.  The address that we want to fetch from is
 696        * related to the X, Y and S values according to the formula:
 697        *
 698        * (X, Y, S) = decode_msaa(src_samples, detile(src_tiling, offset)).
 699        *
 700        * If the actual tiling and sample count of the source surface are not
 701        * the same as the configuration of the texture, then we need to adjust
 702        * the coordinates to compensate for the difference.
 703        */
 704       if (tex_tiled_w != key->src_tiled_w ||
 705           key->tex_samples != key->src_samples ||
 706           key->tex_layout != key->src_layout) {
 707          encode_msaa(key->src_samples, key->src_layout);
 708          /* Now (X, Y, S) = detile(src_tiling, offset) */
 709          translate_tiling(key->src_tiled_w, tex_tiled_w);
 710          /* Now (X, Y, S) = detile(tex_tiling, offset) */
 711          decode_msaa(key->tex_samples, key->tex_layout);
 712       }
 713
 714       /* Now (X, Y, S) = decode_msaa(tex_samples, detile(tex_tiling, offset)).
 715        *
 716        * In other words: X, Y, and S now contain values which, when passed to
 717        * the texturing unit, will cause data to be read from the correct
 718        * memory location.  So we can fetch the texel now.
 719        */
 720       if (key->tex_layout == INTEL_MSAA_LAYOUT_CMS)
 721          mcs_fetch();
 722       texel_fetch(texture_data[0]);
 723    }
 724
 725    /* Finally, write the fetched (or blended) value to the render target and
 726     * terminate the thread.
 727     */
 728    render_target_write();
 729    return brw_get_program(&func, program_size);
 730 }
 731
 732 void
 733 brw_blorp_blit_program::alloc_push_const_regs(int base_reg)
 734 {
 735 #define CONST_LOC(name) offsetof(brw_blorp_wm_push_constants, name)
 736 #define ALLOC_REG(name) \
 737    this->name = \
 738       brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, base_reg, CONST_LOC(name) / 2)
 739
 740    ALLOC_REG(dst_x0);
 741    ALLOC_REG(dst_x1);
 742    ALLOC_REG(dst_y0);
 743    ALLOC_REG(dst_y1);
 744    ALLOC_REG(x_transform.multiplier);
 745    ALLOC_REG(x_transform.offset);
 746    ALLOC_REG(y_transform.multiplier);
 747    ALLOC_REG(y_transform.offset);
 748 #undef CONST_LOC
 749 #undef ALLOC_REG
 750 }
 751
 752 void
 753 brw_blorp_blit_program::alloc_regs()
 754 {
 755    int reg = 0;
 756    this->R0 = retype(brw_vec8_grf(reg++, 0), BRW_REGISTER_TYPE_UW);
 757    this->R1 = retype(brw_vec8_grf(reg++, 0), BRW_REGISTER_TYPE_UW);
 758    prog_data.first_curbe_grf = reg;
 759    alloc_push_const_regs(reg);
 760    reg += BRW_BLORP_NUM_PUSH_CONST_REGS;
 761    for (unsigned i = 0; i < ARRAY_SIZE(texture_data); ++i) {
 762       this->texture_data[i] =
 763          retype(vec16(brw_vec8_grf(reg, 0)), key->texture_data_type);
 764       reg += 8;
 765    }
 766    this->mcs_data =
 767       retype(brw_vec8_grf(reg, 0), BRW_REGISTER_TYPE_UD); reg += 8;
 768    for (int i = 0; i < 2; ++i) {
 769       this->x_coords[i]
 770          = vec16(retype(brw_vec8_grf(reg++, 0), BRW_REGISTER_TYPE_UW));
 771       this->y_coords[i]
 772          = vec16(retype(brw_vec8_grf(reg++, 0), BRW_REGISTER_TYPE_UW));
 773    }
 774    this->xy_coord_index = 0;
 775    this->sample_index
 776       = vec16(retype(brw_vec8_grf(reg++, 0), BRW_REGISTER_TYPE_UW));
 777    this->t1 = vec16(retype(brw_vec8_grf(reg++, 0), BRW_REGISTER_TYPE_UW));
 778    this->t2 = vec16(retype(brw_vec8_grf(reg++, 0), BRW_REGISTER_TYPE_UW));
 779
 780    /* Make sure we didn't run out of registers */
 781    assert(reg <= GEN7_MRF_HACK_START);
 782
 783    int mrf = 2;
 784    this->base_mrf = mrf;
 785 }
 786
 787 /* In the code that follows, X and Y can be used to quickly refer to the
 788  * active elements of x_coords and y_coords, and Xp and Yp ("X prime" and "Y
 789  * prime") to the inactive elements.
 790  *
 791  * S can be used to quickly refer to sample_index.
 792  */
 793 #define X x_coords[xy_coord_index]
 794 #define Y y_coords[xy_coord_index]
 795 #define Xp x_coords[!xy_coord_index]
 796 #define Yp y_coords[!xy_coord_index]
 797 #define S sample_index
 798
 799 /* Quickly swap the roles of (X, Y) and (Xp, Yp).  Saves us from having to do
 800  * MOVs to transfor (Xp, Yp) to (X, Y) after a coordinate transformation.
 801  */
 802 #define SWAP_XY_AND_XPYP() xy_coord_index = !xy_coord_index;
 803
 804 /**
 805  * Emit code to compute the X and Y coordinates of the pixels being rendered
 806  * by this WM invocation.
 807  *
 808  * Assuming the render target is set up for Y tiling, these (X, Y) values are
 809  * related to the address offset where outputs will be written by the formula:
 810  *
 811  *   (X, Y, S) = decode_msaa(detile(offset)).
 812  *
 813  * (See brw_blorp_blit_program).
 814  */
 815 void
 816 brw_blorp_blit_program::compute_frag_coords()
 817 {
 818    /* R1.2[15:0] = X coordinate of upper left pixel of subspan 0 (pixel 0)
 819     * R1.3[15:0] = X coordinate of upper left pixel of subspan 1 (pixel 4)
 820     * R1.4[15:0] = X coordinate of upper left pixel of subspan 2 (pixel 8)
 821     * R1.5[15:0] = X coordinate of upper left pixel of subspan 3 (pixel 12)
 822     *
 823     * Pixels within a subspan are laid out in this arrangement:
 824     * 0 1
 825     * 2 3
 826     *
 827     * So, to compute the coordinates of each pixel, we need to read every 2nd
 828     * 16-bit value (vstride=2) from R1, starting at the 4th 16-bit value
 829     * (suboffset=4), and duplicate each value 4 times (hstride=0, width=4).
 830     * In other words, the data we want to access is R1.4<2;4,0>UW.
 831     *
 832     * Then, we need to add the repeating sequence (0, 1, 0, 1, ...) to the
 833     * result, since pixels n+1 and n+3 are in the right half of the subspan.
 834     */
 835    brw_ADD(&func, X, stride(suboffset(R1, 4), 2, 4, 0), brw_imm_v(0x10101010));
 836
 837    /* Similarly, Y coordinates for subspans come from R1.2[31:16] through
 838     * R1.5[31:16], so to get pixel Y coordinates we need to start at the 5th
 839     * 16-bit value instead of the 4th (R1.5<2;4,0>UW instead of
 840     * R1.4<2;4,0>UW).
 841     *
 842     * And we need to add the repeating sequence (0, 0, 1, 1, ...), since
 843     * pixels n+2 and n+3 are in the bottom half of the subspan.
 844     */
 845    brw_ADD(&func, Y, stride(suboffset(R1, 5), 2, 4, 0), brw_imm_v(0x11001100));
 846
 847    if (key->persample_msaa_dispatch) {
 848       switch (key->rt_samples) {
 849       case 4:
 850          /* The WM will be run in MSDISPMODE_PERSAMPLE with num_samples == 4.
 851           * Therefore, subspan 0 will represent sample 0, subspan 1 will
 852           * represent sample 1, and so on.
 853           *
 854           * So we need to populate S with the sequence (0, 0, 0, 0, 1, 1, 1,
 855           * 1, 2, 2, 2, 2, 3, 3, 3, 3).  The easiest way to do this is to
 856           * populate a temporary variable with the sequence (0, 1, 2, 3), and
 857           * then copy from it using vstride=1, width=4, hstride=0.
 858           */
 859          brw_MOV(&func, t1, brw_imm_v(0x3210));
 860          brw_MOV(&func, S, stride(t1, 1, 4, 0));
 861          break;
 862       case 8: {
 863          /* The WM will be run in MSDISPMODE_PERSAMPLE with num_samples == 8.
 864           * Therefore, subspan 0 will represent sample N (where N is 0 or 4),
 865           * subspan 1 will represent sample 1, and so on.  We can find the
 866           * value of N by looking at R0.0 bits 7:6 ("Starting Sample Pair
 867           * Index") and multiplying by two (since samples are always delivered
 868           * in pairs).  That is, we compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 &
 869           * 0xc0) >> 5.
 870           *
 871           * Then we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1, 2,
 872           * 2, 2, 2, 3, 3, 3, 3), which we compute by populating a temporary
 873           * variable with the sequence (0, 1, 2, 3), and then reading from it
 874           * using vstride=1, width=4, hstride=0.
 875           */
 876          struct brw_reg t1_ud1 = vec1(retype(t1, BRW_REGISTER_TYPE_UD));
 877          struct brw_reg r0_ud1 = vec1(retype(R0, BRW_REGISTER_TYPE_UD));
 878          brw_AND(&func, t1_ud1, r0_ud1, brw_imm_ud(0xc0));
 879          brw_SHR(&func, t1_ud1, t1_ud1, brw_imm_ud(5));
 880          brw_MOV(&func, t2, brw_imm_v(0x3210));
 881          brw_ADD(&func, S, retype(t1_ud1, BRW_REGISTER_TYPE_UW),
 882                  stride(t2, 1, 4, 0));
 883          break;
 884       }
 885       default:
 886          assert(!"Unrecognized sample count in "
 887                 "brw_blorp_blit_program::compute_frag_coords()");
 888          break;
 889       }
 890       s_is_zero = false;
 891    } else {
 892       /* Either the destination surface is single-sampled, or the WM will be
 893        * run in MSDISPMODE_PERPIXEL (which causes a single fragment dispatch
 894        * per pixel).  In either case, it's not meaningful to compute a sample
 895        * value.  Just set it to 0.
 896        */
 897       s_is_zero = true;
 898    }
 899 }
 900
 901 /**
 902  * Emit code to compensate for the difference between Y and W tiling.
 903  *
 904  * This code modifies the X and Y coordinates according to the formula:
 905  *
 906  *   (X', Y', S') = detile(new_tiling, tile(old_tiling, X, Y, S))
 907  *
 908  * (See brw_blorp_blit_program).
 909  *
 910  * It can only translate between W and Y tiling, so new_tiling and old_tiling
 911  * are booleans where true represents W tiling and false represents Y tiling.
 912  */
 913 void
 914 brw_blorp_blit_program::translate_tiling(bool old_tiled_w, bool new_tiled_w)
 915 {
 916    if (old_tiled_w == new_tiled_w)
 917       return;
 918
 919    /* In the code that follows, we can safely assume that S = 0, because W
 920     * tiling formats always use IMS layout.
 921     */
 922    assert(s_is_zero);
 923
 924    if (new_tiled_w) {
 925       /* Given X and Y coordinates that describe an address using Y tiling,
 926        * translate to the X and Y coordinates that describe the same address
 927        * using W tiling.
 928        *
 929        * If we break down the low order bits of X and Y, using a
 930        * single letter to represent each low-order bit:
 931        *
 932        *   X = A << 7 | 0bBCDEFGH
 933        *   Y = J << 5 | 0bKLMNP                                       (1)
 934        *
 935        * Then we can apply the Y tiling formula to see the memory offset being
 936        * addressed:
 937        *
 938        *   offset = (J * tile_pitch + A) << 12 | 0bBCDKLMNPEFGH       (2)
 939        *
 940        * If we apply the W detiling formula to this memory location, that the
 941        * corresponding X' and Y' coordinates are:
 942        *
 943        *   X' = A << 6 | 0bBCDPFH                                     (3)
 944        *   Y' = J << 6 | 0bKLMNEG
 945        *
 946        * Combining (1) and (3), we see that to transform (X, Y) to (X', Y'),
 947        * we need to make the following computation:
 948        *
 949        *   X' = (X & ~0b1011) >> 1 | (Y & 0b1) << 2 | X & 0b1         (4)
 950        *   Y' = (Y & ~0b1) << 1 | (X & 0b1000) >> 2 | (X & 0b10) >> 1
 951        */
 952       brw_AND(&func, t1, X, brw_imm_uw(0xfff4)); /* X & ~0b1011 */
 953       brw_SHR(&func, t1, t1, brw_imm_uw(1)); /* (X & ~0b1011) >> 1 */
 954       brw_AND(&func, t2, Y, brw_imm_uw(1)); /* Y & 0b1 */
 955       brw_SHL(&func, t2, t2, brw_imm_uw(2)); /* (Y & 0b1) << 2 */
 956       brw_OR(&func, t1, t1, t2); /* (X & ~0b1011) >> 1 | (Y & 0b1) << 2 */
 957       brw_AND(&func, t2, X, brw_imm_uw(1)); /* X & 0b1 */
 958       brw_OR(&func, Xp, t1, t2);
 959       brw_AND(&func, t1, Y, brw_imm_uw(0xfffe)); /* Y & ~0b1 */
 960       brw_SHL(&func, t1, t1, brw_imm_uw(1)); /* (Y & ~0b1) << 1 */
 961       brw_AND(&func, t2, X, brw_imm_uw(8)); /* X & 0b1000 */
 962       brw_SHR(&func, t2, t2, brw_imm_uw(2)); /* (X & 0b1000) >> 2 */
 963       brw_OR(&func, t1, t1, t2); /* (Y & ~0b1) << 1 | (X & 0b1000) >> 2 */
 964       brw_AND(&func, t2, X, brw_imm_uw(2)); /* X & 0b10 */
 965       brw_SHR(&func, t2, t2, brw_imm_uw(1)); /* (X & 0b10) >> 1 */
 966       brw_OR(&func, Yp, t1, t2);
 967       SWAP_XY_AND_XPYP();
 968    } else {
 969       /* Applying the same logic as above, but in reverse, we obtain the
 970        * formulas:
 971        *
 972        * X' = (X & ~0b101) << 1 | (Y & 0b10) << 2 | (Y & 0b1) << 1 | X & 0b1
 973        * Y' = (Y & ~0b11) >> 1 | (X & 0b100) >> 2
 974        */
 975       brw_AND(&func, t1, X, brw_imm_uw(0xfffa)); /* X & ~0b101 */
 976       brw_SHL(&func, t1, t1, brw_imm_uw(1)); /* (X & ~0b101) << 1 */
 977       brw_AND(&func, t2, Y, brw_imm_uw(2)); /* Y & 0b10 */
 978       brw_SHL(&func, t2, t2, brw_imm_uw(2)); /* (Y & 0b10) << 2 */
 979       brw_OR(&func, t1, t1, t2); /* (X & ~0b101) << 1 | (Y & 0b10) << 2 */
 980       brw_AND(&func, t2, Y, brw_imm_uw(1)); /* Y & 0b1 */
 981       brw_SHL(&func, t2, t2, brw_imm_uw(1)); /* (Y & 0b1) << 1 */
 982       brw_OR(&func, t1, t1, t2); /* (X & ~0b101) << 1 | (Y & 0b10) << 2
 983                                     | (Y & 0b1) << 1 */
 984       brw_AND(&func, t2, X, brw_imm_uw(1)); /* X & 0b1 */
 985       brw_OR(&func, Xp, t1, t2);
 986       brw_AND(&func, t1, Y, brw_imm_uw(0xfffc)); /* Y & ~0b11 */
 987       brw_SHR(&func, t1, t1, brw_imm_uw(1)); /* (Y & ~0b11) >> 1 */
 988       brw_AND(&func, t2, X, brw_imm_uw(4)); /* X & 0b100 */
 989       brw_SHR(&func, t2, t2, brw_imm_uw(2)); /* (X & 0b100) >> 2 */
 990       brw_OR(&func, Yp, t1, t2);
 991       SWAP_XY_AND_XPYP();
 992    }
 993 }
 994
 995 /**
 996  * Emit code to compensate for the difference between MSAA and non-MSAA
 997  * surfaces.
 998  *
 999  * This code modifies the X and Y coordinates according to the formula:
1000  *
1001  *   (X', Y', S') = encode_msaa(num_samples, IMS, X, Y, S)
1002  *
1003  * (See brw_blorp_blit_program).
1004  */
1005 void
1006 brw_blorp_blit_program::encode_msaa(unsigned num_samples,
1007                                     intel_msaa_layout layout)
1008 {
1009    switch (layout) {
1010    case INTEL_MSAA_LAYOUT_NONE:
1011       /* No translation necessary, and S should already be zero. */
1012       assert(s_is_zero);
1013       break;
1014    case INTEL_MSAA_LAYOUT_CMS:
1015       /* We can't compensate for compressed layout since at this point in the
1016        * program we haven't read from the MCS buffer.
1017        */
1018       assert(!"Bad layout in encode_msaa");
1019       break;
1020    case INTEL_MSAA_LAYOUT_UMS:
1021       /* No translation necessary. */
1022       break;
1023    case INTEL_MSAA_LAYOUT_IMS:
1024       switch (num_samples) {
1025       case 4:
1026          /* encode_msaa(4, IMS, X, Y, S) = (X', Y', 0)
1027           *   where X' = (X & ~0b1) << 1 | (S & 0b1) << 1 | (X & 0b1)
1028           *         Y' = (Y & ~0b1) << 1 | (S & 0b10) | (Y & 0b1)
1029           */
1030          brw_AND(&func, t1, X, brw_imm_uw(0xfffe)); /* X & ~0b1 */
1031          if (!s_is_zero) {
1032             brw_AND(&func, t2, S, brw_imm_uw(1)); /* S & 0b1 */
1033             brw_OR(&func, t1, t1, t2); /* (X & ~0b1) | (S & 0b1) */
1034          }
1035          brw_SHL(&func, t1, t1, brw_imm_uw(1)); /* (X & ~0b1) << 1
1036                                                    | (S & 0b1) << 1 */
1037          brw_AND(&func, t2, X, brw_imm_uw(1)); /* X & 0b1 */
1038          brw_OR(&func, Xp, t1, t2);
1039          brw_AND(&func, t1, Y, brw_imm_uw(0xfffe)); /* Y & ~0b1 */
1040          brw_SHL(&func, t1, t1, brw_imm_uw(1)); /* (Y & ~0b1) << 1 */
1041          if (!s_is_zero) {
1042             brw_AND(&func, t2, S, brw_imm_uw(2)); /* S & 0b10 */
1043             brw_OR(&func, t1, t1, t2); /* (Y & ~0b1) << 1 | (S & 0b10) */
1044          }
1045          brw_AND(&func, t2, Y, brw_imm_uw(1)); /* Y & 0b1 */
1046          brw_OR(&func, Yp, t1, t2);
1047          break;
1048       case 8:
1049          /* encode_msaa(8, IMS, X, Y, S) = (X', Y', 0)
1050           *   where X' = (X & ~0b1) << 2 | (S & 0b100) | (S & 0b1) << 1
1051           *              | (X & 0b1)
1052           *         Y' = (Y & ~0b1) << 1 | (S & 0b10) | (Y & 0b1)
1053           */
1054          brw_AND(&func, t1, X, brw_imm_uw(0xfffe)); /* X & ~0b1 */
1055          brw_SHL(&func, t1, t1, brw_imm_uw(2)); /* (X & ~0b1) << 2 */
1056          if (!s_is_zero) {
1057             brw_AND(&func, t2, S, brw_imm_uw(4)); /* S & 0b100 */
1058             brw_OR(&func, t1, t1, t2); /* (X & ~0b1) << 2 | (S & 0b100) */
1059             brw_AND(&func, t2, S, brw_imm_uw(1)); /* S & 0b1 */
1060             brw_SHL(&func, t2, t2, brw_imm_uw(1)); /* (S & 0b1) << 1 */
1061             brw_OR(&func, t1, t1, t2); /* (X & ~0b1) << 2 | (S & 0b100)
1062                                           | (S & 0b1) << 1 */
1063          }
1064          brw_AND(&func, t2, X, brw_imm_uw(1)); /* X & 0b1 */
1065          brw_OR(&func, Xp, t1, t2);
1066          brw_AND(&func, t1, Y, brw_imm_uw(0xfffe)); /* Y & ~0b1 */
1067          brw_SHL(&func, t1, t1, brw_imm_uw(1)); /* (Y & ~0b1) << 1 */
1068          if (!s_is_zero) {
1069             brw_AND(&func, t2, S, brw_imm_uw(2)); /* S & 0b10 */
1070             brw_OR(&func, t1, t1, t2); /* (Y & ~0b1) << 1 | (S & 0b10) */
1071          }
1072          brw_AND(&func, t2, Y, brw_imm_uw(1)); /* Y & 0b1 */
1073          brw_OR(&func, Yp, t1, t2);
1074          break;
1075       }
1076       SWAP_XY_AND_XPYP();
1077       s_is_zero = true;
1078       break;
1079    }
1080 }
1081
1082 /**
1083  * Emit code to compensate for the difference between MSAA and non-MSAA
1084  * surfaces.
1085  *
1086  * This code modifies the X and Y coordinates according to the formula:
1087  *
1088  *   (X', Y', S) = decode_msaa(num_samples, IMS, X, Y, S)
1089  *
1090  * (See brw_blorp_blit_program).
1091  */
1092 void
1093 brw_blorp_blit_program::decode_msaa(unsigned num_samples,
1094                                     intel_msaa_layout layout)
1095 {
1096    switch (layout) {
1097    case INTEL_MSAA_LAYOUT_NONE:
1098       /* No translation necessary, and S should already be zero. */
1099       assert(s_is_zero);
1100       break;
1101    case INTEL_MSAA_LAYOUT_CMS:
1102       /* We can't compensate for compressed layout since at this point in the
1103        * program we don't have access to the MCS buffer.
1104        */
1105       assert(!"Bad layout in encode_msaa");
1106       break;
1107    case INTEL_MSAA_LAYOUT_UMS:
1108       /* No translation necessary. */
1109       break;
1110    case INTEL_MSAA_LAYOUT_IMS:
1111       assert(s_is_zero);
1112       switch (num_samples) {
1113       case 4:
1114          /* decode_msaa(4, IMS, X, Y, 0) = (X', Y', S)
1115           *   where X' = (X & ~0b11) >> 1 | (X & 0b1)
1116           *         Y' = (Y & ~0b11) >> 1 | (Y & 0b1)
1117           *         S = (Y & 0b10) | (X & 0b10) >> 1
1118           */
1119          brw_AND(&func, t1, X, brw_imm_uw(0xfffc)); /* X & ~0b11 */
1120          brw_SHR(&func, t1, t1, brw_imm_uw(1)); /* (X & ~0b11) >> 1 */
1121          brw_AND(&func, t2, X, brw_imm_uw(1)); /* X & 0b1 */
1122          brw_OR(&func, Xp, t1, t2);
1123          brw_AND(&func, t1, Y, brw_imm_uw(0xfffc)); /* Y & ~0b11 */
1124          brw_SHR(&func, t1, t1, brw_imm_uw(1)); /* (Y & ~0b11) >> 1 */
1125          brw_AND(&func, t2, Y, brw_imm_uw(1)); /* Y & 0b1 */
1126          brw_OR(&func, Yp, t1, t2);
1127          brw_AND(&func, t1, Y, brw_imm_uw(2)); /* Y & 0b10 */
1128          brw_AND(&func, t2, X, brw_imm_uw(2)); /* X & 0b10 */
1129          brw_SHR(&func, t2, t2, brw_imm_uw(1)); /* (X & 0b10) >> 1 */
1130          brw_OR(&func, S, t1, t2);
1131          break;
1132       case 8:
1133          /* decode_msaa(8, IMS, X, Y, 0) = (X', Y', S)
1134           *   where X' = (X & ~0b111) >> 2 | (X & 0b1)
1135           *         Y' = (Y & ~0b11) >> 1 | (Y & 0b1)
1136           *         S = (X & 0b100) | (Y & 0b10) | (X & 0b10) >> 1
1137           */
1138          brw_AND(&func, t1, X, brw_imm_uw(0xfff8)); /* X & ~0b111 */
1139          brw_SHR(&func, t1, t1, brw_imm_uw(2)); /* (X & ~0b111) >> 2 */
1140          brw_AND(&func, t2, X, brw_imm_uw(1)); /* X & 0b1 */
1141          brw_OR(&func, Xp, t1, t2);
1142          brw_AND(&func, t1, Y, brw_imm_uw(0xfffc)); /* Y & ~0b11 */
1143          brw_SHR(&func, t1, t1, brw_imm_uw(1)); /* (Y & ~0b11) >> 1 */
1144          brw_AND(&func, t2, Y, brw_imm_uw(1)); /* Y & 0b1 */
1145          brw_OR(&func, Yp, t1, t2);
1146          brw_AND(&func, t1, X, brw_imm_uw(4)); /* X & 0b100 */
1147          brw_AND(&func, t2, Y, brw_imm_uw(2)); /* Y & 0b10 */
1148          brw_OR(&func, t1, t1, t2); /* (X & 0b100) | (Y & 0b10) */
1149          brw_AND(&func, t2, X, brw_imm_uw(2)); /* X & 0b10 */
1150          brw_SHR(&func, t2, t2, brw_imm_uw(1)); /* (X & 0b10) >> 1 */
1151          brw_OR(&func, S, t1, t2);
1152          break;
1153       }
1154       s_is_zero = false;
1155       SWAP_XY_AND_XPYP();
1156       break;
1157    }
1158 }
1159
1160 /**
1161  * Emit code that kills pixels whose X and Y coordinates are outside the
1162  * boundary of the rectangle defined by the push constants (dst_x0, dst_y0,
1163  * dst_x1, dst_y1).
1164  */
1165 void
1166 brw_blorp_blit_program::kill_if_outside_dst_rect()
1167 {
1168    struct brw_reg f0 = brw_flag_reg();
1169    struct brw_reg g1 = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
1170    struct brw_reg null16 = vec16(retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
1171
1172    brw_CMP(&func, null16, BRW_CONDITIONAL_GE, X, dst_x0);
1173    brw_CMP(&func, null16, BRW_CONDITIONAL_GE, Y, dst_y0);
1174    brw_CMP(&func, null16, BRW_CONDITIONAL_L, X, dst_x1);
1175    brw_CMP(&func, null16, BRW_CONDITIONAL_L, Y, dst_y1);
1176
1177    brw_set_predicate_control(&func, BRW_PREDICATE_NONE);
1178    brw_push_insn_state(&func);
1179    brw_set_mask_control(&func, BRW_MASK_DISABLE);
1180    brw_AND(&func, g1, f0, g1);
1181    brw_pop_insn_state(&func);
1182 }
1183
1184 /**
1185  * Emit code to translate from destination (X, Y) coordinates to source (X, Y)
1186  * coordinates.
1187  */
1188 void
1189 brw_blorp_blit_program::translate_dst_to_src()
1190 {
1191    brw_MUL(&func, Xp, X, x_transform.multiplier);
1192    brw_MUL(&func, Yp, Y, y_transform.multiplier);
1193    brw_ADD(&func, Xp, Xp, x_transform.offset);
1194    brw_ADD(&func, Yp, Yp, y_transform.offset);
1195    SWAP_XY_AND_XPYP();
1196 }
1197
1198 /**
1199  * Emit code to transform the X and Y coordinates as needed for blending
1200  * together the different samples in an MSAA texture.
1201  */
1202 void
1203 brw_blorp_blit_program::single_to_blend()
1204 {
1205    /* When looking up samples in an MSAA texture using the SAMPLE message,
1206     * Gen6 requires the texture coordinates to be odd integers (so that they
1207     * correspond to the center of a 2x2 block representing the four samples
1208     * that maxe up a pixel).  So we need to multiply our X and Y coordinates
1209     * each by 2 and then add 1.
1210     */
1211    brw_SHL(&func, t1, X, brw_imm_w(1));
1212    brw_SHL(&func, t2, Y, brw_imm_w(1));
1213    brw_ADD(&func, Xp, t1, brw_imm_w(1));
1214    brw_ADD(&func, Yp, t2, brw_imm_w(1));
1215    SWAP_XY_AND_XPYP();
1216 }
1217
1218
1219 /**
1220  * Count the number of trailing 1 bits in the given value.  For example:
1221  *
1222  * count_trailing_one_bits(0) == 0
1223  * count_trailing_one_bits(7) == 3
1224  * count_trailing_one_bits(11) == 2
1225  */
1226 inline int count_trailing_one_bits(unsigned value)
1227 {
1228 #if defined(__GNUC__) && ((__GNUC__ * 100 + __GNUC_MINOR__) >= 304) /* gcc 3.4 or later */
1229    return __builtin_ctz(~value);
1230 #else
1231    return _mesa_bitcount(value & ~(value + 1));
1232 #endif
1233 }
1234
1235
1236 void
1237 brw_blorp_blit_program::manual_blend(unsigned num_samples)
1238 {
1239    if (key->tex_layout == INTEL_MSAA_LAYOUT_CMS)
1240       mcs_fetch();
1241
1242    /* We add together samples using a binary tree structure, e.g. for 4x MSAA:
1243     *
1244     *   result = ((sample[0] + sample[1]) + (sample[2] + sample[3])) / 4
1245     *
1246     * This ensures that when all samples have the same value, no numerical
1247     * precision is lost, since each addition operation always adds two equal
1248     * values, and summing two equal floating point values does not lose
1249     * precision.
1250     *
1251     * We perform this computation by treating the texture_data array as a
1252     * stack and performing the following operations:
1253     *
1254     * - push sample 0 onto stack
1255     * - push sample 1 onto stack
1256     * - add top two stack entries
1257     * - push sample 2 onto stack
1258     * - push sample 3 onto stack
1259     * - add top two stack entries
1260     * - add top two stack entries
1261     * - divide top stack entry by 4
1262     *
1263     * Note that after pushing sample i onto the stack, the number of add
1264     * operations we do is equal to the number of trailing 1 bits in i.  This
1265     * works provided the total number of samples is a power of two, which it
1266     * always is for i965.
1267     *
1268     * For integer formats, we replace the add operations with average
1269     * operations and skip the final division.
1270     */
1271    typedef struct brw_instruction *(*brw_op2_ptr)(struct brw_compile *,
1272                                                   struct brw_reg,
1273                                                   struct brw_reg,
1274                                                   struct brw_reg);
1275    brw_op2_ptr combine_op =
1276       key->texture_data_type == BRW_REGISTER_TYPE_F ? brw_ADD : brw_AVG;
1277    unsigned stack_depth = 0;
1278    for (unsigned i = 0; i < num_samples; ++i) {
1279       assert(stack_depth == _mesa_bitcount(i)); /* Loop invariant */
1280
1281       /* Push sample i onto the stack */
1282       assert(stack_depth < ARRAY_SIZE(texture_data));
1283       if (i == 0) {
1284          s_is_zero = true;
1285       } else {
1286          s_is_zero = false;
1287          brw_MOV(&func, S, brw_imm_uw(i));
1288       }
1289       texel_fetch(texture_data[stack_depth++]);
1290
1291       if (i == 0 && key->tex_layout == INTEL_MSAA_LAYOUT_CMS) {
1292          /* The Ivy Bridge PRM, Vol4 Part1 p27 (Multisample Control Surface)
1293           * suggests an optimization:
1294           *
1295           *     "A simple optimization with probable large return in
1296           *     performance is to compare the MCS value to zero (indicating
1297           *     all samples are on sample slice 0), and sample only from
1298           *     sample slice 0 using ld2dss if MCS is zero."
1299           *
1300           * Note that in the case where the MCS value is zero, sampling from
1301           * sample slice 0 using ld2dss and sampling from sample 0 using
1302           * ld2dms are equivalent (since all samples are on sample slice 0).
1303           * Since we have already sampled from sample 0, all we need to do is
1304           * skip the remaining fetches and averaging if MCS is zero.
1305           */
1306          brw_CMP(&func, vec16(brw_null_reg()), BRW_CONDITIONAL_NZ,
1307                  mcs_data, brw_imm_ud(0));
1308          brw_IF(&func, BRW_EXECUTE_16);
1309       }
1310
1311       /* Do count_trailing_one_bits(i) times */
1312       for (int j = count_trailing_one_bits(i); j-- > 0; ) {
1313          assert(stack_depth >= 2);
1314          --stack_depth;
1315
1316          /* TODO: should use a smaller loop bound for non_RGBA formats */
1317          for (int k = 0; k < 4; ++k) {
1318             combine_op(&func, offset(texture_data[stack_depth - 1], 2*k),
1319                        offset(vec8(texture_data[stack_depth - 1]), 2*k),
1320                        offset(vec8(texture_data[stack_depth]), 2*k));
1321          }
1322       }
1323    }
1324
1325    /* We should have just 1 sample on the stack now. */
1326    assert(stack_depth == 1);
1327
1328    if (key->texture_data_type == BRW_REGISTER_TYPE_F) {
1329       /* Scale the result down by a factor of num_samples */
1330       /* TODO: should use a smaller loop bound for non-RGBA formats */
1331       for (int j = 0; j < 4; ++j) {
1332          brw_MUL(&func, offset(texture_data[0], 2*j),
1333                  offset(vec8(texture_data[0]), 2*j),
1334                  brw_imm_f(1.0/num_samples));
1335       }
1336    }
1337
1338    if (key->tex_layout == INTEL_MSAA_LAYOUT_CMS)
1339       brw_ENDIF(&func);
1340 }
1341
1342 /**
1343  * Emit code to look up a value in the texture using the SAMPLE message (which
1344  * does blending of MSAA surfaces).
1345  */
1346 void
1347 brw_blorp_blit_program::sample(struct brw_reg dst)
1348 {
1349    static const sampler_message_arg args[2] = {
1350       SAMPLER_MESSAGE_ARG_U_FLOAT,
1351       SAMPLER_MESSAGE_ARG_V_FLOAT
1352    };
1353
1354    texture_lookup(dst, GEN5_SAMPLER_MESSAGE_SAMPLE, args, ARRAY_SIZE(args));
1355 }
1356
1357 /**
1358  * Emit code to look up a value in the texture using the SAMPLE_LD message
1359  * (which does a simple texel fetch).
1360  */
1361 void
1362 brw_blorp_blit_program::texel_fetch(struct brw_reg dst)
1363 {
1364    static const sampler_message_arg gen6_args[5] = {
1365       SAMPLER_MESSAGE_ARG_U_INT,
1366       SAMPLER_MESSAGE_ARG_V_INT,
1367       SAMPLER_MESSAGE_ARG_ZERO_INT, /* R */
1368       SAMPLER_MESSAGE_ARG_ZERO_INT, /* LOD */
1369       SAMPLER_MESSAGE_ARG_SI_INT
1370    };
1371    static const sampler_message_arg gen7_ld_args[3] = {
1372       SAMPLER_MESSAGE_ARG_U_INT,
1373       SAMPLER_MESSAGE_ARG_ZERO_INT, /* LOD */
1374       SAMPLER_MESSAGE_ARG_V_INT
1375    };
1376    static const sampler_message_arg gen7_ld2dss_args[3] = {
1377       SAMPLER_MESSAGE_ARG_SI_INT,
1378       SAMPLER_MESSAGE_ARG_U_INT,
1379       SAMPLER_MESSAGE_ARG_V_INT
1380    };
1381    static const sampler_message_arg gen7_ld2dms_args[4] = {
1382       SAMPLER_MESSAGE_ARG_SI_INT,
1383       SAMPLER_MESSAGE_ARG_MCS_INT,
1384       SAMPLER_MESSAGE_ARG_U_INT,
1385       SAMPLER_MESSAGE_ARG_V_INT
1386    };
1387
1388    switch (brw->intel.gen) {
1389    case 6:
1390       texture_lookup(dst, GEN5_SAMPLER_MESSAGE_SAMPLE_LD, gen6_args,
1391                      s_is_zero ? 2 : 5);
1392       break;
1393    case 7:
1394       switch (key->tex_layout) {
1395       case INTEL_MSAA_LAYOUT_IMS:
1396          /* From the Ivy Bridge PRM, Vol4 Part1 p72 (Multisampled Surface Storage
1397           * Format):
1398           *
1399           *     If this field is MSFMT_DEPTH_STENCIL
1400           *     [a.k.a. INTEL_MSAA_LAYOUT_IMS], the only sampling engine
1401           *     messages allowed are "ld2dms", "resinfo", and "sampleinfo".
1402           *
1403           * So fall through to emit the same message as we use for
1404           * INTEL_MSAA_LAYOUT_CMS.
1405           */
1406       case INTEL_MSAA_LAYOUT_CMS:
1407          texture_lookup(dst, GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS,
1408                         gen7_ld2dms_args, ARRAY_SIZE(gen7_ld2dms_args));
1409          break;
1410       case INTEL_MSAA_LAYOUT_UMS:
1411          texture_lookup(dst, GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS,
1412                         gen7_ld2dss_args, ARRAY_SIZE(gen7_ld2dss_args));
1413          break;
1414       case INTEL_MSAA_LAYOUT_NONE:
1415          assert(s_is_zero);
1416          texture_lookup(dst, GEN5_SAMPLER_MESSAGE_SAMPLE_LD, gen7_ld_args,
1417                         ARRAY_SIZE(gen7_ld_args));
1418          break;
1419       }
1420       break;
1421    default:
1422       assert(!"Should not get here.");
1423       break;
1424    };
1425 }
1426
1427 void
1428 brw_blorp_blit_program::mcs_fetch()
1429 {
1430    static const sampler_message_arg gen7_ld_mcs_args[2] = {
1431       SAMPLER_MESSAGE_ARG_U_INT,
1432       SAMPLER_MESSAGE_ARG_V_INT
1433    };
1434    texture_lookup(vec16(mcs_data), GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS,
1435                   gen7_ld_mcs_args, ARRAY_SIZE(gen7_ld_mcs_args));
1436 }
1437
1438 void
1439 brw_blorp_blit_program::expand_to_32_bits(struct brw_reg src,
1440                                           struct brw_reg dst)
1441 {
1442    brw_MOV(&func, vec8(dst), vec8(src));
1443    brw_set_compression_control(&func, BRW_COMPRESSION_2NDHALF);
1444    brw_MOV(&func, offset(vec8(dst), 1), suboffset(vec8(src), 8));
1445    brw_set_compression_control(&func, BRW_COMPRESSION_NONE);
1446 }
1447
1448 void
1449 brw_blorp_blit_program::texture_lookup(struct brw_reg dst,
1450                                        GLuint msg_type,
1451                                        const sampler_message_arg *args,
1452                                        int num_args)
1453 {
1454    struct brw_reg mrf =
1455       retype(vec16(brw_message_reg(base_mrf)), BRW_REGISTER_TYPE_UD);
1456    for (int arg = 0; arg < num_args; ++arg) {
1457       switch (args[arg]) {
1458       case SAMPLER_MESSAGE_ARG_U_FLOAT:
1459          expand_to_32_bits(X, retype(mrf, BRW_REGISTER_TYPE_F));
1460          break;
1461       case SAMPLER_MESSAGE_ARG_V_FLOAT:
1462          expand_to_32_bits(Y, retype(mrf, BRW_REGISTER_TYPE_F));
1463          break;
1464       case SAMPLER_MESSAGE_ARG_U_INT:
1465          expand_to_32_bits(X, mrf);
1466          break;
1467       case SAMPLER_MESSAGE_ARG_V_INT:
1468          expand_to_32_bits(Y, mrf);
1469          break;
1470       case SAMPLER_MESSAGE_ARG_SI_INT:
1471          /* Note: on Gen7, this code may be reached with s_is_zero==true
1472           * because in Gen7's ld2dss message, the sample index is the first
1473           * argument.  When this happens, we need to move a 0 into the
1474           * appropriate message register.
1475           */
1476          if (s_is_zero)
1477             brw_MOV(&func, mrf, brw_imm_ud(0));
1478          else
1479             expand_to_32_bits(S, mrf);
1480          break;
1481       case SAMPLER_MESSAGE_ARG_MCS_INT:
1482          switch (key->tex_layout) {
1483          case INTEL_MSAA_LAYOUT_CMS:
1484             brw_MOV(&func, mrf, mcs_data);
1485             break;
1486          case INTEL_MSAA_LAYOUT_IMS:
1487             /* When sampling from an IMS surface, MCS data is not relevant,
1488              * and the hardware ignores it.  So don't bother populating it.
1489              */
1490             break;
1491          default:
1492             /* We shouldn't be trying to send MCS data with any other
1493              * layouts.
1494              */
1495             assert (!"Unsupported layout for MCS data");
1496             break;
1497          }
1498          break;
1499       case SAMPLER_MESSAGE_ARG_ZERO_INT:
1500          brw_MOV(&func, mrf, brw_imm_ud(0));
1501          break;
1502       }
1503       mrf.nr += 2;
1504    }
1505
1506    brw_SAMPLE(&func,
1507               retype(dst, BRW_REGISTER_TYPE_UW) /* dest */,
1508               base_mrf /* msg_reg_nr */,
1509               brw_message_reg(base_mrf) /* src0 */,
1510               BRW_BLORP_TEXTURE_BINDING_TABLE_INDEX,
1511               0 /* sampler */,
1512               WRITEMASK_XYZW,
1513               msg_type,
1514               8 /* response_length.  TODO: should be smaller for non-RGBA formats? */,
1515               mrf.nr - base_mrf /* msg_length */,
1516               0 /* header_present */,
1517               BRW_SAMPLER_SIMD_MODE_SIMD16,
1518               BRW_SAMPLER_RETURN_FORMAT_FLOAT32);
1519 }
1520
1521 #undef X
1522 #undef Y
1523 #undef U
1524 #undef V
1525 #undef S
1526 #undef SWAP_XY_AND_XPYP
1527
1528 void
1529 brw_blorp_blit_program::render_target_write()
1530 {
1531    struct brw_reg mrf_rt_write =
1532       retype(vec16(brw_message_reg(base_mrf)), key->texture_data_type);
1533    int mrf_offset = 0;
1534
1535    /* If we may have killed pixels, then we need to send R0 and R1 in a header
1536     * so that the render target knows which pixels we killed.
1537     */
1538    bool use_header = key->use_kill;
1539    if (use_header) {
1540       /* Copy R0/1 to MRF */
1541       brw_MOV(&func, retype(mrf_rt_write, BRW_REGISTER_TYPE_UD),
1542               retype(R0, BRW_REGISTER_TYPE_UD));
1543       mrf_offset += 2;
1544    }
1545
1546    /* Copy texture data to MRFs */
1547    for (int i = 0; i < 4; ++i) {
1548       /* E.g. mov(16) m2.0<1>:f r2.0<8;8,1>:f { Align1, H1 } */
1549       brw_MOV(&func, offset(mrf_rt_write, mrf_offset),
1550               offset(vec8(texture_data[0]), 2*i));
1551       mrf_offset += 2;
1552    }
1553
1554    /* Now write to the render target and terminate the thread */
1555    brw_fb_WRITE(&func,
1556                 16 /* dispatch_width */,
1557                 base_mrf /* msg_reg_nr */,
1558                 mrf_rt_write /* src0 */,
1559                 BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE,
1560                 BRW_BLORP_RENDERBUFFER_BINDING_TABLE_INDEX,
1561                 mrf_offset /* msg_length.  TODO: Should be smaller for non-RGBA formats. */,
1562                 0 /* response_length */,
1563                 true /* eot */,
1564                 use_header);
1565 }
1566
1567
1568 void
1569 brw_blorp_coord_transform_params::setup(GLuint src0, GLuint dst0, GLuint dst1,
1570                                         bool mirror)
1571 {
1572    if (!mirror) {
1573       /* When not mirroring a coordinate (say, X), we need:
1574        *   x' - src_x0 = x - dst_x0
1575        * Therefore:
1576        *   x' = 1*x + (src_x0 - dst_x0)
1577        */
1578       multiplier = 1;
1579       offset = src0 - dst0;
1580    } else {
1581       /* When mirroring X we need:
1582        *   x' - src_x0 = dst_x1 - x - 1
1583        * Therefore:
1584        *   x' = -1*x + (src_x0 + dst_x1 - 1)
1585        */
1586       multiplier = -1;
1587       offset = src0 + dst1 - 1;
1588    }
1589 }
1590
1591
1592 /**
1593  * Determine which MSAA layout the GPU pipeline should be configured for,
1594  * based on the chip generation, the number of samples, and the true layout of
1595  * the image in memory.
1596  */
1597 inline intel_msaa_layout
1598 compute_msaa_layout_for_pipeline(struct brw_context *brw, unsigned num_samples,
1599                                  intel_msaa_layout true_layout)
1600 {
1601    if (num_samples <= 1) {
1602       /* When configuring the GPU for non-MSAA, we can still accommodate IMS
1603        * format buffers, by transforming coordinates appropriately.
1604        */
1605       assert(true_layout == INTEL_MSAA_LAYOUT_NONE ||
1606              true_layout == INTEL_MSAA_LAYOUT_IMS);
1607       return INTEL_MSAA_LAYOUT_NONE;
1608    } else {
1609       assert(true_layout != INTEL_MSAA_LAYOUT_NONE);
1610    }
1611
1612    /* Prior to Gen7, all MSAA surfaces use IMS layout. */
1613    if (brw->intel.gen == 6) {
1614       assert(true_layout == INTEL_MSAA_LAYOUT_IMS);
1615    }
1616
1617    return true_layout;
1618 }
1619
1620
1621 brw_blorp_blit_params::brw_blorp_blit_params(struct brw_context *brw,
1622                                              struct intel_mipmap_tree *src_mt,
1623                                              struct intel_mipmap_tree *dst_mt,
1624                                              GLuint src_x0, GLuint src_y0,
1625                                              GLuint dst_x0, GLuint dst_y0,
1626                                              GLuint dst_x1, GLuint dst_y1,
1627                                              bool mirror_x, bool mirror_y)
1628 {
1629    src.set(brw, src_mt, 0, 0);
1630    dst.set(brw, dst_mt, 0, 0);
1631
1632    use_wm_prog = true;
1633    memset(&wm_prog_key, 0, sizeof(wm_prog_key));
1634
1635    /* texture_data_type indicates the register type that should be used to
1636     * manipulate texture data.
1637     */
1638    switch (_mesa_get_format_datatype(src_mt->format)) {
1639    case GL_UNSIGNED_NORMALIZED:
1640    case GL_SIGNED_NORMALIZED:
1641    case GL_FLOAT:
1642       wm_prog_key.texture_data_type = BRW_REGISTER_TYPE_F;
1643       break;
1644    case GL_UNSIGNED_INT:
1645       if (src_mt->format == MESA_FORMAT_S8) {
1646          /* We process stencil as though it's an unsigned normalized color */
1647          wm_prog_key.texture_data_type = BRW_REGISTER_TYPE_F;
1648       } else {
1649          wm_prog_key.texture_data_type = BRW_REGISTER_TYPE_UD;
1650       }
1651       break;
1652    case GL_INT:
1653       wm_prog_key.texture_data_type = BRW_REGISTER_TYPE_D;
1654       break;
1655    default:
1656       assert(!"Unrecognized blorp format");
1657       break;
1658    }
1659
1660    if (brw->intel.gen > 6) {
1661       /* Gen7's rendering hardware only supports the IMS layout for depth and
1662        * stencil render targets.  Blorp always maps its destination surface as
1663        * a color render target (even if it's actually a depth or stencil
1664        * buffer).  So if the destination is IMS, we'll have to map it as a
1665        * single-sampled texture and interleave the samples ourselves.
1666        */
1667       if (dst_mt->msaa_layout == INTEL_MSAA_LAYOUT_IMS)
1668          dst.num_samples = 0;
1669    }
1670
1671    if (dst.map_stencil_as_y_tiled && dst.num_samples > 1) {
1672       /* If the destination surface is a W-tiled multisampled stencil buffer
1673        * that we're mapping as Y tiled, then we need to arrange for the WM
1674        * program to run once per sample rather than once per pixel, because
1675        * the memory layout of related samples doesn't match between W and Y
1676        * tiling.
1677        */
1678       wm_prog_key.persample_msaa_dispatch = true;
1679    }
1680
1681    if (src.num_samples > 0 && dst.num_samples > 1) {
1682       /* We are blitting from a multisample buffer to a multisample buffer, so
1683        * we must preserve samples within a pixel.  This means we have to
1684        * arrange for the WM program to run once per sample rather than once
1685        * per pixel.
1686        */
1687       wm_prog_key.persample_msaa_dispatch = true;
1688    }
1689
1690    /* The render path must be configured to use the same number of samples as
1691     * the destination buffer.
1692     */
1693    num_samples = dst.num_samples;
1694
1695    GLenum base_format = _mesa_get_format_base_format(src_mt->format);
1696    if (base_format != GL_DEPTH_COMPONENT && /* TODO: what about depth/stencil? */
1697        base_format != GL_STENCIL_INDEX &&
1698        src_mt->num_samples > 1 && dst_mt->num_samples <= 1) {
1699       /* We are downsampling a color buffer, so blend. */
1700       wm_prog_key.blend = true;
1701    }
1702
1703    /* src_samples and dst_samples are the true sample counts */
1704    wm_prog_key.src_samples = src_mt->num_samples;
1705    wm_prog_key.dst_samples = dst_mt->num_samples;
1706
1707    /* tex_samples and rt_samples are the sample counts that are set up in
1708     * SURFACE_STATE.
1709     */
1710    wm_prog_key.tex_samples = src.num_samples;
1711    wm_prog_key.rt_samples  = dst.num_samples;
1712
1713    /* tex_layout and rt_layout indicate the MSAA layout the GPU pipeline will
1714     * use to access the source and destination surfaces.
1715     */
1716    wm_prog_key.tex_layout =
1717       compute_msaa_layout_for_pipeline(brw, src.num_samples, src.msaa_layout);
1718    wm_prog_key.rt_layout =
1719       compute_msaa_layout_for_pipeline(brw, dst.num_samples, dst.msaa_layout);
1720
1721    /* src_layout and dst_layout indicate the true MSAA layout used by src and
1722     * dst.
1723     */
1724    wm_prog_key.src_layout = src_mt->msaa_layout;
1725    wm_prog_key.dst_layout = dst_mt->msaa_layout;
1726
1727    wm_prog_key.src_tiled_w = src.map_stencil_as_y_tiled;
1728    wm_prog_key.dst_tiled_w = dst.map_stencil_as_y_tiled;
1729    x0 = wm_push_consts.dst_x0 = dst_x0;
1730    y0 = wm_push_consts.dst_y0 = dst_y0;
1731    x1 = wm_push_consts.dst_x1 = dst_x1;
1732    y1 = wm_push_consts.dst_y1 = dst_y1;
1733    wm_push_consts.x_transform.setup(src_x0, dst_x0, dst_x1, mirror_x);
1734    wm_push_consts.y_transform.setup(src_y0, dst_y0, dst_y1, mirror_y);
1735
1736    if (dst.num_samples <= 1 && dst_mt->num_samples > 1) {
1737       /* We must expand the rectangle we send through the rendering pipeline,
1738        * to account for the fact that we are mapping the destination region as
1739        * single-sampled when it is in fact multisampled.  We must also align
1740        * it to a multiple of the multisampling pattern, because the
1741        * differences between multisampled and single-sampled surface formats
1742        * will mean that pixels are scrambled within the multisampling pattern.
1743        * TODO: what if this makes the coordinates too large?
1744        *
1745        * Note: this only works if the destination surface uses the IMS layout.
1746        * If it's UMS, then we have no choice but to set up the rendering
1747        * pipeline as multisampled.
1748        */
1749       assert(dst_mt->msaa_layout == INTEL_MSAA_LAYOUT_IMS);
1750       switch (dst_mt->num_samples) {
1751       case 4:
1752          x0 = ROUND_DOWN_TO(x0 * 2, 4);
1753          y0 = ROUND_DOWN_TO(y0 * 2, 4);
1754          x1 = ALIGN(x1 * 2, 4);
1755          y1 = ALIGN(y1 * 2, 4);
1756          break;
1757       case 8:
1758          x0 = ROUND_DOWN_TO(x0 * 4, 8);
1759          y0 = ROUND_DOWN_TO(y0 * 2, 4);
1760          x1 = ALIGN(x1 * 4, 8);
1761          y1 = ALIGN(y1 * 2, 4);
1762          break;
1763       default:
1764          assert(!"Unrecognized sample count in brw_blorp_blit_params ctor");
1765          break;
1766       }
1767       wm_prog_key.use_kill = true;
1768    }
1769
1770    if (dst.map_stencil_as_y_tiled) {
1771       /* We must modify the rectangle we send through the rendering pipeline,
1772        * to account for the fact that we are mapping it as Y-tiled when it is
1773        * in fact W-tiled.  Y tiles have dimensions 128x32 whereas W tiles have
1774        * dimensions 64x64.  We must also align it to a multiple of the tile
1775        * size, because the differences between W and Y tiling formats will
1776        * mean that pixels are scrambled within the tile.
1777        *
1778        * Note: if the destination surface configured to use IMS layout, then
1779        * the effective tile size we need to align it to is smaller, because
1780        * each pixel covers a 2x2 or a 4x2 block of samples.
1781        *
1782        * TODO: what if this makes the coordinates too large?
1783        */
1784       unsigned x_align = 64, y_align = 64;
1785       if (dst_mt->msaa_layout == INTEL_MSAA_LAYOUT_IMS) {
1786          x_align /= (dst_mt->num_samples == 4 ? 2 : 4);
1787          y_align /= 2;
1788       }
1789       x0 = ROUND_DOWN_TO(x0, x_align) * 2;
1790       y0 = ROUND_DOWN_TO(y0, y_align) / 2;
1791       x1 = ALIGN(x1, x_align) * 2;
1792       y1 = ALIGN(y1, y_align) / 2;
1793       wm_prog_key.use_kill = true;
1794    }
1795 }
1796
1797 uint32_t
1798 brw_blorp_blit_params::get_wm_prog(struct brw_context *brw,
1799                                    brw_blorp_prog_data **prog_data) const
1800 {
1801    uint32_t prog_offset;
1802    if (!brw_search_cache(&brw->cache, BRW_BLORP_BLIT_PROG,
1803                          &this->wm_prog_key, sizeof(this->wm_prog_key),
1804                          &prog_offset, prog_data)) {
1805       brw_blorp_blit_program prog(brw, &this->wm_prog_key);
1806       GLuint program_size;
1807       const GLuint *program = prog.compile(brw, &program_size);
1808       brw_upload_cache(&brw->cache, BRW_BLORP_BLIT_PROG,
1809                        &this->wm_prog_key, sizeof(this->wm_prog_key),
1810                        program, program_size,
1811                        &prog.prog_data, sizeof(prog.prog_data),
1812                        &prog_offset, prog_data);
1813    }
1814    return prog_offset;
1815 }