src/mesa/drivers/dri/i965/genX_state_upload.c

   1 /*
   2  * Copyright © 2017 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include <assert.h>
  25
  26 #include "common/gen_device_info.h"
  27 #include "common/gen_sample_positions.h"
  28 #include "genxml/gen_macros.h"
  29
  30 #include "main/bufferobj.h"
  31 #include "main/context.h"
  32 #include "main/enums.h"
  33 #include "main/macros.h"
  34 #include "main/state.h"
  35
  36 #include "brw_context.h"
  37 #include "brw_draw.h"
  38 #include "brw_multisample_state.h"
  39 #include "brw_state.h"
  40 #include "brw_wm.h"
  41 #include "brw_util.h"
  42
  43 #include "intel_batchbuffer.h"
  44 #include "intel_buffer_objects.h"
  45 #include "intel_fbo.h"
  46
  47 #include "main/enums.h"
  48 #include "main/fbobject.h"
  49 #include "main/framebuffer.h"
  50 #include "main/glformats.h"
  51 #include "main/samplerobj.h"
  52 #include "main/shaderapi.h"
  53 #include "main/stencil.h"
  54 #include "main/transformfeedback.h"
  55 #include "main/varray.h"
  56 #include "main/viewport.h"
  57 #include "util/half_float.h"
  58
  59 UNUSED static void *
  60 emit_dwords(struct brw_context *brw, unsigned n)
  61 {
  62    intel_batchbuffer_begin(brw, n, RENDER_RING);
  63    uint32_t *map = brw->batch.map_next;
  64    brw->batch.map_next += n;
  65    intel_batchbuffer_advance(brw);
  66    return map;
  67 }
  68
  69 struct brw_address {
  70    struct brw_bo *bo;
  71    unsigned reloc_flags;
  72    uint32_t offset;
  73 };
  74
  75 #define __gen_address_type struct brw_address
  76 #define __gen_user_data struct brw_context
  77
  78 static uint64_t
  79 __gen_combine_address(struct brw_context *brw, void *location,
  80                       struct brw_address address, uint32_t delta)
  81 {
  82    struct intel_batchbuffer *batch = &brw->batch;
  83    uint32_t offset;
  84
  85    if (address.bo == NULL) {
  86       return address.offset + delta;
  87    } else {
  88       if (GEN_GEN < 6 && brw_ptr_in_state_buffer(batch, location)) {
  89          offset = (char *) location - (char *) brw->batch.state.map;
  90          return brw_state_reloc(batch, offset, address.bo,
  91                                 address.offset + delta,
  92                                 address.reloc_flags);
  93       }
  94
  95       assert(!brw_ptr_in_state_buffer(batch, location));
  96
  97       offset = (char *) location - (char *) brw->batch.batch.map;
  98       return brw_batch_reloc(batch, offset, address.bo,
  99                              address.offset + delta,
 100                              address.reloc_flags);
 101    }
 102 }
 103
 104 static struct brw_address
 105 rw_bo(struct brw_bo *bo, uint32_t offset)
 106 {
 107    return (struct brw_address) {
 108             .bo = bo,
 109             .offset = offset,
 110             .reloc_flags = RELOC_WRITE,
 111    };
 112 }
 113
 114 static struct brw_address
 115 ro_bo(struct brw_bo *bo, uint32_t offset)
 116 {
 117    return (struct brw_address) {
 118             .bo = bo,
 119             .offset = offset,
 120    };
 121 }
 122
 123 UNUSED static struct brw_address
 124 ggtt_bo(struct brw_bo *bo, uint32_t offset)
 125 {
 126    return (struct brw_address) {
 127             .bo = bo,
 128             .offset = offset,
 129             .reloc_flags = RELOC_WRITE | RELOC_NEEDS_GGTT,
 130    };
 131 }
 132
 133 #if GEN_GEN == 4
 134 static struct brw_address
 135 KSP(struct brw_context *brw, uint32_t offset)
 136 {
 137    return ro_bo(brw->cache.bo, offset);
 138 }
 139 #else
 140 static uint32_t
 141 KSP(struct brw_context *brw, uint32_t offset)
 142 {
 143    return offset;
 144 }
 145 #endif
 146
 147 #include "genxml/genX_pack.h"
 148
 149 #define _brw_cmd_length(cmd) cmd ## _length
 150 #define _brw_cmd_length_bias(cmd) cmd ## _length_bias
 151 #define _brw_cmd_header(cmd) cmd ## _header
 152 #define _brw_cmd_pack(cmd) cmd ## _pack
 153
 154 #define brw_batch_emit(brw, cmd, name)                  \
 155    for (struct cmd name = { _brw_cmd_header(cmd) },     \
 156         *_dst = emit_dwords(brw, _brw_cmd_length(cmd)); \
 157         __builtin_expect(_dst != NULL, 1);              \
 158         _brw_cmd_pack(cmd)(brw, (void *)_dst, &name),   \
 159         _dst = NULL)
 160
 161 #define brw_batch_emitn(brw, cmd, n, ...) ({           \
 162       uint32_t *_dw = emit_dwords(brw, n);             \
 163       struct cmd template = {                          \
 164          _brw_cmd_header(cmd),                         \
 165          .DWordLength = n - _brw_cmd_length_bias(cmd), \
 166          __VA_ARGS__                                   \
 167       };                                               \
 168       _brw_cmd_pack(cmd)(brw, _dw, &template);         \
 169       _dw + 1; /* Array starts at dw[1] */             \
 170    })
 171
 172 #define brw_state_emit(brw, cmd, align, offset, name)              \
 173    for (struct cmd name = {},                                      \
 174         *_dst = brw_state_batch(brw, _brw_cmd_length(cmd) * 4,     \
 175                                 align, offset);                    \
 176         __builtin_expect(_dst != NULL, 1);                         \
 177         _brw_cmd_pack(cmd)(brw, (void *)_dst, &name),              \
 178         _dst = NULL)
 179
 180 /**
 181  * Polygon stipple packet
 182  */
 183 static void
 184 genX(upload_polygon_stipple)(struct brw_context *brw)
 185 {
 186    struct gl_context *ctx = &brw->ctx;
 187
 188    /* _NEW_POLYGON */
 189    if (!ctx->Polygon.StippleFlag)
 190       return;
 191
 192    brw_batch_emit(brw, GENX(3DSTATE_POLY_STIPPLE_PATTERN), poly) {
 193       /* Polygon stipple is provided in OpenGL order, i.e. bottom
 194        * row first.  If we're rendering to a window (i.e. the
 195        * default frame buffer object, 0), then we need to invert
 196        * it to match our pixel layout.  But if we're rendering
 197        * to a FBO (i.e. any named frame buffer object), we *don't*
 198        * need to invert - we already match the layout.
 199        */
 200       if (_mesa_is_winsys_fbo(ctx->DrawBuffer)) {
 201          for (unsigned i = 0; i < 32; i++)
 202             poly.PatternRow[i] = ctx->PolygonStipple[31 - i]; /* invert */
 203       } else {
 204          for (unsigned i = 0; i < 32; i++)
 205             poly.PatternRow[i] = ctx->PolygonStipple[i];
 206       }
 207    }
 208 }
 209
 210 static const struct brw_tracked_state genX(polygon_stipple) = {
 211    .dirty = {
 212       .mesa = _NEW_POLYGON |
 213               _NEW_POLYGONSTIPPLE,
 214       .brw = BRW_NEW_CONTEXT,
 215    },
 216    .emit = genX(upload_polygon_stipple),
 217 };
 218
 219 /**
 220  * Polygon stipple offset packet
 221  */
 222 static void
 223 genX(upload_polygon_stipple_offset)(struct brw_context *brw)
 224 {
 225    struct gl_context *ctx = &brw->ctx;
 226
 227    /* _NEW_POLYGON */
 228    if (!ctx->Polygon.StippleFlag)
 229       return;
 230
 231    brw_batch_emit(brw, GENX(3DSTATE_POLY_STIPPLE_OFFSET), poly) {
 232       /* _NEW_BUFFERS
 233        *
 234        * If we're drawing to a system window we have to invert the Y axis
 235        * in order to match the OpenGL pixel coordinate system, and our
 236        * offset must be matched to the window position.  If we're drawing
 237        * to a user-created FBO then our native pixel coordinate system
 238        * works just fine, and there's no window system to worry about.
 239        */
 240       if (_mesa_is_winsys_fbo(ctx->DrawBuffer)) {
 241          poly.PolygonStippleYOffset =
 242             (32 - (_mesa_geometric_height(ctx->DrawBuffer) & 31)) & 31;
 243       }
 244    }
 245 }
 246
 247 static const struct brw_tracked_state genX(polygon_stipple_offset) = {
 248    .dirty = {
 249       .mesa = _NEW_BUFFERS |
 250               _NEW_POLYGON,
 251       .brw = BRW_NEW_CONTEXT,
 252    },
 253    .emit = genX(upload_polygon_stipple_offset),
 254 };
 255
 256 /**
 257  * Line stipple packet
 258  */
 259 static void
 260 genX(upload_line_stipple)(struct brw_context *brw)
 261 {
 262    struct gl_context *ctx = &brw->ctx;
 263
 264    if (!ctx->Line.StippleFlag)
 265       return;
 266
 267    brw_batch_emit(brw, GENX(3DSTATE_LINE_STIPPLE), line) {
 268       line.LineStipplePattern = ctx->Line.StipplePattern;
 269
 270       line.LineStippleInverseRepeatCount = 1.0f / ctx->Line.StippleFactor;
 271       line.LineStippleRepeatCount = ctx->Line.StippleFactor;
 272    }
 273 }
 274
 275 static const struct brw_tracked_state genX(line_stipple) = {
 276    .dirty = {
 277       .mesa = _NEW_LINE,
 278       .brw = BRW_NEW_CONTEXT,
 279    },
 280    .emit = genX(upload_line_stipple),
 281 };
 282
 283 /* Constant single cliprect for framebuffer object or DRI2 drawing */
 284 static void
 285 genX(upload_drawing_rect)(struct brw_context *brw)
 286 {
 287    struct gl_context *ctx = &brw->ctx;
 288    const struct gl_framebuffer *fb = ctx->DrawBuffer;
 289    const unsigned int fb_width = _mesa_geometric_width(fb);
 290    const unsigned int fb_height = _mesa_geometric_height(fb);
 291
 292    brw_batch_emit(brw, GENX(3DSTATE_DRAWING_RECTANGLE), rect) {
 293       rect.ClippedDrawingRectangleXMax = fb_width - 1;
 294       rect.ClippedDrawingRectangleYMax = fb_height - 1;
 295    }
 296 }
 297
 298 static const struct brw_tracked_state genX(drawing_rect) = {
 299    .dirty = {
 300       .mesa = _NEW_BUFFERS,
 301       .brw = BRW_NEW_BLORP |
 302              BRW_NEW_CONTEXT,
 303    },
 304    .emit = genX(upload_drawing_rect),
 305 };
 306
 307 static uint32_t *
 308 genX(emit_vertex_buffer_state)(struct brw_context *brw,
 309                                uint32_t *dw,
 310                                unsigned buffer_nr,
 311                                struct brw_bo *bo,
 312                                unsigned start_offset,
 313                                unsigned end_offset,
 314                                unsigned stride,
 315                                unsigned step_rate)
 316 {
 317    struct GENX(VERTEX_BUFFER_STATE) buf_state = {
 318       .VertexBufferIndex = buffer_nr,
 319       .BufferPitch = stride,
 320       .BufferStartingAddress = ro_bo(bo, start_offset),
 321 #if GEN_GEN >= 8
 322       .BufferSize = end_offset - start_offset,
 323 #endif
 324
 325 #if GEN_GEN >= 7
 326       .AddressModifyEnable = true,
 327 #endif
 328
 329 #if GEN_GEN < 8
 330       .BufferAccessType = step_rate ? INSTANCEDATA : VERTEXDATA,
 331       .InstanceDataStepRate = step_rate,
 332 #if GEN_GEN >= 5
 333       .EndAddress = ro_bo(bo, end_offset - 1),
 334 #endif
 335 #endif
 336
 337 #if GEN_GEN == 11
 338       .VertexBufferMOCS = ICL_MOCS_WB,
 339 #elif GEN_GEN == 10
 340       .VertexBufferMOCS = CNL_MOCS_WB,
 341 #elif GEN_GEN == 9
 342       .VertexBufferMOCS = SKL_MOCS_WB,
 343 #elif GEN_GEN == 8
 344       .VertexBufferMOCS = BDW_MOCS_WB,
 345 #elif GEN_GEN == 7
 346       .VertexBufferMOCS = GEN7_MOCS_L3,
 347 #endif
 348    };
 349
 350    GENX(VERTEX_BUFFER_STATE_pack)(brw, dw, &buf_state);
 351    return dw + GENX(VERTEX_BUFFER_STATE_length);
 352 }
 353
 354 UNUSED static bool
 355 is_passthru_format(uint32_t format)
 356 {
 357    switch (format) {
 358    case ISL_FORMAT_R64_PASSTHRU:
 359    case ISL_FORMAT_R64G64_PASSTHRU:
 360    case ISL_FORMAT_R64G64B64_PASSTHRU:
 361    case ISL_FORMAT_R64G64B64A64_PASSTHRU:
 362       return true;
 363    default:
 364       return false;
 365    }
 366 }
 367
 368 UNUSED static int
 369 uploads_needed(uint32_t format,
 370                bool is_dual_slot)
 371 {
 372    if (!is_passthru_format(format))
 373       return 1;
 374
 375    if (is_dual_slot)
 376       return 2;
 377
 378    switch (format) {
 379    case ISL_FORMAT_R64_PASSTHRU:
 380    case ISL_FORMAT_R64G64_PASSTHRU:
 381       return 1;
 382    case ISL_FORMAT_R64G64B64_PASSTHRU:
 383    case ISL_FORMAT_R64G64B64A64_PASSTHRU:
 384       return 2;
 385    default:
 386       unreachable("not reached");
 387    }
 388 }
 389
 390 /*
 391  * Returns the format that we are finally going to use when upload a vertex
 392  * element. It will only change if we are using *64*PASSTHRU formats, as for
 393  * gen < 8 they need to be splitted on two *32*FLOAT formats.
 394  *
 395  * @upload points in which upload we are. Valid values are [0,1]
 396  */
 397 static uint32_t
 398 downsize_format_if_needed(uint32_t format,
 399                           int upload)
 400 {
 401    assert(upload == 0 || upload == 1);
 402
 403    if (!is_passthru_format(format))
 404       return format;
 405
 406    /* ISL_FORMAT_R64_PASSTHRU and ISL_FORMAT_R64G64_PASSTHRU with an upload ==
 407     * 1 means that we have been forced to do 2 uploads for a size <= 2. This
 408     * happens with gen < 8 and dvec3 or dvec4 vertex shader input
 409     * variables. In those cases, we return ISL_FORMAT_R32_FLOAT as a way of
 410     * flagging that we want to fill with zeroes this second forced upload.
 411     */
 412    switch (format) {
 413    case ISL_FORMAT_R64_PASSTHRU:
 414       return upload == 0 ? ISL_FORMAT_R32G32_FLOAT
 415                          : ISL_FORMAT_R32_FLOAT;
 416    case ISL_FORMAT_R64G64_PASSTHRU:
 417       return upload == 0 ? ISL_FORMAT_R32G32B32A32_FLOAT
 418                          : ISL_FORMAT_R32_FLOAT;
 419    case ISL_FORMAT_R64G64B64_PASSTHRU:
 420       return upload == 0 ? ISL_FORMAT_R32G32B32A32_FLOAT
 421                          : ISL_FORMAT_R32G32_FLOAT;
 422    case ISL_FORMAT_R64G64B64A64_PASSTHRU:
 423       return ISL_FORMAT_R32G32B32A32_FLOAT;
 424    default:
 425       unreachable("not reached");
 426    }
 427 }
 428
 429 /*
 430  * Returns the number of componentes associated with a format that is used on
 431  * a 64 to 32 format split. See downsize_format()
 432  */
 433 static int
 434 upload_format_size(uint32_t upload_format)
 435 {
 436    switch (upload_format) {
 437    case ISL_FORMAT_R32_FLOAT:
 438
 439       /* downsized_format has returned this one in order to flag that we are
 440        * performing a second upload which we want to have filled with
 441        * zeroes. This happens with gen < 8, a size <= 2, and dvec3 or dvec4
 442        * vertex shader input variables.
 443        */
 444
 445       return 0;
 446    case ISL_FORMAT_R32G32_FLOAT:
 447       return 2;
 448    case ISL_FORMAT_R32G32B32A32_FLOAT:
 449       return 4;
 450    default:
 451       unreachable("not reached");
 452    }
 453 }
 454
 455 static void
 456 genX(emit_vertices)(struct brw_context *brw)
 457 {
 458    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 459    uint32_t *dw;
 460
 461    brw_prepare_vertices(brw);
 462    brw_prepare_shader_draw_parameters(brw);
 463
 464 #if GEN_GEN < 6
 465    brw_emit_query_begin(brw);
 466 #endif
 467
 468    const struct brw_vs_prog_data *vs_prog_data =
 469       brw_vs_prog_data(brw->vs.base.prog_data);
 470
 471 #if GEN_GEN >= 8
 472    struct gl_context *ctx = &brw->ctx;
 473    const bool uses_edge_flag = (ctx->Polygon.FrontMode != GL_FILL ||
 474                                 ctx->Polygon.BackMode != GL_FILL);
 475
 476    if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid) {
 477       unsigned vue = brw->vb.nr_enabled;
 478
 479       /* The element for the edge flags must always be last, so we have to
 480        * insert the SGVS before it in that case.
 481        */
 482       if (uses_edge_flag) {
 483          assert(vue > 0);
 484          vue--;
 485       }
 486
 487       WARN_ONCE(vue >= 33,
 488                 "Trying to insert VID/IID past 33rd vertex element, "
 489                 "need to reorder the vertex attrbutes.");
 490
 491       brw_batch_emit(brw, GENX(3DSTATE_VF_SGVS), vfs) {
 492          if (vs_prog_data->uses_vertexid) {
 493             vfs.VertexIDEnable = true;
 494             vfs.VertexIDComponentNumber = 2;
 495             vfs.VertexIDElementOffset = vue;
 496          }
 497
 498          if (vs_prog_data->uses_instanceid) {
 499             vfs.InstanceIDEnable = true;
 500             vfs.InstanceIDComponentNumber = 3;
 501             vfs.InstanceIDElementOffset = vue;
 502          }
 503       }
 504
 505       brw_batch_emit(brw, GENX(3DSTATE_VF_INSTANCING), vfi) {
 506          vfi.InstancingEnable = true;
 507          vfi.VertexElementIndex = vue;
 508       }
 509    } else {
 510       brw_batch_emit(brw, GENX(3DSTATE_VF_SGVS), vfs);
 511    }
 512 #endif
 513
 514    const bool needs_sgvs_element = (vs_prog_data->uses_basevertex ||
 515                                     vs_prog_data->uses_baseinstance ||
 516                                     vs_prog_data->uses_instanceid ||
 517                                     vs_prog_data->uses_vertexid);
 518
 519    unsigned nr_elements =
 520       brw->vb.nr_enabled + needs_sgvs_element + vs_prog_data->uses_drawid;
 521
 522 #if GEN_GEN < 8
 523    /* If any of the formats of vb.enabled needs more that one upload, we need
 524     * to add it to nr_elements
 525     */
 526    for (unsigned i = 0; i < brw->vb.nr_enabled; i++) {
 527       struct brw_vertex_element *input = brw->vb.enabled[i];
 528       uint32_t format = brw_get_vertex_surface_type(brw, input->glarray);
 529
 530       if (uploads_needed(format, input->is_dual_slot) > 1)
 531          nr_elements++;
 532    }
 533 #endif
 534
 535    /* If the VS doesn't read any inputs (calculating vertex position from
 536     * a state variable for some reason, for example), emit a single pad
 537     * VERTEX_ELEMENT struct and bail.
 538     *
 539     * The stale VB state stays in place, but they don't do anything unless
 540     * a VE loads from them.
 541     */
 542    if (nr_elements == 0) {
 543       dw = brw_batch_emitn(brw, GENX(3DSTATE_VERTEX_ELEMENTS),
 544                            1 + GENX(VERTEX_ELEMENT_STATE_length));
 545       struct GENX(VERTEX_ELEMENT_STATE) elem = {
 546          .Valid = true,
 547          .SourceElementFormat = (enum GENX(SURFACE_FORMAT)) ISL_FORMAT_R32G32B32A32_FLOAT,
 548          .Component0Control = VFCOMP_STORE_0,
 549          .Component1Control = VFCOMP_STORE_0,
 550          .Component2Control = VFCOMP_STORE_0,
 551          .Component3Control = VFCOMP_STORE_1_FP,
 552       };
 553       GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem);
 554       return;
 555    }
 556
 557    /* Now emit 3DSTATE_VERTEX_BUFFERS and 3DSTATE_VERTEX_ELEMENTS packets. */
 558    const bool uses_draw_params =
 559       vs_prog_data->uses_basevertex ||
 560       vs_prog_data->uses_baseinstance;
 561    const unsigned nr_buffers = brw->vb.nr_buffers +
 562       uses_draw_params + vs_prog_data->uses_drawid;
 563
 564    if (nr_buffers) {
 565       assert(nr_buffers <= (GEN_GEN >= 6 ? 33 : 17));
 566
 567       dw = brw_batch_emitn(brw, GENX(3DSTATE_VERTEX_BUFFERS),
 568                            1 + GENX(VERTEX_BUFFER_STATE_length) * nr_buffers);
 569
 570       for (unsigned i = 0; i < brw->vb.nr_buffers; i++) {
 571          const struct brw_vertex_buffer *buffer = &brw->vb.buffers[i];
 572          /* Prior to Haswell and Bay Trail we have to use 4-component formats
 573           * to fake 3-component ones.  In particular, we do this for
 574           * half-float and 8 and 16-bit integer formats.  This means that the
 575           * vertex element may poke over the end of the buffer by 2 bytes.
 576           */
 577          const unsigned padding =
 578             (GEN_GEN <= 7 && !GEN_IS_HASWELL && !devinfo->is_baytrail) * 2;
 579          const unsigned end = buffer->offset + buffer->size + padding;
 580          dw = genX(emit_vertex_buffer_state)(brw, dw, i, buffer->bo,
 581                                              buffer->offset,
 582                                              end,
 583                                              buffer->stride,
 584                                              buffer->step_rate);
 585       }
 586
 587       if (uses_draw_params) {
 588          dw = genX(emit_vertex_buffer_state)(brw, dw, brw->vb.nr_buffers,
 589                                              brw->draw.draw_params_bo,
 590                                              brw->draw.draw_params_offset,
 591                                              brw->draw.draw_params_bo->size,
 592                                              0 /* stride */,
 593                                              0 /* step rate */);
 594       }
 595
 596       if (vs_prog_data->uses_drawid) {
 597          dw = genX(emit_vertex_buffer_state)(brw, dw, brw->vb.nr_buffers + 1,
 598                                              brw->draw.draw_id_bo,
 599                                              brw->draw.draw_id_offset,
 600                                              brw->draw.draw_id_bo->size,
 601                                              0 /* stride */,
 602                                              0 /* step rate */);
 603       }
 604    }
 605
 606    /* The hardware allows one more VERTEX_ELEMENTS than VERTEX_BUFFERS,
 607     * presumably for VertexID/InstanceID.
 608     */
 609 #if GEN_GEN >= 6
 610    assert(nr_elements <= 34);
 611    const struct brw_vertex_element *gen6_edgeflag_input = NULL;
 612 #else
 613    assert(nr_elements <= 18);
 614 #endif
 615
 616    dw = brw_batch_emitn(brw, GENX(3DSTATE_VERTEX_ELEMENTS),
 617                         1 + GENX(VERTEX_ELEMENT_STATE_length) * nr_elements);
 618    unsigned i;
 619    for (i = 0; i < brw->vb.nr_enabled; i++) {
 620       const struct brw_vertex_element *input = brw->vb.enabled[i];
 621       uint32_t format = brw_get_vertex_surface_type(brw, input->glarray);
 622       uint32_t comp0 = VFCOMP_STORE_SRC;
 623       uint32_t comp1 = VFCOMP_STORE_SRC;
 624       uint32_t comp2 = VFCOMP_STORE_SRC;
 625       uint32_t comp3 = VFCOMP_STORE_SRC;
 626       const unsigned num_uploads = GEN_GEN < 8 ?
 627          uploads_needed(format, input->is_dual_slot) : 1;
 628
 629 #if GEN_GEN >= 8
 630       /* From the BDW PRM, Volume 2d, page 588 (VERTEX_ELEMENT_STATE):
 631        * "Any SourceElementFormat of *64*_PASSTHRU cannot be used with an
 632        * element which has edge flag enabled."
 633        */
 634       assert(!(is_passthru_format(format) && uses_edge_flag));
 635 #endif
 636
 637       /* The gen4 driver expects edgeflag to come in as a float, and passes
 638        * that float on to the tests in the clipper.  Mesa's current vertex
 639        * attribute value for EdgeFlag is stored as a float, which works out.
 640        * glEdgeFlagPointer, on the other hand, gives us an unnormalized
 641        * integer ubyte.  Just rewrite that to convert to a float.
 642        *
 643        * Gen6+ passes edgeflag as sideband along with the vertex, instead
 644        * of in the VUE.  We have to upload it sideband as the last vertex
 645        * element according to the B-Spec.
 646        */
 647 #if GEN_GEN >= 6
 648       if (input == &brw->vb.inputs[VERT_ATTRIB_EDGEFLAG]) {
 649          gen6_edgeflag_input = input;
 650          continue;
 651       }
 652 #endif
 653
 654       for (unsigned c = 0; c < num_uploads; c++) {
 655          const uint32_t upload_format = GEN_GEN >= 8 ? format :
 656             downsize_format_if_needed(format, c);
 657          /* If we need more that one upload, the offset stride would be 128
 658           * bits (16 bytes), as for previous uploads we are using the full
 659           * entry. */
 660          const unsigned offset = input->offset + c * 16;
 661
 662          const int size = (GEN_GEN < 8 && is_passthru_format(format)) ?
 663             upload_format_size(upload_format) : input->glarray->Size;
 664
 665          switch (size) {
 666             case 0: comp0 = VFCOMP_STORE_0;
 667             case 1: comp1 = VFCOMP_STORE_0;
 668             case 2: comp2 = VFCOMP_STORE_0;
 669             case 3:
 670                if (GEN_GEN >= 8 && input->glarray->Doubles) {
 671                   comp3 = VFCOMP_STORE_0;
 672                } else if (input->glarray->Integer) {
 673                   comp3 = VFCOMP_STORE_1_INT;
 674                } else {
 675                   comp3 = VFCOMP_STORE_1_FP;
 676                }
 677
 678                break;
 679          }
 680
 681 #if GEN_GEN >= 8
 682          /* From the BDW PRM, Volume 2d, page 586 (VERTEX_ELEMENT_STATE):
 683           *
 684           *     "When SourceElementFormat is set to one of the *64*_PASSTHRU
 685           *     formats, 64-bit components are stored in the URB without any
 686           *     conversion. In this case, vertex elements must be written as 128
 687           *     or 256 bits, with VFCOMP_STORE_0 being used to pad the output as
 688           *     required. E.g., if R64_PASSTHRU is used to copy a 64-bit Red
 689           *     component into the URB, Component 1 must be specified as
 690           *     VFCOMP_STORE_0 (with Components 2,3 set to VFCOMP_NOSTORE) in
 691           *     order to output a 128-bit vertex element, or Components 1-3 must
 692           *     be specified as VFCOMP_STORE_0 in order to output a 256-bit vertex
 693           *     element. Likewise, use of R64G64B64_PASSTHRU requires Component 3
 694           *     to be specified as VFCOMP_STORE_0 in order to output a 256-bit
 695           *     vertex element."
 696           */
 697          if (input->glarray->Doubles && !input->is_dual_slot) {
 698             /* Store vertex elements which correspond to double and dvec2 vertex
 699              * shader inputs as 128-bit vertex elements, instead of 256-bits.
 700              */
 701             comp2 = VFCOMP_NOSTORE;
 702             comp3 = VFCOMP_NOSTORE;
 703          }
 704 #endif
 705
 706          struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
 707             .VertexBufferIndex = input->buffer,
 708             .Valid = true,
 709             .SourceElementFormat = upload_format,
 710             .SourceElementOffset = offset,
 711             .Component0Control = comp0,
 712             .Component1Control = comp1,
 713             .Component2Control = comp2,
 714             .Component3Control = comp3,
 715 #if GEN_GEN < 5
 716             .DestinationElementOffset = i * 4,
 717 #endif
 718          };
 719
 720          GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state);
 721          dw += GENX(VERTEX_ELEMENT_STATE_length);
 722       }
 723    }
 724
 725    if (needs_sgvs_element) {
 726       struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
 727          .Valid = true,
 728          .Component0Control = VFCOMP_STORE_0,
 729          .Component1Control = VFCOMP_STORE_0,
 730          .Component2Control = VFCOMP_STORE_0,
 731          .Component3Control = VFCOMP_STORE_0,
 732 #if GEN_GEN < 5
 733          .DestinationElementOffset = i * 4,
 734 #endif
 735       };
 736
 737 #if GEN_GEN >= 8
 738       if (vs_prog_data->uses_basevertex ||
 739           vs_prog_data->uses_baseinstance) {
 740          elem_state.VertexBufferIndex = brw->vb.nr_buffers;
 741          elem_state.SourceElementFormat = (enum GENX(SURFACE_FORMAT)) ISL_FORMAT_R32G32_UINT;
 742          elem_state.Component0Control = VFCOMP_STORE_SRC;
 743          elem_state.Component1Control = VFCOMP_STORE_SRC;
 744       }
 745 #else
 746       elem_state.VertexBufferIndex = brw->vb.nr_buffers;
 747       elem_state.SourceElementFormat = (enum GENX(SURFACE_FORMAT)) ISL_FORMAT_R32G32_UINT;
 748       if (vs_prog_data->uses_basevertex)
 749          elem_state.Component0Control = VFCOMP_STORE_SRC;
 750
 751       if (vs_prog_data->uses_baseinstance)
 752          elem_state.Component1Control = VFCOMP_STORE_SRC;
 753
 754       if (vs_prog_data->uses_vertexid)
 755          elem_state.Component2Control = VFCOMP_STORE_VID;
 756
 757       if (vs_prog_data->uses_instanceid)
 758          elem_state.Component3Control = VFCOMP_STORE_IID;
 759 #endif
 760
 761       GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state);
 762       dw += GENX(VERTEX_ELEMENT_STATE_length);
 763    }
 764
 765    if (vs_prog_data->uses_drawid) {
 766       struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
 767          .Valid = true,
 768          .VertexBufferIndex = brw->vb.nr_buffers + 1,
 769          .SourceElementFormat = (enum GENX(SURFACE_FORMAT)) ISL_FORMAT_R32_UINT,
 770          .Component0Control = VFCOMP_STORE_SRC,
 771          .Component1Control = VFCOMP_STORE_0,
 772          .Component2Control = VFCOMP_STORE_0,
 773          .Component3Control = VFCOMP_STORE_0,
 774 #if GEN_GEN < 5
 775          .DestinationElementOffset = i * 4,
 776 #endif
 777       };
 778
 779       GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state);
 780       dw += GENX(VERTEX_ELEMENT_STATE_length);
 781    }
 782
 783 #if GEN_GEN >= 6
 784    if (gen6_edgeflag_input) {
 785       const uint32_t format =
 786          brw_get_vertex_surface_type(brw, gen6_edgeflag_input->glarray);
 787
 788       struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
 789          .Valid = true,
 790          .VertexBufferIndex = gen6_edgeflag_input->buffer,
 791          .EdgeFlagEnable = true,
 792          .SourceElementFormat = format,
 793          .SourceElementOffset = gen6_edgeflag_input->offset,
 794          .Component0Control = VFCOMP_STORE_SRC,
 795          .Component1Control = VFCOMP_STORE_0,
 796          .Component2Control = VFCOMP_STORE_0,
 797          .Component3Control = VFCOMP_STORE_0,
 798       };
 799
 800       GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state);
 801       dw += GENX(VERTEX_ELEMENT_STATE_length);
 802    }
 803 #endif
 804
 805 #if GEN_GEN >= 8
 806    for (unsigned i = 0, j = 0; i < brw->vb.nr_enabled; i++) {
 807       const struct brw_vertex_element *input = brw->vb.enabled[i];
 808       const struct brw_vertex_buffer *buffer = &brw->vb.buffers[input->buffer];
 809       unsigned element_index;
 810
 811       /* The edge flag element is reordered to be the last one in the code
 812        * above so we need to compensate for that in the element indices used
 813        * below.
 814        */
 815       if (input == gen6_edgeflag_input)
 816          element_index = nr_elements - 1;
 817       else
 818          element_index = j++;
 819
 820       brw_batch_emit(brw, GENX(3DSTATE_VF_INSTANCING), vfi) {
 821          vfi.VertexElementIndex = element_index;
 822          vfi.InstancingEnable = buffer->step_rate != 0;
 823          vfi.InstanceDataStepRate = buffer->step_rate;
 824       }
 825    }
 826
 827    if (vs_prog_data->uses_drawid) {
 828       const unsigned element = brw->vb.nr_enabled + needs_sgvs_element;
 829
 830       brw_batch_emit(brw, GENX(3DSTATE_VF_INSTANCING), vfi) {
 831          vfi.VertexElementIndex = element;
 832       }
 833    }
 834 #endif
 835 }
 836
 837 static const struct brw_tracked_state genX(vertices) = {
 838    .dirty = {
 839       .mesa = _NEW_POLYGON,
 840       .brw = BRW_NEW_BATCH |
 841              BRW_NEW_BLORP |
 842              BRW_NEW_VERTICES |
 843              BRW_NEW_VS_PROG_DATA,
 844    },
 845    .emit = genX(emit_vertices),
 846 };
 847
 848 static void
 849 genX(emit_index_buffer)(struct brw_context *brw)
 850 {
 851    const struct _mesa_index_buffer *index_buffer = brw->ib.ib;
 852
 853    if (index_buffer == NULL)
 854       return;
 855
 856    brw_batch_emit(brw, GENX(3DSTATE_INDEX_BUFFER), ib) {
 857 #if GEN_GEN < 8 && !GEN_IS_HASWELL
 858       ib.CutIndexEnable = brw->prim_restart.enable_cut_index;
 859 #endif
 860       ib.IndexFormat = brw_get_index_type(index_buffer->index_size);
 861       ib.BufferStartingAddress = ro_bo(brw->ib.bo, 0);
 862 #if GEN_GEN >= 8
 863       ib.IndexBufferMOCS = GEN_GEN >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
 864       ib.BufferSize = brw->ib.size;
 865 #else
 866       ib.BufferEndingAddress = ro_bo(brw->ib.bo, brw->ib.size - 1);
 867 #endif
 868    }
 869 }
 870
 871 static const struct brw_tracked_state genX(index_buffer) = {
 872    .dirty = {
 873       .mesa = 0,
 874       .brw = BRW_NEW_BATCH |
 875              BRW_NEW_BLORP |
 876              BRW_NEW_INDEX_BUFFER,
 877    },
 878    .emit = genX(emit_index_buffer),
 879 };
 880
 881 #if GEN_IS_HASWELL || GEN_GEN >= 8
 882 static void
 883 genX(upload_cut_index)(struct brw_context *brw)
 884 {
 885    const struct gl_context *ctx = &brw->ctx;
 886
 887    brw_batch_emit(brw, GENX(3DSTATE_VF), vf) {
 888       if (ctx->Array._PrimitiveRestart && brw->ib.ib) {
 889          vf.IndexedDrawCutIndexEnable = true;
 890          vf.CutIndex = _mesa_primitive_restart_index(ctx, brw->ib.index_size);
 891       }
 892    }
 893 }
 894
 895 const struct brw_tracked_state genX(cut_index) = {
 896    .dirty = {
 897       .mesa  = _NEW_TRANSFORM,
 898       .brw   = BRW_NEW_INDEX_BUFFER,
 899    },
 900    .emit = genX(upload_cut_index),
 901 };
 902 #endif
 903
 904 #if GEN_GEN >= 6
 905 /**
 906  * Determine the appropriate attribute override value to store into the
 907  * 3DSTATE_SF structure for a given fragment shader attribute.  The attribute
 908  * override value contains two pieces of information: the location of the
 909  * attribute in the VUE (relative to urb_entry_read_offset, see below), and a
 910  * flag indicating whether to "swizzle" the attribute based on the direction
 911  * the triangle is facing.
 912  *
 913  * If an attribute is "swizzled", then the given VUE location is used for
 914  * front-facing triangles, and the VUE location that immediately follows is
 915  * used for back-facing triangles.  We use this to implement the mapping from
 916  * gl_FrontColor/gl_BackColor to gl_Color.
 917  *
 918  * urb_entry_read_offset is the offset into the VUE at which the SF unit is
 919  * being instructed to begin reading attribute data.  It can be set to a
 920  * nonzero value to prevent the SF unit from wasting time reading elements of
 921  * the VUE that are not needed by the fragment shader.  It is measured in
 922  * 256-bit increments.
 923  */
 924 static void
 925 genX(get_attr_override)(struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr,
 926                         const struct brw_vue_map *vue_map,
 927                         int urb_entry_read_offset, int fs_attr,
 928                         bool two_side_color, uint32_t *max_source_attr)
 929 {
 930    /* Find the VUE slot for this attribute. */
 931    int slot = vue_map->varying_to_slot[fs_attr];
 932
 933    /* Viewport and Layer are stored in the VUE header.  We need to override
 934     * them to zero if earlier stages didn't write them, as GL requires that
 935     * they read back as zero when not explicitly set.
 936     */
 937    if (fs_attr == VARYING_SLOT_VIEWPORT || fs_attr == VARYING_SLOT_LAYER) {
 938       attr->ComponentOverrideX = true;
 939       attr->ComponentOverrideW = true;
 940       attr->ConstantSource = CONST_0000;
 941
 942       if (!(vue_map->slots_valid & VARYING_BIT_LAYER))
 943          attr->ComponentOverrideY = true;
 944       if (!(vue_map->slots_valid & VARYING_BIT_VIEWPORT))
 945          attr->ComponentOverrideZ = true;
 946
 947       return;
 948    }
 949
 950    /* If there was only a back color written but not front, use back
 951     * as the color instead of undefined
 952     */
 953    if (slot == -1 && fs_attr == VARYING_SLOT_COL0)
 954       slot = vue_map->varying_to_slot[VARYING_SLOT_BFC0];
 955    if (slot == -1 && fs_attr == VARYING_SLOT_COL1)
 956       slot = vue_map->varying_to_slot[VARYING_SLOT_BFC1];
 957
 958    if (slot == -1) {
 959       /* This attribute does not exist in the VUE--that means that the vertex
 960        * shader did not write to it.  This means that either:
 961        *
 962        * (a) This attribute is a texture coordinate, and it is going to be
 963        * replaced with point coordinates (as a consequence of a call to
 964        * glTexEnvi(GL_POINT_SPRITE, GL_COORD_REPLACE, GL_TRUE)), so the
 965        * hardware will ignore whatever attribute override we supply.
 966        *
 967        * (b) This attribute is read by the fragment shader but not written by
 968        * the vertex shader, so its value is undefined.  Therefore the
 969        * attribute override we supply doesn't matter.
 970        *
 971        * (c) This attribute is gl_PrimitiveID, and it wasn't written by the
 972        * previous shader stage.
 973        *
 974        * Note that we don't have to worry about the cases where the attribute
 975        * is gl_PointCoord or is undergoing point sprite coordinate
 976        * replacement, because in those cases, this function isn't called.
 977        *
 978        * In case (c), we need to program the attribute overrides so that the
 979        * primitive ID will be stored in this slot.  In every other case, the
 980        * attribute override we supply doesn't matter.  So just go ahead and
 981        * program primitive ID in every case.
 982        */
 983       attr->ComponentOverrideW = true;
 984       attr->ComponentOverrideX = true;
 985       attr->ComponentOverrideY = true;
 986       attr->ComponentOverrideZ = true;
 987       attr->ConstantSource = PRIM_ID;
 988       return;
 989    }
 990
 991    /* Compute the location of the attribute relative to urb_entry_read_offset.
 992     * Each increment of urb_entry_read_offset represents a 256-bit value, so
 993     * it counts for two 128-bit VUE slots.
 994     */
 995    int source_attr = slot - 2 * urb_entry_read_offset;
 996    assert(source_attr >= 0 && source_attr < 32);
 997
 998    /* If we are doing two-sided color, and the VUE slot following this one
 999     * represents a back-facing color, then we need to instruct the SF unit to
1000     * do back-facing swizzling.
1001     */
1002    bool swizzling = two_side_color &&
1003       ((vue_map->slot_to_varying[slot] == VARYING_SLOT_COL0 &&
1004         vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC0) ||
1005        (vue_map->slot_to_varying[slot] == VARYING_SLOT_COL1 &&
1006         vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC1));
1007
1008    /* Update max_source_attr.  If swizzling, the SF will read this slot + 1. */
1009    if (*max_source_attr < source_attr + swizzling)
1010       *max_source_attr = source_attr + swizzling;
1011
1012    attr->SourceAttribute = source_attr;
1013    if (swizzling)
1014       attr->SwizzleSelect = INPUTATTR_FACING;
1015 }
1016
1017
1018 static void
1019 genX(calculate_attr_overrides)(const struct brw_context *brw,
1020                                struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr_overrides,
1021                                uint32_t *point_sprite_enables,
1022                                uint32_t *urb_entry_read_length,
1023                                uint32_t *urb_entry_read_offset)
1024 {
1025    const struct gl_context *ctx = &brw->ctx;
1026
1027    /* _NEW_POINT */
1028    const struct gl_point_attrib *point = &ctx->Point;
1029
1030    /* BRW_NEW_FRAGMENT_PROGRAM */
1031    const struct gl_program *fp = brw->programs[MESA_SHADER_FRAGMENT];
1032
1033    /* BRW_NEW_FS_PROG_DATA */
1034    const struct brw_wm_prog_data *wm_prog_data =
1035       brw_wm_prog_data(brw->wm.base.prog_data);
1036    uint32_t max_source_attr = 0;
1037
1038    *point_sprite_enables = 0;
1039
1040    int first_slot =
1041       brw_compute_first_urb_slot_required(fp->info.inputs_read,
1042                                           &brw->vue_map_geom_out);
1043
1044    /* Each URB offset packs two varying slots */
1045    assert(first_slot % 2 == 0);
1046    *urb_entry_read_offset = first_slot / 2;
1047
1048    /* From the Ivybridge PRM, Vol 2 Part 1, 3DSTATE_SBE,
1049     * description of dw10 Point Sprite Texture Coordinate Enable:
1050     *
1051     * "This field must be programmed to zero when non-point primitives
1052     * are rendered."
1053     *
1054     * The SandyBridge PRM doesn't explicitly say that point sprite enables
1055     * must be programmed to zero when rendering non-point primitives, but
1056     * the IvyBridge PRM does, and if we don't, we get garbage.
1057     *
1058     * This is not required on Haswell, as the hardware ignores this state
1059     * when drawing non-points -- although we do still need to be careful to
1060     * correctly set the attr overrides.
1061     *
1062     * _NEW_POLYGON
1063     * BRW_NEW_PRIMITIVE | BRW_NEW_GS_PROG_DATA | BRW_NEW_TES_PROG_DATA
1064     */
1065    bool drawing_points = brw_is_drawing_points(brw);
1066
1067    for (int attr = 0; attr < VARYING_SLOT_MAX; attr++) {
1068       int input_index = wm_prog_data->urb_setup[attr];
1069
1070       if (input_index < 0)
1071          continue;
1072
1073       /* _NEW_POINT */
1074       bool point_sprite = false;
1075       if (drawing_points) {
1076          if (point->PointSprite &&
1077              (attr >= VARYING_SLOT_TEX0 && attr <= VARYING_SLOT_TEX7) &&
1078              (point->CoordReplace & (1u << (attr - VARYING_SLOT_TEX0)))) {
1079             point_sprite = true;
1080          }
1081
1082          if (attr == VARYING_SLOT_PNTC)
1083             point_sprite = true;
1084
1085          if (point_sprite)
1086             *point_sprite_enables |= (1 << input_index);
1087       }
1088
1089       /* BRW_NEW_VUE_MAP_GEOM_OUT | _NEW_LIGHT | _NEW_PROGRAM */
1090       struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attribute = { 0 };
1091
1092       if (!point_sprite) {
1093          genX(get_attr_override)(&attribute,
1094                                  &brw->vue_map_geom_out,
1095                                  *urb_entry_read_offset, attr,
1096                                  _mesa_vertex_program_two_side_enabled(ctx),
1097                                  &max_source_attr);
1098       }
1099
1100       /* The hardware can only do the overrides on 16 overrides at a
1101        * time, and the other up to 16 have to be lined up so that the
1102        * input index = the output index.  We'll need to do some
1103        * tweaking to make sure that's the case.
1104        */
1105       if (input_index < 16)
1106          attr_overrides[input_index] = attribute;
1107       else
1108          assert(attribute.SourceAttribute == input_index);
1109    }
1110
1111    /* From the Sandy Bridge PRM, Volume 2, Part 1, documentation for
1112     * 3DSTATE_SF DWord 1 bits 15:11, "Vertex URB Entry Read Length":
1113     *
1114     * "This field should be set to the minimum length required to read the
1115     *  maximum source attribute.  The maximum source attribute is indicated
1116     *  by the maximum value of the enabled Attribute # Source Attribute if
1117     *  Attribute Swizzle Enable is set, Number of Output Attributes-1 if
1118     *  enable is not set.
1119     *  read_length = ceiling((max_source_attr + 1) / 2)
1120     *
1121     *  [errata] Corruption/Hang possible if length programmed larger than
1122     *  recommended"
1123     *
1124     * Similar text exists for Ivy Bridge.
1125     */
1126    *urb_entry_read_length = DIV_ROUND_UP(max_source_attr + 1, 2);
1127 }
1128 #endif
1129
1130 /* ---------------------------------------------------------------------- */
1131
1132 #if GEN_GEN >= 8
1133 typedef struct GENX(3DSTATE_WM_DEPTH_STENCIL) DEPTH_STENCIL_GENXML;
1134 #elif GEN_GEN >= 6
1135 typedef struct GENX(DEPTH_STENCIL_STATE)      DEPTH_STENCIL_GENXML;
1136 #else
1137 typedef struct GENX(COLOR_CALC_STATE)         DEPTH_STENCIL_GENXML;
1138 #endif
1139
1140 static inline void
1141 set_depth_stencil_bits(struct brw_context *brw, DEPTH_STENCIL_GENXML *ds)
1142 {
1143    struct gl_context *ctx = &brw->ctx;
1144
1145    /* _NEW_BUFFERS */
1146    struct intel_renderbuffer *depth_irb =
1147       intel_get_renderbuffer(ctx->DrawBuffer, BUFFER_DEPTH);
1148
1149    /* _NEW_DEPTH */
1150    struct gl_depthbuffer_attrib *depth = &ctx->Depth;
1151
1152    /* _NEW_STENCIL */
1153    struct gl_stencil_attrib *stencil = &ctx->Stencil;
1154    const int b = stencil->_BackFace;
1155
1156    if (depth->Test && depth_irb) {
1157       ds->DepthTestEnable = true;
1158       ds->DepthBufferWriteEnable = brw_depth_writes_enabled(brw);
1159       ds->DepthTestFunction = intel_translate_compare_func(depth->Func);
1160    }
1161
1162    if (brw->stencil_enabled) {
1163       ds->StencilTestEnable = true;
1164       ds->StencilWriteMask = stencil->WriteMask[0] & 0xff;
1165       ds->StencilTestMask = stencil->ValueMask[0] & 0xff;
1166
1167       ds->StencilTestFunction =
1168          intel_translate_compare_func(stencil->Function[0]);
1169       ds->StencilFailOp =
1170          intel_translate_stencil_op(stencil->FailFunc[0]);
1171       ds->StencilPassDepthPassOp =
1172          intel_translate_stencil_op(stencil->ZPassFunc[0]);
1173       ds->StencilPassDepthFailOp =
1174          intel_translate_stencil_op(stencil->ZFailFunc[0]);
1175
1176       ds->StencilBufferWriteEnable = brw->stencil_write_enabled;
1177
1178       if (brw->stencil_two_sided) {
1179          ds->DoubleSidedStencilEnable = true;
1180          ds->BackfaceStencilWriteMask = stencil->WriteMask[b] & 0xff;
1181          ds->BackfaceStencilTestMask = stencil->ValueMask[b] & 0xff;
1182
1183          ds->BackfaceStencilTestFunction =
1184             intel_translate_compare_func(stencil->Function[b]);
1185          ds->BackfaceStencilFailOp =
1186             intel_translate_stencil_op(stencil->FailFunc[b]);
1187          ds->BackfaceStencilPassDepthPassOp =
1188             intel_translate_stencil_op(stencil->ZPassFunc[b]);
1189          ds->BackfaceStencilPassDepthFailOp =
1190             intel_translate_stencil_op(stencil->ZFailFunc[b]);
1191       }
1192
1193 #if GEN_GEN <= 5 || GEN_GEN >= 9
1194       ds->StencilReferenceValue = _mesa_get_stencil_ref(ctx, 0);
1195       ds->BackfaceStencilReferenceValue = _mesa_get_stencil_ref(ctx, b);
1196 #endif
1197    }
1198 }
1199
1200 #if GEN_GEN >= 6
1201 static void
1202 genX(upload_depth_stencil_state)(struct brw_context *brw)
1203 {
1204 #if GEN_GEN >= 8
1205    brw_batch_emit(brw, GENX(3DSTATE_WM_DEPTH_STENCIL), wmds) {
1206       set_depth_stencil_bits(brw, &wmds);
1207    }
1208 #else
1209    uint32_t ds_offset;
1210    brw_state_emit(brw, GENX(DEPTH_STENCIL_STATE), 64, &ds_offset, ds) {
1211       set_depth_stencil_bits(brw, &ds);
1212    }
1213
1214    /* Now upload a pointer to the indirect state */
1215 #if GEN_GEN == 6
1216    brw_batch_emit(brw, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
1217       ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
1218       ptr.DEPTH_STENCIL_STATEChange = true;
1219    }
1220 #else
1221    brw_batch_emit(brw, GENX(3DSTATE_DEPTH_STENCIL_STATE_POINTERS), ptr) {
1222       ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
1223    }
1224 #endif
1225 #endif
1226 }
1227
1228 static const struct brw_tracked_state genX(depth_stencil_state) = {
1229    .dirty = {
1230       .mesa = _NEW_BUFFERS |
1231               _NEW_DEPTH |
1232               _NEW_STENCIL,
1233       .brw  = BRW_NEW_BLORP |
1234               (GEN_GEN >= 8 ? BRW_NEW_CONTEXT
1235                             : BRW_NEW_BATCH |
1236                               BRW_NEW_STATE_BASE_ADDRESS),
1237    },
1238    .emit = genX(upload_depth_stencil_state),
1239 };
1240 #endif
1241
1242 /* ---------------------------------------------------------------------- */
1243
1244 #if GEN_GEN <= 5
1245
1246 static void
1247 genX(upload_clip_state)(struct brw_context *brw)
1248 {
1249    struct gl_context *ctx = &brw->ctx;
1250
1251    ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
1252    brw_state_emit(brw, GENX(CLIP_STATE), 32, &brw->clip.state_offset, clip) {
1253       clip.KernelStartPointer = KSP(brw, brw->clip.prog_offset);
1254       clip.GRFRegisterCount =
1255          DIV_ROUND_UP(brw->clip.prog_data->total_grf, 16) - 1;
1256       clip.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
1257       clip.SingleProgramFlow = true;
1258       clip.VertexURBEntryReadLength = brw->clip.prog_data->urb_read_length;
1259       clip.ConstantURBEntryReadLength = brw->clip.prog_data->curb_read_length;
1260
1261       /* BRW_NEW_PUSH_CONSTANT_ALLOCATION */
1262       clip.ConstantURBEntryReadOffset = brw->curbe.clip_start * 2;
1263       clip.DispatchGRFStartRegisterForURBData = 1;
1264       clip.VertexURBEntryReadOffset = 0;
1265
1266       /* BRW_NEW_URB_FENCE */
1267       clip.NumberofURBEntries = brw->urb.nr_clip_entries;
1268       clip.URBEntryAllocationSize = brw->urb.vsize - 1;
1269
1270       if (brw->urb.nr_clip_entries >= 10) {
1271          /* Half of the URB entries go to each thread, and it has to be an
1272           * even number.
1273           */
1274          assert(brw->urb.nr_clip_entries % 2 == 0);
1275
1276          /* Although up to 16 concurrent Clip threads are allowed on Ironlake,
1277           * only 2 threads can output VUEs at a time.
1278           */
1279          clip.MaximumNumberofThreads = (GEN_GEN == 5 ? 16 : 2) - 1;
1280       } else {
1281          assert(brw->urb.nr_clip_entries >= 5);
1282          clip.MaximumNumberofThreads = 1 - 1;
1283       }
1284
1285       clip.VertexPositionSpace = VPOS_NDCSPACE;
1286       clip.UserClipFlagsMustClipEnable = true;
1287       clip.GuardbandClipTestEnable = true;
1288
1289       clip.ClipperViewportStatePointer =
1290          ro_bo(brw->batch.state.bo, brw->clip.vp_offset);
1291
1292       clip.ScreenSpaceViewportXMin = -1;
1293       clip.ScreenSpaceViewportXMax = 1;
1294       clip.ScreenSpaceViewportYMin = -1;
1295       clip.ScreenSpaceViewportYMax = 1;
1296
1297       clip.ViewportXYClipTestEnable = true;
1298       clip.ViewportZClipTestEnable = !ctx->Transform.DepthClamp;
1299
1300       /* _NEW_TRANSFORM */
1301       if (GEN_GEN == 5 || GEN_IS_G4X) {
1302          clip.UserClipDistanceClipTestEnableBitmask =
1303             ctx->Transform.ClipPlanesEnabled;
1304       } else {
1305          /* Up to 6 actual clip flags, plus the 7th for the negative RHW
1306           * workaround.
1307           */
1308          clip.UserClipDistanceClipTestEnableBitmask =
1309             (ctx->Transform.ClipPlanesEnabled & 0x3f) | 0x40;
1310       }
1311
1312       if (ctx->Transform.ClipDepthMode == GL_ZERO_TO_ONE)
1313          clip.APIMode = APIMODE_D3D;
1314       else
1315          clip.APIMode = APIMODE_OGL;
1316
1317       clip.GuardbandClipTestEnable = true;
1318
1319       clip.ClipMode = brw->clip.prog_data->clip_mode;
1320
1321 #if GEN_IS_G4X
1322       clip.NegativeWClipTestEnable = true;
1323 #endif
1324    }
1325 }
1326
1327 const struct brw_tracked_state genX(clip_state) = {
1328    .dirty = {
1329       .mesa  = _NEW_TRANSFORM |
1330                _NEW_VIEWPORT,
1331       .brw   = BRW_NEW_BATCH |
1332                BRW_NEW_BLORP |
1333                BRW_NEW_CLIP_PROG_DATA |
1334                BRW_NEW_PUSH_CONSTANT_ALLOCATION |
1335                BRW_NEW_PROGRAM_CACHE |
1336                BRW_NEW_URB_FENCE,
1337    },
1338    .emit = genX(upload_clip_state),
1339 };
1340
1341 #else
1342
1343 static void
1344 genX(upload_clip_state)(struct brw_context *brw)
1345 {
1346    struct gl_context *ctx = &brw->ctx;
1347
1348    /* _NEW_BUFFERS */
1349    struct gl_framebuffer *fb = ctx->DrawBuffer;
1350
1351    /* BRW_NEW_FS_PROG_DATA */
1352    struct brw_wm_prog_data *wm_prog_data =
1353       brw_wm_prog_data(brw->wm.base.prog_data);
1354
1355    brw_batch_emit(brw, GENX(3DSTATE_CLIP), clip) {
1356       clip.StatisticsEnable = !brw->meta_in_progress;
1357
1358       if (wm_prog_data->barycentric_interp_modes &
1359           BRW_BARYCENTRIC_NONPERSPECTIVE_BITS)
1360          clip.NonPerspectiveBarycentricEnable = true;
1361
1362 #if GEN_GEN >= 7
1363       clip.EarlyCullEnable = true;
1364 #endif
1365
1366 #if GEN_GEN == 7
1367       clip.FrontWinding = brw->polygon_front_bit == _mesa_is_user_fbo(fb);
1368
1369       if (ctx->Polygon.CullFlag) {
1370          switch (ctx->Polygon.CullFaceMode) {
1371          case GL_FRONT:
1372             clip.CullMode = CULLMODE_FRONT;
1373             break;
1374          case GL_BACK:
1375             clip.CullMode = CULLMODE_BACK;
1376             break;
1377          case GL_FRONT_AND_BACK:
1378             clip.CullMode = CULLMODE_BOTH;
1379             break;
1380          default:
1381             unreachable("Should not get here: invalid CullFlag");
1382          }
1383       } else {
1384          clip.CullMode = CULLMODE_NONE;
1385       }
1386 #endif
1387
1388 #if GEN_GEN < 8
1389       clip.UserClipDistanceCullTestEnableBitmask =
1390          brw_vue_prog_data(brw->vs.base.prog_data)->cull_distance_mask;
1391
1392       clip.ViewportZClipTestEnable = !ctx->Transform.DepthClamp;
1393 #endif
1394
1395       /* _NEW_LIGHT */
1396       if (ctx->Light.ProvokingVertex == GL_FIRST_VERTEX_CONVENTION) {
1397          clip.TriangleStripListProvokingVertexSelect = 0;
1398          clip.TriangleFanProvokingVertexSelect = 1;
1399          clip.LineStripListProvokingVertexSelect = 0;
1400       } else {
1401          clip.TriangleStripListProvokingVertexSelect = 2;
1402          clip.TriangleFanProvokingVertexSelect = 2;
1403          clip.LineStripListProvokingVertexSelect = 1;
1404       }
1405
1406       /* _NEW_TRANSFORM */
1407       clip.UserClipDistanceClipTestEnableBitmask =
1408          ctx->Transform.ClipPlanesEnabled;
1409
1410 #if GEN_GEN >= 8
1411       clip.ForceUserClipDistanceClipTestEnableBitmask = true;
1412 #endif
1413
1414       if (ctx->Transform.ClipDepthMode == GL_ZERO_TO_ONE)
1415          clip.APIMode = APIMODE_D3D;
1416       else
1417          clip.APIMode = APIMODE_OGL;
1418
1419       clip.GuardbandClipTestEnable = true;
1420
1421       /* BRW_NEW_VIEWPORT_COUNT */
1422       const unsigned viewport_count = brw->clip.viewport_count;
1423
1424       if (ctx->RasterDiscard) {
1425          clip.ClipMode = CLIPMODE_REJECT_ALL;
1426 #if GEN_GEN == 6
1427          perf_debug("Rasterizer discard is currently implemented via the "
1428                     "clipper; having the GS not write primitives would "
1429                     "likely be faster.\n");
1430 #endif
1431       } else {
1432          clip.ClipMode = CLIPMODE_NORMAL;
1433       }
1434
1435       clip.ClipEnable = true;
1436
1437       /* _NEW_POLYGON,
1438        * BRW_NEW_GEOMETRY_PROGRAM | BRW_NEW_TES_PROG_DATA | BRW_NEW_PRIMITIVE
1439        */
1440       if (!brw_is_drawing_points(brw) && !brw_is_drawing_lines(brw))
1441          clip.ViewportXYClipTestEnable = true;
1442
1443       clip.MinimumPointWidth = 0.125;
1444       clip.MaximumPointWidth = 255.875;
1445       clip.MaximumVPIndex = viewport_count - 1;
1446       if (_mesa_geometric_layers(fb) == 0)
1447          clip.ForceZeroRTAIndexEnable = true;
1448    }
1449 }
1450
1451 static const struct brw_tracked_state genX(clip_state) = {
1452    .dirty = {
1453       .mesa  = _NEW_BUFFERS |
1454                _NEW_LIGHT |
1455                _NEW_POLYGON |
1456                _NEW_TRANSFORM,
1457       .brw   = BRW_NEW_BLORP |
1458                BRW_NEW_CONTEXT |
1459                BRW_NEW_FS_PROG_DATA |
1460                BRW_NEW_GS_PROG_DATA |
1461                BRW_NEW_VS_PROG_DATA |
1462                BRW_NEW_META_IN_PROGRESS |
1463                BRW_NEW_PRIMITIVE |
1464                BRW_NEW_RASTERIZER_DISCARD |
1465                BRW_NEW_TES_PROG_DATA |
1466                BRW_NEW_VIEWPORT_COUNT,
1467    },
1468    .emit = genX(upload_clip_state),
1469 };
1470 #endif
1471
1472 /* ---------------------------------------------------------------------- */
1473
1474 static void
1475 genX(upload_sf)(struct brw_context *brw)
1476 {
1477    struct gl_context *ctx = &brw->ctx;
1478    float point_size;
1479
1480 #if GEN_GEN <= 7
1481    /* _NEW_BUFFERS */
1482    bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
1483    UNUSED const bool multisampled_fbo =
1484       _mesa_geometric_samples(ctx->DrawBuffer) > 1;
1485 #endif
1486
1487 #if GEN_GEN < 6
1488    const struct brw_sf_prog_data *sf_prog_data = brw->sf.prog_data;
1489
1490    ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
1491
1492    brw_state_emit(brw, GENX(SF_STATE), 64, &brw->sf.state_offset, sf) {
1493       sf.KernelStartPointer = KSP(brw, brw->sf.prog_offset);
1494       sf.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
1495       sf.GRFRegisterCount = DIV_ROUND_UP(sf_prog_data->total_grf, 16) - 1;
1496       sf.DispatchGRFStartRegisterForURBData = 3;
1497       sf.VertexURBEntryReadOffset = BRW_SF_URB_ENTRY_READ_OFFSET;
1498       sf.VertexURBEntryReadLength = sf_prog_data->urb_read_length;
1499       sf.NumberofURBEntries = brw->urb.nr_sf_entries;
1500       sf.URBEntryAllocationSize = brw->urb.sfsize - 1;
1501
1502       /* STATE_PREFETCH command description describes this state as being
1503        * something loaded through the GPE (L2 ISC), so it's INSTRUCTION
1504        * domain.
1505        */
1506       sf.SetupViewportStateOffset =
1507          ro_bo(brw->batch.state.bo, brw->sf.vp_offset);
1508
1509       sf.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
1510
1511       /* sf.ConstantURBEntryReadLength = stage_prog_data->curb_read_length; */
1512       /* sf.ConstantURBEntryReadOffset = brw->curbe.vs_start * 2; */
1513
1514       sf.MaximumNumberofThreads =
1515          MIN2(GEN_GEN == 5 ? 48 : 24, brw->urb.nr_sf_entries) - 1;
1516
1517       sf.SpritePointEnable = ctx->Point.PointSprite;
1518
1519       sf.DestinationOriginHorizontalBias = 0.5;
1520       sf.DestinationOriginVerticalBias = 0.5;
1521 #else
1522    brw_batch_emit(brw, GENX(3DSTATE_SF), sf) {
1523       sf.StatisticsEnable = true;
1524 #endif
1525       sf.ViewportTransformEnable = true;
1526
1527 #if GEN_GEN == 7
1528       /* _NEW_BUFFERS */
1529       sf.DepthBufferSurfaceFormat = brw_depthbuffer_format(brw);
1530 #endif
1531
1532 #if GEN_GEN <= 7
1533       /* _NEW_POLYGON */
1534       sf.FrontWinding = brw->polygon_front_bit == render_to_fbo;
1535 #if GEN_GEN >= 6
1536       sf.GlobalDepthOffsetEnableSolid = ctx->Polygon.OffsetFill;
1537       sf.GlobalDepthOffsetEnableWireframe = ctx->Polygon.OffsetLine;
1538       sf.GlobalDepthOffsetEnablePoint = ctx->Polygon.OffsetPoint;
1539
1540       switch (ctx->Polygon.FrontMode) {
1541          case GL_FILL:
1542             sf.FrontFaceFillMode = FILL_MODE_SOLID;
1543             break;
1544          case GL_LINE:
1545             sf.FrontFaceFillMode = FILL_MODE_WIREFRAME;
1546             break;
1547          case GL_POINT:
1548             sf.FrontFaceFillMode = FILL_MODE_POINT;
1549             break;
1550          default:
1551             unreachable("not reached");
1552       }
1553
1554       switch (ctx->Polygon.BackMode) {
1555          case GL_FILL:
1556             sf.BackFaceFillMode = FILL_MODE_SOLID;
1557             break;
1558          case GL_LINE:
1559             sf.BackFaceFillMode = FILL_MODE_WIREFRAME;
1560             break;
1561          case GL_POINT:
1562             sf.BackFaceFillMode = FILL_MODE_POINT;
1563             break;
1564          default:
1565             unreachable("not reached");
1566       }
1567
1568       if (multisampled_fbo && ctx->Multisample.Enabled)
1569          sf.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
1570
1571       sf.GlobalDepthOffsetConstant = ctx->Polygon.OffsetUnits * 2;
1572       sf.GlobalDepthOffsetScale = ctx->Polygon.OffsetFactor;
1573       sf.GlobalDepthOffsetClamp = ctx->Polygon.OffsetClamp;
1574 #endif
1575
1576       sf.ScissorRectangleEnable = true;
1577
1578       if (ctx->Polygon.CullFlag) {
1579          switch (ctx->Polygon.CullFaceMode) {
1580             case GL_FRONT:
1581                sf.CullMode = CULLMODE_FRONT;
1582                break;
1583             case GL_BACK:
1584                sf.CullMode = CULLMODE_BACK;
1585                break;
1586             case GL_FRONT_AND_BACK:
1587                sf.CullMode = CULLMODE_BOTH;
1588                break;
1589             default:
1590                unreachable("not reached");
1591          }
1592       } else {
1593          sf.CullMode = CULLMODE_NONE;
1594       }
1595
1596 #if GEN_IS_HASWELL
1597       sf.LineStippleEnable = ctx->Line.StippleFlag;
1598 #endif
1599
1600 #endif
1601
1602       /* _NEW_LINE */
1603 #if GEN_GEN == 8
1604       const struct gen_device_info *devinfo = &brw->screen->devinfo;
1605
1606       if (devinfo->is_cherryview)
1607          sf.CHVLineWidth = brw_get_line_width(brw);
1608       else
1609          sf.LineWidth = brw_get_line_width(brw);
1610 #else
1611       sf.LineWidth = brw_get_line_width(brw);
1612 #endif
1613
1614       if (ctx->Line.SmoothFlag) {
1615          sf.LineEndCapAntialiasingRegionWidth = _10pixels;
1616 #if GEN_GEN <= 7
1617          sf.AntiAliasingEnable = true;
1618 #endif
1619       }
1620
1621       /* _NEW_POINT - Clamp to ARB_point_parameters user limits */
1622       point_size = CLAMP(ctx->Point.Size, ctx->Point.MinSize, ctx->Point.MaxSize);
1623       /* Clamp to the hardware limits */
1624       sf.PointWidth = CLAMP(point_size, 0.125f, 255.875f);
1625
1626       /* _NEW_PROGRAM | _NEW_POINT, BRW_NEW_VUE_MAP_GEOM_OUT */
1627       if (use_state_point_size(brw))
1628          sf.PointWidthSource = State;
1629
1630 #if GEN_GEN >= 8
1631       /* _NEW_POINT | _NEW_MULTISAMPLE */
1632       if ((ctx->Point.SmoothFlag || _mesa_is_multisample_enabled(ctx)) &&
1633           !ctx->Point.PointSprite)
1634          sf.SmoothPointEnable = true;
1635 #endif
1636
1637 #if GEN_GEN == 10
1638       /* _NEW_BUFFERS
1639        * Smooth Point Enable bit MUST not be set when NUM_MULTISAMPLES > 1.
1640        */
1641       const bool multisampled_fbo =
1642          _mesa_geometric_samples(ctx->DrawBuffer) > 1;
1643       if (multisampled_fbo)
1644          sf.SmoothPointEnable = false;
1645 #endif
1646
1647 #if GEN_IS_G4X || GEN_GEN >= 5
1648       sf.AALineDistanceMode = AALINEDISTANCE_TRUE;
1649 #endif
1650
1651       /* _NEW_LIGHT */
1652       if (ctx->Light.ProvokingVertex != GL_FIRST_VERTEX_CONVENTION) {
1653          sf.TriangleStripListProvokingVertexSelect = 2;
1654          sf.TriangleFanProvokingVertexSelect = 2;
1655          sf.LineStripListProvokingVertexSelect = 1;
1656       } else {
1657          sf.TriangleFanProvokingVertexSelect = 1;
1658       }
1659
1660 #if GEN_GEN == 6
1661       /* BRW_NEW_FS_PROG_DATA */
1662       const struct brw_wm_prog_data *wm_prog_data =
1663          brw_wm_prog_data(brw->wm.base.prog_data);
1664
1665       sf.AttributeSwizzleEnable = true;
1666       sf.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
1667
1668       /*
1669        * Window coordinates in an FBO are inverted, which means point
1670        * sprite origin must be inverted, too.
1671        */
1672       if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) != render_to_fbo) {
1673          sf.PointSpriteTextureCoordinateOrigin = LOWERLEFT;
1674       } else {
1675          sf.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
1676       }
1677
1678       /* BRW_NEW_VUE_MAP_GEOM_OUT | BRW_NEW_FRAGMENT_PROGRAM |
1679        * _NEW_POINT | _NEW_LIGHT | _NEW_PROGRAM | BRW_NEW_FS_PROG_DATA
1680        */
1681       uint32_t urb_entry_read_length;
1682       uint32_t urb_entry_read_offset;
1683       uint32_t point_sprite_enables;
1684       genX(calculate_attr_overrides)(brw, sf.Attribute, &point_sprite_enables,
1685                                      &urb_entry_read_length,
1686                                      &urb_entry_read_offset);
1687       sf.VertexURBEntryReadLength = urb_entry_read_length;
1688       sf.VertexURBEntryReadOffset = urb_entry_read_offset;
1689       sf.PointSpriteTextureCoordinateEnable = point_sprite_enables;
1690       sf.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
1691 #endif
1692    }
1693 }
1694
1695 static const struct brw_tracked_state genX(sf_state) = {
1696    .dirty = {
1697       .mesa  = _NEW_LIGHT |
1698                _NEW_LINE |
1699                _NEW_POINT |
1700                _NEW_PROGRAM |
1701                (GEN_GEN >= 6 ? _NEW_MULTISAMPLE : 0) |
1702                (GEN_GEN <= 7 ? _NEW_BUFFERS | _NEW_POLYGON : 0) |
1703                (GEN_GEN == 10 ? _NEW_BUFFERS : 0),
1704       .brw   = BRW_NEW_BLORP |
1705                BRW_NEW_VUE_MAP_GEOM_OUT |
1706                (GEN_GEN <= 5 ? BRW_NEW_BATCH |
1707                                BRW_NEW_PROGRAM_CACHE |
1708                                BRW_NEW_SF_PROG_DATA |
1709                                BRW_NEW_SF_VP |
1710                                BRW_NEW_URB_FENCE
1711                              : 0) |
1712                (GEN_GEN >= 6 ? BRW_NEW_CONTEXT : 0) |
1713                (GEN_GEN >= 6 && GEN_GEN <= 7 ?
1714                                BRW_NEW_GS_PROG_DATA |
1715                                BRW_NEW_PRIMITIVE |
1716                                BRW_NEW_TES_PROG_DATA
1717                              : 0) |
1718                (GEN_GEN == 6 ? BRW_NEW_FS_PROG_DATA |
1719                                BRW_NEW_FRAGMENT_PROGRAM
1720                              : 0),
1721    },
1722    .emit = genX(upload_sf),
1723 };
1724
1725 /* ---------------------------------------------------------------------- */
1726
1727 static bool
1728 brw_color_buffer_write_enabled(struct brw_context *brw)
1729 {
1730    struct gl_context *ctx = &brw->ctx;
1731    /* BRW_NEW_FRAGMENT_PROGRAM */
1732    const struct gl_program *fp = brw->programs[MESA_SHADER_FRAGMENT];
1733    unsigned i;
1734
1735    /* _NEW_BUFFERS */
1736    for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++) {
1737       struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
1738       uint64_t outputs_written = fp->info.outputs_written;
1739
1740       /* _NEW_COLOR */
1741       if (rb && (outputs_written & BITFIELD64_BIT(FRAG_RESULT_COLOR) ||
1742                  outputs_written & BITFIELD64_BIT(FRAG_RESULT_DATA0 + i)) &&
1743           GET_COLORMASK(ctx->Color.ColorMask, i)) {
1744          return true;
1745       }
1746    }
1747
1748    return false;
1749 }
1750
1751 static void
1752 genX(upload_wm)(struct brw_context *brw)
1753 {
1754    struct gl_context *ctx = &brw->ctx;
1755
1756    /* BRW_NEW_FS_PROG_DATA */
1757    const struct brw_wm_prog_data *wm_prog_data =
1758       brw_wm_prog_data(brw->wm.base.prog_data);
1759
1760    UNUSED bool writes_depth =
1761       wm_prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF;
1762    UNUSED struct brw_stage_state *stage_state = &brw->wm.base;
1763    UNUSED const struct gen_device_info *devinfo = &brw->screen->devinfo;
1764
1765 #if GEN_GEN == 6
1766    /* We can't fold this into gen6_upload_wm_push_constants(), because
1767     * according to the SNB PRM, vol 2 part 1 section 7.2.2
1768     * (3DSTATE_CONSTANT_PS [DevSNB]):
1769     *
1770     *     "[DevSNB]: This packet must be followed by WM_STATE."
1771     */
1772    brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_PS), wmcp) {
1773       if (wm_prog_data->base.nr_params != 0) {
1774          wmcp.Buffer0Valid = true;
1775          /* Pointer to the WM constant buffer.  Covered by the set of
1776           * state flags from gen6_upload_wm_push_constants.
1777           */
1778          wmcp.PointertoPSConstantBuffer0 = stage_state->push_const_offset;
1779          wmcp.PSConstantBuffer0ReadLength = stage_state->push_const_size - 1;
1780       }
1781    }
1782 #endif
1783
1784 #if GEN_GEN >= 6
1785    brw_batch_emit(brw, GENX(3DSTATE_WM), wm) {
1786       wm.LineAntialiasingRegionWidth = _10pixels;
1787       wm.LineEndCapAntialiasingRegionWidth = _05pixels;
1788
1789       wm.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
1790       wm.BarycentricInterpolationMode = wm_prog_data->barycentric_interp_modes;
1791 #else
1792    ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
1793    brw_state_emit(brw, GENX(WM_STATE), 64, &stage_state->state_offset, wm) {
1794       if (wm_prog_data->dispatch_8 && wm_prog_data->dispatch_16) {
1795          /* These two fields should be the same pre-gen6, which is why we
1796           * only have one hardware field to program for both dispatch
1797           * widths.
1798           */
1799          assert(wm_prog_data->base.dispatch_grf_start_reg ==
1800                 wm_prog_data->dispatch_grf_start_reg_2);
1801       }
1802
1803       if (wm_prog_data->dispatch_8 || wm_prog_data->dispatch_16)
1804          wm.GRFRegisterCount0 = wm_prog_data->reg_blocks_0;
1805
1806       if (stage_state->sampler_count)
1807          wm.SamplerStatePointer =
1808             ro_bo(brw->batch.state.bo, stage_state->sampler_offset);
1809 #if GEN_GEN == 5
1810       if (wm_prog_data->prog_offset_2)
1811          wm.GRFRegisterCount2 = wm_prog_data->reg_blocks_2;
1812 #endif
1813
1814       wm.SetupURBEntryReadLength = wm_prog_data->num_varying_inputs * 2;
1815       wm.ConstantURBEntryReadLength = wm_prog_data->base.curb_read_length;
1816       /* BRW_NEW_PUSH_CONSTANT_ALLOCATION */
1817       wm.ConstantURBEntryReadOffset = brw->curbe.wm_start * 2;
1818       wm.EarlyDepthTestEnable = true;
1819       wm.LineAntialiasingRegionWidth = _05pixels;
1820       wm.LineEndCapAntialiasingRegionWidth = _10pixels;
1821
1822       /* _NEW_POLYGON */
1823       if (ctx->Polygon.OffsetFill) {
1824          wm.GlobalDepthOffsetEnable = true;
1825          /* Something weird going on with legacy_global_depth_bias,
1826           * offset_constant, scaling and MRD.  This value passes glean
1827           * but gives some odd results elsewere (eg. the
1828           * quad-offset-units test).
1829           */
1830          wm.GlobalDepthOffsetConstant = ctx->Polygon.OffsetUnits * 2;
1831
1832          /* This is the only value that passes glean:
1833          */
1834          wm.GlobalDepthOffsetScale = ctx->Polygon.OffsetFactor;
1835       }
1836
1837       wm.DepthCoefficientURBReadOffset = 1;
1838 #endif
1839
1840       /* BRW_NEW_STATS_WM */
1841       wm.StatisticsEnable = GEN_GEN >= 6 || brw->stats_wm;
1842
1843 #if GEN_GEN < 7
1844       if (wm_prog_data->base.use_alt_mode)
1845          wm.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
1846
1847       wm.SamplerCount = GEN_GEN == 5 ?
1848          0 : DIV_ROUND_UP(stage_state->sampler_count, 4);
1849
1850       wm.BindingTableEntryCount =
1851          wm_prog_data->base.binding_table.size_bytes / 4;
1852       wm.MaximumNumberofThreads = devinfo->max_wm_threads - 1;
1853       wm._8PixelDispatchEnable = wm_prog_data->dispatch_8;
1854       wm._16PixelDispatchEnable = wm_prog_data->dispatch_16;
1855       wm.DispatchGRFStartRegisterForConstantSetupData0 =
1856          wm_prog_data->base.dispatch_grf_start_reg;
1857       if (GEN_GEN == 6 ||
1858           wm_prog_data->dispatch_8 || wm_prog_data->dispatch_16) {
1859          wm.KernelStartPointer0 = KSP(brw, stage_state->prog_offset);
1860       }
1861
1862 #if GEN_GEN >= 5
1863       if (GEN_GEN == 6 || wm_prog_data->prog_offset_2) {
1864          wm.KernelStartPointer2 =
1865             KSP(brw, stage_state->prog_offset + wm_prog_data->prog_offset_2);
1866       }
1867 #endif
1868
1869 #if GEN_GEN == 6
1870       wm.DualSourceBlendEnable =
1871          wm_prog_data->dual_src_blend && (ctx->Color.BlendEnabled & 1) &&
1872          ctx->Color.Blend[0]._UsesDualSrc;
1873       wm.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
1874       wm.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
1875
1876       /* From the SNB PRM, volume 2 part 1, page 281:
1877        * "If the PS kernel does not need the Position XY Offsets
1878        * to compute a Position XY value, then this field should be
1879        * programmed to POSOFFSET_NONE."
1880        *
1881        * "SW Recommendation: If the PS kernel needs the Position Offsets
1882        * to compute a Position XY value, this field should match Position
1883        * ZW Interpolation Mode to ensure a consistent position.xyzw
1884        * computation."
1885        * We only require XY sample offsets. So, this recommendation doesn't
1886        * look useful at the moment. We might need this in future.
1887        */
1888       if (wm_prog_data->uses_pos_offset)
1889          wm.PositionXYOffsetSelect = POSOFFSET_SAMPLE;
1890       else
1891          wm.PositionXYOffsetSelect = POSOFFSET_NONE;
1892
1893       wm.DispatchGRFStartRegisterForConstantSetupData2 =
1894          wm_prog_data->dispatch_grf_start_reg_2;
1895 #endif
1896
1897       if (wm_prog_data->base.total_scratch) {
1898          wm.ScratchSpaceBasePointer = rw_bo(stage_state->scratch_bo, 0);
1899          wm.PerThreadScratchSpace =
1900             ffs(stage_state->per_thread_scratch) - 11;
1901       }
1902
1903       wm.PixelShaderComputedDepth = writes_depth;
1904 #endif
1905
1906       /* _NEW_LINE */
1907       wm.LineStippleEnable = ctx->Line.StippleFlag;
1908
1909       /* _NEW_POLYGON */
1910       wm.PolygonStippleEnable = ctx->Polygon.StippleFlag;
1911
1912 #if GEN_GEN < 8
1913
1914 #if GEN_GEN >= 6
1915       wm.PixelShaderUsesSourceW = wm_prog_data->uses_src_w;
1916
1917       /* _NEW_BUFFERS */
1918       const bool multisampled_fbo = _mesa_geometric_samples(ctx->DrawBuffer) > 1;
1919
1920       if (multisampled_fbo) {
1921          /* _NEW_MULTISAMPLE */
1922          if (ctx->Multisample.Enabled)
1923             wm.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
1924          else
1925             wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
1926
1927          if (wm_prog_data->persample_dispatch)
1928             wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
1929          else
1930             wm.MultisampleDispatchMode = MSDISPMODE_PERPIXEL;
1931       } else {
1932          wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
1933          wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
1934       }
1935 #endif
1936       wm.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth;
1937       if (wm_prog_data->uses_kill ||
1938           _mesa_is_alpha_test_enabled(ctx) ||
1939           _mesa_is_alpha_to_coverage_enabled(ctx) ||
1940           (GEN_GEN >= 6 && wm_prog_data->uses_omask)) {
1941          wm.PixelShaderKillsPixel = true;
1942       }
1943
1944       /* _NEW_BUFFERS | _NEW_COLOR */
1945       if (brw_color_buffer_write_enabled(brw) || writes_depth ||
1946           wm.PixelShaderKillsPixel ||
1947           (GEN_GEN >= 6 && wm_prog_data->has_side_effects)) {
1948          wm.ThreadDispatchEnable = true;
1949       }
1950
1951 #if GEN_GEN >= 7
1952       wm.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode;
1953       wm.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask;
1954 #endif
1955
1956       /* The "UAV access enable" bits are unnecessary on HSW because they only
1957        * seem to have an effect on the HW-assisted coherency mechanism which we
1958        * don't need, and the rasterization-related UAV_ONLY flag and the
1959        * DISPATCH_ENABLE bit can be set independently from it.
1960        * C.f. gen8_upload_ps_extra().
1961        *
1962        * BRW_NEW_FRAGMENT_PROGRAM | BRW_NEW_FS_PROG_DATA | _NEW_BUFFERS |
1963        * _NEW_COLOR
1964        */
1965 #if GEN_IS_HASWELL
1966       if (!(brw_color_buffer_write_enabled(brw) || writes_depth) &&
1967           wm_prog_data->has_side_effects)
1968          wm.PSUAVonly = ON;
1969 #endif
1970 #endif
1971
1972 #if GEN_GEN >= 7
1973       /* BRW_NEW_FS_PROG_DATA */
1974       if (wm_prog_data->early_fragment_tests)
1975          wm.EarlyDepthStencilControl = EDSC_PREPS;
1976       else if (wm_prog_data->has_side_effects)
1977          wm.EarlyDepthStencilControl = EDSC_PSEXEC;
1978 #endif
1979    }
1980
1981 #if GEN_GEN <= 5
1982    if (brw->wm.offset_clamp != ctx->Polygon.OffsetClamp) {
1983       brw_batch_emit(brw, GENX(3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP), clamp) {
1984          clamp.GlobalDepthOffsetClamp = ctx->Polygon.OffsetClamp;
1985       }
1986
1987       brw->wm.offset_clamp = ctx->Polygon.OffsetClamp;
1988    }
1989 #endif
1990 }
1991
1992 static const struct brw_tracked_state genX(wm_state) = {
1993    .dirty = {
1994       .mesa  = _NEW_LINE |
1995                _NEW_POLYGON |
1996                (GEN_GEN < 8 ? _NEW_BUFFERS |
1997                               _NEW_COLOR :
1998                               0) |
1999                (GEN_GEN == 6 ? _NEW_PROGRAM_CONSTANTS : 0) |
2000                (GEN_GEN < 6 ? _NEW_POLYGONSTIPPLE : 0) |
2001                (GEN_GEN < 8 && GEN_GEN >= 6 ? _NEW_MULTISAMPLE : 0),
2002       .brw   = BRW_NEW_BLORP |
2003                BRW_NEW_FS_PROG_DATA |
2004                (GEN_GEN < 6 ? BRW_NEW_PUSH_CONSTANT_ALLOCATION |
2005                               BRW_NEW_FRAGMENT_PROGRAM |
2006                               BRW_NEW_PROGRAM_CACHE |
2007                               BRW_NEW_SAMPLER_STATE_TABLE |
2008                               BRW_NEW_STATS_WM
2009                             : 0) |
2010                (GEN_GEN < 7 ? BRW_NEW_BATCH : BRW_NEW_CONTEXT),
2011    },
2012    .emit = genX(upload_wm),
2013 };
2014
2015 /* ---------------------------------------------------------------------- */
2016
2017 #define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix) \
2018    pkt.KernelStartPointer = KSP(brw, stage_state->prog_offset);           \
2019    pkt.SamplerCount       =                                               \
2020       DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4);          \
2021    pkt.BindingTableEntryCount =                                           \
2022       stage_prog_data->binding_table.size_bytes / 4;                      \
2023    pkt.FloatingPointMode  = stage_prog_data->use_alt_mode;                \
2024                                                                           \
2025    if (stage_prog_data->total_scratch) {                                  \
2026       pkt.ScratchSpaceBasePointer = rw_bo(stage_state->scratch_bo, 0);    \
2027       pkt.PerThreadScratchSpace =                                         \
2028          ffs(stage_state->per_thread_scratch) - 11;                       \
2029    }                                                                      \
2030                                                                           \
2031    pkt.DispatchGRFStartRegisterForURBData =                               \
2032       stage_prog_data->dispatch_grf_start_reg;                            \
2033    pkt.prefix##URBEntryReadLength = vue_prog_data->urb_read_length;       \
2034    pkt.prefix##URBEntryReadOffset = 0;                                    \
2035                                                                           \
2036    pkt.StatisticsEnable = true;                                           \
2037    pkt.Enable           = true;
2038
2039 static void
2040 genX(upload_vs_state)(struct brw_context *brw)
2041 {
2042    UNUSED struct gl_context *ctx = &brw->ctx;
2043    const struct gen_device_info *devinfo = &brw->screen->devinfo;
2044    struct brw_stage_state *stage_state = &brw->vs.base;
2045
2046    /* BRW_NEW_VS_PROG_DATA */
2047    const struct brw_vue_prog_data *vue_prog_data =
2048       brw_vue_prog_data(brw->vs.base.prog_data);
2049    const struct brw_stage_prog_data *stage_prog_data = &vue_prog_data->base;
2050
2051    assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8 ||
2052           vue_prog_data->dispatch_mode == DISPATCH_MODE_4X2_DUAL_OBJECT);
2053
2054 #if GEN_GEN == 6
2055    /* From the BSpec, 3D Pipeline > Geometry > Vertex Shader > State,
2056     * 3DSTATE_VS, Dword 5.0 "VS Function Enable":
2057     *
2058     *   [DevSNB] A pipeline flush must be programmed prior to a 3DSTATE_VS
2059     *   command that causes the VS Function Enable to toggle. Pipeline
2060     *   flush can be executed by sending a PIPE_CONTROL command with CS
2061     *   stall bit set and a post sync operation.
2062     *
2063     * We've already done such a flush at the start of state upload, so we
2064     * don't need to do another one here.
2065     */
2066    brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_VS), cvs) {
2067       if (stage_state->push_const_size != 0) {
2068          cvs.Buffer0Valid = true;
2069          cvs.PointertoVSConstantBuffer0 = stage_state->push_const_offset;
2070          cvs.VSConstantBuffer0ReadLength = stage_state->push_const_size - 1;
2071       }
2072    }
2073 #endif
2074
2075    if (GEN_GEN == 7 && devinfo->is_ivybridge)
2076       gen7_emit_vs_workaround_flush(brw);
2077
2078 #if GEN_GEN >= 6
2079    brw_batch_emit(brw, GENX(3DSTATE_VS), vs) {
2080 #else
2081    ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
2082    brw_state_emit(brw, GENX(VS_STATE), 32, &stage_state->state_offset, vs) {
2083 #endif
2084       INIT_THREAD_DISPATCH_FIELDS(vs, Vertex);
2085
2086       vs.MaximumNumberofThreads = devinfo->max_vs_threads - 1;
2087
2088 #if GEN_GEN < 6
2089       vs.GRFRegisterCount = DIV_ROUND_UP(vue_prog_data->total_grf, 16) - 1;
2090       vs.ConstantURBEntryReadLength = stage_prog_data->curb_read_length;
2091       vs.ConstantURBEntryReadOffset = brw->curbe.vs_start * 2;
2092
2093       vs.NumberofURBEntries = brw->urb.nr_vs_entries >> (GEN_GEN == 5 ? 2 : 0);
2094       vs.URBEntryAllocationSize = brw->urb.vsize - 1;
2095
2096       vs.MaximumNumberofThreads =
2097          CLAMP(brw->urb.nr_vs_entries / 2, 1, devinfo->max_vs_threads) - 1;
2098
2099       vs.StatisticsEnable = false;
2100       vs.SamplerStatePointer =
2101          ro_bo(brw->batch.state.bo, stage_state->sampler_offset);
2102 #endif
2103
2104 #if GEN_GEN == 5
2105       /* Force single program flow on Ironlake.  We cannot reliably get
2106        * all applications working without it.  See:
2107        * https://bugs.freedesktop.org/show_bug.cgi?id=29172
2108        *
2109        * The most notable and reliably failing application is the Humus
2110        * demo "CelShading"
2111        */
2112       vs.SingleProgramFlow = true;
2113       vs.SamplerCount = 0; /* hardware requirement */
2114 #endif
2115
2116 #if GEN_GEN >= 8
2117       vs.SIMD8DispatchEnable =
2118          vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8;
2119
2120       vs.UserClipDistanceCullTestEnableBitmask =
2121          vue_prog_data->cull_distance_mask;
2122 #endif
2123    }
2124
2125 #if GEN_GEN == 6
2126    /* Based on my reading of the simulator, the VS constants don't get
2127     * pulled into the VS FF unit until an appropriate pipeline flush
2128     * happens, and instead the 3DSTATE_CONSTANT_VS packet just adds
2129     * references to them into a little FIFO.  The flushes are common,
2130     * but don't reliably happen between this and a 3DPRIMITIVE, causing
2131     * the primitive to use the wrong constants.  Then the FIFO
2132     * containing the constant setup gets added to again on the next
2133     * constants change, and eventually when a flush does happen the
2134     * unit is overwhelmed by constant changes and dies.
2135     *
2136     * To avoid this, send a PIPE_CONTROL down the line that will
2137     * update the unit immediately loading the constants.  The flush
2138     * type bits here were those set by the STATE_BASE_ADDRESS whose
2139     * move in a82a43e8d99e1715dd11c9c091b5ab734079b6a6 triggered the
2140     * bug reports that led to this workaround, and may be more than
2141     * what is strictly required to avoid the issue.
2142     */
2143    brw_emit_pipe_control_flush(brw,
2144                                PIPE_CONTROL_DEPTH_STALL |
2145                                PIPE_CONTROL_INSTRUCTION_INVALIDATE |
2146                                PIPE_CONTROL_STATE_CACHE_INVALIDATE);
2147 #endif
2148 }
2149
2150 static const struct brw_tracked_state genX(vs_state) = {
2151    .dirty = {
2152       .mesa  = (GEN_GEN == 6 ? (_NEW_PROGRAM_CONSTANTS | _NEW_TRANSFORM) : 0),
2153       .brw   = BRW_NEW_BATCH |
2154                BRW_NEW_BLORP |
2155                BRW_NEW_CONTEXT |
2156                BRW_NEW_VS_PROG_DATA |
2157                (GEN_GEN == 6 ? BRW_NEW_VERTEX_PROGRAM : 0) |
2158                (GEN_GEN <= 5 ? BRW_NEW_PUSH_CONSTANT_ALLOCATION |
2159                                BRW_NEW_PROGRAM_CACHE |
2160                                BRW_NEW_SAMPLER_STATE_TABLE |
2161                                BRW_NEW_URB_FENCE
2162                              : 0),
2163    },
2164    .emit = genX(upload_vs_state),
2165 };
2166
2167 /* ---------------------------------------------------------------------- */
2168
2169 static void
2170 genX(upload_cc_viewport)(struct brw_context *brw)
2171 {
2172    struct gl_context *ctx = &brw->ctx;
2173
2174    /* BRW_NEW_VIEWPORT_COUNT */
2175    const unsigned viewport_count = brw->clip.viewport_count;
2176
2177    struct GENX(CC_VIEWPORT) ccv;
2178    uint32_t cc_vp_offset;
2179    uint32_t *cc_map =
2180       brw_state_batch(brw, 4 * GENX(CC_VIEWPORT_length) * viewport_count,
2181                       32, &cc_vp_offset);
2182
2183    for (unsigned i = 0; i < viewport_count; i++) {
2184       /* _NEW_VIEWPORT | _NEW_TRANSFORM */
2185       const struct gl_viewport_attrib *vp = &ctx->ViewportArray[i];
2186       if (ctx->Transform.DepthClamp) {
2187          ccv.MinimumDepth = MIN2(vp->Near, vp->Far);
2188          ccv.MaximumDepth = MAX2(vp->Near, vp->Far);
2189       } else {
2190          ccv.MinimumDepth = 0.0;
2191          ccv.MaximumDepth = 1.0;
2192       }
2193       GENX(CC_VIEWPORT_pack)(NULL, cc_map, &ccv);
2194       cc_map += GENX(CC_VIEWPORT_length);
2195    }
2196
2197 #if GEN_GEN >= 7
2198    brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), ptr) {
2199       ptr.CCViewportPointer = cc_vp_offset;
2200    }
2201 #elif GEN_GEN == 6
2202    brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) {
2203       vp.CCViewportStateChange = 1;
2204       vp.PointertoCC_VIEWPORT = cc_vp_offset;
2205    }
2206 #else
2207    brw->cc.vp_offset = cc_vp_offset;
2208    ctx->NewDriverState |= BRW_NEW_CC_VP;
2209 #endif
2210 }
2211
2212 const struct brw_tracked_state genX(cc_vp) = {
2213    .dirty = {
2214       .mesa = _NEW_TRANSFORM |
2215               _NEW_VIEWPORT,
2216       .brw = BRW_NEW_BATCH |
2217              BRW_NEW_BLORP |
2218              BRW_NEW_VIEWPORT_COUNT,
2219    },
2220    .emit = genX(upload_cc_viewport)
2221 };
2222
2223 /* ---------------------------------------------------------------------- */
2224
2225 static void
2226 set_scissor_bits(const struct gl_context *ctx, int i,
2227                  bool render_to_fbo, unsigned fb_width, unsigned fb_height,
2228                  struct GENX(SCISSOR_RECT) *sc)
2229 {
2230    int bbox[4];
2231
2232    bbox[0] = MAX2(ctx->ViewportArray[i].X, 0);
2233    bbox[1] = MIN2(bbox[0] + ctx->ViewportArray[i].Width, fb_width);
2234    bbox[2] = MAX2(ctx->ViewportArray[i].Y, 0);
2235    bbox[3] = MIN2(bbox[2] + ctx->ViewportArray[i].Height, fb_height);
2236    _mesa_intersect_scissor_bounding_box(ctx, i, bbox);
2237
2238    if (bbox[0] == bbox[1] || bbox[2] == bbox[3]) {
2239       /* If the scissor was out of bounds and got clamped to 0 width/height
2240        * at the bounds, the subtraction of 1 from maximums could produce a
2241        * negative number and thus not clip anything.  Instead, just provide
2242        * a min > max scissor inside the bounds, which produces the expected
2243        * no rendering.
2244        */
2245       sc->ScissorRectangleXMin = 1;
2246       sc->ScissorRectangleXMax = 0;
2247       sc->ScissorRectangleYMin = 1;
2248       sc->ScissorRectangleYMax = 0;
2249    } else if (render_to_fbo) {
2250       /* texmemory: Y=0=bottom */
2251       sc->ScissorRectangleXMin = bbox[0];
2252       sc->ScissorRectangleXMax = bbox[1] - 1;
2253       sc->ScissorRectangleYMin = bbox[2];
2254       sc->ScissorRectangleYMax = bbox[3] - 1;
2255    } else {
2256       /* memory: Y=0=top */
2257       sc->ScissorRectangleXMin = bbox[0];
2258       sc->ScissorRectangleXMax = bbox[1] - 1;
2259       sc->ScissorRectangleYMin = fb_height - bbox[3];
2260       sc->ScissorRectangleYMax = fb_height - bbox[2] - 1;
2261    }
2262 }
2263
2264 #if GEN_GEN >= 6
2265 static void
2266 genX(upload_scissor_state)(struct brw_context *brw)
2267 {
2268    struct gl_context *ctx = &brw->ctx;
2269    const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
2270    struct GENX(SCISSOR_RECT) scissor;
2271    uint32_t scissor_state_offset;
2272    const unsigned int fb_width = _mesa_geometric_width(ctx->DrawBuffer);
2273    const unsigned int fb_height = _mesa_geometric_height(ctx->DrawBuffer);
2274    uint32_t *scissor_map;
2275
2276    /* BRW_NEW_VIEWPORT_COUNT */
2277    const unsigned viewport_count = brw->clip.viewport_count;
2278
2279    scissor_map = brw_state_batch(
2280       brw, GENX(SCISSOR_RECT_length) * sizeof(uint32_t) * viewport_count,
2281       32, &scissor_state_offset);
2282
2283    /* _NEW_SCISSOR | _NEW_BUFFERS | _NEW_VIEWPORT */
2284
2285    /* The scissor only needs to handle the intersection of drawable and
2286     * scissor rect.  Clipping to the boundaries of static shared buffers
2287     * for front/back/depth is covered by looping over cliprects in brw_draw.c.
2288     *
2289     * Note that the hardware's coordinates are inclusive, while Mesa's min is
2290     * inclusive but max is exclusive.
2291     */
2292    for (unsigned i = 0; i < viewport_count; i++) {
2293       set_scissor_bits(ctx, i, render_to_fbo, fb_width, fb_height, &scissor);
2294       GENX(SCISSOR_RECT_pack)(
2295          NULL, scissor_map + i * GENX(SCISSOR_RECT_length), &scissor);
2296    }
2297
2298    brw_batch_emit(brw, GENX(3DSTATE_SCISSOR_STATE_POINTERS), ptr) {
2299       ptr.ScissorRectPointer = scissor_state_offset;
2300    }
2301 }
2302
2303 static const struct brw_tracked_state genX(scissor_state) = {
2304    .dirty = {
2305       .mesa = _NEW_BUFFERS |
2306               _NEW_SCISSOR |
2307               _NEW_VIEWPORT,
2308       .brw = BRW_NEW_BATCH |
2309              BRW_NEW_BLORP |
2310              BRW_NEW_VIEWPORT_COUNT,
2311    },
2312    .emit = genX(upload_scissor_state),
2313 };
2314 #endif
2315
2316 /* ---------------------------------------------------------------------- */
2317
2318 static void
2319 brw_calculate_guardband_size(uint32_t fb_width, uint32_t fb_height,
2320                              float m00, float m11, float m30, float m31,
2321                              float *xmin, float *xmax,
2322                              float *ymin, float *ymax)
2323 {
2324    /* According to the "Vertex X,Y Clamping and Quantization" section of the
2325     * Strips and Fans documentation:
2326     *
2327     * "The vertex X and Y screen-space coordinates are also /clamped/ to the
2328     *  fixed-point "guardband" range supported by the rasterization hardware"
2329     *
2330     * and
2331     *
2332     * "In almost all circumstances, if an object’s vertices are actually
2333     *  modified by this clamping (i.e., had X or Y coordinates outside of
2334     *  the guardband extent the rendered object will not match the intended
2335     *  result.  Therefore software should take steps to ensure that this does
2336     *  not happen - e.g., by clipping objects such that they do not exceed
2337     *  these limits after the Drawing Rectangle is applied."
2338     *
2339     * I believe the fundamental restriction is that the rasterizer (in
2340     * the SF/WM stages) have a limit on the number of pixels that can be
2341     * rasterized.  We need to ensure any coordinates beyond the rasterizer
2342     * limit are handled by the clipper.  So effectively that limit becomes
2343     * the clipper's guardband size.
2344     *
2345     * It goes on to say:
2346     *
2347     * "In addition, in order to be correctly rendered, objects must have a
2348     *  screenspace bounding box not exceeding 8K in the X or Y direction.
2349     *  This additional restriction must also be comprehended by software,
2350     *  i.e., enforced by use of clipping."
2351     *
2352     * This makes no sense.  Gen7+ hardware supports 16K render targets,
2353     * and you definitely need to be able to draw polygons that fill the
2354     * surface.  Our assumption is that the rasterizer was limited to 8K
2355     * on Sandybridge, which only supports 8K surfaces, and it was actually
2356     * increased to 16K on Ivybridge and later.
2357     *
2358     * So, limit the guardband to 16K on Gen7+ and 8K on Sandybridge.
2359     */
2360    const float gb_size = GEN_GEN >= 7 ? 16384.0f : 8192.0f;
2361
2362    if (m00 != 0 && m11 != 0) {
2363       /* First, we compute the screen-space render area */
2364       const float ss_ra_xmin = MIN3(        0, m30 + m00, m30 - m00);
2365       const float ss_ra_xmax = MAX3( fb_width, m30 + m00, m30 - m00);
2366       const float ss_ra_ymin = MIN3(        0, m31 + m11, m31 - m11);
2367       const float ss_ra_ymax = MAX3(fb_height, m31 + m11, m31 - m11);
2368
2369       /* We want the guardband to be centered on that */
2370       const float ss_gb_xmin = (ss_ra_xmin + ss_ra_xmax) / 2 - gb_size;
2371       const float ss_gb_xmax = (ss_ra_xmin + ss_ra_xmax) / 2 + gb_size;
2372       const float ss_gb_ymin = (ss_ra_ymin + ss_ra_ymax) / 2 - gb_size;
2373       const float ss_gb_ymax = (ss_ra_ymin + ss_ra_ymax) / 2 + gb_size;
2374
2375       /* Now we need it in native device coordinates */
2376       const float ndc_gb_xmin = (ss_gb_xmin - m30) / m00;
2377       const float ndc_gb_xmax = (ss_gb_xmax - m30) / m00;
2378       const float ndc_gb_ymin = (ss_gb_ymin - m31) / m11;
2379       const float ndc_gb_ymax = (ss_gb_ymax - m31) / m11;
2380
2381       /* Thanks to Y-flipping and ORIGIN_UPPER_LEFT, the Y coordinates may be
2382        * flipped upside-down.  X should be fine though.
2383        */
2384       assert(ndc_gb_xmin <= ndc_gb_xmax);
2385       *xmin = ndc_gb_xmin;
2386       *xmax = ndc_gb_xmax;
2387       *ymin = MIN2(ndc_gb_ymin, ndc_gb_ymax);
2388       *ymax = MAX2(ndc_gb_ymin, ndc_gb_ymax);
2389    } else {
2390       /* The viewport scales to 0, so nothing will be rendered. */
2391       *xmin = 0.0f;
2392       *xmax = 0.0f;
2393       *ymin = 0.0f;
2394       *ymax = 0.0f;
2395    }
2396 }
2397
2398 static void
2399 genX(upload_sf_clip_viewport)(struct brw_context *brw)
2400 {
2401    struct gl_context *ctx = &brw->ctx;
2402    float y_scale, y_bias;
2403
2404    /* BRW_NEW_VIEWPORT_COUNT */
2405    const unsigned viewport_count = brw->clip.viewport_count;
2406
2407    /* _NEW_BUFFERS */
2408    const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
2409    const uint32_t fb_width = (float)_mesa_geometric_width(ctx->DrawBuffer);
2410    const uint32_t fb_height = (float)_mesa_geometric_height(ctx->DrawBuffer);
2411
2412 #if GEN_GEN >= 7
2413 #define clv sfv
2414    struct GENX(SF_CLIP_VIEWPORT) sfv;
2415    uint32_t sf_clip_vp_offset;
2416    uint32_t *sf_clip_map =
2417       brw_state_batch(brw, GENX(SF_CLIP_VIEWPORT_length) * 4 * viewport_count,
2418                       64, &sf_clip_vp_offset);
2419 #else
2420    struct GENX(SF_VIEWPORT) sfv;
2421    struct GENX(CLIP_VIEWPORT) clv;
2422    uint32_t sf_vp_offset, clip_vp_offset;
2423    uint32_t *sf_map =
2424       brw_state_batch(brw, GENX(SF_VIEWPORT_length) * 4 * viewport_count,
2425                       32, &sf_vp_offset);
2426    uint32_t *clip_map =
2427       brw_state_batch(brw, GENX(CLIP_VIEWPORT_length) * 4 * viewport_count,
2428                       32, &clip_vp_offset);
2429 #endif
2430
2431    /* _NEW_BUFFERS */
2432    if (render_to_fbo) {
2433       y_scale = 1.0;
2434       y_bias = 0;
2435    } else {
2436       y_scale = -1.0;
2437       y_bias = (float)fb_height;
2438    }
2439
2440    for (unsigned i = 0; i < brw->clip.viewport_count; i++) {
2441       /* _NEW_VIEWPORT: Guardband Clipping */
2442       float scale[3], translate[3], gb_xmin, gb_xmax, gb_ymin, gb_ymax;
2443       _mesa_get_viewport_xform(ctx, i, scale, translate);
2444
2445       sfv.ViewportMatrixElementm00 = scale[0];
2446       sfv.ViewportMatrixElementm11 = scale[1] * y_scale,
2447       sfv.ViewportMatrixElementm22 = scale[2],
2448       sfv.ViewportMatrixElementm30 = translate[0],
2449       sfv.ViewportMatrixElementm31 = translate[1] * y_scale + y_bias,
2450       sfv.ViewportMatrixElementm32 = translate[2],
2451       brw_calculate_guardband_size(fb_width, fb_height,
2452                                    sfv.ViewportMatrixElementm00,
2453                                    sfv.ViewportMatrixElementm11,
2454                                    sfv.ViewportMatrixElementm30,
2455                                    sfv.ViewportMatrixElementm31,
2456                                    &gb_xmin, &gb_xmax, &gb_ymin, &gb_ymax);
2457
2458
2459       clv.XMinClipGuardband = gb_xmin;
2460       clv.XMaxClipGuardband = gb_xmax;
2461       clv.YMinClipGuardband = gb_ymin;
2462       clv.YMaxClipGuardband = gb_ymax;
2463
2464 #if GEN_GEN < 6
2465       set_scissor_bits(ctx, i, render_to_fbo, fb_width, fb_height,
2466                        &sfv.ScissorRectangle);
2467 #elif GEN_GEN >= 8
2468       /* _NEW_VIEWPORT | _NEW_BUFFERS: Screen Space Viewport
2469        * The hardware will take the intersection of the drawing rectangle,
2470        * scissor rectangle, and the viewport extents. We don't need to be
2471        * smart, and can therefore just program the viewport extents.
2472        */
2473       const float viewport_Xmax =
2474          ctx->ViewportArray[i].X + ctx->ViewportArray[i].Width;
2475       const float viewport_Ymax =
2476          ctx->ViewportArray[i].Y + ctx->ViewportArray[i].Height;
2477
2478       if (render_to_fbo) {
2479          sfv.XMinViewPort = ctx->ViewportArray[i].X;
2480          sfv.XMaxViewPort = viewport_Xmax - 1;
2481          sfv.YMinViewPort = ctx->ViewportArray[i].Y;
2482          sfv.YMaxViewPort = viewport_Ymax - 1;
2483       } else {
2484          sfv.XMinViewPort = ctx->ViewportArray[i].X;
2485          sfv.XMaxViewPort = viewport_Xmax - 1;
2486          sfv.YMinViewPort = fb_height - viewport_Ymax;
2487          sfv.YMaxViewPort = fb_height - ctx->ViewportArray[i].Y - 1;
2488       }
2489 #endif
2490
2491 #if GEN_GEN >= 7
2492       GENX(SF_CLIP_VIEWPORT_pack)(NULL, sf_clip_map, &sfv);
2493       sf_clip_map += GENX(SF_CLIP_VIEWPORT_length);
2494 #else
2495       GENX(SF_VIEWPORT_pack)(NULL, sf_map, &sfv);
2496       GENX(CLIP_VIEWPORT_pack)(NULL, clip_map, &clv);
2497       sf_map += GENX(SF_VIEWPORT_length);
2498       clip_map += GENX(CLIP_VIEWPORT_length);
2499 #endif
2500    }
2501
2502 #if GEN_GEN >= 7
2503    brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), ptr) {
2504       ptr.SFClipViewportPointer = sf_clip_vp_offset;
2505    }
2506 #elif GEN_GEN == 6
2507    brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) {
2508       vp.SFViewportStateChange = 1;
2509       vp.CLIPViewportStateChange = 1;
2510       vp.PointertoCLIP_VIEWPORT = clip_vp_offset;
2511       vp.PointertoSF_VIEWPORT = sf_vp_offset;
2512    }
2513 #else
2514    brw->sf.vp_offset = sf_vp_offset;
2515    brw->clip.vp_offset = clip_vp_offset;
2516    brw->ctx.NewDriverState |= BRW_NEW_SF_VP | BRW_NEW_CLIP_VP;
2517 #endif
2518 }
2519
2520 static const struct brw_tracked_state genX(sf_clip_viewport) = {
2521    .dirty = {
2522       .mesa = _NEW_BUFFERS |
2523               _NEW_VIEWPORT |
2524               (GEN_GEN <= 5 ? _NEW_SCISSOR : 0),
2525       .brw = BRW_NEW_BATCH |
2526              BRW_NEW_BLORP |
2527              BRW_NEW_VIEWPORT_COUNT,
2528    },
2529    .emit = genX(upload_sf_clip_viewport),
2530 };
2531
2532 /* ---------------------------------------------------------------------- */
2533
2534 static void
2535 genX(upload_gs_state)(struct brw_context *brw)
2536 {
2537    UNUSED struct gl_context *ctx = &brw->ctx;
2538    UNUSED const struct gen_device_info *devinfo = &brw->screen->devinfo;
2539    const struct brw_stage_state *stage_state = &brw->gs.base;
2540    const struct gl_program *gs_prog = brw->programs[MESA_SHADER_GEOMETRY];
2541    /* BRW_NEW_GEOMETRY_PROGRAM */
2542    bool active = GEN_GEN >= 6 && gs_prog;
2543
2544    /* BRW_NEW_GS_PROG_DATA */
2545    struct brw_stage_prog_data *stage_prog_data = stage_state->prog_data;
2546    UNUSED const struct brw_vue_prog_data *vue_prog_data =
2547       brw_vue_prog_data(stage_prog_data);
2548 #if GEN_GEN >= 7
2549    const struct brw_gs_prog_data *gs_prog_data =
2550       brw_gs_prog_data(stage_prog_data);
2551 #endif
2552
2553 #if GEN_GEN == 6
2554    brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_GS), cgs) {
2555       if (active && stage_state->push_const_size != 0) {
2556          cgs.Buffer0Valid = true;
2557          cgs.PointertoGSConstantBuffer0 = stage_state->push_const_offset;
2558          cgs.GSConstantBuffer0ReadLength = stage_state->push_const_size - 1;
2559       }
2560    }
2561 #endif
2562
2563 #if GEN_GEN == 7 && !GEN_IS_HASWELL
2564    /**
2565     * From Graphics BSpec: 3D-Media-GPGPU Engine > 3D Pipeline Stages >
2566     * Geometry > Geometry Shader > State:
2567     *
2568     *     "Note: Because of corruption in IVB:GT2, software needs to flush the
2569     *     whole fixed function pipeline when the GS enable changes value in
2570     *     the 3DSTATE_GS."
2571     *
2572     * The hardware architects have clarified that in this context "flush the
2573     * whole fixed function pipeline" means to emit a PIPE_CONTROL with the "CS
2574     * Stall" bit set.
2575     */
2576    if (devinfo->gt == 2 && brw->gs.enabled != active)
2577       gen7_emit_cs_stall_flush(brw);
2578 #endif
2579
2580 #if GEN_GEN >= 6
2581    brw_batch_emit(brw, GENX(3DSTATE_GS), gs) {
2582 #else
2583    ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
2584    brw_state_emit(brw, GENX(GS_STATE), 32, &brw->ff_gs.state_offset, gs) {
2585 #endif
2586
2587 #if GEN_GEN >= 6
2588       if (active) {
2589          INIT_THREAD_DISPATCH_FIELDS(gs, Vertex);
2590
2591 #if GEN_GEN >= 7
2592          gs.OutputVertexSize = gs_prog_data->output_vertex_size_hwords * 2 - 1;
2593          gs.OutputTopology = gs_prog_data->output_topology;
2594          gs.ControlDataHeaderSize =
2595             gs_prog_data->control_data_header_size_hwords;
2596
2597          gs.InstanceControl = gs_prog_data->invocations - 1;
2598          gs.DispatchMode = vue_prog_data->dispatch_mode;
2599
2600          gs.IncludePrimitiveID = gs_prog_data->include_primitive_id;
2601
2602          gs.ControlDataFormat = gs_prog_data->control_data_format;
2603 #endif
2604
2605          /* Note: the meaning of the GEN7_GS_REORDER_TRAILING bit changes between
2606           * Ivy Bridge and Haswell.
2607           *
2608           * On Ivy Bridge, setting this bit causes the vertices of a triangle
2609           * strip to be delivered to the geometry shader in an order that does
2610           * not strictly follow the OpenGL spec, but preserves triangle
2611           * orientation.  For example, if the vertices are (1, 2, 3, 4, 5), then
2612           * the geometry shader sees triangles:
2613           *
2614           * (1, 2, 3), (2, 4, 3), (3, 4, 5)
2615           *
2616           * (Clearing the bit is even worse, because it fails to preserve
2617           * orientation).
2618           *
2619           * Triangle strips with adjacency always ordered in a way that preserves
2620           * triangle orientation but does not strictly follow the OpenGL spec,
2621           * regardless of the setting of this bit.
2622           *
2623           * On Haswell, both triangle strips and triangle strips with adjacency
2624           * are always ordered in a way that preserves triangle orientation.
2625           * Setting this bit causes the ordering to strictly follow the OpenGL
2626           * spec.
2627           *
2628           * So in either case we want to set the bit.  Unfortunately on Ivy
2629           * Bridge this will get the order close to correct but not perfect.
2630           */
2631          gs.ReorderMode = TRAILING;
2632          gs.MaximumNumberofThreads =
2633             GEN_GEN == 8 ? (devinfo->max_gs_threads / 2 - 1)
2634                          : (devinfo->max_gs_threads - 1);
2635
2636 #if GEN_GEN < 7
2637          gs.SOStatisticsEnable = true;
2638          if (gs_prog->info.has_transform_feedback_varyings)
2639             gs.SVBIPayloadEnable = true;
2640
2641          /* GEN6_GS_SPF_MODE and GEN6_GS_VECTOR_MASK_ENABLE are enabled as it
2642           * was previously done for gen6.
2643           *
2644           * TODO: test with both disabled to see if the HW is behaving
2645           * as expected, like in gen7.
2646           */
2647          gs.SingleProgramFlow = true;
2648          gs.VectorMaskEnable = true;
2649 #endif
2650
2651 #if GEN_GEN >= 8
2652          gs.ExpectedVertexCount = gs_prog_data->vertices_in;
2653
2654          if (gs_prog_data->static_vertex_count != -1) {
2655             gs.StaticOutput = true;
2656             gs.StaticOutputVertexCount = gs_prog_data->static_vertex_count;
2657          }
2658          gs.IncludeVertexHandles = vue_prog_data->include_vue_handles;
2659
2660          gs.UserClipDistanceCullTestEnableBitmask =
2661             vue_prog_data->cull_distance_mask;
2662
2663          const int urb_entry_write_offset = 1;
2664          const uint32_t urb_entry_output_length =
2665             DIV_ROUND_UP(vue_prog_data->vue_map.num_slots, 2) -
2666             urb_entry_write_offset;
2667
2668          gs.VertexURBEntryOutputReadOffset = urb_entry_write_offset;
2669          gs.VertexURBEntryOutputLength = MAX2(urb_entry_output_length, 1);
2670 #endif
2671       }
2672 #endif
2673
2674 #if GEN_GEN <= 6
2675       if (!active && brw->ff_gs.prog_active) {
2676          /* In gen6, transform feedback for the VS stage is done with an
2677           * ad-hoc GS program. This function provides the needed 3DSTATE_GS
2678           * for this.
2679           */
2680          gs.KernelStartPointer = KSP(brw, brw->ff_gs.prog_offset);
2681          gs.SingleProgramFlow = true;
2682          gs.DispatchGRFStartRegisterForURBData = GEN_GEN == 6 ? 2 : 1;
2683          gs.VertexURBEntryReadLength = brw->ff_gs.prog_data->urb_read_length;
2684
2685 #if GEN_GEN <= 5
2686          gs.GRFRegisterCount =
2687             DIV_ROUND_UP(brw->ff_gs.prog_data->total_grf, 16) - 1;
2688          /* BRW_NEW_URB_FENCE */
2689          gs.NumberofURBEntries = brw->urb.nr_gs_entries;
2690          gs.URBEntryAllocationSize = brw->urb.vsize - 1;
2691          gs.MaximumNumberofThreads = brw->urb.nr_gs_entries >= 8 ? 1 : 0;
2692          gs.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
2693 #else
2694          gs.Enable = true;
2695          gs.VectorMaskEnable = true;
2696          gs.SVBIPayloadEnable = true;
2697          gs.SVBIPostIncrementEnable = true;
2698          gs.SVBIPostIncrementValue =
2699             brw->ff_gs.prog_data->svbi_postincrement_value;
2700          gs.SOStatisticsEnable = true;
2701          gs.MaximumNumberofThreads = devinfo->max_gs_threads - 1;
2702 #endif
2703       }
2704 #endif
2705       if (!active && !brw->ff_gs.prog_active) {
2706 #if GEN_GEN < 8
2707          gs.DispatchGRFStartRegisterForURBData = 1;
2708 #if GEN_GEN >= 7
2709          gs.IncludeVertexHandles = true;
2710 #endif
2711 #endif
2712       }
2713
2714 #if GEN_GEN >= 6
2715       gs.StatisticsEnable = true;
2716 #endif
2717 #if GEN_GEN == 5 || GEN_GEN == 6
2718       gs.RenderingEnabled = true;
2719 #endif
2720 #if GEN_GEN <= 5
2721       gs.MaximumVPIndex = brw->clip.viewport_count - 1;
2722 #endif
2723    }
2724
2725 #if GEN_GEN == 6
2726    brw->gs.enabled = active;
2727 #endif
2728 }
2729
2730 static const struct brw_tracked_state genX(gs_state) = {
2731    .dirty = {
2732       .mesa  = (GEN_GEN == 6 ? _NEW_PROGRAM_CONSTANTS : 0),
2733       .brw   = BRW_NEW_BATCH |
2734                BRW_NEW_BLORP |
2735                (GEN_GEN <= 5 ? BRW_NEW_PUSH_CONSTANT_ALLOCATION |
2736                                BRW_NEW_PROGRAM_CACHE |
2737                                BRW_NEW_URB_FENCE |
2738                                BRW_NEW_VIEWPORT_COUNT
2739                              : 0) |
2740                (GEN_GEN >= 6 ? BRW_NEW_CONTEXT |
2741                                BRW_NEW_GEOMETRY_PROGRAM |
2742                                BRW_NEW_GS_PROG_DATA
2743                              : 0) |
2744                (GEN_GEN < 7 ? BRW_NEW_FF_GS_PROG_DATA : 0),
2745    },
2746    .emit = genX(upload_gs_state),
2747 };
2748
2749 /* ---------------------------------------------------------------------- */
2750
2751 UNUSED static GLenum
2752 fix_dual_blend_alpha_to_one(GLenum function)
2753 {
2754    switch (function) {
2755    case GL_SRC1_ALPHA:
2756       return GL_ONE;
2757
2758    case GL_ONE_MINUS_SRC1_ALPHA:
2759       return GL_ZERO;
2760    }
2761
2762    return function;
2763 }
2764
2765 #define blend_factor(x) brw_translate_blend_factor(x)
2766 #define blend_eqn(x) brw_translate_blend_equation(x)
2767
2768 /**
2769  * Modify blend function to force destination alpha to 1.0
2770  *
2771  * If \c function specifies a blend function that uses destination alpha,
2772  * replace it with a function that hard-wires destination alpha to 1.0.  This
2773  * is used when rendering to xRGB targets.
2774  */
2775 static GLenum
2776 brw_fix_xRGB_alpha(GLenum function)
2777 {
2778    switch (function) {
2779    case GL_DST_ALPHA:
2780       return GL_ONE;
2781
2782    case GL_ONE_MINUS_DST_ALPHA:
2783    case GL_SRC_ALPHA_SATURATE:
2784       return GL_ZERO;
2785    }
2786
2787    return function;
2788 }
2789
2790 #if GEN_GEN >= 6
2791 typedef struct GENX(BLEND_STATE_ENTRY) BLEND_ENTRY_GENXML;
2792 #else
2793 typedef struct GENX(COLOR_CALC_STATE) BLEND_ENTRY_GENXML;
2794 #endif
2795
2796 UNUSED static bool
2797 set_blend_entry_bits(struct brw_context *brw, BLEND_ENTRY_GENXML *entry, int i,
2798                      bool alpha_to_one)
2799 {
2800    struct gl_context *ctx = &brw->ctx;
2801
2802    /* _NEW_BUFFERS */
2803    const struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
2804
2805    bool independent_alpha_blend = false;
2806
2807    /* Used for implementing the following bit of GL_EXT_texture_integer:
2808     * "Per-fragment operations that require floating-point color
2809     *  components, including multisample alpha operations, alpha test,
2810     *  blending, and dithering, have no effect when the corresponding
2811     *  colors are written to an integer color buffer."
2812     */
2813    const bool integer = ctx->DrawBuffer->_IntegerBuffers & (0x1 << i);
2814
2815    const unsigned blend_enabled = GEN_GEN >= 6 ?
2816       ctx->Color.BlendEnabled & (1 << i) : ctx->Color.BlendEnabled;
2817
2818    /* _NEW_COLOR */
2819    if (ctx->Color.ColorLogicOpEnabled) {
2820       GLenum rb_type = rb ? _mesa_get_format_datatype(rb->Format)
2821          : GL_UNSIGNED_NORMALIZED;
2822       WARN_ONCE(ctx->Color.LogicOp != GL_COPY &&
2823                 rb_type != GL_UNSIGNED_NORMALIZED &&
2824                 rb_type != GL_FLOAT, "Ignoring %s logic op on %s "
2825                 "renderbuffer\n",
2826                 _mesa_enum_to_string(ctx->Color.LogicOp),
2827                 _mesa_enum_to_string(rb_type));
2828       if (GEN_GEN >= 8 || rb_type == GL_UNSIGNED_NORMALIZED) {
2829          entry->LogicOpEnable = true;
2830          entry->LogicOpFunction = ctx->Color._LogicOp;
2831       }
2832    } else if (blend_enabled && !ctx->Color._AdvancedBlendMode
2833               && (GEN_GEN <= 5 || !integer)) {
2834       GLenum eqRGB = ctx->Color.Blend[i].EquationRGB;
2835       GLenum eqA = ctx->Color.Blend[i].EquationA;
2836       GLenum srcRGB = ctx->Color.Blend[i].SrcRGB;
2837       GLenum dstRGB = ctx->Color.Blend[i].DstRGB;
2838       GLenum srcA = ctx->Color.Blend[i].SrcA;
2839       GLenum dstA = ctx->Color.Blend[i].DstA;
2840
2841       if (eqRGB == GL_MIN || eqRGB == GL_MAX)
2842          srcRGB = dstRGB = GL_ONE;
2843
2844       if (eqA == GL_MIN || eqA == GL_MAX)
2845          srcA = dstA = GL_ONE;
2846
2847       /* Due to hardware limitations, the destination may have information
2848        * in an alpha channel even when the format specifies no alpha
2849        * channel. In order to avoid getting any incorrect blending due to
2850        * that alpha channel, coerce the blend factors to values that will
2851        * not read the alpha channel, but will instead use the correct
2852        * implicit value for alpha.
2853        */
2854       if (rb && !_mesa_base_format_has_channel(rb->_BaseFormat,
2855                                                GL_TEXTURE_ALPHA_TYPE)) {
2856          srcRGB = brw_fix_xRGB_alpha(srcRGB);
2857          srcA = brw_fix_xRGB_alpha(srcA);
2858          dstRGB = brw_fix_xRGB_alpha(dstRGB);
2859          dstA = brw_fix_xRGB_alpha(dstA);
2860       }
2861
2862       /* From the BLEND_STATE docs, DWord 0, Bit 29 (AlphaToOne Enable):
2863        * "If Dual Source Blending is enabled, this bit must be disabled."
2864        *
2865        * We override SRC1_ALPHA to ONE and ONE_MINUS_SRC1_ALPHA to ZERO,
2866        * and leave it enabled anyway.
2867        */
2868       if (GEN_GEN >= 6 && ctx->Color.Blend[i]._UsesDualSrc && alpha_to_one) {
2869          srcRGB = fix_dual_blend_alpha_to_one(srcRGB);
2870          srcA = fix_dual_blend_alpha_to_one(srcA);
2871          dstRGB = fix_dual_blend_alpha_to_one(dstRGB);
2872          dstA = fix_dual_blend_alpha_to_one(dstA);
2873       }
2874
2875       entry->ColorBufferBlendEnable = true;
2876       entry->DestinationBlendFactor = blend_factor(dstRGB);
2877       entry->SourceBlendFactor = blend_factor(srcRGB);
2878       entry->DestinationAlphaBlendFactor = blend_factor(dstA);
2879       entry->SourceAlphaBlendFactor = blend_factor(srcA);
2880       entry->ColorBlendFunction = blend_eqn(eqRGB);
2881       entry->AlphaBlendFunction = blend_eqn(eqA);
2882
2883       if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB)
2884          independent_alpha_blend = true;
2885    }
2886
2887    return independent_alpha_blend;
2888 }
2889
2890 #if GEN_GEN >= 6
2891 static void
2892 genX(upload_blend_state)(struct brw_context *brw)
2893 {
2894    struct gl_context *ctx = &brw->ctx;
2895    int size;
2896
2897    /* We need at least one BLEND_STATE written, because we might do
2898     * thread dispatch even if _NumColorDrawBuffers is 0 (for example
2899     * for computed depth or alpha test), which will do an FB write
2900     * with render target 0, which will reference BLEND_STATE[0] for
2901     * alpha test enable.
2902     */
2903    int nr_draw_buffers = ctx->DrawBuffer->_NumColorDrawBuffers;
2904    if (nr_draw_buffers == 0 && ctx->Color.AlphaEnabled)
2905       nr_draw_buffers = 1;
2906
2907    size = GENX(BLEND_STATE_ENTRY_length) * 4 * nr_draw_buffers;
2908 #if GEN_GEN >= 8
2909    size += GENX(BLEND_STATE_length) * 4;
2910 #endif
2911
2912    uint32_t *blend_map;
2913    blend_map = brw_state_batch(brw, size, 64, &brw->cc.blend_state_offset);
2914
2915 #if GEN_GEN >= 8
2916    struct GENX(BLEND_STATE) blend = { 0 };
2917    {
2918 #else
2919    for (int i = 0; i < nr_draw_buffers; i++) {
2920       struct GENX(BLEND_STATE_ENTRY) entry = { 0 };
2921 #define blend entry
2922 #endif
2923       /* OpenGL specification 3.3 (page 196), section 4.1.3 says:
2924        * "If drawbuffer zero is not NONE and the buffer it references has an
2925        * integer format, the SAMPLE_ALPHA_TO_COVERAGE and SAMPLE_ALPHA_TO_ONE
2926        * operations are skipped."
2927        */
2928       if (!(ctx->DrawBuffer->_IntegerBuffers & 0x1)) {
2929          /* _NEW_MULTISAMPLE */
2930          if (_mesa_is_multisample_enabled(ctx)) {
2931             if (ctx->Multisample.SampleAlphaToCoverage) {
2932                blend.AlphaToCoverageEnable = true;
2933                blend.AlphaToCoverageDitherEnable = GEN_GEN >= 7;
2934             }
2935             if (ctx->Multisample.SampleAlphaToOne)
2936                blend.AlphaToOneEnable = true;
2937          }
2938
2939          /* _NEW_COLOR */
2940          if (ctx->Color.AlphaEnabled) {
2941             blend.AlphaTestEnable = true;
2942             blend.AlphaTestFunction =
2943                intel_translate_compare_func(ctx->Color.AlphaFunc);
2944          }
2945
2946          if (ctx->Color.DitherFlag) {
2947             blend.ColorDitherEnable = true;
2948          }
2949       }
2950
2951 #if GEN_GEN >= 8
2952       for (int i = 0; i < nr_draw_buffers; i++) {
2953          struct GENX(BLEND_STATE_ENTRY) entry = { 0 };
2954 #else
2955       {
2956 #endif
2957          blend.IndependentAlphaBlendEnable =
2958             set_blend_entry_bits(brw, &entry, i, blend.AlphaToOneEnable) ||
2959             blend.IndependentAlphaBlendEnable;
2960
2961          /* See section 8.1.6 "Pre-Blend Color Clamping" of the
2962           * SandyBridge PRM Volume 2 Part 1 for HW requirements.
2963           *
2964           * We do our ARB_color_buffer_float CLAMP_FRAGMENT_COLOR
2965           * clamping in the fragment shader.  For its clamping of
2966           * blending, the spec says:
2967           *
2968           *     "RESOLVED: For fixed-point color buffers, the inputs and
2969           *      the result of the blending equation are clamped.  For
2970           *      floating-point color buffers, no clamping occurs."
2971           *
2972           * So, generally, we want clamping to the render target's range.
2973           * And, good news, the hardware tables for both pre- and
2974           * post-blend color clamping are either ignored, or any are
2975           * allowed, or clamping is required but RT range clamping is a
2976           * valid option.
2977           */
2978          entry.PreBlendColorClampEnable = true;
2979          entry.PostBlendColorClampEnable = true;
2980          entry.ColorClampRange = COLORCLAMP_RTFORMAT;
2981
2982          entry.WriteDisableRed   = !GET_COLORMASK_BIT(ctx->Color.ColorMask, i, 0);
2983          entry.WriteDisableGreen = !GET_COLORMASK_BIT(ctx->Color.ColorMask, i, 1);
2984          entry.WriteDisableBlue  = !GET_COLORMASK_BIT(ctx->Color.ColorMask, i, 2);
2985          entry.WriteDisableAlpha = !GET_COLORMASK_BIT(ctx->Color.ColorMask, i, 3);
2986
2987 #if GEN_GEN >= 8
2988          GENX(BLEND_STATE_ENTRY_pack)(NULL, &blend_map[1 + i * 2], &entry);
2989 #else
2990          GENX(BLEND_STATE_ENTRY_pack)(NULL, &blend_map[i * 2], &entry);
2991 #endif
2992       }
2993    }
2994
2995 #if GEN_GEN >= 8
2996    GENX(BLEND_STATE_pack)(NULL, blend_map, &blend);
2997 #endif
2998
2999 #if GEN_GEN < 7
3000    brw_batch_emit(brw, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
3001       ptr.PointertoBLEND_STATE = brw->cc.blend_state_offset;
3002       ptr.BLEND_STATEChange = true;
3003    }
3004 #else
3005    brw_batch_emit(brw, GENX(3DSTATE_BLEND_STATE_POINTERS), ptr) {
3006       ptr.BlendStatePointer = brw->cc.blend_state_offset;
3007 #if GEN_GEN >= 8
3008       ptr.BlendStatePointerValid = true;
3009 #endif
3010    }
3011 #endif
3012 }
3013
3014 static const struct brw_tracked_state genX(blend_state) = {
3015    .dirty = {
3016       .mesa = _NEW_BUFFERS |
3017               _NEW_COLOR |
3018               _NEW_MULTISAMPLE,
3019       .brw = BRW_NEW_BATCH |
3020              BRW_NEW_BLORP |
3021              BRW_NEW_STATE_BASE_ADDRESS,
3022    },
3023    .emit = genX(upload_blend_state),
3024 };
3025 #endif
3026
3027 /* ---------------------------------------------------------------------- */
3028
3029 #if GEN_GEN >= 7
3030 UNUSED static const uint32_t push_constant_opcodes[] = {
3031    [MESA_SHADER_VERTEX]                      = 21,
3032    [MESA_SHADER_TESS_CTRL]                   = 25, /* HS */
3033    [MESA_SHADER_TESS_EVAL]                   = 26, /* DS */
3034    [MESA_SHADER_GEOMETRY]                    = 22,
3035    [MESA_SHADER_FRAGMENT]                    = 23,
3036    [MESA_SHADER_COMPUTE]                     = 0,
3037 };
3038
3039 static void
3040 genX(upload_push_constant_packets)(struct brw_context *brw)
3041 {
3042    const struct gen_device_info *devinfo = &brw->screen->devinfo;
3043    struct gl_context *ctx = &brw->ctx;
3044
3045    UNUSED uint32_t mocs = GEN_GEN < 8 ? GEN7_MOCS_L3 : 0;
3046
3047    struct brw_stage_state *stage_states[] = {
3048       &brw->vs.base,
3049       &brw->tcs.base,
3050       &brw->tes.base,
3051       &brw->gs.base,
3052       &brw->wm.base,
3053    };
3054
3055    if (GEN_GEN == 7 && !GEN_IS_HASWELL && !devinfo->is_baytrail &&
3056        stage_states[MESA_SHADER_VERTEX]->push_constants_dirty)
3057       gen7_emit_vs_workaround_flush(brw);
3058
3059    for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
3060       struct brw_stage_state *stage_state = stage_states[stage];
3061       UNUSED struct gl_program *prog = ctx->_Shader->CurrentProgram[stage];
3062
3063       if (!stage_state->push_constants_dirty)
3064          continue;
3065
3066       brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_VS), pkt) {
3067          pkt._3DCommandSubOpcode = push_constant_opcodes[stage];
3068          if (stage_state->prog_data) {
3069 #if GEN_GEN >= 8 || GEN_IS_HASWELL
3070             /* The Skylake PRM contains the following restriction:
3071              *
3072              *    "The driver must ensure The following case does not occur
3073              *     without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
3074              *     buffer 3 read length equal to zero committed followed by a
3075              *     3DSTATE_CONSTANT_* with buffer 0 read length not equal to
3076              *     zero committed."
3077              *
3078              * To avoid this, we program the buffers in the highest slots.
3079              * This way, slot 0 is only used if slot 3 is also used.
3080              */
3081             int n = 3;
3082
3083             for (int i = 3; i >= 0; i--) {
3084                const struct brw_ubo_range *range =
3085                   &stage_state->prog_data->ubo_ranges[i];
3086
3087                if (range->length == 0)
3088                   continue;
3089
3090                const struct gl_uniform_block *block =
3091                   prog->sh.UniformBlocks[range->block];
3092                const struct gl_buffer_binding *binding =
3093                   &ctx->UniformBufferBindings[block->Binding];
3094
3095                if (binding->BufferObject == ctx->Shared->NullBufferObj) {
3096                   static unsigned msg_id = 0;
3097                   _mesa_gl_debug(ctx, &msg_id, MESA_DEBUG_SOURCE_API,
3098                                  MESA_DEBUG_TYPE_UNDEFINED,
3099                                  MESA_DEBUG_SEVERITY_HIGH,
3100                                  "UBO %d unbound, %s shader uniform data "
3101                                  "will be undefined.",
3102                                  range->block,
3103                                  _mesa_shader_stage_to_string(stage));
3104                   continue;
3105                }
3106
3107                assert(binding->Offset % 32 == 0);
3108
3109                struct brw_bo *bo = intel_bufferobj_buffer(brw,
3110                   intel_buffer_object(binding->BufferObject),
3111                   binding->Offset, range->length * 32, false);
3112
3113                pkt.ConstantBody.ReadLength[n] = range->length;
3114                pkt.ConstantBody.Buffer[n] =
3115                   ro_bo(bo, range->start * 32 + binding->Offset);
3116                n--;
3117             }
3118
3119             if (stage_state->push_const_size > 0) {
3120                assert(n >= 0);
3121                pkt.ConstantBody.ReadLength[n] = stage_state->push_const_size;
3122                pkt.ConstantBody.Buffer[n] =
3123                   ro_bo(stage_state->push_const_bo,
3124                         stage_state->push_const_offset);
3125             }
3126 #else
3127             pkt.ConstantBody.ReadLength[0] = stage_state->push_const_size;
3128             pkt.ConstantBody.Buffer[0].offset =
3129                stage_state->push_const_offset | mocs;
3130 #endif
3131          }
3132       }
3133
3134       stage_state->push_constants_dirty = false;
3135       brw->ctx.NewDriverState |= GEN_GEN >= 9 ? BRW_NEW_SURFACES : 0;
3136    }
3137 }
3138
3139 const struct brw_tracked_state genX(push_constant_packets) = {
3140    .dirty = {
3141       .mesa  = 0,
3142       .brw   = BRW_NEW_DRAW_CALL,
3143    },
3144    .emit = genX(upload_push_constant_packets),
3145 };
3146 #endif
3147
3148 #if GEN_GEN >= 6
3149 static void
3150 genX(upload_vs_push_constants)(struct brw_context *brw)
3151 {
3152    struct brw_stage_state *stage_state = &brw->vs.base;
3153
3154    /* BRW_NEW_VERTEX_PROGRAM */
3155    const struct gl_program *vp = brw->programs[MESA_SHADER_VERTEX];
3156    /* BRW_NEW_VS_PROG_DATA */
3157    const struct brw_stage_prog_data *prog_data = brw->vs.base.prog_data;
3158
3159    gen6_upload_push_constants(brw, vp, prog_data, stage_state);
3160 }
3161
3162 static const struct brw_tracked_state genX(vs_push_constants) = {
3163    .dirty = {
3164       .mesa  = _NEW_PROGRAM_CONSTANTS |
3165                _NEW_TRANSFORM,
3166       .brw   = BRW_NEW_BATCH |
3167                BRW_NEW_BLORP |
3168                BRW_NEW_VERTEX_PROGRAM |
3169                BRW_NEW_VS_PROG_DATA,
3170    },
3171    .emit = genX(upload_vs_push_constants),
3172 };
3173
3174 static void
3175 genX(upload_gs_push_constants)(struct brw_context *brw)
3176 {
3177    struct brw_stage_state *stage_state = &brw->gs.base;
3178
3179    /* BRW_NEW_GEOMETRY_PROGRAM */
3180    const struct gl_program *gp = brw->programs[MESA_SHADER_GEOMETRY];
3181
3182    /* BRW_NEW_GS_PROG_DATA */
3183    struct brw_stage_prog_data *prog_data = brw->gs.base.prog_data;
3184
3185    gen6_upload_push_constants(brw, gp, prog_data, stage_state);
3186 }
3187
3188 static const struct brw_tracked_state genX(gs_push_constants) = {
3189    .dirty = {
3190       .mesa  = _NEW_PROGRAM_CONSTANTS |
3191                _NEW_TRANSFORM,
3192       .brw   = BRW_NEW_BATCH |
3193                BRW_NEW_BLORP |
3194                BRW_NEW_GEOMETRY_PROGRAM |
3195                BRW_NEW_GS_PROG_DATA,
3196    },
3197    .emit = genX(upload_gs_push_constants),
3198 };
3199
3200 static void
3201 genX(upload_wm_push_constants)(struct brw_context *brw)
3202 {
3203    struct brw_stage_state *stage_state = &brw->wm.base;
3204    /* BRW_NEW_FRAGMENT_PROGRAM */
3205    const struct gl_program *fp = brw->programs[MESA_SHADER_FRAGMENT];
3206    /* BRW_NEW_FS_PROG_DATA */
3207    const struct brw_stage_prog_data *prog_data = brw->wm.base.prog_data;
3208
3209    gen6_upload_push_constants(brw, fp, prog_data, stage_state);
3210 }
3211
3212 static const struct brw_tracked_state genX(wm_push_constants) = {
3213    .dirty = {
3214       .mesa  = _NEW_PROGRAM_CONSTANTS,
3215       .brw   = BRW_NEW_BATCH |
3216                BRW_NEW_BLORP |
3217                BRW_NEW_FRAGMENT_PROGRAM |
3218                BRW_NEW_FS_PROG_DATA,
3219    },
3220    .emit = genX(upload_wm_push_constants),
3221 };
3222 #endif
3223
3224 /* ---------------------------------------------------------------------- */
3225
3226 #if GEN_GEN >= 6
3227 static unsigned
3228 genX(determine_sample_mask)(struct brw_context *brw)
3229 {
3230    struct gl_context *ctx = &brw->ctx;
3231    float coverage = 1.0f;
3232    float coverage_invert = false;
3233    unsigned sample_mask = ~0u;
3234
3235    /* BRW_NEW_NUM_SAMPLES */
3236    unsigned num_samples = brw->num_samples;
3237
3238    if (_mesa_is_multisample_enabled(ctx)) {
3239       if (ctx->Multisample.SampleCoverage) {
3240          coverage = ctx->Multisample.SampleCoverageValue;
3241          coverage_invert = ctx->Multisample.SampleCoverageInvert;
3242       }
3243       if (ctx->Multisample.SampleMask) {
3244          sample_mask = ctx->Multisample.SampleMaskValue;
3245       }
3246    }
3247
3248    if (num_samples > 1) {
3249       int coverage_int = (int) (num_samples * coverage + 0.5f);
3250       uint32_t coverage_bits = (1 << coverage_int) - 1;
3251       if (coverage_invert)
3252          coverage_bits ^= (1 << num_samples) - 1;
3253       return coverage_bits & sample_mask;
3254    } else {
3255       return 1;
3256    }
3257 }
3258
3259 static void
3260 genX(emit_3dstate_multisample2)(struct brw_context *brw,
3261                                 unsigned num_samples)
3262 {
3263    unsigned log2_samples = ffs(num_samples) - 1;
3264
3265    brw_batch_emit(brw, GENX(3DSTATE_MULTISAMPLE), multi) {
3266       multi.PixelLocation = CENTER;
3267       multi.NumberofMultisamples = log2_samples;
3268 #if GEN_GEN == 6
3269       GEN_SAMPLE_POS_4X(multi.Sample);
3270 #elif GEN_GEN == 7
3271       switch (num_samples) {
3272       case 1:
3273          GEN_SAMPLE_POS_1X(multi.Sample);
3274          break;
3275       case 2:
3276          GEN_SAMPLE_POS_2X(multi.Sample);
3277          break;
3278       case 4:
3279          GEN_SAMPLE_POS_4X(multi.Sample);
3280          break;
3281       case 8:
3282          GEN_SAMPLE_POS_8X(multi.Sample);
3283          break;
3284       default:
3285          break;
3286       }
3287 #endif
3288    }
3289 }
3290
3291 static void
3292 genX(upload_multisample_state)(struct brw_context *brw)
3293 {
3294    assert(brw->num_samples > 0 && brw->num_samples <= 16);
3295
3296    genX(emit_3dstate_multisample2)(brw, brw->num_samples);
3297
3298    brw_batch_emit(brw, GENX(3DSTATE_SAMPLE_MASK), sm) {
3299       sm.SampleMask = genX(determine_sample_mask)(brw);
3300    }
3301 }
3302
3303 static const struct brw_tracked_state genX(multisample_state) = {
3304    .dirty = {
3305       .mesa = _NEW_MULTISAMPLE |
3306               (GEN_GEN == 10 ? _NEW_BUFFERS : 0),
3307       .brw = BRW_NEW_BLORP |
3308              BRW_NEW_CONTEXT |
3309              BRW_NEW_NUM_SAMPLES,
3310    },
3311    .emit = genX(upload_multisample_state)
3312 };
3313 #endif
3314
3315 /* ---------------------------------------------------------------------- */
3316
3317 static void
3318 genX(upload_color_calc_state)(struct brw_context *brw)
3319 {
3320    struct gl_context *ctx = &brw->ctx;
3321
3322    brw_state_emit(brw, GENX(COLOR_CALC_STATE), 64, &brw->cc.state_offset, cc) {
3323 #if GEN_GEN <= 5
3324       cc.IndependentAlphaBlendEnable =
3325          set_blend_entry_bits(brw, &cc, 0, false);
3326       set_depth_stencil_bits(brw, &cc);
3327
3328       if (ctx->Color.AlphaEnabled &&
3329           ctx->DrawBuffer->_NumColorDrawBuffers <= 1) {
3330          cc.AlphaTestEnable = true;
3331          cc.AlphaTestFunction =
3332             intel_translate_compare_func(ctx->Color.AlphaFunc);
3333       }
3334
3335       cc.ColorDitherEnable = ctx->Color.DitherFlag;
3336
3337       cc.StatisticsEnable = brw->stats_wm;
3338
3339       cc.CCViewportStatePointer =
3340          ro_bo(brw->batch.state.bo, brw->cc.vp_offset);
3341 #else
3342       /* _NEW_COLOR */
3343       cc.BlendConstantColorRed = ctx->Color.BlendColorUnclamped[0];
3344       cc.BlendConstantColorGreen = ctx->Color.BlendColorUnclamped[1];
3345       cc.BlendConstantColorBlue = ctx->Color.BlendColorUnclamped[2];
3346       cc.BlendConstantColorAlpha = ctx->Color.BlendColorUnclamped[3];
3347
3348 #if GEN_GEN < 9
3349       /* _NEW_STENCIL */
3350       cc.StencilReferenceValue = _mesa_get_stencil_ref(ctx, 0);
3351       cc.BackfaceStencilReferenceValue =
3352          _mesa_get_stencil_ref(ctx, ctx->Stencil._BackFace);
3353 #endif
3354
3355 #endif
3356
3357       /* _NEW_COLOR */
3358       UNCLAMPED_FLOAT_TO_UBYTE(cc.AlphaReferenceValueAsUNORM8,
3359                                ctx->Color.AlphaRef);
3360    }
3361
3362 #if GEN_GEN >= 6
3363    brw_batch_emit(brw, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
3364       ptr.ColorCalcStatePointer = brw->cc.state_offset;
3365 #if GEN_GEN != 7
3366       ptr.ColorCalcStatePointerValid = true;
3367 #endif
3368    }
3369 #else
3370    brw->ctx.NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
3371 #endif
3372 }
3373
3374 static const struct brw_tracked_state genX(color_calc_state) = {
3375    .dirty = {
3376       .mesa = _NEW_COLOR |
3377               _NEW_STENCIL |
3378               (GEN_GEN <= 5 ? _NEW_BUFFERS |
3379                               _NEW_DEPTH
3380                             : 0),
3381       .brw = BRW_NEW_BATCH |
3382              BRW_NEW_BLORP |
3383              (GEN_GEN <= 5 ? BRW_NEW_CC_VP |
3384                              BRW_NEW_STATS_WM
3385                            : BRW_NEW_CC_STATE |
3386                              BRW_NEW_STATE_BASE_ADDRESS),
3387    },
3388    .emit = genX(upload_color_calc_state),
3389 };
3390
3391
3392 /* ---------------------------------------------------------------------- */
3393
3394 #if GEN_GEN >= 7
3395 static void
3396 genX(upload_sbe)(struct brw_context *brw)
3397 {
3398    struct gl_context *ctx = &brw->ctx;
3399    /* BRW_NEW_FRAGMENT_PROGRAM */
3400    UNUSED const struct gl_program *fp = brw->programs[MESA_SHADER_FRAGMENT];
3401    /* BRW_NEW_FS_PROG_DATA */
3402    const struct brw_wm_prog_data *wm_prog_data =
3403       brw_wm_prog_data(brw->wm.base.prog_data);
3404 #if GEN_GEN >= 8
3405    struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attr_overrides[16] = { { 0 } };
3406 #else
3407 #define attr_overrides sbe.Attribute
3408 #endif
3409    uint32_t urb_entry_read_length;
3410    uint32_t urb_entry_read_offset;
3411    uint32_t point_sprite_enables;
3412
3413    brw_batch_emit(brw, GENX(3DSTATE_SBE), sbe) {
3414       sbe.AttributeSwizzleEnable = true;
3415       sbe.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
3416
3417       /* _NEW_BUFFERS */
3418       bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3419
3420       /* _NEW_POINT
3421        *
3422        * Window coordinates in an FBO are inverted, which means point
3423        * sprite origin must be inverted.
3424        */
3425       if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) != render_to_fbo)
3426          sbe.PointSpriteTextureCoordinateOrigin = LOWERLEFT;
3427       else
3428          sbe.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
3429
3430       /* _NEW_POINT | _NEW_LIGHT | _NEW_PROGRAM,
3431        * BRW_NEW_FS_PROG_DATA | BRW_NEW_FRAGMENT_PROGRAM |
3432        * BRW_NEW_GS_PROG_DATA | BRW_NEW_PRIMITIVE | BRW_NEW_TES_PROG_DATA |
3433        * BRW_NEW_VUE_MAP_GEOM_OUT
3434        */
3435       genX(calculate_attr_overrides)(brw,
3436                                      attr_overrides,
3437                                      &point_sprite_enables,
3438                                      &urb_entry_read_length,
3439                                      &urb_entry_read_offset);
3440
3441       /* Typically, the URB entry read length and offset should be programmed
3442        * in 3DSTATE_VS and 3DSTATE_GS; SBE inherits it from the last active
3443        * stage which produces geometry.  However, we don't know the proper
3444        * value until we call calculate_attr_overrides().
3445        *
3446        * To fit with our existing code, we override the inherited values and
3447        * specify it here directly, as we did on previous generations.
3448        */
3449       sbe.VertexURBEntryReadLength = urb_entry_read_length;
3450       sbe.VertexURBEntryReadOffset = urb_entry_read_offset;
3451       sbe.PointSpriteTextureCoordinateEnable = point_sprite_enables;
3452       sbe.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
3453
3454 #if GEN_GEN >= 8
3455       sbe.ForceVertexURBEntryReadLength = true;
3456       sbe.ForceVertexURBEntryReadOffset = true;
3457 #endif
3458
3459 #if GEN_GEN >= 9
3460       /* prepare the active component dwords */
3461       const int num_inputs = urb_entry_read_length * 2;
3462       for (int input_index = 0; input_index < num_inputs; input_index++) {
3463          sbe.AttributeActiveComponentFormat[input_index] = ACTIVE_COMPONENT_XYZW;
3464       }
3465 #endif
3466    }
3467
3468 #if GEN_GEN >= 8
3469    brw_batch_emit(brw, GENX(3DSTATE_SBE_SWIZ), sbes) {
3470       for (int i = 0; i < 16; i++)
3471          sbes.Attribute[i] = attr_overrides[i];
3472    }
3473 #endif
3474
3475 #undef attr_overrides
3476 }
3477
3478 static const struct brw_tracked_state genX(sbe_state) = {
3479    .dirty = {
3480       .mesa  = _NEW_BUFFERS |
3481                _NEW_LIGHT |
3482                _NEW_POINT |
3483                _NEW_POLYGON |
3484                _NEW_PROGRAM,
3485       .brw   = BRW_NEW_BLORP |
3486                BRW_NEW_CONTEXT |
3487                BRW_NEW_FRAGMENT_PROGRAM |
3488                BRW_NEW_FS_PROG_DATA |
3489                BRW_NEW_GS_PROG_DATA |
3490                BRW_NEW_TES_PROG_DATA |
3491                BRW_NEW_VUE_MAP_GEOM_OUT |
3492                (GEN_GEN == 7 ? BRW_NEW_PRIMITIVE
3493                              : 0),
3494    },
3495    .emit = genX(upload_sbe),
3496 };
3497 #endif
3498
3499 /* ---------------------------------------------------------------------- */
3500
3501 #if GEN_GEN >= 7
3502 /**
3503  * Outputs the 3DSTATE_SO_DECL_LIST command.
3504  *
3505  * The data output is a series of 64-bit entries containing a SO_DECL per
3506  * stream.  We only have one stream of rendering coming out of the GS unit, so
3507  * we only emit stream 0 (low 16 bits) SO_DECLs.
3508  */
3509 static void
3510 genX(upload_3dstate_so_decl_list)(struct brw_context *brw,
3511                                   const struct brw_vue_map *vue_map)
3512 {
3513    struct gl_context *ctx = &brw->ctx;
3514    /* BRW_NEW_TRANSFORM_FEEDBACK */
3515    struct gl_transform_feedback_object *xfb_obj =
3516       ctx->TransformFeedback.CurrentObject;
3517    const struct gl_transform_feedback_info *linked_xfb_info =
3518       xfb_obj->program->sh.LinkedTransformFeedback;
3519    struct GENX(SO_DECL) so_decl[MAX_VERTEX_STREAMS][128];
3520    int buffer_mask[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
3521    int next_offset[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
3522    int decls[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
3523    int max_decls = 0;
3524    STATIC_ASSERT(ARRAY_SIZE(so_decl[0]) >= MAX_PROGRAM_OUTPUTS);
3525
3526    memset(so_decl, 0, sizeof(so_decl));
3527
3528    /* Construct the list of SO_DECLs to be emitted.  The formatting of the
3529     * command feels strange -- each dword pair contains a SO_DECL per stream.
3530     */
3531    for (unsigned i = 0; i < linked_xfb_info->NumOutputs; i++) {
3532       const struct gl_transform_feedback_output *output =
3533          &linked_xfb_info->Outputs[i];
3534       const int buffer = output->OutputBuffer;
3535       const int varying = output->OutputRegister;
3536       const unsigned stream_id = output->StreamId;
3537       assert(stream_id < MAX_VERTEX_STREAMS);
3538
3539       buffer_mask[stream_id] |= 1 << buffer;
3540
3541       assert(vue_map->varying_to_slot[varying] >= 0);
3542
3543       /* Mesa doesn't store entries for gl_SkipComponents in the Outputs[]
3544        * array.  Instead, it simply increments DstOffset for the following
3545        * input by the number of components that should be skipped.
3546        *
3547        * Our hardware is unusual in that it requires us to program SO_DECLs
3548        * for fake "hole" components, rather than simply taking the offset
3549        * for each real varying.  Each hole can have size 1, 2, 3, or 4; we
3550        * program as many size = 4 holes as we can, then a final hole to
3551        * accommodate the final 1, 2, or 3 remaining.
3552        */
3553       int skip_components = output->DstOffset - next_offset[buffer];
3554
3555       while (skip_components > 0) {
3556          so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
3557             .HoleFlag = 1,
3558             .OutputBufferSlot = output->OutputBuffer,
3559             .ComponentMask = (1 << MIN2(skip_components, 4)) - 1,
3560          };
3561          skip_components -= 4;
3562       }
3563
3564       next_offset[buffer] = output->DstOffset + output->NumComponents;
3565
3566       so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
3567          .OutputBufferSlot = output->OutputBuffer,
3568          .RegisterIndex = vue_map->varying_to_slot[varying],
3569          .ComponentMask =
3570             ((1 << output->NumComponents) - 1) << output->ComponentOffset,
3571       };
3572
3573       if (decls[stream_id] > max_decls)
3574          max_decls = decls[stream_id];
3575    }
3576
3577    uint32_t *dw;
3578    dw = brw_batch_emitn(brw, GENX(3DSTATE_SO_DECL_LIST), 3 + 2 * max_decls,
3579                         .StreamtoBufferSelects0 = buffer_mask[0],
3580                         .StreamtoBufferSelects1 = buffer_mask[1],
3581                         .StreamtoBufferSelects2 = buffer_mask[2],
3582                         .StreamtoBufferSelects3 = buffer_mask[3],
3583                         .NumEntries0 = decls[0],
3584                         .NumEntries1 = decls[1],
3585                         .NumEntries2 = decls[2],
3586                         .NumEntries3 = decls[3]);
3587
3588    for (int i = 0; i < max_decls; i++) {
3589       GENX(SO_DECL_ENTRY_pack)(
3590          brw, dw + 2 + i * 2,
3591          &(struct GENX(SO_DECL_ENTRY)) {
3592             .Stream0Decl = so_decl[0][i],
3593             .Stream1Decl = so_decl[1][i],
3594             .Stream2Decl = so_decl[2][i],
3595             .Stream3Decl = so_decl[3][i],
3596          });
3597    }
3598 }
3599
3600 static void
3601 genX(upload_3dstate_so_buffers)(struct brw_context *brw)
3602 {
3603    struct gl_context *ctx = &brw->ctx;
3604    /* BRW_NEW_TRANSFORM_FEEDBACK */
3605    struct gl_transform_feedback_object *xfb_obj =
3606       ctx->TransformFeedback.CurrentObject;
3607 #if GEN_GEN < 8
3608    const struct gl_transform_feedback_info *linked_xfb_info =
3609       xfb_obj->program->sh.LinkedTransformFeedback;
3610 #else
3611    struct brw_transform_feedback_object *brw_obj =
3612       (struct brw_transform_feedback_object *) xfb_obj;
3613    uint32_t mocs_wb = GEN_GEN >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
3614 #endif
3615
3616    /* Set up the up to 4 output buffers.  These are the ranges defined in the
3617     * gl_transform_feedback_object.
3618     */
3619    for (int i = 0; i < 4; i++) {
3620       struct intel_buffer_object *bufferobj =
3621          intel_buffer_object(xfb_obj->Buffers[i]);
3622
3623       if (!bufferobj) {
3624          brw_batch_emit(brw, GENX(3DSTATE_SO_BUFFER), sob) {
3625             sob.SOBufferIndex = i;
3626          }
3627          continue;
3628       }
3629
3630       uint32_t start = xfb_obj->Offset[i];
3631       assert(start % 4 == 0);
3632       uint32_t end = ALIGN(start + xfb_obj->Size[i], 4);
3633       struct brw_bo *bo =
3634          intel_bufferobj_buffer(brw, bufferobj, start, end - start, true);
3635       assert(end <= bo->size);
3636
3637       brw_batch_emit(brw, GENX(3DSTATE_SO_BUFFER), sob) {
3638          sob.SOBufferIndex = i;
3639
3640          sob.SurfaceBaseAddress = rw_bo(bo, start);
3641 #if GEN_GEN < 8
3642          sob.SurfacePitch = linked_xfb_info->Buffers[i].Stride * 4;
3643          sob.SurfaceEndAddress = rw_bo(bo, end);
3644 #else
3645          sob.SOBufferEnable = true;
3646          sob.StreamOffsetWriteEnable = true;
3647          sob.StreamOutputBufferOffsetAddressEnable = true;
3648          sob.SOBufferMOCS = mocs_wb;
3649
3650          sob.SurfaceSize = MAX2(xfb_obj->Size[i] / 4, 1) - 1;
3651          sob.StreamOutputBufferOffsetAddress =
3652             rw_bo(brw_obj->offset_bo, i * sizeof(uint32_t));
3653
3654          if (brw_obj->zero_offsets) {
3655             /* Zero out the offset and write that to offset_bo */
3656             sob.StreamOffset = 0;
3657          } else {
3658             /* Use offset_bo as the "Stream Offset." */
3659             sob.StreamOffset = 0xFFFFFFFF;
3660          }
3661 #endif
3662       }
3663    }
3664
3665 #if GEN_GEN >= 8
3666    brw_obj->zero_offsets = false;
3667 #endif
3668 }
3669
3670 static bool
3671 query_active(struct gl_query_object *q)
3672 {
3673    return q && q->Active;
3674 }
3675
3676 static void
3677 genX(upload_3dstate_streamout)(struct brw_context *brw, bool active,
3678                                const struct brw_vue_map *vue_map)
3679 {
3680    struct gl_context *ctx = &brw->ctx;
3681    /* BRW_NEW_TRANSFORM_FEEDBACK */
3682    struct gl_transform_feedback_object *xfb_obj =
3683       ctx->TransformFeedback.CurrentObject;
3684
3685    brw_batch_emit(brw, GENX(3DSTATE_STREAMOUT), sos) {
3686       if (active) {
3687          int urb_entry_read_offset = 0;
3688          int urb_entry_read_length = (vue_map->num_slots + 1) / 2 -
3689             urb_entry_read_offset;
3690
3691          sos.SOFunctionEnable = true;
3692          sos.SOStatisticsEnable = true;
3693
3694          /* BRW_NEW_RASTERIZER_DISCARD */
3695          if (ctx->RasterDiscard) {
3696             if (!query_active(ctx->Query.PrimitivesGenerated[0])) {
3697                sos.RenderingDisable = true;
3698             } else {
3699                perf_debug("Rasterizer discard with a GL_PRIMITIVES_GENERATED "
3700                           "query active relies on the clipper.\n");
3701             }
3702          }
3703
3704          /* _NEW_LIGHT */
3705          if (ctx->Light.ProvokingVertex != GL_FIRST_VERTEX_CONVENTION)
3706             sos.ReorderMode = TRAILING;
3707
3708 #if GEN_GEN < 8
3709          sos.SOBufferEnable0 = xfb_obj->Buffers[0] != NULL;
3710          sos.SOBufferEnable1 = xfb_obj->Buffers[1] != NULL;
3711          sos.SOBufferEnable2 = xfb_obj->Buffers[2] != NULL;
3712          sos.SOBufferEnable3 = xfb_obj->Buffers[3] != NULL;
3713 #else
3714          const struct gl_transform_feedback_info *linked_xfb_info =
3715             xfb_obj->program->sh.LinkedTransformFeedback;
3716          /* Set buffer pitches; 0 means unbound. */
3717          if (xfb_obj->Buffers[0])
3718             sos.Buffer0SurfacePitch = linked_xfb_info->Buffers[0].Stride * 4;
3719          if (xfb_obj->Buffers[1])
3720             sos.Buffer1SurfacePitch = linked_xfb_info->Buffers[1].Stride * 4;
3721          if (xfb_obj->Buffers[2])
3722             sos.Buffer2SurfacePitch = linked_xfb_info->Buffers[2].Stride * 4;
3723          if (xfb_obj->Buffers[3])
3724             sos.Buffer3SurfacePitch = linked_xfb_info->Buffers[3].Stride * 4;
3725 #endif
3726
3727          /* We always read the whole vertex.  This could be reduced at some
3728           * point by reading less and offsetting the register index in the
3729           * SO_DECLs.
3730           */
3731          sos.Stream0VertexReadOffset = urb_entry_read_offset;
3732          sos.Stream0VertexReadLength = urb_entry_read_length - 1;
3733          sos.Stream1VertexReadOffset = urb_entry_read_offset;
3734          sos.Stream1VertexReadLength = urb_entry_read_length - 1;
3735          sos.Stream2VertexReadOffset = urb_entry_read_offset;
3736          sos.Stream2VertexReadLength = urb_entry_read_length - 1;
3737          sos.Stream3VertexReadOffset = urb_entry_read_offset;
3738          sos.Stream3VertexReadLength = urb_entry_read_length - 1;
3739       }
3740    }
3741 }
3742
3743 static void
3744 genX(upload_sol)(struct brw_context *brw)
3745 {
3746    struct gl_context *ctx = &brw->ctx;
3747    /* BRW_NEW_TRANSFORM_FEEDBACK */
3748    bool active = _mesa_is_xfb_active_and_unpaused(ctx);
3749
3750    if (active) {
3751       genX(upload_3dstate_so_buffers)(brw);
3752
3753       /* BRW_NEW_VUE_MAP_GEOM_OUT */
3754       genX(upload_3dstate_so_decl_list)(brw, &brw->vue_map_geom_out);
3755    }
3756
3757    /* Finally, set up the SOL stage.  This command must always follow updates to
3758     * the nonpipelined SOL state (3DSTATE_SO_BUFFER, 3DSTATE_SO_DECL_LIST) or
3759     * MMIO register updates (current performed by the kernel at each batch
3760     * emit).
3761     */
3762    genX(upload_3dstate_streamout)(brw, active, &brw->vue_map_geom_out);
3763 }
3764
3765 static const struct brw_tracked_state genX(sol_state) = {
3766    .dirty = {
3767       .mesa  = _NEW_LIGHT,
3768       .brw   = BRW_NEW_BATCH |
3769                BRW_NEW_BLORP |
3770                BRW_NEW_RASTERIZER_DISCARD |
3771                BRW_NEW_VUE_MAP_GEOM_OUT |
3772                BRW_NEW_TRANSFORM_FEEDBACK,
3773    },
3774    .emit = genX(upload_sol),
3775 };
3776 #endif
3777
3778 /* ---------------------------------------------------------------------- */
3779
3780 #if GEN_GEN >= 7
3781 static void
3782 genX(upload_ps)(struct brw_context *brw)
3783 {
3784    UNUSED const struct gl_context *ctx = &brw->ctx;
3785    UNUSED const struct gen_device_info *devinfo = &brw->screen->devinfo;
3786
3787    /* BRW_NEW_FS_PROG_DATA */
3788    const struct brw_wm_prog_data *prog_data =
3789       brw_wm_prog_data(brw->wm.base.prog_data);
3790    const struct brw_stage_state *stage_state = &brw->wm.base;
3791
3792 #if GEN_GEN < 8
3793 #endif
3794
3795    brw_batch_emit(brw, GENX(3DSTATE_PS), ps) {
3796       /* Initialize the execution mask with VMask.  Otherwise, derivatives are
3797        * incorrect for subspans where some of the pixels are unlit.  We believe
3798        * the bit just didn't take effect in previous generations.
3799        */
3800       ps.VectorMaskEnable = GEN_GEN >= 8;
3801
3802       ps.SamplerCount =
3803          DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4);
3804
3805       /* BRW_NEW_FS_PROG_DATA */
3806       ps.BindingTableEntryCount = prog_data->base.binding_table.size_bytes / 4;
3807
3808       if (prog_data->base.use_alt_mode)
3809          ps.FloatingPointMode = Alternate;
3810
3811       /* Haswell requires the sample mask to be set in this packet as well as
3812        * in 3DSTATE_SAMPLE_MASK; the values should match.
3813        */
3814
3815       /* _NEW_BUFFERS, _NEW_MULTISAMPLE */
3816 #if GEN_IS_HASWELL
3817       ps.SampleMask = genX(determine_sample_mask(brw));
3818 #endif
3819
3820       /* 3DSTATE_PS expects the number of threads per PSD, which is always 64
3821        * for pre Gen11 and 128 for gen11+; On gen11+ If a programmed value is
3822        * k, it implies 2(k+1) threads. It implicitly scales for different GT
3823        * levels (which have some # of PSDs).
3824        *
3825        * In Gen8 the format is U8-2 whereas in Gen9+ it is U9-1.
3826        */
3827 #if GEN_GEN >= 9
3828       ps.MaximumNumberofThreadsPerPSD = 64 - 1;
3829 #elif GEN_GEN >= 8
3830       ps.MaximumNumberofThreadsPerPSD = 64 - 2;
3831 #else
3832       ps.MaximumNumberofThreads = devinfo->max_wm_threads - 1;
3833 #endif
3834
3835       if (prog_data->base.nr_params > 0 ||
3836           prog_data->base.ubo_ranges[0].length > 0)
3837          ps.PushConstantEnable = true;
3838
3839 #if GEN_GEN < 8
3840       /* From the IVB PRM, volume 2 part 1, page 287:
3841        * "This bit is inserted in the PS payload header and made available to
3842        * the DataPort (either via the message header or via header bypass) to
3843        * indicate that oMask data (one or two phases) is included in Render
3844        * Target Write messages. If present, the oMask data is used to mask off
3845        * samples."
3846        */
3847       ps.oMaskPresenttoRenderTarget = prog_data->uses_omask;
3848
3849       /* The hardware wedges if you have this bit set but don't turn on any
3850        * dual source blend factors.
3851        *
3852        * BRW_NEW_FS_PROG_DATA | _NEW_COLOR
3853        */
3854       ps.DualSourceBlendEnable = prog_data->dual_src_blend &&
3855                                  (ctx->Color.BlendEnabled & 1) &&
3856                                  ctx->Color.Blend[0]._UsesDualSrc;
3857
3858       /* BRW_NEW_FS_PROG_DATA */
3859       ps.AttributeEnable = (prog_data->num_varying_inputs != 0);
3860 #endif
3861
3862       /* From the documentation for this packet:
3863        * "If the PS kernel does not need the Position XY Offsets to
3864        *  compute a Position Value, then this field should be programmed
3865        *  to POSOFFSET_NONE."
3866        *
3867        * "SW Recommendation: If the PS kernel needs the Position Offsets
3868        *  to compute a Position XY value, this field should match Position
3869        *  ZW Interpolation Mode to ensure a consistent position.xyzw
3870        *  computation."
3871        *
3872        * We only require XY sample offsets. So, this recommendation doesn't
3873        * look useful at the moment. We might need this in future.
3874        */
3875       if (prog_data->uses_pos_offset)
3876          ps.PositionXYOffsetSelect = POSOFFSET_SAMPLE;
3877       else
3878          ps.PositionXYOffsetSelect = POSOFFSET_NONE;
3879
3880       ps._8PixelDispatchEnable = prog_data->dispatch_8;
3881       ps._16PixelDispatchEnable = prog_data->dispatch_16;
3882       ps.DispatchGRFStartRegisterForConstantSetupData0 =
3883          prog_data->base.dispatch_grf_start_reg;
3884       ps.DispatchGRFStartRegisterForConstantSetupData2 =
3885          prog_data->dispatch_grf_start_reg_2;
3886
3887       ps.KernelStartPointer0 = stage_state->prog_offset;
3888       ps.KernelStartPointer2 = stage_state->prog_offset +
3889          prog_data->prog_offset_2;
3890
3891       if (prog_data->base.total_scratch) {
3892          ps.ScratchSpaceBasePointer =
3893             rw_bo(stage_state->scratch_bo,
3894                   ffs(stage_state->per_thread_scratch) - 11);
3895       }
3896    }
3897 }
3898
3899 static const struct brw_tracked_state genX(ps_state) = {
3900    .dirty = {
3901       .mesa  = _NEW_MULTISAMPLE |
3902                (GEN_GEN < 8 ? _NEW_BUFFERS |
3903                               _NEW_COLOR
3904                             : 0),
3905       .brw   = BRW_NEW_BATCH |
3906                BRW_NEW_BLORP |
3907                BRW_NEW_FS_PROG_DATA,
3908    },
3909    .emit = genX(upload_ps),
3910 };
3911 #endif
3912
3913 /* ---------------------------------------------------------------------- */
3914
3915 #if GEN_GEN >= 7
3916 static void
3917 genX(upload_hs_state)(struct brw_context *brw)
3918 {
3919    const struct gen_device_info *devinfo = &brw->screen->devinfo;
3920    struct brw_stage_state *stage_state = &brw->tcs.base;
3921    struct brw_stage_prog_data *stage_prog_data = stage_state->prog_data;
3922    const struct brw_vue_prog_data *vue_prog_data =
3923       brw_vue_prog_data(stage_prog_data);
3924
3925    /* BRW_NEW_TES_PROG_DATA */
3926    struct brw_tcs_prog_data *tcs_prog_data =
3927       brw_tcs_prog_data(stage_prog_data);
3928
3929    if (!tcs_prog_data) {
3930       brw_batch_emit(brw, GENX(3DSTATE_HS), hs);
3931    } else {
3932       brw_batch_emit(brw, GENX(3DSTATE_HS), hs) {
3933          INIT_THREAD_DISPATCH_FIELDS(hs, Vertex);
3934
3935          hs.InstanceCount = tcs_prog_data->instances - 1;
3936          hs.IncludeVertexHandles = true;
3937
3938          hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1;
3939       }
3940    }
3941 }
3942
3943 static const struct brw_tracked_state genX(hs_state) = {
3944    .dirty = {
3945       .mesa  = 0,
3946       .brw   = BRW_NEW_BATCH |
3947                BRW_NEW_BLORP |
3948                BRW_NEW_TCS_PROG_DATA |
3949                BRW_NEW_TESS_PROGRAMS,
3950    },
3951    .emit = genX(upload_hs_state),
3952 };
3953
3954 static void
3955 genX(upload_ds_state)(struct brw_context *brw)
3956 {
3957    const struct gen_device_info *devinfo = &brw->screen->devinfo;
3958    const struct brw_stage_state *stage_state = &brw->tes.base;
3959    struct brw_stage_prog_data *stage_prog_data = stage_state->prog_data;
3960
3961    /* BRW_NEW_TES_PROG_DATA */
3962    const struct brw_tes_prog_data *tes_prog_data =
3963       brw_tes_prog_data(stage_prog_data);
3964    const struct brw_vue_prog_data *vue_prog_data =
3965       brw_vue_prog_data(stage_prog_data);
3966
3967    if (!tes_prog_data) {
3968       brw_batch_emit(brw, GENX(3DSTATE_DS), ds);
3969    } else {
3970       brw_batch_emit(brw, GENX(3DSTATE_DS), ds) {
3971          INIT_THREAD_DISPATCH_FIELDS(ds, Patch);
3972
3973         ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1;
3974         ds.ComputeWCoordinateEnable =
3975            tes_prog_data->domain == BRW_TESS_DOMAIN_TRI;
3976
3977 #if GEN_GEN >= 8
3978         if (vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8)
3979            ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH;
3980         ds.UserClipDistanceCullTestEnableBitmask =
3981             vue_prog_data->cull_distance_mask;
3982 #endif
3983       }
3984    }
3985 }
3986
3987 static const struct brw_tracked_state genX(ds_state) = {
3988    .dirty = {
3989       .mesa  = 0,
3990       .brw   = BRW_NEW_BATCH |
3991                BRW_NEW_BLORP |
3992                BRW_NEW_TESS_PROGRAMS |
3993                BRW_NEW_TES_PROG_DATA,
3994    },
3995    .emit = genX(upload_ds_state),
3996 };
3997
3998 /* ---------------------------------------------------------------------- */
3999
4000 static void
4001 upload_te_state(struct brw_context *brw)
4002 {
4003    /* BRW_NEW_TESS_PROGRAMS */
4004    bool active = brw->programs[MESA_SHADER_TESS_EVAL];
4005
4006    /* BRW_NEW_TES_PROG_DATA */
4007    const struct brw_tes_prog_data *tes_prog_data =
4008       brw_tes_prog_data(brw->tes.base.prog_data);
4009
4010    if (active) {
4011       brw_batch_emit(brw, GENX(3DSTATE_TE), te) {
4012          te.Partitioning = tes_prog_data->partitioning;
4013          te.OutputTopology = tes_prog_data->output_topology;
4014          te.TEDomain = tes_prog_data->domain;
4015          te.TEEnable = true;
4016          te.MaximumTessellationFactorOdd = 63.0;
4017          te.MaximumTessellationFactorNotOdd = 64.0;
4018       }
4019    } else {
4020       brw_batch_emit(brw, GENX(3DSTATE_TE), te);
4021    }
4022 }
4023
4024 static const struct brw_tracked_state genX(te_state) = {
4025    .dirty = {
4026       .mesa  = 0,
4027       .brw   = BRW_NEW_BLORP |
4028                BRW_NEW_CONTEXT |
4029                BRW_NEW_TES_PROG_DATA |
4030                BRW_NEW_TESS_PROGRAMS,
4031    },
4032    .emit = upload_te_state,
4033 };
4034
4035 /* ---------------------------------------------------------------------- */
4036
4037 static void
4038 genX(upload_tes_push_constants)(struct brw_context *brw)
4039 {
4040    struct brw_stage_state *stage_state = &brw->tes.base;
4041    /* BRW_NEW_TESS_PROGRAMS */
4042    const struct gl_program *tep = brw->programs[MESA_SHADER_TESS_EVAL];
4043
4044    /* BRW_NEW_TES_PROG_DATA */
4045    const struct brw_stage_prog_data *prog_data = brw->tes.base.prog_data;
4046    gen6_upload_push_constants(brw, tep, prog_data, stage_state);
4047 }
4048
4049 static const struct brw_tracked_state genX(tes_push_constants) = {
4050    .dirty = {
4051       .mesa  = _NEW_PROGRAM_CONSTANTS,
4052       .brw   = BRW_NEW_BATCH |
4053                BRW_NEW_BLORP |
4054                BRW_NEW_TESS_PROGRAMS |
4055                BRW_NEW_TES_PROG_DATA,
4056    },
4057    .emit = genX(upload_tes_push_constants),
4058 };
4059
4060 static void
4061 genX(upload_tcs_push_constants)(struct brw_context *brw)
4062 {
4063    struct brw_stage_state *stage_state = &brw->tcs.base;
4064    /* BRW_NEW_TESS_PROGRAMS */
4065    const struct gl_program *tcp = brw->programs[MESA_SHADER_TESS_CTRL];
4066
4067    /* BRW_NEW_TCS_PROG_DATA */
4068    const struct brw_stage_prog_data *prog_data = brw->tcs.base.prog_data;
4069
4070    gen6_upload_push_constants(brw, tcp, prog_data, stage_state);
4071 }
4072
4073 static const struct brw_tracked_state genX(tcs_push_constants) = {
4074    .dirty = {
4075       .mesa  = _NEW_PROGRAM_CONSTANTS,
4076       .brw   = BRW_NEW_BATCH |
4077                BRW_NEW_BLORP |
4078                BRW_NEW_DEFAULT_TESS_LEVELS |
4079                BRW_NEW_TESS_PROGRAMS |
4080                BRW_NEW_TCS_PROG_DATA,
4081    },
4082    .emit = genX(upload_tcs_push_constants),
4083 };
4084
4085 #endif
4086
4087 /* ---------------------------------------------------------------------- */
4088
4089 #if GEN_GEN >= 7
4090 static void
4091 genX(upload_cs_push_constants)(struct brw_context *brw)
4092 {
4093    struct brw_stage_state *stage_state = &brw->cs.base;
4094
4095    /* BRW_NEW_COMPUTE_PROGRAM */
4096    const struct gl_program *cp = brw->programs[MESA_SHADER_COMPUTE];
4097
4098    if (cp) {
4099       /* BRW_NEW_CS_PROG_DATA */
4100       struct brw_cs_prog_data *cs_prog_data =
4101          brw_cs_prog_data(brw->cs.base.prog_data);
4102
4103       _mesa_shader_write_subroutine_indices(&brw->ctx, MESA_SHADER_COMPUTE);
4104       brw_upload_cs_push_constants(brw, cp, cs_prog_data, stage_state);
4105    }
4106 }
4107
4108 const struct brw_tracked_state genX(cs_push_constants) = {
4109    .dirty = {
4110       .mesa = _NEW_PROGRAM_CONSTANTS,
4111       .brw = BRW_NEW_BATCH |
4112              BRW_NEW_BLORP |
4113              BRW_NEW_COMPUTE_PROGRAM |
4114              BRW_NEW_CS_PROG_DATA,
4115    },
4116    .emit = genX(upload_cs_push_constants),
4117 };
4118
4119 /**
4120  * Creates a new CS constant buffer reflecting the current CS program's
4121  * constants, if needed by the CS program.
4122  */
4123 static void
4124 genX(upload_cs_pull_constants)(struct brw_context *brw)
4125 {
4126    struct brw_stage_state *stage_state = &brw->cs.base;
4127
4128    /* BRW_NEW_COMPUTE_PROGRAM */
4129    struct brw_program *cp =
4130       (struct brw_program *) brw->programs[MESA_SHADER_COMPUTE];
4131
4132    /* BRW_NEW_CS_PROG_DATA */
4133    const struct brw_stage_prog_data *prog_data = brw->cs.base.prog_data;
4134
4135    _mesa_shader_write_subroutine_indices(&brw->ctx, MESA_SHADER_COMPUTE);
4136    /* _NEW_PROGRAM_CONSTANTS */
4137    brw_upload_pull_constants(brw, BRW_NEW_SURFACES, &cp->program,
4138                              stage_state, prog_data);
4139 }
4140
4141 const struct brw_tracked_state genX(cs_pull_constants) = {
4142    .dirty = {
4143       .mesa = _NEW_PROGRAM_CONSTANTS,
4144       .brw = BRW_NEW_BATCH |
4145              BRW_NEW_BLORP |
4146              BRW_NEW_COMPUTE_PROGRAM |
4147              BRW_NEW_CS_PROG_DATA,
4148    },
4149    .emit = genX(upload_cs_pull_constants),
4150 };
4151
4152 static void
4153 genX(upload_cs_state)(struct brw_context *brw)
4154 {
4155    if (!brw->cs.base.prog_data)
4156       return;
4157
4158    uint32_t offset;
4159    uint32_t *desc = (uint32_t*) brw_state_batch(
4160       brw, GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t), 64,
4161       &offset);
4162
4163    struct brw_stage_state *stage_state = &brw->cs.base;
4164    struct brw_stage_prog_data *prog_data = stage_state->prog_data;
4165    struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data);
4166    const struct gen_device_info *devinfo = &brw->screen->devinfo;
4167
4168    if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
4169       brw_emit_buffer_surface_state(
4170          brw, &stage_state->surf_offset[
4171                  prog_data->binding_table.shader_time_start],
4172          brw->shader_time.bo, 0, ISL_FORMAT_RAW,
4173          brw->shader_time.bo->size, 1,
4174          RELOC_WRITE);
4175    }
4176
4177    uint32_t *bind = brw_state_batch(brw, prog_data->binding_table.size_bytes,
4178                                     32, &stage_state->bind_bo_offset);
4179
4180    /* The MEDIA_VFE_STATE documentation for Gen8+ says:
4181     *
4182     * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
4183     *  the only bits that are changed are scoreboard related: Scoreboard
4184     *  Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For
4185     *  these scoreboard related states, a MEDIA_STATE_FLUSH is sufficient."
4186     *
4187     * Earlier generations say "MI_FLUSH" instead of "stalling PIPE_CONTROL",
4188     * but MI_FLUSH isn't really a thing, so we assume they meant PIPE_CONTROL.
4189     */
4190    brw_emit_pipe_control_flush(brw, PIPE_CONTROL_CS_STALL);
4191
4192    brw_batch_emit(brw, GENX(MEDIA_VFE_STATE), vfe) {
4193       if (prog_data->total_scratch) {
4194          uint32_t per_thread_scratch_value;
4195
4196          if (GEN_GEN >= 8) {
4197             /* Broadwell's Per Thread Scratch Space is in the range [0, 11]
4198              * where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M.
4199              */
4200             per_thread_scratch_value = ffs(stage_state->per_thread_scratch) - 11;
4201          } else if (GEN_IS_HASWELL) {
4202             /* Haswell's Per Thread Scratch Space is in the range [0, 10]
4203              * where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M.
4204              */
4205             per_thread_scratch_value = ffs(stage_state->per_thread_scratch) - 12;
4206          } else {
4207             /* Earlier platforms use the range [0, 11] to mean [1kB, 12kB]
4208              * where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB.
4209              */
4210             per_thread_scratch_value = stage_state->per_thread_scratch / 1024 - 1;
4211          }
4212          vfe.ScratchSpaceBasePointer = rw_bo(stage_state->scratch_bo, 0);
4213          vfe.PerThreadScratchSpace = per_thread_scratch_value;
4214       }
4215
4216       /* If brw->screen->subslice_total is greater than one, then
4217        * devinfo->max_cs_threads stores number of threads per sub-slice;
4218        * thus we need to multiply by that number by subslices to get
4219        * the actual maximum number of threads; the -1 is because the HW
4220        * has a bias of 1 (would not make sense to say the maximum number
4221        * of threads is 0).
4222        */
4223       const uint32_t subslices = MAX2(brw->screen->subslice_total, 1);
4224       vfe.MaximumNumberofThreads = devinfo->max_cs_threads * subslices - 1;
4225       vfe.NumberofURBEntries = GEN_GEN >= 8 ? 2 : 0;
4226 #if GEN_GEN < 11
4227       vfe.ResetGatewayTimer =
4228          Resettingrelativetimerandlatchingtheglobaltimestamp;
4229 #endif
4230 #if GEN_GEN < 9
4231       vfe.BypassGatewayControl = BypassingOpenGatewayCloseGatewayprotocol;
4232 #endif
4233 #if GEN_GEN == 7
4234       vfe.GPGPUMode = 1;
4235 #endif
4236
4237       /* We are uploading duplicated copies of push constant uniforms for each
4238        * thread. Although the local id data needs to vary per thread, it won't
4239        * change for other uniform data. Unfortunately this duplication is
4240        * required for gen7. As of Haswell, this duplication can be avoided,
4241        * but this older mechanism with duplicated data continues to work.
4242        *
4243        * FINISHME: As of Haswell, we could make use of the
4244        * INTERFACE_DESCRIPTOR_DATA "Cross-Thread Constant Data Read Length"
4245        * field to only store one copy of uniform data.
4246        *
4247        * FINISHME: Broadwell adds a new alternative "Indirect Payload Storage"
4248        * which is described in the GPGPU_WALKER command and in the Broadwell
4249        * PRM Volume 7: 3D Media GPGPU, under Media GPGPU Pipeline => Mode of
4250        * Operations => GPGPU Mode => Indirect Payload Storage.
4251        *
4252        * Note: The constant data is built in brw_upload_cs_push_constants
4253        * below.
4254        */
4255       vfe.URBEntryAllocationSize = GEN_GEN >= 8 ? 2 : 0;
4256
4257       const uint32_t vfe_curbe_allocation =
4258          ALIGN(cs_prog_data->push.per_thread.regs * cs_prog_data->threads +
4259                cs_prog_data->push.cross_thread.regs, 2);
4260       vfe.CURBEAllocationSize = vfe_curbe_allocation;
4261    }
4262
4263    if (cs_prog_data->push.total.size > 0) {
4264       brw_batch_emit(brw, GENX(MEDIA_CURBE_LOAD), curbe) {
4265          curbe.CURBETotalDataLength =
4266             ALIGN(cs_prog_data->push.total.size, 64);
4267          curbe.CURBEDataStartAddress = stage_state->push_const_offset;
4268       }
4269    }
4270
4271    /* BRW_NEW_SURFACES and BRW_NEW_*_CONSTBUF */
4272    memcpy(bind, stage_state->surf_offset,
4273           prog_data->binding_table.size_bytes);
4274    const struct GENX(INTERFACE_DESCRIPTOR_DATA) idd = {
4275       .KernelStartPointer = brw->cs.base.prog_offset,
4276       .SamplerStatePointer = stage_state->sampler_offset,
4277       .SamplerCount = DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4),
4278       .BindingTablePointer = stage_state->bind_bo_offset,
4279       .ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs,
4280       .NumberofThreadsinGPGPUThreadGroup = cs_prog_data->threads,
4281       .SharedLocalMemorySize = encode_slm_size(GEN_GEN,
4282                                                prog_data->total_shared),
4283       .BarrierEnable = cs_prog_data->uses_barrier,
4284 #if GEN_GEN >= 8 || GEN_IS_HASWELL
4285       .CrossThreadConstantDataReadLength =
4286          cs_prog_data->push.cross_thread.regs,
4287 #endif
4288    };
4289
4290    GENX(INTERFACE_DESCRIPTOR_DATA_pack)(brw, desc, &idd);
4291
4292    brw_batch_emit(brw, GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), load) {
4293       load.InterfaceDescriptorTotalLength =
4294          GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
4295       load.InterfaceDescriptorDataStartAddress = offset;
4296    }
4297 }
4298
4299 static const struct brw_tracked_state genX(cs_state) = {
4300    .dirty = {
4301       .mesa = _NEW_PROGRAM_CONSTANTS,
4302       .brw = BRW_NEW_BATCH |
4303              BRW_NEW_BLORP |
4304              BRW_NEW_CS_PROG_DATA |
4305              BRW_NEW_SAMPLER_STATE_TABLE |
4306              BRW_NEW_SURFACES,
4307    },
4308    .emit = genX(upload_cs_state)
4309 };
4310
4311 #endif
4312
4313 /* ---------------------------------------------------------------------- */
4314
4315 #if GEN_GEN >= 8
4316 static void
4317 genX(upload_raster)(struct brw_context *brw)
4318 {
4319    const struct gl_context *ctx = &brw->ctx;
4320
4321    /* _NEW_BUFFERS */
4322    const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
4323
4324    /* _NEW_POLYGON */
4325    const struct gl_polygon_attrib *polygon = &ctx->Polygon;
4326
4327    /* _NEW_POINT */
4328    const struct gl_point_attrib *point = &ctx->Point;
4329
4330    brw_batch_emit(brw, GENX(3DSTATE_RASTER), raster) {
4331       if (brw->polygon_front_bit == render_to_fbo)
4332          raster.FrontWinding = CounterClockwise;
4333
4334       if (polygon->CullFlag) {
4335          switch (polygon->CullFaceMode) {
4336          case GL_FRONT:
4337             raster.CullMode = CULLMODE_FRONT;
4338             break;
4339          case GL_BACK:
4340             raster.CullMode = CULLMODE_BACK;
4341             break;
4342          case GL_FRONT_AND_BACK:
4343             raster.CullMode = CULLMODE_BOTH;
4344             break;
4345          default:
4346             unreachable("not reached");
4347          }
4348       } else {
4349          raster.CullMode = CULLMODE_NONE;
4350       }
4351
4352       raster.SmoothPointEnable = point->SmoothFlag;
4353
4354       raster.DXMultisampleRasterizationEnable =
4355          _mesa_is_multisample_enabled(ctx);
4356
4357       raster.GlobalDepthOffsetEnableSolid = polygon->OffsetFill;
4358       raster.GlobalDepthOffsetEnableWireframe = polygon->OffsetLine;
4359       raster.GlobalDepthOffsetEnablePoint = polygon->OffsetPoint;
4360
4361       switch (polygon->FrontMode) {
4362       case GL_FILL:
4363          raster.FrontFaceFillMode = FILL_MODE_SOLID;
4364          break;
4365       case GL_LINE:
4366          raster.FrontFaceFillMode = FILL_MODE_WIREFRAME;
4367          break;
4368       case GL_POINT:
4369          raster.FrontFaceFillMode = FILL_MODE_POINT;
4370          break;
4371       default:
4372          unreachable("not reached");
4373       }
4374
4375       switch (polygon->BackMode) {
4376       case GL_FILL:
4377          raster.BackFaceFillMode = FILL_MODE_SOLID;
4378          break;
4379       case GL_LINE:
4380          raster.BackFaceFillMode = FILL_MODE_WIREFRAME;
4381          break;
4382       case GL_POINT:
4383          raster.BackFaceFillMode = FILL_MODE_POINT;
4384          break;
4385       default:
4386          unreachable("not reached");
4387       }
4388
4389       /* _NEW_LINE */
4390       raster.AntialiasingEnable = ctx->Line.SmoothFlag;
4391
4392 #if GEN_GEN == 10
4393       /* _NEW_BUFFERS
4394        * Antialiasing Enable bit MUST not be set when NUM_MULTISAMPLES > 1.
4395        */
4396       const bool multisampled_fbo =
4397          _mesa_geometric_samples(ctx->DrawBuffer) > 1;
4398       if (multisampled_fbo)
4399          raster.AntialiasingEnable = false;
4400 #endif
4401
4402       /* _NEW_SCISSOR */
4403       raster.ScissorRectangleEnable = ctx->Scissor.EnableFlags;
4404
4405       /* _NEW_TRANSFORM */
4406       if (!ctx->Transform.DepthClamp) {
4407 #if GEN_GEN >= 9
4408          raster.ViewportZFarClipTestEnable = true;
4409          raster.ViewportZNearClipTestEnable = true;
4410 #else
4411          raster.ViewportZClipTestEnable = true;
4412 #endif
4413       }
4414
4415       /* BRW_NEW_CONSERVATIVE_RASTERIZATION */
4416 #if GEN_GEN >= 9
4417       raster.ConservativeRasterizationEnable =
4418          ctx->IntelConservativeRasterization;
4419 #endif
4420
4421       raster.GlobalDepthOffsetClamp = polygon->OffsetClamp;
4422       raster.GlobalDepthOffsetScale = polygon->OffsetFactor;
4423
4424       raster.GlobalDepthOffsetConstant = polygon->OffsetUnits * 2;
4425    }
4426 }
4427
4428 static const struct brw_tracked_state genX(raster_state) = {
4429    .dirty = {
4430       .mesa  = _NEW_BUFFERS |
4431                _NEW_LINE |
4432                _NEW_MULTISAMPLE |
4433                _NEW_POINT |
4434                _NEW_POLYGON |
4435                _NEW_SCISSOR |
4436                _NEW_TRANSFORM,
4437       .brw   = BRW_NEW_BLORP |
4438                BRW_NEW_CONTEXT |
4439                BRW_NEW_CONSERVATIVE_RASTERIZATION,
4440    },
4441    .emit = genX(upload_raster),
4442 };
4443 #endif
4444
4445 /* ---------------------------------------------------------------------- */
4446
4447 #if GEN_GEN >= 8
4448 static void
4449 genX(upload_ps_extra)(struct brw_context *brw)
4450 {
4451    UNUSED struct gl_context *ctx = &brw->ctx;
4452
4453    const struct brw_wm_prog_data *prog_data =
4454       brw_wm_prog_data(brw->wm.base.prog_data);
4455
4456    brw_batch_emit(brw, GENX(3DSTATE_PS_EXTRA), psx) {
4457       psx.PixelShaderValid = true;
4458       psx.PixelShaderComputedDepthMode = prog_data->computed_depth_mode;
4459       psx.PixelShaderKillsPixel = prog_data->uses_kill;
4460       psx.AttributeEnable = prog_data->num_varying_inputs != 0;
4461       psx.PixelShaderUsesSourceDepth = prog_data->uses_src_depth;
4462       psx.PixelShaderUsesSourceW = prog_data->uses_src_w;
4463       psx.PixelShaderIsPerSample = prog_data->persample_dispatch;
4464
4465       /* _NEW_MULTISAMPLE | BRW_NEW_CONSERVATIVE_RASTERIZATION */
4466       if (prog_data->uses_sample_mask) {
4467 #if GEN_GEN >= 9
4468          if (prog_data->post_depth_coverage)
4469             psx.InputCoverageMaskState = ICMS_DEPTH_COVERAGE;
4470          else if (prog_data->inner_coverage && ctx->IntelConservativeRasterization)
4471             psx.InputCoverageMaskState = ICMS_INNER_CONSERVATIVE;
4472          else
4473             psx.InputCoverageMaskState = ICMS_NORMAL;
4474 #else
4475          psx.PixelShaderUsesInputCoverageMask = true;
4476 #endif
4477       }
4478
4479       psx.oMaskPresenttoRenderTarget = prog_data->uses_omask;
4480 #if GEN_GEN >= 9
4481       psx.PixelShaderPullsBary = prog_data->pulls_bary;
4482       psx.PixelShaderComputesStencil = prog_data->computed_stencil;
4483 #endif
4484
4485       /* The stricter cross-primitive coherency guarantees that the hardware
4486        * gives us with the "Accesses UAV" bit set for at least one shader stage
4487        * and the "UAV coherency required" bit set on the 3DPRIMITIVE command
4488        * are redundant within the current image, atomic counter and SSBO GL
4489        * APIs, which all have very loose ordering and coherency requirements
4490        * and generally rely on the application to insert explicit barriers when
4491        * a shader invocation is expected to see the memory writes performed by
4492        * the invocations of some previous primitive.  Regardless of the value
4493        * of "UAV coherency required", the "Accesses UAV" bits will implicitly
4494        * cause an in most cases useless DC flush when the lowermost stage with
4495        * the bit set finishes execution.
4496        *
4497        * It would be nice to disable it, but in some cases we can't because on
4498        * Gen8+ it also has an influence on rasterization via the PS UAV-only
4499        * signal (which could be set independently from the coherency mechanism
4500        * in the 3DSTATE_WM command on Gen7), and because in some cases it will
4501        * determine whether the hardware skips execution of the fragment shader
4502        * or not via the ThreadDispatchEnable signal.  However if we know that
4503        * GEN8_PS_BLEND_HAS_WRITEABLE_RT is going to be set and
4504        * GEN8_PSX_PIXEL_SHADER_NO_RT_WRITE is not set it shouldn't make any
4505        * difference so we may just disable it here.
4506        *
4507        * Gen8 hardware tries to compute ThreadDispatchEnable for us but doesn't
4508        * take into account KillPixels when no depth or stencil writes are
4509        * enabled.  In order for occlusion queries to work correctly with no
4510        * attachments, we need to force-enable here.
4511        *
4512        * BRW_NEW_FS_PROG_DATA | BRW_NEW_FRAGMENT_PROGRAM | _NEW_BUFFERS |
4513        * _NEW_COLOR
4514        */
4515       if ((prog_data->has_side_effects || prog_data->uses_kill) &&
4516           !brw_color_buffer_write_enabled(brw))
4517          psx.PixelShaderHasUAV = true;
4518    }
4519 }
4520
4521 const struct brw_tracked_state genX(ps_extra) = {
4522    .dirty = {
4523       .mesa  = _NEW_BUFFERS | _NEW_COLOR,
4524       .brw   = BRW_NEW_BLORP |
4525                BRW_NEW_CONTEXT |
4526                BRW_NEW_FRAGMENT_PROGRAM |
4527                BRW_NEW_FS_PROG_DATA |
4528                BRW_NEW_CONSERVATIVE_RASTERIZATION,
4529    },
4530    .emit = genX(upload_ps_extra),
4531 };
4532 #endif
4533
4534 /* ---------------------------------------------------------------------- */
4535
4536 #if GEN_GEN >= 8
4537 static void
4538 genX(upload_ps_blend)(struct brw_context *brw)
4539 {
4540    struct gl_context *ctx = &brw->ctx;
4541
4542    /* _NEW_BUFFERS */
4543    struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[0];
4544    const bool buffer0_is_integer = ctx->DrawBuffer->_IntegerBuffers & 0x1;
4545
4546    /* _NEW_COLOR */
4547    struct gl_colorbuffer_attrib *color = &ctx->Color;
4548
4549    brw_batch_emit(brw, GENX(3DSTATE_PS_BLEND), pb) {
4550       /* BRW_NEW_FRAGMENT_PROGRAM | _NEW_BUFFERS | _NEW_COLOR */
4551       pb.HasWriteableRT = brw_color_buffer_write_enabled(brw);
4552
4553       bool alpha_to_one = false;
4554
4555       if (!buffer0_is_integer) {
4556          /* _NEW_MULTISAMPLE */
4557
4558          if (_mesa_is_multisample_enabled(ctx)) {
4559             pb.AlphaToCoverageEnable = ctx->Multisample.SampleAlphaToCoverage;
4560             alpha_to_one = ctx->Multisample.SampleAlphaToOne;
4561          }
4562
4563          pb.AlphaTestEnable = color->AlphaEnabled;
4564       }
4565
4566       /* Used for implementing the following bit of GL_EXT_texture_integer:
4567        * "Per-fragment operations that require floating-point color
4568        *  components, including multisample alpha operations, alpha test,
4569        *  blending, and dithering, have no effect when the corresponding
4570        *  colors are written to an integer color buffer."
4571        *
4572        * The OpenGL specification 3.3 (page 196), section 4.1.3 says:
4573        * "If drawbuffer zero is not NONE and the buffer it references has an
4574        *  integer format, the SAMPLE_ALPHA_TO_COVERAGE and SAMPLE_ALPHA_TO_ONE
4575        *  operations are skipped."
4576        */
4577       if (rb && !buffer0_is_integer && (color->BlendEnabled & 1)) {
4578          GLenum eqRGB = color->Blend[0].EquationRGB;
4579          GLenum eqA = color->Blend[0].EquationA;
4580          GLenum srcRGB = color->Blend[0].SrcRGB;
4581          GLenum dstRGB = color->Blend[0].DstRGB;
4582          GLenum srcA = color->Blend[0].SrcA;
4583          GLenum dstA = color->Blend[0].DstA;
4584
4585          if (eqRGB == GL_MIN || eqRGB == GL_MAX)
4586             srcRGB = dstRGB = GL_ONE;
4587
4588          if (eqA == GL_MIN || eqA == GL_MAX)
4589             srcA = dstA = GL_ONE;
4590
4591          /* Due to hardware limitations, the destination may have information
4592           * in an alpha channel even when the format specifies no alpha
4593           * channel. In order to avoid getting any incorrect blending due to
4594           * that alpha channel, coerce the blend factors to values that will
4595           * not read the alpha channel, but will instead use the correct
4596           * implicit value for alpha.
4597           */
4598          if (!_mesa_base_format_has_channel(rb->_BaseFormat,
4599                                             GL_TEXTURE_ALPHA_TYPE)) {
4600             srcRGB = brw_fix_xRGB_alpha(srcRGB);
4601             srcA = brw_fix_xRGB_alpha(srcA);
4602             dstRGB = brw_fix_xRGB_alpha(dstRGB);
4603             dstA = brw_fix_xRGB_alpha(dstA);
4604          }
4605
4606          /* Alpha to One doesn't work with Dual Color Blending.  Override
4607           * SRC1_ALPHA to ONE and ONE_MINUS_SRC1_ALPHA to ZERO.
4608           */
4609          if (alpha_to_one && color->Blend[0]._UsesDualSrc) {
4610             srcRGB = fix_dual_blend_alpha_to_one(srcRGB);
4611             srcA = fix_dual_blend_alpha_to_one(srcA);
4612             dstRGB = fix_dual_blend_alpha_to_one(dstRGB);
4613             dstA = fix_dual_blend_alpha_to_one(dstA);
4614          }
4615
4616          pb.ColorBufferBlendEnable = true;
4617          pb.SourceAlphaBlendFactor = brw_translate_blend_factor(srcA);
4618          pb.DestinationAlphaBlendFactor = brw_translate_blend_factor(dstA);
4619          pb.SourceBlendFactor = brw_translate_blend_factor(srcRGB);
4620          pb.DestinationBlendFactor = brw_translate_blend_factor(dstRGB);
4621
4622          pb.IndependentAlphaBlendEnable =
4623             srcA != srcRGB || dstA != dstRGB || eqA != eqRGB;
4624       }
4625    }
4626 }
4627
4628 static const struct brw_tracked_state genX(ps_blend) = {
4629    .dirty = {
4630       .mesa = _NEW_BUFFERS |
4631               _NEW_COLOR |
4632               _NEW_MULTISAMPLE,
4633       .brw = BRW_NEW_BLORP |
4634              BRW_NEW_CONTEXT |
4635              BRW_NEW_FRAGMENT_PROGRAM,
4636    },
4637    .emit = genX(upload_ps_blend)
4638 };
4639 #endif
4640
4641 /* ---------------------------------------------------------------------- */
4642
4643 #if GEN_GEN >= 8
4644 static void
4645 genX(emit_vf_topology)(struct brw_context *brw)
4646 {
4647    brw_batch_emit(brw, GENX(3DSTATE_VF_TOPOLOGY), vftopo) {
4648       vftopo.PrimitiveTopologyType = brw->primitive;
4649    }
4650 }
4651
4652 static const struct brw_tracked_state genX(vf_topology) = {
4653    .dirty = {
4654       .mesa = 0,
4655       .brw = BRW_NEW_BLORP |
4656              BRW_NEW_PRIMITIVE,
4657    },
4658    .emit = genX(emit_vf_topology),
4659 };
4660 #endif
4661
4662 /* ---------------------------------------------------------------------- */
4663
4664 #if GEN_GEN >= 7
4665 static void
4666 genX(emit_mi_report_perf_count)(struct brw_context *brw,
4667                                 struct brw_bo *bo,
4668                                 uint32_t offset_in_bytes,
4669                                 uint32_t report_id)
4670 {
4671    brw_batch_emit(brw, GENX(MI_REPORT_PERF_COUNT), mi_rpc) {
4672       mi_rpc.MemoryAddress = ggtt_bo(bo, offset_in_bytes);
4673       mi_rpc.ReportID = report_id;
4674    }
4675 }
4676 #endif
4677
4678 /* ---------------------------------------------------------------------- */
4679
4680 /**
4681  * Emit a 3DSTATE_SAMPLER_STATE_POINTERS_{VS,HS,GS,DS,PS} packet.
4682  */
4683 static void
4684 genX(emit_sampler_state_pointers_xs)(struct brw_context *brw,
4685                                      struct brw_stage_state *stage_state)
4686 {
4687 #if GEN_GEN >= 7
4688    static const uint16_t packet_headers[] = {
4689       [MESA_SHADER_VERTEX] = 43,
4690       [MESA_SHADER_TESS_CTRL] = 44,
4691       [MESA_SHADER_TESS_EVAL] = 45,
4692       [MESA_SHADER_GEOMETRY] = 46,
4693       [MESA_SHADER_FRAGMENT] = 47,
4694    };
4695
4696    /* Ivybridge requires a workaround flush before VS packets. */
4697    if (GEN_GEN == 7 && !GEN_IS_HASWELL &&
4698        stage_state->stage == MESA_SHADER_VERTEX) {
4699       gen7_emit_vs_workaround_flush(brw);
4700    }
4701
4702    brw_batch_emit(brw, GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ptr) {
4703       ptr._3DCommandSubOpcode = packet_headers[stage_state->stage];
4704       ptr.PointertoVSSamplerState = stage_state->sampler_offset;
4705    }
4706 #endif
4707 }
4708
4709 UNUSED static bool
4710 has_component(mesa_format format, int i)
4711 {
4712    if (_mesa_is_format_color_format(format))
4713       return _mesa_format_has_color_component(format, i);
4714
4715    /* depth and stencil have only one component */
4716    return i == 0;
4717 }
4718
4719 /**
4720  * Upload SAMPLER_BORDER_COLOR_STATE.
4721  */
4722 static void
4723 genX(upload_default_color)(struct brw_context *brw,
4724                            const struct gl_sampler_object *sampler,
4725                            mesa_format format, GLenum base_format,
4726                            bool is_integer_format, bool is_stencil_sampling,
4727                            uint32_t *sdc_offset)
4728 {
4729    union gl_color_union color;
4730
4731    switch (base_format) {
4732    case GL_DEPTH_COMPONENT:
4733       /* GL specs that border color for depth textures is taken from the
4734        * R channel, while the hardware uses A.  Spam R into all the
4735        * channels for safety.
4736        */
4737       color.ui[0] = sampler->BorderColor.ui[0];
4738       color.ui[1] = sampler->BorderColor.ui[0];
4739       color.ui[2] = sampler->BorderColor.ui[0];
4740       color.ui[3] = sampler->BorderColor.ui[0];
4741       break;
4742    case GL_ALPHA:
4743       color.ui[0] = 0u;
4744       color.ui[1] = 0u;
4745       color.ui[2] = 0u;
4746       color.ui[3] = sampler->BorderColor.ui[3];
4747       break;
4748    case GL_INTENSITY:
4749       color.ui[0] = sampler->BorderColor.ui[0];
4750       color.ui[1] = sampler->BorderColor.ui[0];
4751       color.ui[2] = sampler->BorderColor.ui[0];
4752       color.ui[3] = sampler->BorderColor.ui[0];
4753       break;
4754    case GL_LUMINANCE:
4755       color.ui[0] = sampler->BorderColor.ui[0];
4756       color.ui[1] = sampler->BorderColor.ui[0];
4757       color.ui[2] = sampler->BorderColor.ui[0];
4758       color.ui[3] = float_as_int(1.0);
4759       break;
4760    case GL_LUMINANCE_ALPHA:
4761       color.ui[0] = sampler->BorderColor.ui[0];
4762       color.ui[1] = sampler->BorderColor.ui[0];
4763       color.ui[2] = sampler->BorderColor.ui[0];
4764       color.ui[3] = sampler->BorderColor.ui[3];
4765       break;
4766    default:
4767       color.ui[0] = sampler->BorderColor.ui[0];
4768       color.ui[1] = sampler->BorderColor.ui[1];
4769       color.ui[2] = sampler->BorderColor.ui[2];
4770       color.ui[3] = sampler->BorderColor.ui[3];
4771       break;
4772    }
4773
4774    /* In some cases we use an RGBA surface format for GL RGB textures,
4775     * where we've initialized the A channel to 1.0.  We also have to set
4776     * the border color alpha to 1.0 in that case.
4777     */
4778    if (base_format == GL_RGB)
4779       color.ui[3] = float_as_int(1.0);
4780
4781    int alignment = 32;
4782    if (GEN_GEN >= 8) {
4783       alignment = 64;
4784    } else if (GEN_IS_HASWELL && (is_integer_format || is_stencil_sampling)) {
4785       alignment = 512;
4786    }
4787
4788    uint32_t *sdc = brw_state_batch(
4789       brw, GENX(SAMPLER_BORDER_COLOR_STATE_length) * sizeof(uint32_t),
4790       alignment, sdc_offset);
4791
4792    struct GENX(SAMPLER_BORDER_COLOR_STATE) state = { 0 };
4793
4794 #define ASSIGN(dst, src) \
4795    do {                  \
4796       dst = src;         \
4797    } while (0)
4798
4799 #define ASSIGNu16(dst, src) \
4800    do {                     \
4801       dst = (uint16_t)src;  \
4802    } while (0)
4803
4804 #define ASSIGNu8(dst, src) \
4805    do {                    \
4806       dst = (uint8_t)src;  \
4807    } while (0)
4808
4809 #define BORDER_COLOR_ATTR(macro, _color_type, src)              \
4810    macro(state.BorderColor ## _color_type ## Red, src[0]);   \
4811    macro(state.BorderColor ## _color_type ## Green, src[1]);   \
4812    macro(state.BorderColor ## _color_type ## Blue, src[2]);   \
4813    macro(state.BorderColor ## _color_type ## Alpha, src[3]);
4814
4815 #if GEN_GEN >= 8
4816    /* On Broadwell, the border color is represented as four 32-bit floats,
4817     * integers, or unsigned values, interpreted according to the surface
4818     * format.  This matches the sampler->BorderColor union exactly; just
4819     * memcpy the values.
4820     */
4821    BORDER_COLOR_ATTR(ASSIGN, 32bit, color.ui);
4822 #elif GEN_IS_HASWELL
4823    if (is_integer_format || is_stencil_sampling) {
4824       bool stencil = format == MESA_FORMAT_S_UINT8 || is_stencil_sampling;
4825       const int bits_per_channel =
4826          _mesa_get_format_bits(format, stencil ? GL_STENCIL_BITS : GL_RED_BITS);
4827
4828       /* From the Haswell PRM, "Command Reference: Structures", Page 36:
4829        * "If any color channel is missing from the surface format,
4830        *  corresponding border color should be programmed as zero and if
4831        *  alpha channel is missing, corresponding Alpha border color should
4832        *  be programmed as 1."
4833        */
4834       unsigned c[4] = { 0, 0, 0, 1 };
4835       for (int i = 0; i < 4; i++) {
4836          if (has_component(format, i))
4837             c[i] = color.ui[i];
4838       }
4839
4840       switch (bits_per_channel) {
4841       case 8:
4842          /* Copy RGBA in order. */
4843          BORDER_COLOR_ATTR(ASSIGNu8, 8bit, c);
4844          break;
4845       case 10:
4846          /* R10G10B10A2_UINT is treated like a 16-bit format. */
4847       case 16:
4848          BORDER_COLOR_ATTR(ASSIGNu16, 16bit, c);
4849          break;
4850       case 32:
4851          if (base_format == GL_RG) {
4852             /* Careful inspection of the tables reveals that for RG32 formats,
4853              * the green channel needs to go where blue normally belongs.
4854              */
4855             state.BorderColor32bitRed = c[0];
4856             state.BorderColor32bitBlue = c[1];
4857             state.BorderColor32bitAlpha = 1;
4858          } else {
4859             /* Copy RGBA in order. */
4860             BORDER_COLOR_ATTR(ASSIGN, 32bit, c);
4861          }
4862          break;
4863       default:
4864          assert(!"Invalid number of bits per channel in integer format.");
4865          break;
4866       }
4867    } else {
4868       BORDER_COLOR_ATTR(ASSIGN, Float, color.f);
4869    }
4870 #elif GEN_GEN == 5 || GEN_GEN == 6
4871    BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_UBYTE, Unorm, color.f);
4872    BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_USHORT, Unorm16, color.f);
4873    BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_SHORT, Snorm16, color.f);
4874
4875 #define MESA_FLOAT_TO_HALF(dst, src) \
4876    dst = _mesa_float_to_half(src);
4877
4878    BORDER_COLOR_ATTR(MESA_FLOAT_TO_HALF, Float16, color.f);
4879
4880 #undef MESA_FLOAT_TO_HALF
4881
4882    state.BorderColorSnorm8Red   = state.BorderColorSnorm16Red >> 8;
4883    state.BorderColorSnorm8Green = state.BorderColorSnorm16Green >> 8;
4884    state.BorderColorSnorm8Blue  = state.BorderColorSnorm16Blue >> 8;
4885    state.BorderColorSnorm8Alpha = state.BorderColorSnorm16Alpha >> 8;
4886
4887    BORDER_COLOR_ATTR(ASSIGN, Float, color.f);
4888 #elif GEN_GEN == 4
4889    BORDER_COLOR_ATTR(ASSIGN, , color.f);
4890 #else
4891    BORDER_COLOR_ATTR(ASSIGN, Float, color.f);
4892 #endif
4893
4894 #undef ASSIGN
4895 #undef BORDER_COLOR_ATTR
4896
4897    GENX(SAMPLER_BORDER_COLOR_STATE_pack)(brw, sdc, &state);
4898 }
4899
4900 static uint32_t
4901 translate_wrap_mode(struct brw_context *brw, GLenum wrap, bool using_nearest)
4902 {
4903    switch (wrap) {
4904    case GL_REPEAT:
4905       return TCM_WRAP;
4906    case GL_CLAMP:
4907 #if GEN_GEN >= 8
4908       /* GL_CLAMP is the weird mode where coordinates are clamped to
4909        * [0.0, 1.0], so linear filtering of coordinates outside of
4910        * [0.0, 1.0] give you half edge texel value and half border
4911        * color.
4912        *
4913        * Gen8+ supports this natively.
4914        */
4915       return TCM_HALF_BORDER;
4916 #else
4917       /* On Gen4-7.5, we clamp the coordinates in the fragment shader
4918        * and set clamp_border here, which gets the result desired.
4919        * We just use clamp(_to_edge) for nearest, because for nearest
4920        * clamping to 1.0 gives border color instead of the desired
4921        * edge texels.
4922        */
4923       if (using_nearest)
4924          return TCM_CLAMP;
4925       else
4926          return TCM_CLAMP_BORDER;
4927 #endif
4928    case GL_CLAMP_TO_EDGE:
4929       return TCM_CLAMP;
4930    case GL_CLAMP_TO_BORDER:
4931       return TCM_CLAMP_BORDER;
4932    case GL_MIRRORED_REPEAT:
4933       return TCM_MIRROR;
4934    case GL_MIRROR_CLAMP_TO_EDGE:
4935       return TCM_MIRROR_ONCE;
4936    default:
4937       return TCM_WRAP;
4938    }
4939 }
4940
4941 /**
4942  * Return true if the given wrap mode requires the border color to exist.
4943  */
4944 static bool
4945 wrap_mode_needs_border_color(unsigned wrap_mode)
4946 {
4947 #if GEN_GEN >= 8
4948    return wrap_mode == TCM_CLAMP_BORDER ||
4949           wrap_mode == TCM_HALF_BORDER;
4950 #else
4951    return wrap_mode == TCM_CLAMP_BORDER;
4952 #endif
4953 }
4954
4955 /**
4956  * Sets the sampler state for a single unit based off of the sampler key
4957  * entry.
4958  */
4959 static void
4960 genX(update_sampler_state)(struct brw_context *brw,
4961                            GLenum target, bool tex_cube_map_seamless,
4962                            GLfloat tex_unit_lod_bias,
4963                            mesa_format format, GLenum base_format,
4964                            const struct gl_texture_object *texObj,
4965                            const struct gl_sampler_object *sampler,
4966                            uint32_t *sampler_state,
4967                            uint32_t batch_offset_for_sampler_state)
4968 {
4969    struct GENX(SAMPLER_STATE) samp_st = { 0 };
4970
4971    /* Select min and mip filters. */
4972    switch (sampler->MinFilter) {
4973    case GL_NEAREST:
4974       samp_st.MinModeFilter = MAPFILTER_NEAREST;
4975       samp_st.MipModeFilter = MIPFILTER_NONE;
4976       break;
4977    case GL_LINEAR:
4978       samp_st.MinModeFilter = MAPFILTER_LINEAR;
4979       samp_st.MipModeFilter = MIPFILTER_NONE;
4980       break;
4981    case GL_NEAREST_MIPMAP_NEAREST:
4982       samp_st.MinModeFilter = MAPFILTER_NEAREST;
4983       samp_st.MipModeFilter = MIPFILTER_NEAREST;
4984       break;
4985    case GL_LINEAR_MIPMAP_NEAREST:
4986       samp_st.MinModeFilter = MAPFILTER_LINEAR;
4987       samp_st.MipModeFilter = MIPFILTER_NEAREST;
4988       break;
4989    case GL_NEAREST_MIPMAP_LINEAR:
4990       samp_st.MinModeFilter = MAPFILTER_NEAREST;
4991       samp_st.MipModeFilter = MIPFILTER_LINEAR;
4992       break;
4993    case GL_LINEAR_MIPMAP_LINEAR:
4994       samp_st.MinModeFilter = MAPFILTER_LINEAR;
4995       samp_st.MipModeFilter = MIPFILTER_LINEAR;
4996       break;
4997    default:
4998       unreachable("not reached");
4999    }
5000
5001    /* Select mag filter. */
5002    samp_st.MagModeFilter = sampler->MagFilter == GL_LINEAR ?
5003       MAPFILTER_LINEAR : MAPFILTER_NEAREST;
5004
5005    /* Enable anisotropic filtering if desired. */
5006    samp_st.MaximumAnisotropy = RATIO21;
5007
5008    if (sampler->MaxAnisotropy > 1.0f) {
5009       if (samp_st.MinModeFilter == MAPFILTER_LINEAR)
5010          samp_st.MinModeFilter = MAPFILTER_ANISOTROPIC;
5011       if (samp_st.MagModeFilter == MAPFILTER_LINEAR)
5012          samp_st.MagModeFilter = MAPFILTER_ANISOTROPIC;
5013
5014       if (sampler->MaxAnisotropy > 2.0f) {
5015          samp_st.MaximumAnisotropy =
5016             MIN2((sampler->MaxAnisotropy - 2) / 2, RATIO161);
5017       }
5018    }
5019
5020    /* Set address rounding bits if not using nearest filtering. */
5021    if (samp_st.MinModeFilter != MAPFILTER_NEAREST) {
5022       samp_st.UAddressMinFilterRoundingEnable = true;
5023       samp_st.VAddressMinFilterRoundingEnable = true;
5024       samp_st.RAddressMinFilterRoundingEnable = true;
5025    }
5026
5027    if (samp_st.MagModeFilter != MAPFILTER_NEAREST) {
5028       samp_st.UAddressMagFilterRoundingEnable = true;
5029       samp_st.VAddressMagFilterRoundingEnable = true;
5030       samp_st.RAddressMagFilterRoundingEnable = true;
5031    }
5032
5033    bool either_nearest =
5034       sampler->MinFilter == GL_NEAREST || sampler->MagFilter == GL_NEAREST;
5035    unsigned wrap_s = translate_wrap_mode(brw, sampler->WrapS, either_nearest);
5036    unsigned wrap_t = translate_wrap_mode(brw, sampler->WrapT, either_nearest);
5037    unsigned wrap_r = translate_wrap_mode(brw, sampler->WrapR, either_nearest);
5038
5039    if (target == GL_TEXTURE_CUBE_MAP ||
5040        target == GL_TEXTURE_CUBE_MAP_ARRAY) {
5041       /* Cube maps must use the same wrap mode for all three coordinate
5042        * dimensions.  Prior to Haswell, only CUBE and CLAMP are valid.
5043        *
5044        * Ivybridge and Baytrail seem to have problems with CUBE mode and
5045        * integer formats.  Fall back to CLAMP for now.
5046        */
5047       if ((tex_cube_map_seamless || sampler->CubeMapSeamless) &&
5048           !(GEN_GEN == 7 && !GEN_IS_HASWELL && texObj->_IsIntegerFormat)) {
5049          wrap_s = TCM_CUBE;
5050          wrap_t = TCM_CUBE;
5051          wrap_r = TCM_CUBE;
5052       } else {
5053          wrap_s = TCM_CLAMP;
5054          wrap_t = TCM_CLAMP;
5055          wrap_r = TCM_CLAMP;
5056       }
5057    } else if (target == GL_TEXTURE_1D) {
5058       /* There's a bug in 1D texture sampling - it actually pays
5059        * attention to the wrap_t value, though it should not.
5060        * Override the wrap_t value here to GL_REPEAT to keep
5061        * any nonexistent border pixels from floating in.
5062        */
5063       wrap_t = TCM_WRAP;
5064    }
5065
5066    samp_st.TCXAddressControlMode = wrap_s;
5067    samp_st.TCYAddressControlMode = wrap_t;
5068    samp_st.TCZAddressControlMode = wrap_r;
5069
5070    samp_st.ShadowFunction =
5071       sampler->CompareMode == GL_COMPARE_R_TO_TEXTURE_ARB ?
5072       intel_translate_shadow_compare_func(sampler->CompareFunc) : 0;
5073
5074 #if GEN_GEN >= 7
5075    /* Set shadow function. */
5076    samp_st.AnisotropicAlgorithm =
5077       samp_st.MinModeFilter == MAPFILTER_ANISOTROPIC ?
5078       EWAApproximation : LEGACY;
5079 #endif
5080
5081 #if GEN_GEN >= 6
5082    samp_st.NonnormalizedCoordinateEnable = target == GL_TEXTURE_RECTANGLE;
5083 #endif
5084
5085    const float hw_max_lod = GEN_GEN >= 7 ? 14 : 13;
5086    samp_st.MinLOD = CLAMP(sampler->MinLod, 0, hw_max_lod);
5087    samp_st.MaxLOD = CLAMP(sampler->MaxLod, 0, hw_max_lod);
5088    samp_st.TextureLODBias =
5089       CLAMP(tex_unit_lod_bias + sampler->LodBias, -16, 15);
5090
5091 #if GEN_GEN == 6
5092    samp_st.BaseMipLevel =
5093       CLAMP(texObj->MinLevel + texObj->BaseLevel, 0, hw_max_lod);
5094    samp_st.MinandMagStateNotEqual =
5095       samp_st.MinModeFilter != samp_st.MagModeFilter;
5096 #endif
5097
5098    /* Upload the border color if necessary.  If not, just point it at
5099     * offset 0 (the start of the batch) - the color should be ignored,
5100     * but that address won't fault in case something reads it anyway.
5101     */
5102    uint32_t border_color_offset = 0;
5103    if (wrap_mode_needs_border_color(wrap_s) ||
5104        wrap_mode_needs_border_color(wrap_t) ||
5105        wrap_mode_needs_border_color(wrap_r)) {
5106       genX(upload_default_color)(brw, sampler, format, base_format,
5107                                  texObj->_IsIntegerFormat,
5108                                  texObj->StencilSampling,
5109                                  &border_color_offset);
5110    }
5111 #if GEN_GEN < 6
5112       samp_st.BorderColorPointer =
5113          ro_bo(brw->batch.state.bo, border_color_offset);
5114 #else
5115       samp_st.BorderColorPointer = border_color_offset;
5116 #endif
5117
5118 #if GEN_GEN >= 8
5119    samp_st.LODPreClampMode = CLAMP_MODE_OGL;
5120 #else
5121    samp_st.LODPreClampEnable = true;
5122 #endif
5123
5124    GENX(SAMPLER_STATE_pack)(brw, sampler_state, &samp_st);
5125 }
5126
5127 static void
5128 update_sampler_state(struct brw_context *brw,
5129                      int unit,
5130                      uint32_t *sampler_state,
5131                      uint32_t batch_offset_for_sampler_state)
5132 {
5133    struct gl_context *ctx = &brw->ctx;
5134    const struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
5135    const struct gl_texture_object *texObj = texUnit->_Current;
5136    const struct gl_sampler_object *sampler = _mesa_get_samplerobj(ctx, unit);
5137
5138    /* These don't use samplers at all. */
5139    if (texObj->Target == GL_TEXTURE_BUFFER)
5140       return;
5141
5142    struct gl_texture_image *firstImage = texObj->Image[0][texObj->BaseLevel];
5143    genX(update_sampler_state)(brw, texObj->Target,
5144                               ctx->Texture.CubeMapSeamless,
5145                               texUnit->LodBias,
5146                               firstImage->TexFormat, firstImage->_BaseFormat,
5147                               texObj, sampler,
5148                               sampler_state, batch_offset_for_sampler_state);
5149 }
5150
5151 static void
5152 genX(upload_sampler_state_table)(struct brw_context *brw,
5153                                  struct gl_program *prog,
5154                                  struct brw_stage_state *stage_state)
5155 {
5156    struct gl_context *ctx = &brw->ctx;
5157    uint32_t sampler_count = stage_state->sampler_count;
5158
5159    GLbitfield SamplersUsed = prog->SamplersUsed;
5160
5161    if (sampler_count == 0)
5162       return;
5163
5164    /* SAMPLER_STATE is 4 DWords on all platforms. */
5165    const int dwords = GENX(SAMPLER_STATE_length);
5166    const int size_in_bytes = dwords * sizeof(uint32_t);
5167
5168    uint32_t *sampler_state = brw_state_batch(brw,
5169                                              sampler_count * size_in_bytes,
5170                                              32, &stage_state->sampler_offset);
5171    /* memset(sampler_state, 0, sampler_count * size_in_bytes); */
5172
5173    uint32_t batch_offset_for_sampler_state = stage_state->sampler_offset;
5174
5175    for (unsigned s = 0; s < sampler_count; s++) {
5176       if (SamplersUsed & (1 << s)) {
5177          const unsigned unit = prog->SamplerUnits[s];
5178          if (ctx->Texture.Unit[unit]._Current) {
5179             update_sampler_state(brw, unit, sampler_state,
5180                                  batch_offset_for_sampler_state);
5181          }
5182       }
5183
5184       sampler_state += dwords;
5185       batch_offset_for_sampler_state += size_in_bytes;
5186    }
5187
5188    if (GEN_GEN >= 7 && stage_state->stage != MESA_SHADER_COMPUTE) {
5189       /* Emit a 3DSTATE_SAMPLER_STATE_POINTERS_XS packet. */
5190       genX(emit_sampler_state_pointers_xs)(brw, stage_state);
5191    } else {
5192       /* Flag that the sampler state table pointer has changed; later atoms
5193        * will handle it.
5194        */
5195       brw->ctx.NewDriverState |= BRW_NEW_SAMPLER_STATE_TABLE;
5196    }
5197 }
5198
5199 static void
5200 genX(upload_fs_samplers)(struct brw_context *brw)
5201 {
5202    /* BRW_NEW_FRAGMENT_PROGRAM */
5203    struct gl_program *fs = brw->programs[MESA_SHADER_FRAGMENT];
5204    genX(upload_sampler_state_table)(brw, fs, &brw->wm.base);
5205 }
5206
5207 static const struct brw_tracked_state genX(fs_samplers) = {
5208    .dirty = {
5209       .mesa = _NEW_TEXTURE,
5210       .brw = BRW_NEW_BATCH |
5211              BRW_NEW_BLORP |
5212              BRW_NEW_FRAGMENT_PROGRAM,
5213    },
5214    .emit = genX(upload_fs_samplers),
5215 };
5216
5217 static void
5218 genX(upload_vs_samplers)(struct brw_context *brw)
5219 {
5220    /* BRW_NEW_VERTEX_PROGRAM */
5221    struct gl_program *vs = brw->programs[MESA_SHADER_VERTEX];
5222    genX(upload_sampler_state_table)(brw, vs, &brw->vs.base);
5223 }
5224
5225 static const struct brw_tracked_state genX(vs_samplers) = {
5226    .dirty = {
5227       .mesa = _NEW_TEXTURE,
5228       .brw = BRW_NEW_BATCH |
5229              BRW_NEW_BLORP |
5230              BRW_NEW_VERTEX_PROGRAM,
5231    },
5232    .emit = genX(upload_vs_samplers),
5233 };
5234
5235 #if GEN_GEN >= 6
5236 static void
5237 genX(upload_gs_samplers)(struct brw_context *brw)
5238 {
5239    /* BRW_NEW_GEOMETRY_PROGRAM */
5240    struct gl_program *gs = brw->programs[MESA_SHADER_GEOMETRY];
5241    if (!gs)
5242       return;
5243
5244    genX(upload_sampler_state_table)(brw, gs, &brw->gs.base);
5245 }
5246
5247
5248 static const struct brw_tracked_state genX(gs_samplers) = {
5249    .dirty = {
5250       .mesa = _NEW_TEXTURE,
5251       .brw = BRW_NEW_BATCH |
5252              BRW_NEW_BLORP |
5253              BRW_NEW_GEOMETRY_PROGRAM,
5254    },
5255    .emit = genX(upload_gs_samplers),
5256 };
5257 #endif
5258
5259 #if GEN_GEN >= 7
5260 static void
5261 genX(upload_tcs_samplers)(struct brw_context *brw)
5262 {
5263    /* BRW_NEW_TESS_PROGRAMS */
5264    struct gl_program *tcs = brw->programs[MESA_SHADER_TESS_CTRL];
5265    if (!tcs)
5266       return;
5267
5268    genX(upload_sampler_state_table)(brw, tcs, &brw->tcs.base);
5269 }
5270
5271 static const struct brw_tracked_state genX(tcs_samplers) = {
5272    .dirty = {
5273       .mesa = _NEW_TEXTURE,
5274       .brw = BRW_NEW_BATCH |
5275              BRW_NEW_BLORP |
5276              BRW_NEW_TESS_PROGRAMS,
5277    },
5278    .emit = genX(upload_tcs_samplers),
5279 };
5280 #endif
5281
5282 #if GEN_GEN >= 7
5283 static void
5284 genX(upload_tes_samplers)(struct brw_context *brw)
5285 {
5286    /* BRW_NEW_TESS_PROGRAMS */
5287    struct gl_program *tes = brw->programs[MESA_SHADER_TESS_EVAL];
5288    if (!tes)
5289       return;
5290
5291    genX(upload_sampler_state_table)(brw, tes, &brw->tes.base);
5292 }
5293
5294 static const struct brw_tracked_state genX(tes_samplers) = {
5295    .dirty = {
5296       .mesa = _NEW_TEXTURE,
5297       .brw = BRW_NEW_BATCH |
5298              BRW_NEW_BLORP |
5299              BRW_NEW_TESS_PROGRAMS,
5300    },
5301    .emit = genX(upload_tes_samplers),
5302 };
5303 #endif
5304
5305 #if GEN_GEN >= 7
5306 static void
5307 genX(upload_cs_samplers)(struct brw_context *brw)
5308 {
5309    /* BRW_NEW_COMPUTE_PROGRAM */
5310    struct gl_program *cs = brw->programs[MESA_SHADER_COMPUTE];
5311    if (!cs)
5312       return;
5313
5314    genX(upload_sampler_state_table)(brw, cs, &brw->cs.base);
5315 }
5316
5317 const struct brw_tracked_state genX(cs_samplers) = {
5318    .dirty = {
5319       .mesa = _NEW_TEXTURE,
5320       .brw = BRW_NEW_BATCH |
5321              BRW_NEW_BLORP |
5322              BRW_NEW_COMPUTE_PROGRAM,
5323    },
5324    .emit = genX(upload_cs_samplers),
5325 };
5326 #endif
5327
5328 /* ---------------------------------------------------------------------- */
5329
5330 #if GEN_GEN <= 5
5331
5332 static void genX(upload_blend_constant_color)(struct brw_context *brw)
5333 {
5334    struct gl_context *ctx = &brw->ctx;
5335
5336    brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_COLOR), blend_cc) {
5337       blend_cc.BlendConstantColorRed = ctx->Color.BlendColorUnclamped[0];
5338       blend_cc.BlendConstantColorGreen = ctx->Color.BlendColorUnclamped[1];
5339       blend_cc.BlendConstantColorBlue = ctx->Color.BlendColorUnclamped[2];
5340       blend_cc.BlendConstantColorAlpha = ctx->Color.BlendColorUnclamped[3];
5341    }
5342 }
5343
5344 static const struct brw_tracked_state genX(blend_constant_color) = {
5345    .dirty = {
5346       .mesa = _NEW_COLOR,
5347       .brw = BRW_NEW_CONTEXT |
5348              BRW_NEW_BLORP,
5349    },
5350    .emit = genX(upload_blend_constant_color)
5351 };
5352 #endif
5353
5354 /* ---------------------------------------------------------------------- */
5355
5356 void
5357 genX(init_atoms)(struct brw_context *brw)
5358 {
5359 #if GEN_GEN < 6
5360    static const struct brw_tracked_state *render_atoms[] =
5361    {
5362       /* Once all the programs are done, we know how large urb entry
5363        * sizes need to be and can decide if we need to change the urb
5364        * layout.
5365        */
5366       &brw_curbe_offsets,
5367       &brw_recalculate_urb_fence,
5368
5369       &genX(cc_vp),
5370       &genX(color_calc_state),
5371
5372       /* Surface state setup.  Must come before the VS/WM unit.  The binding
5373        * table upload must be last.
5374        */
5375       &brw_vs_pull_constants,
5376       &brw_wm_pull_constants,
5377       &brw_renderbuffer_surfaces,
5378       &brw_renderbuffer_read_surfaces,
5379       &brw_texture_surfaces,
5380       &brw_vs_binding_table,
5381       &brw_wm_binding_table,
5382
5383       &genX(fs_samplers),
5384       &genX(vs_samplers),
5385
5386       /* These set up state for brw_psp_urb_cbs */
5387       &genX(wm_state),
5388       &genX(sf_clip_viewport),
5389       &genX(sf_state),
5390       &genX(vs_state), /* always required, enabled or not */
5391       &genX(clip_state),
5392       &genX(gs_state),
5393
5394       /* Command packets:
5395        */
5396       &brw_binding_table_pointers,
5397       &genX(blend_constant_color),
5398
5399       &brw_depthbuffer,
5400
5401       &genX(polygon_stipple),
5402       &genX(polygon_stipple_offset),
5403
5404       &genX(line_stipple),
5405
5406       &brw_psp_urb_cbs,
5407
5408       &genX(drawing_rect),
5409       &brw_indices, /* must come before brw_vertices */
5410       &genX(index_buffer),
5411       &genX(vertices),
5412
5413       &brw_constant_buffer
5414    };
5415 #elif GEN_GEN == 6
5416    static const struct brw_tracked_state *render_atoms[] =
5417    {
5418       &genX(sf_clip_viewport),
5419
5420       /* Command packets: */
5421
5422       &genX(cc_vp),
5423
5424       &gen6_urb,
5425       &genX(blend_state),               /* must do before cc unit */
5426       &genX(color_calc_state),  /* must do before cc unit */
5427       &genX(depth_stencil_state),       /* must do before cc unit */
5428
5429       &genX(vs_push_constants), /* Before vs_state */
5430       &genX(gs_push_constants), /* Before gs_state */
5431       &genX(wm_push_constants), /* Before wm_state */
5432
5433       /* Surface state setup.  Must come before the VS/WM unit.  The binding
5434        * table upload must be last.
5435        */
5436       &brw_vs_pull_constants,
5437       &brw_vs_ubo_surfaces,
5438       &brw_gs_pull_constants,
5439       &brw_gs_ubo_surfaces,
5440       &brw_wm_pull_constants,
5441       &brw_wm_ubo_surfaces,
5442       &gen6_renderbuffer_surfaces,
5443       &brw_renderbuffer_read_surfaces,
5444       &brw_texture_surfaces,
5445       &gen6_sol_surface,
5446       &brw_vs_binding_table,
5447       &gen6_gs_binding_table,
5448       &brw_wm_binding_table,
5449
5450       &genX(fs_samplers),
5451       &genX(vs_samplers),
5452       &genX(gs_samplers),
5453       &gen6_sampler_state,
5454       &genX(multisample_state),
5455
5456       &genX(vs_state),
5457       &genX(gs_state),
5458       &genX(clip_state),
5459       &genX(sf_state),
5460       &genX(wm_state),
5461
5462       &genX(scissor_state),
5463
5464       &gen6_binding_table_pointers,
5465
5466       &brw_depthbuffer,
5467
5468       &genX(polygon_stipple),
5469       &genX(polygon_stipple_offset),
5470
5471       &genX(line_stipple),
5472
5473       &genX(drawing_rect),
5474
5475       &brw_indices, /* must come before brw_vertices */
5476       &genX(index_buffer),
5477       &genX(vertices),
5478    };
5479 #elif GEN_GEN == 7
5480    static const struct brw_tracked_state *render_atoms[] =
5481    {
5482       /* Command packets: */
5483
5484       &genX(cc_vp),
5485       &genX(sf_clip_viewport),
5486
5487       &gen7_l3_state,
5488       &gen7_push_constant_space,
5489       &gen7_urb,
5490       &genX(blend_state),               /* must do before cc unit */
5491       &genX(color_calc_state),  /* must do before cc unit */
5492       &genX(depth_stencil_state),       /* must do before cc unit */
5493
5494       &brw_vs_image_surfaces, /* Before vs push/pull constants and binding table */
5495       &brw_tcs_image_surfaces, /* Before tcs push/pull constants and binding table */
5496       &brw_tes_image_surfaces, /* Before tes push/pull constants and binding table */
5497       &brw_gs_image_surfaces, /* Before gs push/pull constants and binding table */
5498       &brw_wm_image_surfaces, /* Before wm push/pull constants and binding table */
5499
5500       &genX(vs_push_constants), /* Before vs_state */
5501       &genX(tcs_push_constants),
5502       &genX(tes_push_constants),
5503       &genX(gs_push_constants), /* Before gs_state */
5504       &genX(wm_push_constants), /* Before wm_surfaces and constant_buffer */
5505
5506       /* Surface state setup.  Must come before the VS/WM unit.  The binding
5507        * table upload must be last.
5508        */
5509       &brw_vs_pull_constants,
5510       &brw_vs_ubo_surfaces,
5511       &brw_tcs_pull_constants,
5512       &brw_tcs_ubo_surfaces,
5513       &brw_tes_pull_constants,
5514       &brw_tes_ubo_surfaces,
5515       &brw_gs_pull_constants,
5516       &brw_gs_ubo_surfaces,
5517       &brw_wm_pull_constants,
5518       &brw_wm_ubo_surfaces,
5519       &gen6_renderbuffer_surfaces,
5520       &brw_renderbuffer_read_surfaces,
5521       &brw_texture_surfaces,
5522
5523       &genX(push_constant_packets),
5524
5525       &brw_vs_binding_table,
5526       &brw_tcs_binding_table,
5527       &brw_tes_binding_table,
5528       &brw_gs_binding_table,
5529       &brw_wm_binding_table,
5530
5531       &genX(fs_samplers),
5532       &genX(vs_samplers),
5533       &genX(tcs_samplers),
5534       &genX(tes_samplers),
5535       &genX(gs_samplers),
5536       &genX(multisample_state),
5537
5538       &genX(vs_state),
5539       &genX(hs_state),
5540       &genX(te_state),
5541       &genX(ds_state),
5542       &genX(gs_state),
5543       &genX(sol_state),
5544       &genX(clip_state),
5545       &genX(sbe_state),
5546       &genX(sf_state),
5547       &genX(wm_state),
5548       &genX(ps_state),
5549
5550       &genX(scissor_state),
5551
5552       &gen7_depthbuffer,
5553
5554       &genX(polygon_stipple),
5555       &genX(polygon_stipple_offset),
5556
5557       &genX(line_stipple),
5558
5559       &genX(drawing_rect),
5560
5561       &brw_indices, /* must come before brw_vertices */
5562       &genX(index_buffer),
5563       &genX(vertices),
5564
5565 #if GEN_IS_HASWELL
5566       &genX(cut_index),
5567 #endif
5568    };
5569 #elif GEN_GEN >= 8
5570    static const struct brw_tracked_state *render_atoms[] =
5571    {
5572       &genX(cc_vp),
5573       &genX(sf_clip_viewport),
5574
5575       &gen7_l3_state,
5576       &gen7_push_constant_space,
5577       &gen7_urb,
5578       &genX(blend_state),
5579       &genX(color_calc_state),
5580
5581       &brw_vs_image_surfaces, /* Before vs push/pull constants and binding table */
5582       &brw_tcs_image_surfaces, /* Before tcs push/pull constants and binding table */
5583       &brw_tes_image_surfaces, /* Before tes push/pull constants and binding table */
5584       &brw_gs_image_surfaces, /* Before gs push/pull constants and binding table */
5585       &brw_wm_image_surfaces, /* Before wm push/pull constants and binding table */
5586
5587       &genX(vs_push_constants), /* Before vs_state */
5588       &genX(tcs_push_constants),
5589       &genX(tes_push_constants),
5590       &genX(gs_push_constants), /* Before gs_state */
5591       &genX(wm_push_constants), /* Before wm_surfaces and constant_buffer */
5592
5593       /* Surface state setup.  Must come before the VS/WM unit.  The binding
5594        * table upload must be last.
5595        */
5596       &brw_vs_pull_constants,
5597       &brw_vs_ubo_surfaces,
5598       &brw_tcs_pull_constants,
5599       &brw_tcs_ubo_surfaces,
5600       &brw_tes_pull_constants,
5601       &brw_tes_ubo_surfaces,
5602       &brw_gs_pull_constants,
5603       &brw_gs_ubo_surfaces,
5604       &brw_wm_pull_constants,
5605       &brw_wm_ubo_surfaces,
5606       &gen6_renderbuffer_surfaces,
5607       &brw_renderbuffer_read_surfaces,
5608       &brw_texture_surfaces,
5609
5610       &genX(push_constant_packets),
5611
5612       &brw_vs_binding_table,
5613       &brw_tcs_binding_table,
5614       &brw_tes_binding_table,
5615       &brw_gs_binding_table,
5616       &brw_wm_binding_table,
5617
5618       &genX(fs_samplers),
5619       &genX(vs_samplers),
5620       &genX(tcs_samplers),
5621       &genX(tes_samplers),
5622       &genX(gs_samplers),
5623       &genX(multisample_state),
5624
5625       &genX(vs_state),
5626       &genX(hs_state),
5627       &genX(te_state),
5628       &genX(ds_state),
5629       &genX(gs_state),
5630       &genX(sol_state),
5631       &genX(clip_state),
5632       &genX(raster_state),
5633       &genX(sbe_state),
5634       &genX(sf_state),
5635       &genX(ps_blend),
5636       &genX(ps_extra),
5637       &genX(ps_state),
5638       &genX(depth_stencil_state),
5639       &genX(wm_state),
5640
5641       &genX(scissor_state),
5642
5643       &gen7_depthbuffer,
5644
5645       &genX(polygon_stipple),
5646       &genX(polygon_stipple_offset),
5647
5648       &genX(line_stipple),
5649
5650       &genX(drawing_rect),
5651
5652       &genX(vf_topology),
5653
5654       &brw_indices,
5655       &genX(index_buffer),
5656       &genX(vertices),
5657
5658       &genX(cut_index),
5659       &gen8_pma_fix,
5660    };
5661 #endif
5662
5663    STATIC_ASSERT(ARRAY_SIZE(render_atoms) <= ARRAY_SIZE(brw->render_atoms));
5664    brw_copy_pipeline_atoms(brw, BRW_RENDER_PIPELINE,
5665                            render_atoms, ARRAY_SIZE(render_atoms));
5666
5667 #if GEN_GEN >= 7
5668    static const struct brw_tracked_state *compute_atoms[] =
5669    {
5670       &gen7_l3_state,
5671       &brw_cs_image_surfaces,
5672       &genX(cs_push_constants),
5673       &genX(cs_pull_constants),
5674       &brw_cs_ubo_surfaces,
5675       &brw_cs_texture_surfaces,
5676       &brw_cs_work_groups_surface,
5677       &genX(cs_samplers),
5678       &genX(cs_state),
5679    };
5680
5681    STATIC_ASSERT(ARRAY_SIZE(compute_atoms) <= ARRAY_SIZE(brw->compute_atoms));
5682    brw_copy_pipeline_atoms(brw, BRW_COMPUTE_PIPELINE,
5683                            compute_atoms, ARRAY_SIZE(compute_atoms));
5684
5685    brw->vtbl.emit_mi_report_perf_count = genX(emit_mi_report_perf_count);
5686 #endif
5687 }