src/mesa/drivers/dri/i965/genX_state_upload.c

   1 /*
   2  * Copyright © 2017 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include <assert.h>
  25
  26 #include "common/gen_device_info.h"
  27 #include "common/gen_sample_positions.h"
  28 #include "genxml/gen_macros.h"
  29
  30 #include "main/bufferobj.h"
  31 #include "main/context.h"
  32 #include "main/enums.h"
  33 #include "main/macros.h"
  34 #include "main/state.h"
  35
  36 #include "brw_context.h"
  37 #include "brw_draw.h"
  38 #include "brw_multisample_state.h"
  39 #include "brw_state.h"
  40 #include "brw_wm.h"
  41 #include "brw_util.h"
  42
  43 #include "intel_batchbuffer.h"
  44 #include "intel_buffer_objects.h"
  45 #include "intel_fbo.h"
  46
  47 #include "main/enums.h"
  48 #include "main/fbobject.h"
  49 #include "main/framebuffer.h"
  50 #include "main/glformats.h"
  51 #include "main/samplerobj.h"
  52 #include "main/shaderapi.h"
  53 #include "main/stencil.h"
  54 #include "main/transformfeedback.h"
  55 #include "main/varray.h"
  56 #include "main/viewport.h"
  57 #include "util/half_float.h"
  58
  59 UNUSED static void *
  60 emit_dwords(struct brw_context *brw, unsigned n)
  61 {
  62    intel_batchbuffer_begin(brw, n, RENDER_RING);
  63    uint32_t *map = brw->batch.map_next;
  64    brw->batch.map_next += n;
  65    intel_batchbuffer_advance(brw);
  66    return map;
  67 }
  68
  69 struct brw_address {
  70    struct brw_bo *bo;
  71    unsigned reloc_flags;
  72    uint32_t offset;
  73 };
  74
  75 #define __gen_address_type struct brw_address
  76 #define __gen_user_data struct brw_context
  77
  78 static uint64_t
  79 __gen_combine_address(struct brw_context *brw, void *location,
  80                       struct brw_address address, uint32_t delta)
  81 {
  82    struct intel_batchbuffer *batch = &brw->batch;
  83    uint32_t offset;
  84
  85    if (address.bo == NULL) {
  86       return address.offset + delta;
  87    } else {
  88       if (GEN_GEN < 6 && brw_ptr_in_state_buffer(batch, location)) {
  89          offset = (char *) location - (char *) brw->batch.state.map;
  90          return brw_state_reloc(batch, offset, address.bo,
  91                                 address.offset + delta,
  92                                 address.reloc_flags);
  93       }
  94
  95       assert(!brw_ptr_in_state_buffer(batch, location));
  96
  97       offset = (char *) location - (char *) brw->batch.batch.map;
  98       return brw_batch_reloc(batch, offset, address.bo,
  99                              address.offset + delta,
 100                              address.reloc_flags);
 101    }
 102 }
 103
 104 static struct brw_address
 105 rw_bo(struct brw_bo *bo, uint32_t offset)
 106 {
 107    return (struct brw_address) {
 108             .bo = bo,
 109             .offset = offset,
 110             .reloc_flags = RELOC_WRITE,
 111    };
 112 }
 113
 114 static struct brw_address
 115 ro_bo(struct brw_bo *bo, uint32_t offset)
 116 {
 117    return (struct brw_address) {
 118             .bo = bo,
 119             .offset = offset,
 120    };
 121 }
 122
 123 UNUSED static struct brw_address
 124 ggtt_bo(struct brw_bo *bo, uint32_t offset)
 125 {
 126    return (struct brw_address) {
 127             .bo = bo,
 128             .offset = offset,
 129             .reloc_flags = RELOC_WRITE | RELOC_NEEDS_GGTT,
 130    };
 131 }
 132
 133 #if GEN_GEN == 4
 134 static struct brw_address
 135 KSP(struct brw_context *brw, uint32_t offset)
 136 {
 137    return ro_bo(brw->cache.bo, offset);
 138 }
 139 #else
 140 static uint32_t
 141 KSP(struct brw_context *brw, uint32_t offset)
 142 {
 143    return offset;
 144 }
 145 #endif
 146
 147 #include "genxml/genX_pack.h"
 148
 149 #define _brw_cmd_length(cmd) cmd ## _length
 150 #define _brw_cmd_length_bias(cmd) cmd ## _length_bias
 151 #define _brw_cmd_header(cmd) cmd ## _header
 152 #define _brw_cmd_pack(cmd) cmd ## _pack
 153
 154 #define brw_batch_emit(brw, cmd, name)                  \
 155    for (struct cmd name = { _brw_cmd_header(cmd) },     \
 156         *_dst = emit_dwords(brw, _brw_cmd_length(cmd)); \
 157         __builtin_expect(_dst != NULL, 1);              \
 158         _brw_cmd_pack(cmd)(brw, (void *)_dst, &name),   \
 159         _dst = NULL)
 160
 161 #define brw_batch_emitn(brw, cmd, n, ...) ({           \
 162       uint32_t *_dw = emit_dwords(brw, n);             \
 163       struct cmd template = {                          \
 164          _brw_cmd_header(cmd),                         \
 165          .DWordLength = n - _brw_cmd_length_bias(cmd), \
 166          __VA_ARGS__                                   \
 167       };                                               \
 168       _brw_cmd_pack(cmd)(brw, _dw, &template);         \
 169       _dw + 1; /* Array starts at dw[1] */             \
 170    })
 171
 172 #define brw_state_emit(brw, cmd, align, offset, name)              \
 173    for (struct cmd name = {},                                      \
 174         *_dst = brw_state_batch(brw, _brw_cmd_length(cmd) * 4,     \
 175                                 align, offset);                    \
 176         __builtin_expect(_dst != NULL, 1);                         \
 177         _brw_cmd_pack(cmd)(brw, (void *)_dst, &name),              \
 178         _dst = NULL)
 179
 180 /**
 181  * Polygon stipple packet
 182  */
 183 static void
 184 genX(upload_polygon_stipple)(struct brw_context *brw)
 185 {
 186    struct gl_context *ctx = &brw->ctx;
 187
 188    /* _NEW_POLYGON */
 189    if (!ctx->Polygon.StippleFlag)
 190       return;
 191
 192    brw_batch_emit(brw, GENX(3DSTATE_POLY_STIPPLE_PATTERN), poly) {
 193       /* Polygon stipple is provided in OpenGL order, i.e. bottom
 194        * row first.  If we're rendering to a window (i.e. the
 195        * default frame buffer object, 0), then we need to invert
 196        * it to match our pixel layout.  But if we're rendering
 197        * to a FBO (i.e. any named frame buffer object), we *don't*
 198        * need to invert - we already match the layout.
 199        */
 200       if (_mesa_is_winsys_fbo(ctx->DrawBuffer)) {
 201          for (unsigned i = 0; i < 32; i++)
 202             poly.PatternRow[i] = ctx->PolygonStipple[31 - i]; /* invert */
 203       } else {
 204          for (unsigned i = 0; i < 32; i++)
 205             poly.PatternRow[i] = ctx->PolygonStipple[i];
 206       }
 207    }
 208 }
 209
 210 static const struct brw_tracked_state genX(polygon_stipple) = {
 211    .dirty = {
 212       .mesa = _NEW_POLYGON |
 213               _NEW_POLYGONSTIPPLE,
 214       .brw = BRW_NEW_CONTEXT,
 215    },
 216    .emit = genX(upload_polygon_stipple),
 217 };
 218
 219 /**
 220  * Polygon stipple offset packet
 221  */
 222 static void
 223 genX(upload_polygon_stipple_offset)(struct brw_context *brw)
 224 {
 225    struct gl_context *ctx = &brw->ctx;
 226
 227    /* _NEW_POLYGON */
 228    if (!ctx->Polygon.StippleFlag)
 229       return;
 230
 231    brw_batch_emit(brw, GENX(3DSTATE_POLY_STIPPLE_OFFSET), poly) {
 232       /* _NEW_BUFFERS
 233        *
 234        * If we're drawing to a system window we have to invert the Y axis
 235        * in order to match the OpenGL pixel coordinate system, and our
 236        * offset must be matched to the window position.  If we're drawing
 237        * to a user-created FBO then our native pixel coordinate system
 238        * works just fine, and there's no window system to worry about.
 239        */
 240       if (_mesa_is_winsys_fbo(ctx->DrawBuffer)) {
 241          poly.PolygonStippleYOffset =
 242             (32 - (_mesa_geometric_height(ctx->DrawBuffer) & 31)) & 31;
 243       }
 244    }
 245 }
 246
 247 static const struct brw_tracked_state genX(polygon_stipple_offset) = {
 248    .dirty = {
 249       .mesa = _NEW_BUFFERS |
 250               _NEW_POLYGON,
 251       .brw = BRW_NEW_CONTEXT,
 252    },
 253    .emit = genX(upload_polygon_stipple_offset),
 254 };
 255
 256 /**
 257  * Line stipple packet
 258  */
 259 static void
 260 genX(upload_line_stipple)(struct brw_context *brw)
 261 {
 262    struct gl_context *ctx = &brw->ctx;
 263
 264    if (!ctx->Line.StippleFlag)
 265       return;
 266
 267    brw_batch_emit(brw, GENX(3DSTATE_LINE_STIPPLE), line) {
 268       line.LineStipplePattern = ctx->Line.StipplePattern;
 269
 270       line.LineStippleInverseRepeatCount = 1.0f / ctx->Line.StippleFactor;
 271       line.LineStippleRepeatCount = ctx->Line.StippleFactor;
 272    }
 273 }
 274
 275 static const struct brw_tracked_state genX(line_stipple) = {
 276    .dirty = {
 277       .mesa = _NEW_LINE,
 278       .brw = BRW_NEW_CONTEXT,
 279    },
 280    .emit = genX(upload_line_stipple),
 281 };
 282
 283 /* Constant single cliprect for framebuffer object or DRI2 drawing */
 284 static void
 285 genX(upload_drawing_rect)(struct brw_context *brw)
 286 {
 287    struct gl_context *ctx = &brw->ctx;
 288    const struct gl_framebuffer *fb = ctx->DrawBuffer;
 289    const unsigned int fb_width = _mesa_geometric_width(fb);
 290    const unsigned int fb_height = _mesa_geometric_height(fb);
 291
 292    brw_batch_emit(brw, GENX(3DSTATE_DRAWING_RECTANGLE), rect) {
 293       rect.ClippedDrawingRectangleXMax = fb_width - 1;
 294       rect.ClippedDrawingRectangleYMax = fb_height - 1;
 295    }
 296 }
 297
 298 static const struct brw_tracked_state genX(drawing_rect) = {
 299    .dirty = {
 300       .mesa = _NEW_BUFFERS,
 301       .brw = BRW_NEW_BLORP |
 302              BRW_NEW_CONTEXT,
 303    },
 304    .emit = genX(upload_drawing_rect),
 305 };
 306
 307 static uint32_t *
 308 genX(emit_vertex_buffer_state)(struct brw_context *brw,
 309                                uint32_t *dw,
 310                                unsigned buffer_nr,
 311                                struct brw_bo *bo,
 312                                unsigned start_offset,
 313                                unsigned end_offset,
 314                                unsigned stride,
 315                                unsigned step_rate)
 316 {
 317    struct GENX(VERTEX_BUFFER_STATE) buf_state = {
 318       .VertexBufferIndex = buffer_nr,
 319       .BufferPitch = stride,
 320       .BufferStartingAddress = ro_bo(bo, start_offset),
 321 #if GEN_GEN >= 8
 322       .BufferSize = end_offset - start_offset,
 323 #endif
 324
 325 #if GEN_GEN >= 7
 326       .AddressModifyEnable = true,
 327 #endif
 328
 329 #if GEN_GEN < 8
 330       .BufferAccessType = step_rate ? INSTANCEDATA : VERTEXDATA,
 331       .InstanceDataStepRate = step_rate,
 332 #if GEN_GEN >= 5
 333       .EndAddress = ro_bo(bo, end_offset - 1),
 334 #endif
 335 #endif
 336
 337 #if GEN_GEN == 11
 338       .VertexBufferMOCS = ICL_MOCS_WB,
 339 #elif GEN_GEN == 10
 340       .VertexBufferMOCS = CNL_MOCS_WB,
 341 #elif GEN_GEN == 9
 342       .VertexBufferMOCS = SKL_MOCS_WB,
 343 #elif GEN_GEN == 8
 344       .VertexBufferMOCS = BDW_MOCS_WB,
 345 #elif GEN_GEN == 7
 346       .VertexBufferMOCS = GEN7_MOCS_L3,
 347 #endif
 348    };
 349
 350    GENX(VERTEX_BUFFER_STATE_pack)(brw, dw, &buf_state);
 351    return dw + GENX(VERTEX_BUFFER_STATE_length);
 352 }
 353
 354 UNUSED static bool
 355 is_passthru_format(uint32_t format)
 356 {
 357    switch (format) {
 358    case ISL_FORMAT_R64_PASSTHRU:
 359    case ISL_FORMAT_R64G64_PASSTHRU:
 360    case ISL_FORMAT_R64G64B64_PASSTHRU:
 361    case ISL_FORMAT_R64G64B64A64_PASSTHRU:
 362       return true;
 363    default:
 364       return false;
 365    }
 366 }
 367
 368 UNUSED static int
 369 uploads_needed(uint32_t format,
 370                bool is_dual_slot)
 371 {
 372    if (!is_passthru_format(format))
 373       return 1;
 374
 375    if (is_dual_slot)
 376       return 2;
 377
 378    switch (format) {
 379    case ISL_FORMAT_R64_PASSTHRU:
 380    case ISL_FORMAT_R64G64_PASSTHRU:
 381       return 1;
 382    case ISL_FORMAT_R64G64B64_PASSTHRU:
 383    case ISL_FORMAT_R64G64B64A64_PASSTHRU:
 384       return 2;
 385    default:
 386       unreachable("not reached");
 387    }
 388 }
 389
 390 /*
 391  * Returns the format that we are finally going to use when upload a vertex
 392  * element. It will only change if we are using *64*PASSTHRU formats, as for
 393  * gen < 8 they need to be splitted on two *32*FLOAT formats.
 394  *
 395  * @upload points in which upload we are. Valid values are [0,1]
 396  */
 397 static uint32_t
 398 downsize_format_if_needed(uint32_t format,
 399                           int upload)
 400 {
 401    assert(upload == 0 || upload == 1);
 402
 403    if (!is_passthru_format(format))
 404       return format;
 405
 406    /* ISL_FORMAT_R64_PASSTHRU and ISL_FORMAT_R64G64_PASSTHRU with an upload ==
 407     * 1 means that we have been forced to do 2 uploads for a size <= 2. This
 408     * happens with gen < 8 and dvec3 or dvec4 vertex shader input
 409     * variables. In those cases, we return ISL_FORMAT_R32_FLOAT as a way of
 410     * flagging that we want to fill with zeroes this second forced upload.
 411     */
 412    switch (format) {
 413    case ISL_FORMAT_R64_PASSTHRU:
 414       return upload == 0 ? ISL_FORMAT_R32G32_FLOAT
 415                          : ISL_FORMAT_R32_FLOAT;
 416    case ISL_FORMAT_R64G64_PASSTHRU:
 417       return upload == 0 ? ISL_FORMAT_R32G32B32A32_FLOAT
 418                          : ISL_FORMAT_R32_FLOAT;
 419    case ISL_FORMAT_R64G64B64_PASSTHRU:
 420       return upload == 0 ? ISL_FORMAT_R32G32B32A32_FLOAT
 421                          : ISL_FORMAT_R32G32_FLOAT;
 422    case ISL_FORMAT_R64G64B64A64_PASSTHRU:
 423       return ISL_FORMAT_R32G32B32A32_FLOAT;
 424    default:
 425       unreachable("not reached");
 426    }
 427 }
 428
 429 /*
 430  * Returns the number of componentes associated with a format that is used on
 431  * a 64 to 32 format split. See downsize_format()
 432  */
 433 static int
 434 upload_format_size(uint32_t upload_format)
 435 {
 436    switch (upload_format) {
 437    case ISL_FORMAT_R32_FLOAT:
 438
 439       /* downsized_format has returned this one in order to flag that we are
 440        * performing a second upload which we want to have filled with
 441        * zeroes. This happens with gen < 8, a size <= 2, and dvec3 or dvec4
 442        * vertex shader input variables.
 443        */
 444
 445       return 0;
 446    case ISL_FORMAT_R32G32_FLOAT:
 447       return 2;
 448    case ISL_FORMAT_R32G32B32A32_FLOAT:
 449       return 4;
 450    default:
 451       unreachable("not reached");
 452    }
 453 }
 454
 455 static void
 456 genX(emit_vertices)(struct brw_context *brw)
 457 {
 458    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 459    uint32_t *dw;
 460
 461    brw_prepare_vertices(brw);
 462    brw_prepare_shader_draw_parameters(brw);
 463
 464 #if GEN_GEN < 6
 465    brw_emit_query_begin(brw);
 466 #endif
 467
 468    const struct brw_vs_prog_data *vs_prog_data =
 469       brw_vs_prog_data(brw->vs.base.prog_data);
 470
 471 #if GEN_GEN >= 8
 472    struct gl_context *ctx = &brw->ctx;
 473    const bool uses_edge_flag = (ctx->Polygon.FrontMode != GL_FILL ||
 474                                 ctx->Polygon.BackMode != GL_FILL);
 475
 476    if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid) {
 477       unsigned vue = brw->vb.nr_enabled;
 478
 479       /* The element for the edge flags must always be last, so we have to
 480        * insert the SGVS before it in that case.
 481        */
 482       if (uses_edge_flag) {
 483          assert(vue > 0);
 484          vue--;
 485       }
 486
 487       WARN_ONCE(vue >= 33,
 488                 "Trying to insert VID/IID past 33rd vertex element, "
 489                 "need to reorder the vertex attrbutes.");
 490
 491       brw_batch_emit(brw, GENX(3DSTATE_VF_SGVS), vfs) {
 492          if (vs_prog_data->uses_vertexid) {
 493             vfs.VertexIDEnable = true;
 494             vfs.VertexIDComponentNumber = 2;
 495             vfs.VertexIDElementOffset = vue;
 496          }
 497
 498          if (vs_prog_data->uses_instanceid) {
 499             vfs.InstanceIDEnable = true;
 500             vfs.InstanceIDComponentNumber = 3;
 501             vfs.InstanceIDElementOffset = vue;
 502          }
 503       }
 504
 505       brw_batch_emit(brw, GENX(3DSTATE_VF_INSTANCING), vfi) {
 506          vfi.InstancingEnable = true;
 507          vfi.VertexElementIndex = vue;
 508       }
 509    } else {
 510       brw_batch_emit(brw, GENX(3DSTATE_VF_SGVS), vfs);
 511    }
 512 #endif
 513
 514    const bool needs_sgvs_element = (vs_prog_data->uses_basevertex ||
 515                                     vs_prog_data->uses_baseinstance ||
 516                                     vs_prog_data->uses_instanceid ||
 517                                     vs_prog_data->uses_vertexid);
 518
 519    unsigned nr_elements =
 520       brw->vb.nr_enabled + needs_sgvs_element + vs_prog_data->uses_drawid;
 521
 522 #if GEN_GEN < 8
 523    /* If any of the formats of vb.enabled needs more that one upload, we need
 524     * to add it to nr_elements
 525     */
 526    for (unsigned i = 0; i < brw->vb.nr_enabled; i++) {
 527       struct brw_vertex_element *input = brw->vb.enabled[i];
 528       uint32_t format = brw_get_vertex_surface_type(brw, input->glarray);
 529
 530       if (uploads_needed(format, input->is_dual_slot) > 1)
 531          nr_elements++;
 532    }
 533 #endif
 534
 535    /* If the VS doesn't read any inputs (calculating vertex position from
 536     * a state variable for some reason, for example), emit a single pad
 537     * VERTEX_ELEMENT struct and bail.
 538     *
 539     * The stale VB state stays in place, but they don't do anything unless
 540     * a VE loads from them.
 541     */
 542    if (nr_elements == 0) {
 543       dw = brw_batch_emitn(brw, GENX(3DSTATE_VERTEX_ELEMENTS),
 544                            1 + GENX(VERTEX_ELEMENT_STATE_length));
 545       struct GENX(VERTEX_ELEMENT_STATE) elem = {
 546          .Valid = true,
 547          .SourceElementFormat = (enum GENX(SURFACE_FORMAT)) ISL_FORMAT_R32G32B32A32_FLOAT,
 548          .Component0Control = VFCOMP_STORE_0,
 549          .Component1Control = VFCOMP_STORE_0,
 550          .Component2Control = VFCOMP_STORE_0,
 551          .Component3Control = VFCOMP_STORE_1_FP,
 552       };
 553       GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem);
 554       return;
 555    }
 556
 557    /* Now emit 3DSTATE_VERTEX_BUFFERS and 3DSTATE_VERTEX_ELEMENTS packets. */
 558    const bool uses_draw_params =
 559       vs_prog_data->uses_basevertex ||
 560       vs_prog_data->uses_baseinstance;
 561    const unsigned nr_buffers = brw->vb.nr_buffers +
 562       uses_draw_params + vs_prog_data->uses_drawid;
 563
 564    if (nr_buffers) {
 565       assert(nr_buffers <= (GEN_GEN >= 6 ? 33 : 17));
 566
 567       dw = brw_batch_emitn(brw, GENX(3DSTATE_VERTEX_BUFFERS),
 568                            1 + GENX(VERTEX_BUFFER_STATE_length) * nr_buffers);
 569
 570       for (unsigned i = 0; i < brw->vb.nr_buffers; i++) {
 571          const struct brw_vertex_buffer *buffer = &brw->vb.buffers[i];
 572          /* Prior to Haswell and Bay Trail we have to use 4-component formats
 573           * to fake 3-component ones.  In particular, we do this for
 574           * half-float and 8 and 16-bit integer formats.  This means that the
 575           * vertex element may poke over the end of the buffer by 2 bytes.
 576           */
 577          const unsigned padding =
 578             (GEN_GEN <= 7 && !GEN_IS_HASWELL && !devinfo->is_baytrail) * 2;
 579          const unsigned end = buffer->offset + buffer->size + padding;
 580          dw = genX(emit_vertex_buffer_state)(brw, dw, i, buffer->bo,
 581                                              buffer->offset,
 582                                              end,
 583                                              buffer->stride,
 584                                              buffer->step_rate);
 585       }
 586
 587       if (uses_draw_params) {
 588          dw = genX(emit_vertex_buffer_state)(brw, dw, brw->vb.nr_buffers,
 589                                              brw->draw.draw_params_bo,
 590                                              brw->draw.draw_params_offset,
 591                                              brw->draw.draw_params_bo->size,
 592                                              0 /* stride */,
 593                                              0 /* step rate */);
 594       }
 595
 596       if (vs_prog_data->uses_drawid) {
 597          dw = genX(emit_vertex_buffer_state)(brw, dw, brw->vb.nr_buffers + 1,
 598                                              brw->draw.draw_id_bo,
 599                                              brw->draw.draw_id_offset,
 600                                              brw->draw.draw_id_bo->size,
 601                                              0 /* stride */,
 602                                              0 /* step rate */);
 603       }
 604    }
 605
 606    /* The hardware allows one more VERTEX_ELEMENTS than VERTEX_BUFFERS,
 607     * presumably for VertexID/InstanceID.
 608     */
 609 #if GEN_GEN >= 6
 610    assert(nr_elements <= 34);
 611    const struct brw_vertex_element *gen6_edgeflag_input = NULL;
 612 #else
 613    assert(nr_elements <= 18);
 614 #endif
 615
 616    dw = brw_batch_emitn(brw, GENX(3DSTATE_VERTEX_ELEMENTS),
 617                         1 + GENX(VERTEX_ELEMENT_STATE_length) * nr_elements);
 618    unsigned i;
 619    for (i = 0; i < brw->vb.nr_enabled; i++) {
 620       const struct brw_vertex_element *input = brw->vb.enabled[i];
 621       uint32_t format = brw_get_vertex_surface_type(brw, input->glarray);
 622       uint32_t comp0 = VFCOMP_STORE_SRC;
 623       uint32_t comp1 = VFCOMP_STORE_SRC;
 624       uint32_t comp2 = VFCOMP_STORE_SRC;
 625       uint32_t comp3 = VFCOMP_STORE_SRC;
 626       const unsigned num_uploads = GEN_GEN < 8 ?
 627          uploads_needed(format, input->is_dual_slot) : 1;
 628
 629 #if GEN_GEN >= 8
 630       /* From the BDW PRM, Volume 2d, page 588 (VERTEX_ELEMENT_STATE):
 631        * "Any SourceElementFormat of *64*_PASSTHRU cannot be used with an
 632        * element which has edge flag enabled."
 633        */
 634       assert(!(is_passthru_format(format) && uses_edge_flag));
 635 #endif
 636
 637       /* The gen4 driver expects edgeflag to come in as a float, and passes
 638        * that float on to the tests in the clipper.  Mesa's current vertex
 639        * attribute value for EdgeFlag is stored as a float, which works out.
 640        * glEdgeFlagPointer, on the other hand, gives us an unnormalized
 641        * integer ubyte.  Just rewrite that to convert to a float.
 642        *
 643        * Gen6+ passes edgeflag as sideband along with the vertex, instead
 644        * of in the VUE.  We have to upload it sideband as the last vertex
 645        * element according to the B-Spec.
 646        */
 647 #if GEN_GEN >= 6
 648       if (input == &brw->vb.inputs[VERT_ATTRIB_EDGEFLAG]) {
 649          gen6_edgeflag_input = input;
 650          continue;
 651       }
 652 #endif
 653
 654       for (unsigned c = 0; c < num_uploads; c++) {
 655          const uint32_t upload_format = GEN_GEN >= 8 ? format :
 656             downsize_format_if_needed(format, c);
 657          /* If we need more that one upload, the offset stride would be 128
 658           * bits (16 bytes), as for previous uploads we are using the full
 659           * entry. */
 660          const unsigned offset = input->offset + c * 16;
 661
 662          const int size = (GEN_GEN < 8 && is_passthru_format(format)) ?
 663             upload_format_size(upload_format) : input->glarray->Size;
 664
 665          switch (size) {
 666             case 0: comp0 = VFCOMP_STORE_0;
 667             case 1: comp1 = VFCOMP_STORE_0;
 668             case 2: comp2 = VFCOMP_STORE_0;
 669             case 3:
 670                if (GEN_GEN >= 8 && input->glarray->Doubles) {
 671                   comp3 = VFCOMP_STORE_0;
 672                } else if (input->glarray->Integer) {
 673                   comp3 = VFCOMP_STORE_1_INT;
 674                } else {
 675                   comp3 = VFCOMP_STORE_1_FP;
 676                }
 677
 678                break;
 679          }
 680
 681 #if GEN_GEN >= 8
 682          /* From the BDW PRM, Volume 2d, page 586 (VERTEX_ELEMENT_STATE):
 683           *
 684           *     "When SourceElementFormat is set to one of the *64*_PASSTHRU
 685           *     formats, 64-bit components are stored in the URB without any
 686           *     conversion. In this case, vertex elements must be written as 128
 687           *     or 256 bits, with VFCOMP_STORE_0 being used to pad the output as
 688           *     required. E.g., if R64_PASSTHRU is used to copy a 64-bit Red
 689           *     component into the URB, Component 1 must be specified as
 690           *     VFCOMP_STORE_0 (with Components 2,3 set to VFCOMP_NOSTORE) in
 691           *     order to output a 128-bit vertex element, or Components 1-3 must
 692           *     be specified as VFCOMP_STORE_0 in order to output a 256-bit vertex
 693           *     element. Likewise, use of R64G64B64_PASSTHRU requires Component 3
 694           *     to be specified as VFCOMP_STORE_0 in order to output a 256-bit
 695           *     vertex element."
 696           */
 697          if (input->glarray->Doubles && !input->is_dual_slot) {
 698             /* Store vertex elements which correspond to double and dvec2 vertex
 699              * shader inputs as 128-bit vertex elements, instead of 256-bits.
 700              */
 701             comp2 = VFCOMP_NOSTORE;
 702             comp3 = VFCOMP_NOSTORE;
 703          }
 704 #endif
 705
 706          struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
 707             .VertexBufferIndex = input->buffer,
 708             .Valid = true,
 709             .SourceElementFormat = upload_format,
 710             .SourceElementOffset = offset,
 711             .Component0Control = comp0,
 712             .Component1Control = comp1,
 713             .Component2Control = comp2,
 714             .Component3Control = comp3,
 715 #if GEN_GEN < 5
 716             .DestinationElementOffset = i * 4,
 717 #endif
 718          };
 719
 720          GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state);
 721          dw += GENX(VERTEX_ELEMENT_STATE_length);
 722       }
 723    }
 724
 725    if (needs_sgvs_element) {
 726       struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
 727          .Valid = true,
 728          .Component0Control = VFCOMP_STORE_0,
 729          .Component1Control = VFCOMP_STORE_0,
 730          .Component2Control = VFCOMP_STORE_0,
 731          .Component3Control = VFCOMP_STORE_0,
 732 #if GEN_GEN < 5
 733          .DestinationElementOffset = i * 4,
 734 #endif
 735       };
 736
 737 #if GEN_GEN >= 8
 738       if (vs_prog_data->uses_basevertex ||
 739           vs_prog_data->uses_baseinstance) {
 740          elem_state.VertexBufferIndex = brw->vb.nr_buffers;
 741          elem_state.SourceElementFormat = (enum GENX(SURFACE_FORMAT)) ISL_FORMAT_R32G32_UINT;
 742          elem_state.Component0Control = VFCOMP_STORE_SRC;
 743          elem_state.Component1Control = VFCOMP_STORE_SRC;
 744       }
 745 #else
 746       elem_state.VertexBufferIndex = brw->vb.nr_buffers;
 747       elem_state.SourceElementFormat = (enum GENX(SURFACE_FORMAT)) ISL_FORMAT_R32G32_UINT;
 748       if (vs_prog_data->uses_basevertex)
 749          elem_state.Component0Control = VFCOMP_STORE_SRC;
 750
 751       if (vs_prog_data->uses_baseinstance)
 752          elem_state.Component1Control = VFCOMP_STORE_SRC;
 753
 754       if (vs_prog_data->uses_vertexid)
 755          elem_state.Component2Control = VFCOMP_STORE_VID;
 756
 757       if (vs_prog_data->uses_instanceid)
 758          elem_state.Component3Control = VFCOMP_STORE_IID;
 759 #endif
 760
 761       GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state);
 762       dw += GENX(VERTEX_ELEMENT_STATE_length);
 763    }
 764
 765    if (vs_prog_data->uses_drawid) {
 766       struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
 767          .Valid = true,
 768          .VertexBufferIndex = brw->vb.nr_buffers + 1,
 769          .SourceElementFormat = (enum GENX(SURFACE_FORMAT)) ISL_FORMAT_R32_UINT,
 770          .Component0Control = VFCOMP_STORE_SRC,
 771          .Component1Control = VFCOMP_STORE_0,
 772          .Component2Control = VFCOMP_STORE_0,
 773          .Component3Control = VFCOMP_STORE_0,
 774 #if GEN_GEN < 5
 775          .DestinationElementOffset = i * 4,
 776 #endif
 777       };
 778
 779       GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state);
 780       dw += GENX(VERTEX_ELEMENT_STATE_length);
 781    }
 782
 783 #if GEN_GEN >= 6
 784    if (gen6_edgeflag_input) {
 785       const uint32_t format =
 786          brw_get_vertex_surface_type(brw, gen6_edgeflag_input->glarray);
 787
 788       struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
 789          .Valid = true,
 790          .VertexBufferIndex = gen6_edgeflag_input->buffer,
 791          .EdgeFlagEnable = true,
 792          .SourceElementFormat = format,
 793          .SourceElementOffset = gen6_edgeflag_input->offset,
 794          .Component0Control = VFCOMP_STORE_SRC,
 795          .Component1Control = VFCOMP_STORE_0,
 796          .Component2Control = VFCOMP_STORE_0,
 797          .Component3Control = VFCOMP_STORE_0,
 798       };
 799
 800       GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state);
 801       dw += GENX(VERTEX_ELEMENT_STATE_length);
 802    }
 803 #endif
 804
 805 #if GEN_GEN >= 8
 806    for (unsigned i = 0, j = 0; i < brw->vb.nr_enabled; i++) {
 807       const struct brw_vertex_element *input = brw->vb.enabled[i];
 808       const struct brw_vertex_buffer *buffer = &brw->vb.buffers[input->buffer];
 809       unsigned element_index;
 810
 811       /* The edge flag element is reordered to be the last one in the code
 812        * above so we need to compensate for that in the element indices used
 813        * below.
 814        */
 815       if (input == gen6_edgeflag_input)
 816          element_index = nr_elements - 1;
 817       else
 818          element_index = j++;
 819
 820       brw_batch_emit(brw, GENX(3DSTATE_VF_INSTANCING), vfi) {
 821          vfi.VertexElementIndex = element_index;
 822          vfi.InstancingEnable = buffer->step_rate != 0;
 823          vfi.InstanceDataStepRate = buffer->step_rate;
 824       }
 825    }
 826
 827    if (vs_prog_data->uses_drawid) {
 828       const unsigned element = brw->vb.nr_enabled + needs_sgvs_element;
 829
 830       brw_batch_emit(brw, GENX(3DSTATE_VF_INSTANCING), vfi) {
 831          vfi.VertexElementIndex = element;
 832       }
 833    }
 834 #endif
 835 }
 836
 837 static const struct brw_tracked_state genX(vertices) = {
 838    .dirty = {
 839       .mesa = _NEW_POLYGON,
 840       .brw = BRW_NEW_BATCH |
 841              BRW_NEW_BLORP |
 842              BRW_NEW_VERTICES |
 843              BRW_NEW_VS_PROG_DATA,
 844    },
 845    .emit = genX(emit_vertices),
 846 };
 847
 848 static void
 849 genX(emit_index_buffer)(struct brw_context *brw)
 850 {
 851    const struct _mesa_index_buffer *index_buffer = brw->ib.ib;
 852
 853    if (index_buffer == NULL)
 854       return;
 855
 856    brw_batch_emit(brw, GENX(3DSTATE_INDEX_BUFFER), ib) {
 857 #if GEN_GEN < 8 && !GEN_IS_HASWELL
 858       ib.CutIndexEnable = brw->prim_restart.enable_cut_index;
 859 #endif
 860       ib.IndexFormat = brw_get_index_type(index_buffer->index_size);
 861       ib.BufferStartingAddress = ro_bo(brw->ib.bo, 0);
 862 #if GEN_GEN >= 8
 863       ib.IndexBufferMOCS = GEN_GEN >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
 864       ib.BufferSize = brw->ib.size;
 865 #else
 866       ib.BufferEndingAddress = ro_bo(brw->ib.bo, brw->ib.size - 1);
 867 #endif
 868    }
 869 }
 870
 871 static const struct brw_tracked_state genX(index_buffer) = {
 872    .dirty = {
 873       .mesa = 0,
 874       .brw = BRW_NEW_BATCH |
 875              BRW_NEW_BLORP |
 876              BRW_NEW_INDEX_BUFFER,
 877    },
 878    .emit = genX(emit_index_buffer),
 879 };
 880
 881 #if GEN_IS_HASWELL || GEN_GEN >= 8
 882 static void
 883 genX(upload_cut_index)(struct brw_context *brw)
 884 {
 885    const struct gl_context *ctx = &brw->ctx;
 886
 887    brw_batch_emit(brw, GENX(3DSTATE_VF), vf) {
 888       if (ctx->Array._PrimitiveRestart && brw->ib.ib) {
 889          vf.IndexedDrawCutIndexEnable = true;
 890          vf.CutIndex = _mesa_primitive_restart_index(ctx, brw->ib.index_size);
 891       }
 892    }
 893 }
 894
 895 const struct brw_tracked_state genX(cut_index) = {
 896    .dirty = {
 897       .mesa  = _NEW_TRANSFORM,
 898       .brw   = BRW_NEW_INDEX_BUFFER,
 899    },
 900    .emit = genX(upload_cut_index),
 901 };
 902 #endif
 903
 904 #if GEN_GEN >= 6
 905 /**
 906  * Determine the appropriate attribute override value to store into the
 907  * 3DSTATE_SF structure for a given fragment shader attribute.  The attribute
 908  * override value contains two pieces of information: the location of the
 909  * attribute in the VUE (relative to urb_entry_read_offset, see below), and a
 910  * flag indicating whether to "swizzle" the attribute based on the direction
 911  * the triangle is facing.
 912  *
 913  * If an attribute is "swizzled", then the given VUE location is used for
 914  * front-facing triangles, and the VUE location that immediately follows is
 915  * used for back-facing triangles.  We use this to implement the mapping from
 916  * gl_FrontColor/gl_BackColor to gl_Color.
 917  *
 918  * urb_entry_read_offset is the offset into the VUE at which the SF unit is
 919  * being instructed to begin reading attribute data.  It can be set to a
 920  * nonzero value to prevent the SF unit from wasting time reading elements of
 921  * the VUE that are not needed by the fragment shader.  It is measured in
 922  * 256-bit increments.
 923  */
 924 static void
 925 genX(get_attr_override)(struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr,
 926                         const struct brw_vue_map *vue_map,
 927                         int urb_entry_read_offset, int fs_attr,
 928                         bool two_side_color, uint32_t *max_source_attr)
 929 {
 930    /* Find the VUE slot for this attribute. */
 931    int slot = vue_map->varying_to_slot[fs_attr];
 932
 933    /* Viewport and Layer are stored in the VUE header.  We need to override
 934     * them to zero if earlier stages didn't write them, as GL requires that
 935     * they read back as zero when not explicitly set.
 936     */
 937    if (fs_attr == VARYING_SLOT_VIEWPORT || fs_attr == VARYING_SLOT_LAYER) {
 938       attr->ComponentOverrideX = true;
 939       attr->ComponentOverrideW = true;
 940       attr->ConstantSource = CONST_0000;
 941
 942       if (!(vue_map->slots_valid & VARYING_BIT_LAYER))
 943          attr->ComponentOverrideY = true;
 944       if (!(vue_map->slots_valid & VARYING_BIT_VIEWPORT))
 945          attr->ComponentOverrideZ = true;
 946
 947       return;
 948    }
 949
 950    /* If there was only a back color written but not front, use back
 951     * as the color instead of undefined
 952     */
 953    if (slot == -1 && fs_attr == VARYING_SLOT_COL0)
 954       slot = vue_map->varying_to_slot[VARYING_SLOT_BFC0];
 955    if (slot == -1 && fs_attr == VARYING_SLOT_COL1)
 956       slot = vue_map->varying_to_slot[VARYING_SLOT_BFC1];
 957
 958    if (slot == -1) {
 959       /* This attribute does not exist in the VUE--that means that the vertex
 960        * shader did not write to it.  This means that either:
 961        *
 962        * (a) This attribute is a texture coordinate, and it is going to be
 963        * replaced with point coordinates (as a consequence of a call to
 964        * glTexEnvi(GL_POINT_SPRITE, GL_COORD_REPLACE, GL_TRUE)), so the
 965        * hardware will ignore whatever attribute override we supply.
 966        *
 967        * (b) This attribute is read by the fragment shader but not written by
 968        * the vertex shader, so its value is undefined.  Therefore the
 969        * attribute override we supply doesn't matter.
 970        *
 971        * (c) This attribute is gl_PrimitiveID, and it wasn't written by the
 972        * previous shader stage.
 973        *
 974        * Note that we don't have to worry about the cases where the attribute
 975        * is gl_PointCoord or is undergoing point sprite coordinate
 976        * replacement, because in those cases, this function isn't called.
 977        *
 978        * In case (c), we need to program the attribute overrides so that the
 979        * primitive ID will be stored in this slot.  In every other case, the
 980        * attribute override we supply doesn't matter.  So just go ahead and
 981        * program primitive ID in every case.
 982        */
 983       attr->ComponentOverrideW = true;
 984       attr->ComponentOverrideX = true;
 985       attr->ComponentOverrideY = true;
 986       attr->ComponentOverrideZ = true;
 987       attr->ConstantSource = PRIM_ID;
 988       return;
 989    }
 990
 991    /* Compute the location of the attribute relative to urb_entry_read_offset.
 992     * Each increment of urb_entry_read_offset represents a 256-bit value, so
 993     * it counts for two 128-bit VUE slots.
 994     */
 995    int source_attr = slot - 2 * urb_entry_read_offset;
 996    assert(source_attr >= 0 && source_attr < 32);
 997
 998    /* If we are doing two-sided color, and the VUE slot following this one
 999     * represents a back-facing color, then we need to instruct the SF unit to
1000     * do back-facing swizzling.
1001     */
1002    bool swizzling = two_side_color &&
1003       ((vue_map->slot_to_varying[slot] == VARYING_SLOT_COL0 &&
1004         vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC0) ||
1005        (vue_map->slot_to_varying[slot] == VARYING_SLOT_COL1 &&
1006         vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC1));
1007
1008    /* Update max_source_attr.  If swizzling, the SF will read this slot + 1. */
1009    if (*max_source_attr < source_attr + swizzling)
1010       *max_source_attr = source_attr + swizzling;
1011
1012    attr->SourceAttribute = source_attr;
1013    if (swizzling)
1014       attr->SwizzleSelect = INPUTATTR_FACING;
1015 }
1016
1017
1018 static void
1019 genX(calculate_attr_overrides)(const struct brw_context *brw,
1020                                struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr_overrides,
1021                                uint32_t *point_sprite_enables,
1022                                uint32_t *urb_entry_read_length,
1023                                uint32_t *urb_entry_read_offset)
1024 {
1025    const struct gl_context *ctx = &brw->ctx;
1026
1027    /* _NEW_POINT */
1028    const struct gl_point_attrib *point = &ctx->Point;
1029
1030    /* BRW_NEW_FRAGMENT_PROGRAM */
1031    const struct gl_program *fp = brw->programs[MESA_SHADER_FRAGMENT];
1032
1033    /* BRW_NEW_FS_PROG_DATA */
1034    const struct brw_wm_prog_data *wm_prog_data =
1035       brw_wm_prog_data(brw->wm.base.prog_data);
1036    uint32_t max_source_attr = 0;
1037
1038    *point_sprite_enables = 0;
1039
1040    int first_slot =
1041       brw_compute_first_urb_slot_required(fp->info.inputs_read,
1042                                           &brw->vue_map_geom_out);
1043
1044    /* Each URB offset packs two varying slots */
1045    assert(first_slot % 2 == 0);
1046    *urb_entry_read_offset = first_slot / 2;
1047
1048    /* From the Ivybridge PRM, Vol 2 Part 1, 3DSTATE_SBE,
1049     * description of dw10 Point Sprite Texture Coordinate Enable:
1050     *
1051     * "This field must be programmed to zero when non-point primitives
1052     * are rendered."
1053     *
1054     * The SandyBridge PRM doesn't explicitly say that point sprite enables
1055     * must be programmed to zero when rendering non-point primitives, but
1056     * the IvyBridge PRM does, and if we don't, we get garbage.
1057     *
1058     * This is not required on Haswell, as the hardware ignores this state
1059     * when drawing non-points -- although we do still need to be careful to
1060     * correctly set the attr overrides.
1061     *
1062     * _NEW_POLYGON
1063     * BRW_NEW_PRIMITIVE | BRW_NEW_GS_PROG_DATA | BRW_NEW_TES_PROG_DATA
1064     */
1065    bool drawing_points = brw_is_drawing_points(brw);
1066
1067    for (int attr = 0; attr < VARYING_SLOT_MAX; attr++) {
1068       int input_index = wm_prog_data->urb_setup[attr];
1069
1070       if (input_index < 0)
1071          continue;
1072
1073       /* _NEW_POINT */
1074       bool point_sprite = false;
1075       if (drawing_points) {
1076          if (point->PointSprite &&
1077              (attr >= VARYING_SLOT_TEX0 && attr <= VARYING_SLOT_TEX7) &&
1078              (point->CoordReplace & (1u << (attr - VARYING_SLOT_TEX0)))) {
1079             point_sprite = true;
1080          }
1081
1082          if (attr == VARYING_SLOT_PNTC)
1083             point_sprite = true;
1084
1085          if (point_sprite)
1086             *point_sprite_enables |= (1 << input_index);
1087       }
1088
1089       /* BRW_NEW_VUE_MAP_GEOM_OUT | _NEW_LIGHT | _NEW_PROGRAM */
1090       struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attribute = { 0 };
1091
1092       if (!point_sprite) {
1093          genX(get_attr_override)(&attribute,
1094                                  &brw->vue_map_geom_out,
1095                                  *urb_entry_read_offset, attr,
1096                                  _mesa_vertex_program_two_side_enabled(ctx),
1097                                  &max_source_attr);
1098       }
1099
1100       /* The hardware can only do the overrides on 16 overrides at a
1101        * time, and the other up to 16 have to be lined up so that the
1102        * input index = the output index.  We'll need to do some
1103        * tweaking to make sure that's the case.
1104        */
1105       if (input_index < 16)
1106          attr_overrides[input_index] = attribute;
1107       else
1108          assert(attribute.SourceAttribute == input_index);
1109    }
1110
1111    /* From the Sandy Bridge PRM, Volume 2, Part 1, documentation for
1112     * 3DSTATE_SF DWord 1 bits 15:11, "Vertex URB Entry Read Length":
1113     *
1114     * "This field should be set to the minimum length required to read the
1115     *  maximum source attribute.  The maximum source attribute is indicated
1116     *  by the maximum value of the enabled Attribute # Source Attribute if
1117     *  Attribute Swizzle Enable is set, Number of Output Attributes-1 if
1118     *  enable is not set.
1119     *  read_length = ceiling((max_source_attr + 1) / 2)
1120     *
1121     *  [errata] Corruption/Hang possible if length programmed larger than
1122     *  recommended"
1123     *
1124     * Similar text exists for Ivy Bridge.
1125     */
1126    *urb_entry_read_length = DIV_ROUND_UP(max_source_attr + 1, 2);
1127 }
1128 #endif
1129
1130 /* ---------------------------------------------------------------------- */
1131
1132 #if GEN_GEN >= 8
1133 typedef struct GENX(3DSTATE_WM_DEPTH_STENCIL) DEPTH_STENCIL_GENXML;
1134 #elif GEN_GEN >= 6
1135 typedef struct GENX(DEPTH_STENCIL_STATE)      DEPTH_STENCIL_GENXML;
1136 #else
1137 typedef struct GENX(COLOR_CALC_STATE)         DEPTH_STENCIL_GENXML;
1138 #endif
1139
1140 static inline void
1141 set_depth_stencil_bits(struct brw_context *brw, DEPTH_STENCIL_GENXML *ds)
1142 {
1143    struct gl_context *ctx = &brw->ctx;
1144
1145    /* _NEW_BUFFERS */
1146    struct intel_renderbuffer *depth_irb =
1147       intel_get_renderbuffer(ctx->DrawBuffer, BUFFER_DEPTH);
1148
1149    /* _NEW_DEPTH */
1150    struct gl_depthbuffer_attrib *depth = &ctx->Depth;
1151
1152    /* _NEW_STENCIL */
1153    struct gl_stencil_attrib *stencil = &ctx->Stencil;
1154    const int b = stencil->_BackFace;
1155
1156    if (depth->Test && depth_irb) {
1157       ds->DepthTestEnable = true;
1158       ds->DepthBufferWriteEnable = brw_depth_writes_enabled(brw);
1159       ds->DepthTestFunction = intel_translate_compare_func(depth->Func);
1160    }
1161
1162    if (brw->stencil_enabled) {
1163       ds->StencilTestEnable = true;
1164       ds->StencilWriteMask = stencil->WriteMask[0] & 0xff;
1165       ds->StencilTestMask = stencil->ValueMask[0] & 0xff;
1166
1167       ds->StencilTestFunction =
1168          intel_translate_compare_func(stencil->Function[0]);
1169       ds->StencilFailOp =
1170          intel_translate_stencil_op(stencil->FailFunc[0]);
1171       ds->StencilPassDepthPassOp =
1172          intel_translate_stencil_op(stencil->ZPassFunc[0]);
1173       ds->StencilPassDepthFailOp =
1174          intel_translate_stencil_op(stencil->ZFailFunc[0]);
1175
1176       ds->StencilBufferWriteEnable = brw->stencil_write_enabled;
1177
1178       if (brw->stencil_two_sided) {
1179          ds->DoubleSidedStencilEnable = true;
1180          ds->BackfaceStencilWriteMask = stencil->WriteMask[b] & 0xff;
1181          ds->BackfaceStencilTestMask = stencil->ValueMask[b] & 0xff;
1182
1183          ds->BackfaceStencilTestFunction =
1184             intel_translate_compare_func(stencil->Function[b]);
1185          ds->BackfaceStencilFailOp =
1186             intel_translate_stencil_op(stencil->FailFunc[b]);
1187          ds->BackfaceStencilPassDepthPassOp =
1188             intel_translate_stencil_op(stencil->ZPassFunc[b]);
1189          ds->BackfaceStencilPassDepthFailOp =
1190             intel_translate_stencil_op(stencil->ZFailFunc[b]);
1191       }
1192
1193 #if GEN_GEN <= 5 || GEN_GEN >= 9
1194       ds->StencilReferenceValue = _mesa_get_stencil_ref(ctx, 0);
1195       ds->BackfaceStencilReferenceValue = _mesa_get_stencil_ref(ctx, b);
1196 #endif
1197    }
1198 }
1199
1200 #if GEN_GEN >= 6
1201 static void
1202 genX(upload_depth_stencil_state)(struct brw_context *brw)
1203 {
1204 #if GEN_GEN >= 8
1205    brw_batch_emit(brw, GENX(3DSTATE_WM_DEPTH_STENCIL), wmds) {
1206       set_depth_stencil_bits(brw, &wmds);
1207    }
1208 #else
1209    uint32_t ds_offset;
1210    brw_state_emit(brw, GENX(DEPTH_STENCIL_STATE), 64, &ds_offset, ds) {
1211       set_depth_stencil_bits(brw, &ds);
1212    }
1213
1214    /* Now upload a pointer to the indirect state */
1215 #if GEN_GEN == 6
1216    brw_batch_emit(brw, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
1217       ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
1218       ptr.DEPTH_STENCIL_STATEChange = true;
1219    }
1220 #else
1221    brw_batch_emit(brw, GENX(3DSTATE_DEPTH_STENCIL_STATE_POINTERS), ptr) {
1222       ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
1223    }
1224 #endif
1225 #endif
1226 }
1227
1228 static const struct brw_tracked_state genX(depth_stencil_state) = {
1229    .dirty = {
1230       .mesa = _NEW_BUFFERS |
1231               _NEW_DEPTH |
1232               _NEW_STENCIL,
1233       .brw  = BRW_NEW_BLORP |
1234               (GEN_GEN >= 8 ? BRW_NEW_CONTEXT
1235                             : BRW_NEW_BATCH |
1236                               BRW_NEW_STATE_BASE_ADDRESS),
1237    },
1238    .emit = genX(upload_depth_stencil_state),
1239 };
1240 #endif
1241
1242 /* ---------------------------------------------------------------------- */
1243
1244 #if GEN_GEN <= 5
1245
1246 static void
1247 genX(upload_clip_state)(struct brw_context *brw)
1248 {
1249    struct gl_context *ctx = &brw->ctx;
1250
1251    ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
1252    brw_state_emit(brw, GENX(CLIP_STATE), 32, &brw->clip.state_offset, clip) {
1253       clip.KernelStartPointer = KSP(brw, brw->clip.prog_offset);
1254       clip.GRFRegisterCount =
1255          DIV_ROUND_UP(brw->clip.prog_data->total_grf, 16) - 1;
1256       clip.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
1257       clip.SingleProgramFlow = true;
1258       clip.VertexURBEntryReadLength = brw->clip.prog_data->urb_read_length;
1259       clip.ConstantURBEntryReadLength = brw->clip.prog_data->curb_read_length;
1260
1261       /* BRW_NEW_PUSH_CONSTANT_ALLOCATION */
1262       clip.ConstantURBEntryReadOffset = brw->curbe.clip_start * 2;
1263       clip.DispatchGRFStartRegisterForURBData = 1;
1264       clip.VertexURBEntryReadOffset = 0;
1265
1266       /* BRW_NEW_URB_FENCE */
1267       clip.NumberofURBEntries = brw->urb.nr_clip_entries;
1268       clip.URBEntryAllocationSize = brw->urb.vsize - 1;
1269
1270       if (brw->urb.nr_clip_entries >= 10) {
1271          /* Half of the URB entries go to each thread, and it has to be an
1272           * even number.
1273           */
1274          assert(brw->urb.nr_clip_entries % 2 == 0);
1275
1276          /* Although up to 16 concurrent Clip threads are allowed on Ironlake,
1277           * only 2 threads can output VUEs at a time.
1278           */
1279          clip.MaximumNumberofThreads = (GEN_GEN == 5 ? 16 : 2) - 1;
1280       } else {
1281          assert(brw->urb.nr_clip_entries >= 5);
1282          clip.MaximumNumberofThreads = 1 - 1;
1283       }
1284
1285       clip.VertexPositionSpace = VPOS_NDCSPACE;
1286       clip.UserClipFlagsMustClipEnable = true;
1287       clip.GuardbandClipTestEnable = true;
1288
1289       clip.ClipperViewportStatePointer =
1290          ro_bo(brw->batch.state.bo, brw->clip.vp_offset);
1291
1292       clip.ScreenSpaceViewportXMin = -1;
1293       clip.ScreenSpaceViewportXMax = 1;
1294       clip.ScreenSpaceViewportYMin = -1;
1295       clip.ScreenSpaceViewportYMax = 1;
1296
1297       clip.ViewportXYClipTestEnable = true;
1298       clip.ViewportZClipTestEnable = !ctx->Transform.DepthClamp;
1299
1300       /* _NEW_TRANSFORM */
1301       if (GEN_GEN == 5 || GEN_IS_G4X) {
1302          clip.UserClipDistanceClipTestEnableBitmask =
1303             ctx->Transform.ClipPlanesEnabled;
1304       } else {
1305          /* Up to 6 actual clip flags, plus the 7th for the negative RHW
1306           * workaround.
1307           */
1308          clip.UserClipDistanceClipTestEnableBitmask =
1309             (ctx->Transform.ClipPlanesEnabled & 0x3f) | 0x40;
1310       }
1311
1312       if (ctx->Transform.ClipDepthMode == GL_ZERO_TO_ONE)
1313          clip.APIMode = APIMODE_D3D;
1314       else
1315          clip.APIMode = APIMODE_OGL;
1316
1317       clip.GuardbandClipTestEnable = true;
1318
1319       clip.ClipMode = brw->clip.prog_data->clip_mode;
1320
1321 #if GEN_IS_G4X
1322       clip.NegativeWClipTestEnable = true;
1323 #endif
1324    }
1325 }
1326
1327 const struct brw_tracked_state genX(clip_state) = {
1328    .dirty = {
1329       .mesa  = _NEW_TRANSFORM |
1330                _NEW_VIEWPORT,
1331       .brw   = BRW_NEW_BATCH |
1332                BRW_NEW_BLORP |
1333                BRW_NEW_CLIP_PROG_DATA |
1334                BRW_NEW_PUSH_CONSTANT_ALLOCATION |
1335                BRW_NEW_PROGRAM_CACHE |
1336                BRW_NEW_URB_FENCE,
1337    },
1338    .emit = genX(upload_clip_state),
1339 };
1340
1341 #else
1342
1343 static void
1344 genX(upload_clip_state)(struct brw_context *brw)
1345 {
1346    struct gl_context *ctx = &brw->ctx;
1347
1348    /* _NEW_BUFFERS */
1349    struct gl_framebuffer *fb = ctx->DrawBuffer;
1350
1351    /* BRW_NEW_FS_PROG_DATA */
1352    struct brw_wm_prog_data *wm_prog_data =
1353       brw_wm_prog_data(brw->wm.base.prog_data);
1354
1355    brw_batch_emit(brw, GENX(3DSTATE_CLIP), clip) {
1356       clip.StatisticsEnable = !brw->meta_in_progress;
1357
1358       if (wm_prog_data->barycentric_interp_modes &
1359           BRW_BARYCENTRIC_NONPERSPECTIVE_BITS)
1360          clip.NonPerspectiveBarycentricEnable = true;
1361
1362 #if GEN_GEN >= 7
1363       clip.EarlyCullEnable = true;
1364 #endif
1365
1366 #if GEN_GEN == 7
1367       clip.FrontWinding = brw->polygon_front_bit == _mesa_is_user_fbo(fb);
1368
1369       if (ctx->Polygon.CullFlag) {
1370          switch (ctx->Polygon.CullFaceMode) {
1371          case GL_FRONT:
1372             clip.CullMode = CULLMODE_FRONT;
1373             break;
1374          case GL_BACK:
1375             clip.CullMode = CULLMODE_BACK;
1376             break;
1377          case GL_FRONT_AND_BACK:
1378             clip.CullMode = CULLMODE_BOTH;
1379             break;
1380          default:
1381             unreachable("Should not get here: invalid CullFlag");
1382          }
1383       } else {
1384          clip.CullMode = CULLMODE_NONE;
1385       }
1386 #endif
1387
1388 #if GEN_GEN < 8
1389       clip.UserClipDistanceCullTestEnableBitmask =
1390          brw_vue_prog_data(brw->vs.base.prog_data)->cull_distance_mask;
1391
1392       clip.ViewportZClipTestEnable = !ctx->Transform.DepthClamp;
1393 #endif
1394
1395       /* _NEW_LIGHT */
1396       if (ctx->Light.ProvokingVertex == GL_FIRST_VERTEX_CONVENTION) {
1397          clip.TriangleStripListProvokingVertexSelect = 0;
1398          clip.TriangleFanProvokingVertexSelect = 1;
1399          clip.LineStripListProvokingVertexSelect = 0;
1400       } else {
1401          clip.TriangleStripListProvokingVertexSelect = 2;
1402          clip.TriangleFanProvokingVertexSelect = 2;
1403          clip.LineStripListProvokingVertexSelect = 1;
1404       }
1405
1406       /* _NEW_TRANSFORM */
1407       clip.UserClipDistanceClipTestEnableBitmask =
1408          ctx->Transform.ClipPlanesEnabled;
1409
1410 #if GEN_GEN >= 8
1411       clip.ForceUserClipDistanceClipTestEnableBitmask = true;
1412 #endif
1413
1414       if (ctx->Transform.ClipDepthMode == GL_ZERO_TO_ONE)
1415          clip.APIMode = APIMODE_D3D;
1416       else
1417          clip.APIMode = APIMODE_OGL;
1418
1419       clip.GuardbandClipTestEnable = true;
1420
1421       /* BRW_NEW_VIEWPORT_COUNT */
1422       const unsigned viewport_count = brw->clip.viewport_count;
1423
1424       if (ctx->RasterDiscard) {
1425          clip.ClipMode = CLIPMODE_REJECT_ALL;
1426 #if GEN_GEN == 6
1427          perf_debug("Rasterizer discard is currently implemented via the "
1428                     "clipper; having the GS not write primitives would "
1429                     "likely be faster.\n");
1430 #endif
1431       } else {
1432          clip.ClipMode = CLIPMODE_NORMAL;
1433       }
1434
1435       clip.ClipEnable = true;
1436
1437       /* _NEW_POLYGON,
1438        * BRW_NEW_GEOMETRY_PROGRAM | BRW_NEW_TES_PROG_DATA | BRW_NEW_PRIMITIVE
1439        */
1440       if (!brw_is_drawing_points(brw) && !brw_is_drawing_lines(brw))
1441          clip.ViewportXYClipTestEnable = true;
1442
1443       clip.MinimumPointWidth = 0.125;
1444       clip.MaximumPointWidth = 255.875;
1445       clip.MaximumVPIndex = viewport_count - 1;
1446       if (_mesa_geometric_layers(fb) == 0)
1447          clip.ForceZeroRTAIndexEnable = true;
1448    }
1449 }
1450
1451 static const struct brw_tracked_state genX(clip_state) = {
1452    .dirty = {
1453       .mesa  = _NEW_BUFFERS |
1454                _NEW_LIGHT |
1455                _NEW_POLYGON |
1456                _NEW_TRANSFORM,
1457       .brw   = BRW_NEW_BLORP |
1458                BRW_NEW_CONTEXT |
1459                BRW_NEW_FS_PROG_DATA |
1460                BRW_NEW_GS_PROG_DATA |
1461                BRW_NEW_VS_PROG_DATA |
1462                BRW_NEW_META_IN_PROGRESS |
1463                BRW_NEW_PRIMITIVE |
1464                BRW_NEW_RASTERIZER_DISCARD |
1465                BRW_NEW_TES_PROG_DATA |
1466                BRW_NEW_VIEWPORT_COUNT,
1467    },
1468    .emit = genX(upload_clip_state),
1469 };
1470 #endif
1471
1472 /* ---------------------------------------------------------------------- */
1473
1474 static void
1475 genX(upload_sf)(struct brw_context *brw)
1476 {
1477    struct gl_context *ctx = &brw->ctx;
1478    float point_size;
1479
1480 #if GEN_GEN <= 7
1481    /* _NEW_BUFFERS */
1482    bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
1483    UNUSED const bool multisampled_fbo =
1484       _mesa_geometric_samples(ctx->DrawBuffer) > 1;
1485 #endif
1486
1487 #if GEN_GEN < 6
1488    const struct brw_sf_prog_data *sf_prog_data = brw->sf.prog_data;
1489
1490    ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
1491
1492    brw_state_emit(brw, GENX(SF_STATE), 64, &brw->sf.state_offset, sf) {
1493       sf.KernelStartPointer = KSP(brw, brw->sf.prog_offset);
1494       sf.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
1495       sf.GRFRegisterCount = DIV_ROUND_UP(sf_prog_data->total_grf, 16) - 1;
1496       sf.DispatchGRFStartRegisterForURBData = 3;
1497       sf.VertexURBEntryReadOffset = BRW_SF_URB_ENTRY_READ_OFFSET;
1498       sf.VertexURBEntryReadLength = sf_prog_data->urb_read_length;
1499       sf.NumberofURBEntries = brw->urb.nr_sf_entries;
1500       sf.URBEntryAllocationSize = brw->urb.sfsize - 1;
1501
1502       /* STATE_PREFETCH command description describes this state as being
1503        * something loaded through the GPE (L2 ISC), so it's INSTRUCTION
1504        * domain.
1505        */
1506       sf.SetupViewportStateOffset =
1507          ro_bo(brw->batch.state.bo, brw->sf.vp_offset);
1508
1509       sf.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
1510
1511       /* sf.ConstantURBEntryReadLength = stage_prog_data->curb_read_length; */
1512       /* sf.ConstantURBEntryReadOffset = brw->curbe.vs_start * 2; */
1513
1514       sf.MaximumNumberofThreads =
1515          MIN2(GEN_GEN == 5 ? 48 : 24, brw->urb.nr_sf_entries) - 1;
1516
1517       sf.SpritePointEnable = ctx->Point.PointSprite;
1518
1519       sf.DestinationOriginHorizontalBias = 0.5;
1520       sf.DestinationOriginVerticalBias = 0.5;
1521 #else
1522    brw_batch_emit(brw, GENX(3DSTATE_SF), sf) {
1523       sf.StatisticsEnable = true;
1524 #endif
1525       sf.ViewportTransformEnable = true;
1526
1527 #if GEN_GEN == 7
1528       /* _NEW_BUFFERS */
1529       sf.DepthBufferSurfaceFormat = brw_depthbuffer_format(brw);
1530 #endif
1531
1532 #if GEN_GEN <= 7
1533       /* _NEW_POLYGON */
1534       sf.FrontWinding = brw->polygon_front_bit == render_to_fbo;
1535 #if GEN_GEN >= 6
1536       sf.GlobalDepthOffsetEnableSolid = ctx->Polygon.OffsetFill;
1537       sf.GlobalDepthOffsetEnableWireframe = ctx->Polygon.OffsetLine;
1538       sf.GlobalDepthOffsetEnablePoint = ctx->Polygon.OffsetPoint;
1539
1540       switch (ctx->Polygon.FrontMode) {
1541          case GL_FILL:
1542             sf.FrontFaceFillMode = FILL_MODE_SOLID;
1543             break;
1544          case GL_LINE:
1545             sf.FrontFaceFillMode = FILL_MODE_WIREFRAME;
1546             break;
1547          case GL_POINT:
1548             sf.FrontFaceFillMode = FILL_MODE_POINT;
1549             break;
1550          default:
1551             unreachable("not reached");
1552       }
1553
1554       switch (ctx->Polygon.BackMode) {
1555          case GL_FILL:
1556             sf.BackFaceFillMode = FILL_MODE_SOLID;
1557             break;
1558          case GL_LINE:
1559             sf.BackFaceFillMode = FILL_MODE_WIREFRAME;
1560             break;
1561          case GL_POINT:
1562             sf.BackFaceFillMode = FILL_MODE_POINT;
1563             break;
1564          default:
1565             unreachable("not reached");
1566       }
1567
1568       if (multisampled_fbo && ctx->Multisample.Enabled)
1569          sf.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
1570
1571       sf.GlobalDepthOffsetConstant = ctx->Polygon.OffsetUnits * 2;
1572       sf.GlobalDepthOffsetScale = ctx->Polygon.OffsetFactor;
1573       sf.GlobalDepthOffsetClamp = ctx->Polygon.OffsetClamp;
1574 #endif
1575
1576       sf.ScissorRectangleEnable = true;
1577
1578       if (ctx->Polygon.CullFlag) {
1579          switch (ctx->Polygon.CullFaceMode) {
1580             case GL_FRONT:
1581                sf.CullMode = CULLMODE_FRONT;
1582                break;
1583             case GL_BACK:
1584                sf.CullMode = CULLMODE_BACK;
1585                break;
1586             case GL_FRONT_AND_BACK:
1587                sf.CullMode = CULLMODE_BOTH;
1588                break;
1589             default:
1590                unreachable("not reached");
1591          }
1592       } else {
1593          sf.CullMode = CULLMODE_NONE;
1594       }
1595
1596 #if GEN_IS_HASWELL
1597       sf.LineStippleEnable = ctx->Line.StippleFlag;
1598 #endif
1599
1600 #endif
1601
1602       /* _NEW_LINE */
1603 #if GEN_GEN == 8
1604       const struct gen_device_info *devinfo = &brw->screen->devinfo;
1605
1606       if (devinfo->is_cherryview)
1607          sf.CHVLineWidth = brw_get_line_width(brw);
1608       else
1609          sf.LineWidth = brw_get_line_width(brw);
1610 #else
1611       sf.LineWidth = brw_get_line_width(brw);
1612 #endif
1613
1614       if (ctx->Line.SmoothFlag) {
1615          sf.LineEndCapAntialiasingRegionWidth = _10pixels;
1616 #if GEN_GEN <= 7
1617          sf.AntiAliasingEnable = true;
1618 #endif
1619       }
1620
1621       /* _NEW_POINT - Clamp to ARB_point_parameters user limits */
1622       point_size = CLAMP(ctx->Point.Size, ctx->Point.MinSize, ctx->Point.MaxSize);
1623       /* Clamp to the hardware limits */
1624       sf.PointWidth = CLAMP(point_size, 0.125f, 255.875f);
1625
1626       /* _NEW_PROGRAM | _NEW_POINT, BRW_NEW_VUE_MAP_GEOM_OUT */
1627       if (use_state_point_size(brw))
1628          sf.PointWidthSource = State;
1629
1630 #if GEN_GEN >= 8
1631       /* _NEW_POINT | _NEW_MULTISAMPLE */
1632       if ((ctx->Point.SmoothFlag || _mesa_is_multisample_enabled(ctx)) &&
1633           !ctx->Point.PointSprite)
1634          sf.SmoothPointEnable = true;
1635 #endif
1636
1637 #if GEN_GEN == 10
1638       /* _NEW_BUFFERS
1639        * Smooth Point Enable bit MUST not be set when NUM_MULTISAMPLES > 1.
1640        */
1641       const bool multisampled_fbo =
1642          _mesa_geometric_samples(ctx->DrawBuffer) > 1;
1643       if (multisampled_fbo)
1644          sf.SmoothPointEnable = false;
1645 #endif
1646
1647 #if GEN_IS_G4X || GEN_GEN >= 5
1648       sf.AALineDistanceMode = AALINEDISTANCE_TRUE;
1649 #endif
1650
1651       /* _NEW_LIGHT */
1652       if (ctx->Light.ProvokingVertex != GL_FIRST_VERTEX_CONVENTION) {
1653          sf.TriangleStripListProvokingVertexSelect = 2;
1654          sf.TriangleFanProvokingVertexSelect = 2;
1655          sf.LineStripListProvokingVertexSelect = 1;
1656       } else {
1657          sf.TriangleFanProvokingVertexSelect = 1;
1658       }
1659
1660 #if GEN_GEN == 6
1661       /* BRW_NEW_FS_PROG_DATA */
1662       const struct brw_wm_prog_data *wm_prog_data =
1663          brw_wm_prog_data(brw->wm.base.prog_data);
1664
1665       sf.AttributeSwizzleEnable = true;
1666       sf.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
1667
1668       /*
1669        * Window coordinates in an FBO are inverted, which means point
1670        * sprite origin must be inverted, too.
1671        */
1672       if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) != render_to_fbo) {
1673          sf.PointSpriteTextureCoordinateOrigin = LOWERLEFT;
1674       } else {
1675          sf.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
1676       }
1677
1678       /* BRW_NEW_VUE_MAP_GEOM_OUT | BRW_NEW_FRAGMENT_PROGRAM |
1679        * _NEW_POINT | _NEW_LIGHT | _NEW_PROGRAM | BRW_NEW_FS_PROG_DATA
1680        */
1681       uint32_t urb_entry_read_length;
1682       uint32_t urb_entry_read_offset;
1683       uint32_t point_sprite_enables;
1684       genX(calculate_attr_overrides)(brw, sf.Attribute, &point_sprite_enables,
1685                                      &urb_entry_read_length,
1686                                      &urb_entry_read_offset);
1687       sf.VertexURBEntryReadLength = urb_entry_read_length;
1688       sf.VertexURBEntryReadOffset = urb_entry_read_offset;
1689       sf.PointSpriteTextureCoordinateEnable = point_sprite_enables;
1690       sf.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
1691 #endif
1692    }
1693 }
1694
1695 static const struct brw_tracked_state genX(sf_state) = {
1696    .dirty = {
1697       .mesa  = _NEW_LIGHT |
1698                _NEW_LINE |
1699                _NEW_POINT |
1700                _NEW_PROGRAM |
1701                (GEN_GEN >= 6 ? _NEW_MULTISAMPLE : 0) |
1702                (GEN_GEN <= 7 ? _NEW_BUFFERS | _NEW_POLYGON : 0) |
1703                (GEN_GEN == 10 ? _NEW_BUFFERS : 0),
1704       .brw   = BRW_NEW_BLORP |
1705                BRW_NEW_VUE_MAP_GEOM_OUT |
1706                (GEN_GEN <= 5 ? BRW_NEW_BATCH |
1707                                BRW_NEW_PROGRAM_CACHE |
1708                                BRW_NEW_SF_PROG_DATA |
1709                                BRW_NEW_SF_VP |
1710                                BRW_NEW_URB_FENCE
1711                              : 0) |
1712                (GEN_GEN >= 6 ? BRW_NEW_CONTEXT : 0) |
1713                (GEN_GEN >= 6 && GEN_GEN <= 7 ?
1714                                BRW_NEW_GS_PROG_DATA |
1715                                BRW_NEW_PRIMITIVE |
1716                                BRW_NEW_TES_PROG_DATA
1717                              : 0) |
1718                (GEN_GEN == 6 ? BRW_NEW_FS_PROG_DATA |
1719                                BRW_NEW_FRAGMENT_PROGRAM
1720                              : 0),
1721    },
1722    .emit = genX(upload_sf),
1723 };
1724
1725 /* ---------------------------------------------------------------------- */
1726
1727 static bool
1728 brw_color_buffer_write_enabled(struct brw_context *brw)
1729 {
1730    struct gl_context *ctx = &brw->ctx;
1731    /* BRW_NEW_FRAGMENT_PROGRAM */
1732    const struct gl_program *fp = brw->programs[MESA_SHADER_FRAGMENT];
1733    unsigned i;
1734
1735    /* _NEW_BUFFERS */
1736    for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++) {
1737       struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
1738       uint64_t outputs_written = fp->info.outputs_written;
1739
1740       /* _NEW_COLOR */
1741       if (rb && (outputs_written & BITFIELD64_BIT(FRAG_RESULT_COLOR) ||
1742                  outputs_written & BITFIELD64_BIT(FRAG_RESULT_DATA0 + i)) &&
1743           GET_COLORMASK(ctx->Color.ColorMask, i)) {
1744          return true;
1745       }
1746    }
1747
1748    return false;
1749 }
1750
1751 static void
1752 genX(upload_wm)(struct brw_context *brw)
1753 {
1754    struct gl_context *ctx = &brw->ctx;
1755
1756    /* BRW_NEW_FS_PROG_DATA */
1757    const struct brw_wm_prog_data *wm_prog_data =
1758       brw_wm_prog_data(brw->wm.base.prog_data);
1759
1760    UNUSED bool writes_depth =
1761       wm_prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF;
1762    UNUSED struct brw_stage_state *stage_state = &brw->wm.base;
1763    UNUSED const struct gen_device_info *devinfo = &brw->screen->devinfo;
1764
1765 #if GEN_GEN == 6
1766    /* We can't fold this into gen6_upload_wm_push_constants(), because
1767     * according to the SNB PRM, vol 2 part 1 section 7.2.2
1768     * (3DSTATE_CONSTANT_PS [DevSNB]):
1769     *
1770     *     "[DevSNB]: This packet must be followed by WM_STATE."
1771     */
1772    brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_PS), wmcp) {
1773       if (wm_prog_data->base.nr_params != 0) {
1774          wmcp.Buffer0Valid = true;
1775          /* Pointer to the WM constant buffer.  Covered by the set of
1776           * state flags from gen6_upload_wm_push_constants.
1777           */
1778          wmcp.PointertoPSConstantBuffer0 = stage_state->push_const_offset;
1779          wmcp.PSConstantBuffer0ReadLength = stage_state->push_const_size - 1;
1780       }
1781    }
1782 #endif
1783
1784 #if GEN_GEN >= 6
1785    brw_batch_emit(brw, GENX(3DSTATE_WM), wm) {
1786       wm.LineAntialiasingRegionWidth = _10pixels;
1787       wm.LineEndCapAntialiasingRegionWidth = _05pixels;
1788
1789       wm.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
1790       wm.BarycentricInterpolationMode = wm_prog_data->barycentric_interp_modes;
1791 #else
1792    ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
1793    brw_state_emit(brw, GENX(WM_STATE), 64, &stage_state->state_offset, wm) {
1794       if (wm_prog_data->dispatch_8 && wm_prog_data->dispatch_16) {
1795          /* These two fields should be the same pre-gen6, which is why we
1796           * only have one hardware field to program for both dispatch
1797           * widths.
1798           */
1799          assert(wm_prog_data->base.dispatch_grf_start_reg ==
1800                 wm_prog_data->dispatch_grf_start_reg_2);
1801       }
1802
1803       if (wm_prog_data->dispatch_8 || wm_prog_data->dispatch_16)
1804          wm.GRFRegisterCount0 = wm_prog_data->reg_blocks_0;
1805
1806       if (stage_state->sampler_count)
1807          wm.SamplerStatePointer =
1808             ro_bo(brw->batch.state.bo, stage_state->sampler_offset);
1809 #if GEN_GEN == 5
1810       if (wm_prog_data->prog_offset_2)
1811          wm.GRFRegisterCount2 = wm_prog_data->reg_blocks_2;
1812 #endif
1813
1814       wm.SetupURBEntryReadLength = wm_prog_data->num_varying_inputs * 2;
1815       wm.ConstantURBEntryReadLength = wm_prog_data->base.curb_read_length;
1816       /* BRW_NEW_PUSH_CONSTANT_ALLOCATION */
1817       wm.ConstantURBEntryReadOffset = brw->curbe.wm_start * 2;
1818       wm.EarlyDepthTestEnable = true;
1819       wm.LineAntialiasingRegionWidth = _05pixels;
1820       wm.LineEndCapAntialiasingRegionWidth = _10pixels;
1821
1822       /* _NEW_POLYGON */
1823       if (ctx->Polygon.OffsetFill) {
1824          wm.GlobalDepthOffsetEnable = true;
1825          /* Something weird going on with legacy_global_depth_bias,
1826           * offset_constant, scaling and MRD.  This value passes glean
1827           * but gives some odd results elsewere (eg. the
1828           * quad-offset-units test).
1829           */
1830          wm.GlobalDepthOffsetConstant = ctx->Polygon.OffsetUnits * 2;
1831
1832          /* This is the only value that passes glean:
1833          */
1834          wm.GlobalDepthOffsetScale = ctx->Polygon.OffsetFactor;
1835       }
1836
1837       wm.DepthCoefficientURBReadOffset = 1;
1838 #endif
1839
1840       /* BRW_NEW_STATS_WM */
1841       wm.StatisticsEnable = GEN_GEN >= 6 || brw->stats_wm;
1842
1843 #if GEN_GEN < 7
1844       if (wm_prog_data->base.use_alt_mode)
1845          wm.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
1846
1847       wm.SamplerCount = GEN_GEN == 5 ?
1848          0 : DIV_ROUND_UP(stage_state->sampler_count, 4);
1849
1850       wm.BindingTableEntryCount =
1851          wm_prog_data->base.binding_table.size_bytes / 4;
1852       wm.MaximumNumberofThreads = devinfo->max_wm_threads - 1;
1853       wm._8PixelDispatchEnable = wm_prog_data->dispatch_8;
1854       wm._16PixelDispatchEnable = wm_prog_data->dispatch_16;
1855       wm.DispatchGRFStartRegisterForConstantSetupData0 =
1856          wm_prog_data->base.dispatch_grf_start_reg;
1857       if (GEN_GEN == 6 ||
1858           wm_prog_data->dispatch_8 || wm_prog_data->dispatch_16) {
1859          wm.KernelStartPointer0 = KSP(brw, stage_state->prog_offset);
1860       }
1861
1862 #if GEN_GEN >= 5
1863       if (GEN_GEN == 6 || wm_prog_data->prog_offset_2) {
1864          wm.KernelStartPointer2 =
1865             KSP(brw, stage_state->prog_offset + wm_prog_data->prog_offset_2);
1866       }
1867 #endif
1868
1869 #if GEN_GEN == 6
1870       wm.DualSourceBlendEnable =
1871          wm_prog_data->dual_src_blend && (ctx->Color.BlendEnabled & 1) &&
1872          ctx->Color.Blend[0]._UsesDualSrc;
1873       wm.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
1874       wm.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
1875
1876       /* From the SNB PRM, volume 2 part 1, page 281:
1877        * "If the PS kernel does not need the Position XY Offsets
1878        * to compute a Position XY value, then this field should be
1879        * programmed to POSOFFSET_NONE."
1880        *
1881        * "SW Recommendation: If the PS kernel needs the Position Offsets
1882        * to compute a Position XY value, this field should match Position
1883        * ZW Interpolation Mode to ensure a consistent position.xyzw
1884        * computation."
1885        * We only require XY sample offsets. So, this recommendation doesn't
1886        * look useful at the moment. We might need this in future.
1887        */
1888       if (wm_prog_data->uses_pos_offset)
1889          wm.PositionXYOffsetSelect = POSOFFSET_SAMPLE;
1890       else
1891          wm.PositionXYOffsetSelect = POSOFFSET_NONE;
1892
1893       wm.DispatchGRFStartRegisterForConstantSetupData2 =
1894          wm_prog_data->dispatch_grf_start_reg_2;
1895 #endif
1896
1897       if (wm_prog_data->base.total_scratch) {
1898          wm.ScratchSpaceBasePointer = rw_bo(stage_state->scratch_bo, 0);
1899          wm.PerThreadScratchSpace =
1900             ffs(stage_state->per_thread_scratch) - 11;
1901       }
1902
1903       wm.PixelShaderComputedDepth = writes_depth;
1904 #endif
1905
1906       /* _NEW_LINE */
1907       wm.LineStippleEnable = ctx->Line.StippleFlag;
1908
1909       /* _NEW_POLYGON */
1910       wm.PolygonStippleEnable = ctx->Polygon.StippleFlag;
1911
1912 #if GEN_GEN < 8
1913
1914 #if GEN_GEN >= 6
1915       wm.PixelShaderUsesSourceW = wm_prog_data->uses_src_w;
1916
1917       /* _NEW_BUFFERS */
1918       const bool multisampled_fbo = _mesa_geometric_samples(ctx->DrawBuffer) > 1;
1919
1920       if (multisampled_fbo) {
1921          /* _NEW_MULTISAMPLE */
1922          if (ctx->Multisample.Enabled)
1923             wm.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
1924          else
1925             wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
1926
1927          if (wm_prog_data->persample_dispatch)
1928             wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
1929          else
1930             wm.MultisampleDispatchMode = MSDISPMODE_PERPIXEL;
1931       } else {
1932          wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
1933          wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
1934       }
1935 #endif
1936       wm.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth;
1937       if (wm_prog_data->uses_kill ||
1938           _mesa_is_alpha_test_enabled(ctx) ||
1939           _mesa_is_alpha_to_coverage_enabled(ctx) ||
1940           (GEN_GEN >= 6 && wm_prog_data->uses_omask)) {
1941          wm.PixelShaderKillsPixel = true;
1942       }
1943
1944       /* _NEW_BUFFERS | _NEW_COLOR */
1945       if (brw_color_buffer_write_enabled(brw) || writes_depth ||
1946           wm.PixelShaderKillsPixel ||
1947           (GEN_GEN >= 6 && wm_prog_data->has_side_effects)) {
1948          wm.ThreadDispatchEnable = true;
1949       }
1950
1951 #if GEN_GEN >= 7
1952       wm.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode;
1953       wm.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask;
1954 #endif
1955
1956       /* The "UAV access enable" bits are unnecessary on HSW because they only
1957        * seem to have an effect on the HW-assisted coherency mechanism which we
1958        * don't need, and the rasterization-related UAV_ONLY flag and the
1959        * DISPATCH_ENABLE bit can be set independently from it.
1960        * C.f. gen8_upload_ps_extra().
1961        *
1962        * BRW_NEW_FRAGMENT_PROGRAM | BRW_NEW_FS_PROG_DATA | _NEW_BUFFERS |
1963        * _NEW_COLOR
1964        */
1965 #if GEN_IS_HASWELL
1966       if (!(brw_color_buffer_write_enabled(brw) || writes_depth) &&
1967           wm_prog_data->has_side_effects)
1968          wm.PSUAVonly = ON;
1969 #endif
1970 #endif
1971
1972 #if GEN_GEN >= 7
1973       /* BRW_NEW_FS_PROG_DATA */
1974       if (wm_prog_data->early_fragment_tests)
1975          wm.EarlyDepthStencilControl = EDSC_PREPS;
1976       else if (wm_prog_data->has_side_effects)
1977          wm.EarlyDepthStencilControl = EDSC_PSEXEC;
1978 #endif
1979    }
1980
1981 #if GEN_GEN <= 5
1982    if (brw->wm.offset_clamp != ctx->Polygon.OffsetClamp) {
1983       brw_batch_emit(brw, GENX(3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP), clamp) {
1984          clamp.GlobalDepthOffsetClamp = ctx->Polygon.OffsetClamp;
1985       }
1986
1987       brw->wm.offset_clamp = ctx->Polygon.OffsetClamp;
1988    }
1989 #endif
1990 }
1991
1992 static const struct brw_tracked_state genX(wm_state) = {
1993    .dirty = {
1994       .mesa  = _NEW_LINE |
1995                _NEW_POLYGON |
1996                (GEN_GEN < 8 ? _NEW_BUFFERS |
1997                               _NEW_COLOR :
1998                               0) |
1999                (GEN_GEN == 6 ? _NEW_PROGRAM_CONSTANTS : 0) |
2000                (GEN_GEN < 6 ? _NEW_POLYGONSTIPPLE : 0) |
2001                (GEN_GEN < 8 && GEN_GEN >= 6 ? _NEW_MULTISAMPLE : 0),
2002       .brw   = BRW_NEW_BLORP |
2003                BRW_NEW_FS_PROG_DATA |
2004                (GEN_GEN < 6 ? BRW_NEW_PUSH_CONSTANT_ALLOCATION |
2005                               BRW_NEW_FRAGMENT_PROGRAM |
2006                               BRW_NEW_PROGRAM_CACHE |
2007                               BRW_NEW_SAMPLER_STATE_TABLE |
2008                               BRW_NEW_STATS_WM
2009                             : 0) |
2010                (GEN_GEN < 7 ? BRW_NEW_BATCH : BRW_NEW_CONTEXT),
2011    },
2012    .emit = genX(upload_wm),
2013 };
2014
2015 /* ---------------------------------------------------------------------- */
2016
2017 #define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix) \
2018    pkt.KernelStartPointer = KSP(brw, stage_state->prog_offset);           \
2019    pkt.SamplerCount       =                                               \
2020       DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4);          \
2021    pkt.BindingTableEntryCount =                                           \
2022       stage_prog_data->binding_table.size_bytes / 4;                      \
2023    pkt.FloatingPointMode  = stage_prog_data->use_alt_mode;                \
2024                                                                           \
2025    if (stage_prog_data->total_scratch) {                                  \
2026       pkt.ScratchSpaceBasePointer = rw_bo(stage_state->scratch_bo, 0);    \
2027       pkt.PerThreadScratchSpace =                                         \
2028          ffs(stage_state->per_thread_scratch) - 11;                       \
2029    }                                                                      \
2030                                                                           \
2031    pkt.DispatchGRFStartRegisterForURBData =                               \
2032       stage_prog_data->dispatch_grf_start_reg;                            \
2033    pkt.prefix##URBEntryReadLength = vue_prog_data->urb_read_length;       \
2034    pkt.prefix##URBEntryReadOffset = 0;                                    \
2035                                                                           \
2036    pkt.StatisticsEnable = true;                                           \
2037    pkt.Enable           = true;
2038
2039 static void
2040 genX(upload_vs_state)(struct brw_context *brw)
2041 {
2042    UNUSED struct gl_context *ctx = &brw->ctx;
2043    const struct gen_device_info *devinfo = &brw->screen->devinfo;
2044    struct brw_stage_state *stage_state = &brw->vs.base;
2045
2046    /* BRW_NEW_VS_PROG_DATA */
2047    const struct brw_vue_prog_data *vue_prog_data =
2048       brw_vue_prog_data(brw->vs.base.prog_data);
2049    const struct brw_stage_prog_data *stage_prog_data = &vue_prog_data->base;
2050
2051    assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8 ||
2052           vue_prog_data->dispatch_mode == DISPATCH_MODE_4X2_DUAL_OBJECT);
2053    assert(GEN_GEN < 11 ||
2054           vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8);
2055
2056 #if GEN_GEN == 6
2057    /* From the BSpec, 3D Pipeline > Geometry > Vertex Shader > State,
2058     * 3DSTATE_VS, Dword 5.0 "VS Function Enable":
2059     *
2060     *   [DevSNB] A pipeline flush must be programmed prior to a 3DSTATE_VS
2061     *   command that causes the VS Function Enable to toggle. Pipeline
2062     *   flush can be executed by sending a PIPE_CONTROL command with CS
2063     *   stall bit set and a post sync operation.
2064     *
2065     * We've already done such a flush at the start of state upload, so we
2066     * don't need to do another one here.
2067     */
2068    brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_VS), cvs) {
2069       if (stage_state->push_const_size != 0) {
2070          cvs.Buffer0Valid = true;
2071          cvs.PointertoVSConstantBuffer0 = stage_state->push_const_offset;
2072          cvs.VSConstantBuffer0ReadLength = stage_state->push_const_size - 1;
2073       }
2074    }
2075 #endif
2076
2077    if (GEN_GEN == 7 && devinfo->is_ivybridge)
2078       gen7_emit_vs_workaround_flush(brw);
2079
2080 #if GEN_GEN >= 6
2081    brw_batch_emit(brw, GENX(3DSTATE_VS), vs) {
2082 #else
2083    ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
2084    brw_state_emit(brw, GENX(VS_STATE), 32, &stage_state->state_offset, vs) {
2085 #endif
2086       INIT_THREAD_DISPATCH_FIELDS(vs, Vertex);
2087
2088       vs.MaximumNumberofThreads = devinfo->max_vs_threads - 1;
2089
2090 #if GEN_GEN < 6
2091       vs.GRFRegisterCount = DIV_ROUND_UP(vue_prog_data->total_grf, 16) - 1;
2092       vs.ConstantURBEntryReadLength = stage_prog_data->curb_read_length;
2093       vs.ConstantURBEntryReadOffset = brw->curbe.vs_start * 2;
2094
2095       vs.NumberofURBEntries = brw->urb.nr_vs_entries >> (GEN_GEN == 5 ? 2 : 0);
2096       vs.URBEntryAllocationSize = brw->urb.vsize - 1;
2097
2098       vs.MaximumNumberofThreads =
2099          CLAMP(brw->urb.nr_vs_entries / 2, 1, devinfo->max_vs_threads) - 1;
2100
2101       vs.StatisticsEnable = false;
2102       vs.SamplerStatePointer =
2103          ro_bo(brw->batch.state.bo, stage_state->sampler_offset);
2104 #endif
2105
2106 #if GEN_GEN == 5
2107       /* Force single program flow on Ironlake.  We cannot reliably get
2108        * all applications working without it.  See:
2109        * https://bugs.freedesktop.org/show_bug.cgi?id=29172
2110        *
2111        * The most notable and reliably failing application is the Humus
2112        * demo "CelShading"
2113        */
2114       vs.SingleProgramFlow = true;
2115       vs.SamplerCount = 0; /* hardware requirement */
2116 #endif
2117
2118 #if GEN_GEN >= 8
2119       vs.SIMD8DispatchEnable =
2120          vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8;
2121
2122       vs.UserClipDistanceCullTestEnableBitmask =
2123          vue_prog_data->cull_distance_mask;
2124 #endif
2125    }
2126
2127 #if GEN_GEN == 6
2128    /* Based on my reading of the simulator, the VS constants don't get
2129     * pulled into the VS FF unit until an appropriate pipeline flush
2130     * happens, and instead the 3DSTATE_CONSTANT_VS packet just adds
2131     * references to them into a little FIFO.  The flushes are common,
2132     * but don't reliably happen between this and a 3DPRIMITIVE, causing
2133     * the primitive to use the wrong constants.  Then the FIFO
2134     * containing the constant setup gets added to again on the next
2135     * constants change, and eventually when a flush does happen the
2136     * unit is overwhelmed by constant changes and dies.
2137     *
2138     * To avoid this, send a PIPE_CONTROL down the line that will
2139     * update the unit immediately loading the constants.  The flush
2140     * type bits here were those set by the STATE_BASE_ADDRESS whose
2141     * move in a82a43e8d99e1715dd11c9c091b5ab734079b6a6 triggered the
2142     * bug reports that led to this workaround, and may be more than
2143     * what is strictly required to avoid the issue.
2144     */
2145    brw_emit_pipe_control_flush(brw,
2146                                PIPE_CONTROL_DEPTH_STALL |
2147                                PIPE_CONTROL_INSTRUCTION_INVALIDATE |
2148                                PIPE_CONTROL_STATE_CACHE_INVALIDATE);
2149 #endif
2150 }
2151
2152 static const struct brw_tracked_state genX(vs_state) = {
2153    .dirty = {
2154       .mesa  = (GEN_GEN == 6 ? (_NEW_PROGRAM_CONSTANTS | _NEW_TRANSFORM) : 0),
2155       .brw   = BRW_NEW_BATCH |
2156                BRW_NEW_BLORP |
2157                BRW_NEW_CONTEXT |
2158                BRW_NEW_VS_PROG_DATA |
2159                (GEN_GEN == 6 ? BRW_NEW_VERTEX_PROGRAM : 0) |
2160                (GEN_GEN <= 5 ? BRW_NEW_PUSH_CONSTANT_ALLOCATION |
2161                                BRW_NEW_PROGRAM_CACHE |
2162                                BRW_NEW_SAMPLER_STATE_TABLE |
2163                                BRW_NEW_URB_FENCE
2164                              : 0),
2165    },
2166    .emit = genX(upload_vs_state),
2167 };
2168
2169 /* ---------------------------------------------------------------------- */
2170
2171 static void
2172 genX(upload_cc_viewport)(struct brw_context *brw)
2173 {
2174    struct gl_context *ctx = &brw->ctx;
2175
2176    /* BRW_NEW_VIEWPORT_COUNT */
2177    const unsigned viewport_count = brw->clip.viewport_count;
2178
2179    struct GENX(CC_VIEWPORT) ccv;
2180    uint32_t cc_vp_offset;
2181    uint32_t *cc_map =
2182       brw_state_batch(brw, 4 * GENX(CC_VIEWPORT_length) * viewport_count,
2183                       32, &cc_vp_offset);
2184
2185    for (unsigned i = 0; i < viewport_count; i++) {
2186       /* _NEW_VIEWPORT | _NEW_TRANSFORM */
2187       const struct gl_viewport_attrib *vp = &ctx->ViewportArray[i];
2188       if (ctx->Transform.DepthClamp) {
2189          ccv.MinimumDepth = MIN2(vp->Near, vp->Far);
2190          ccv.MaximumDepth = MAX2(vp->Near, vp->Far);
2191       } else {
2192          ccv.MinimumDepth = 0.0;
2193          ccv.MaximumDepth = 1.0;
2194       }
2195       GENX(CC_VIEWPORT_pack)(NULL, cc_map, &ccv);
2196       cc_map += GENX(CC_VIEWPORT_length);
2197    }
2198
2199 #if GEN_GEN >= 7
2200    brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), ptr) {
2201       ptr.CCViewportPointer = cc_vp_offset;
2202    }
2203 #elif GEN_GEN == 6
2204    brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) {
2205       vp.CCViewportStateChange = 1;
2206       vp.PointertoCC_VIEWPORT = cc_vp_offset;
2207    }
2208 #else
2209    brw->cc.vp_offset = cc_vp_offset;
2210    ctx->NewDriverState |= BRW_NEW_CC_VP;
2211 #endif
2212 }
2213
2214 const struct brw_tracked_state genX(cc_vp) = {
2215    .dirty = {
2216       .mesa = _NEW_TRANSFORM |
2217               _NEW_VIEWPORT,
2218       .brw = BRW_NEW_BATCH |
2219              BRW_NEW_BLORP |
2220              BRW_NEW_VIEWPORT_COUNT,
2221    },
2222    .emit = genX(upload_cc_viewport)
2223 };
2224
2225 /* ---------------------------------------------------------------------- */
2226
2227 static void
2228 set_scissor_bits(const struct gl_context *ctx, int i,
2229                  bool render_to_fbo, unsigned fb_width, unsigned fb_height,
2230                  struct GENX(SCISSOR_RECT) *sc)
2231 {
2232    int bbox[4];
2233
2234    bbox[0] = MAX2(ctx->ViewportArray[i].X, 0);
2235    bbox[1] = MIN2(bbox[0] + ctx->ViewportArray[i].Width, fb_width);
2236    bbox[2] = MAX2(ctx->ViewportArray[i].Y, 0);
2237    bbox[3] = MIN2(bbox[2] + ctx->ViewportArray[i].Height, fb_height);
2238    _mesa_intersect_scissor_bounding_box(ctx, i, bbox);
2239
2240    if (bbox[0] == bbox[1] || bbox[2] == bbox[3]) {
2241       /* If the scissor was out of bounds and got clamped to 0 width/height
2242        * at the bounds, the subtraction of 1 from maximums could produce a
2243        * negative number and thus not clip anything.  Instead, just provide
2244        * a min > max scissor inside the bounds, which produces the expected
2245        * no rendering.
2246        */
2247       sc->ScissorRectangleXMin = 1;
2248       sc->ScissorRectangleXMax = 0;
2249       sc->ScissorRectangleYMin = 1;
2250       sc->ScissorRectangleYMax = 0;
2251    } else if (render_to_fbo) {
2252       /* texmemory: Y=0=bottom */
2253       sc->ScissorRectangleXMin = bbox[0];
2254       sc->ScissorRectangleXMax = bbox[1] - 1;
2255       sc->ScissorRectangleYMin = bbox[2];
2256       sc->ScissorRectangleYMax = bbox[3] - 1;
2257    } else {
2258       /* memory: Y=0=top */
2259       sc->ScissorRectangleXMin = bbox[0];
2260       sc->ScissorRectangleXMax = bbox[1] - 1;
2261       sc->ScissorRectangleYMin = fb_height - bbox[3];
2262       sc->ScissorRectangleYMax = fb_height - bbox[2] - 1;
2263    }
2264 }
2265
2266 #if GEN_GEN >= 6
2267 static void
2268 genX(upload_scissor_state)(struct brw_context *brw)
2269 {
2270    struct gl_context *ctx = &brw->ctx;
2271    const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
2272    struct GENX(SCISSOR_RECT) scissor;
2273    uint32_t scissor_state_offset;
2274    const unsigned int fb_width = _mesa_geometric_width(ctx->DrawBuffer);
2275    const unsigned int fb_height = _mesa_geometric_height(ctx->DrawBuffer);
2276    uint32_t *scissor_map;
2277
2278    /* BRW_NEW_VIEWPORT_COUNT */
2279    const unsigned viewport_count = brw->clip.viewport_count;
2280
2281    scissor_map = brw_state_batch(
2282       brw, GENX(SCISSOR_RECT_length) * sizeof(uint32_t) * viewport_count,
2283       32, &scissor_state_offset);
2284
2285    /* _NEW_SCISSOR | _NEW_BUFFERS | _NEW_VIEWPORT */
2286
2287    /* The scissor only needs to handle the intersection of drawable and
2288     * scissor rect.  Clipping to the boundaries of static shared buffers
2289     * for front/back/depth is covered by looping over cliprects in brw_draw.c.
2290     *
2291     * Note that the hardware's coordinates are inclusive, while Mesa's min is
2292     * inclusive but max is exclusive.
2293     */
2294    for (unsigned i = 0; i < viewport_count; i++) {
2295       set_scissor_bits(ctx, i, render_to_fbo, fb_width, fb_height, &scissor);
2296       GENX(SCISSOR_RECT_pack)(
2297          NULL, scissor_map + i * GENX(SCISSOR_RECT_length), &scissor);
2298    }
2299
2300    brw_batch_emit(brw, GENX(3DSTATE_SCISSOR_STATE_POINTERS), ptr) {
2301       ptr.ScissorRectPointer = scissor_state_offset;
2302    }
2303 }
2304
2305 static const struct brw_tracked_state genX(scissor_state) = {
2306    .dirty = {
2307       .mesa = _NEW_BUFFERS |
2308               _NEW_SCISSOR |
2309               _NEW_VIEWPORT,
2310       .brw = BRW_NEW_BATCH |
2311              BRW_NEW_BLORP |
2312              BRW_NEW_VIEWPORT_COUNT,
2313    },
2314    .emit = genX(upload_scissor_state),
2315 };
2316 #endif
2317
2318 /* ---------------------------------------------------------------------- */
2319
2320 static void
2321 brw_calculate_guardband_size(uint32_t fb_width, uint32_t fb_height,
2322                              float m00, float m11, float m30, float m31,
2323                              float *xmin, float *xmax,
2324                              float *ymin, float *ymax)
2325 {
2326    /* According to the "Vertex X,Y Clamping and Quantization" section of the
2327     * Strips and Fans documentation:
2328     *
2329     * "The vertex X and Y screen-space coordinates are also /clamped/ to the
2330     *  fixed-point "guardband" range supported by the rasterization hardware"
2331     *
2332     * and
2333     *
2334     * "In almost all circumstances, if an object’s vertices are actually
2335     *  modified by this clamping (i.e., had X or Y coordinates outside of
2336     *  the guardband extent the rendered object will not match the intended
2337     *  result.  Therefore software should take steps to ensure that this does
2338     *  not happen - e.g., by clipping objects such that they do not exceed
2339     *  these limits after the Drawing Rectangle is applied."
2340     *
2341     * I believe the fundamental restriction is that the rasterizer (in
2342     * the SF/WM stages) have a limit on the number of pixels that can be
2343     * rasterized.  We need to ensure any coordinates beyond the rasterizer
2344     * limit are handled by the clipper.  So effectively that limit becomes
2345     * the clipper's guardband size.
2346     *
2347     * It goes on to say:
2348     *
2349     * "In addition, in order to be correctly rendered, objects must have a
2350     *  screenspace bounding box not exceeding 8K in the X or Y direction.
2351     *  This additional restriction must also be comprehended by software,
2352     *  i.e., enforced by use of clipping."
2353     *
2354     * This makes no sense.  Gen7+ hardware supports 16K render targets,
2355     * and you definitely need to be able to draw polygons that fill the
2356     * surface.  Our assumption is that the rasterizer was limited to 8K
2357     * on Sandybridge, which only supports 8K surfaces, and it was actually
2358     * increased to 16K on Ivybridge and later.
2359     *
2360     * So, limit the guardband to 16K on Gen7+ and 8K on Sandybridge.
2361     */
2362    const float gb_size = GEN_GEN >= 7 ? 16384.0f : 8192.0f;
2363
2364    if (m00 != 0 && m11 != 0) {
2365       /* First, we compute the screen-space render area */
2366       const float ss_ra_xmin = MIN3(        0, m30 + m00, m30 - m00);
2367       const float ss_ra_xmax = MAX3( fb_width, m30 + m00, m30 - m00);
2368       const float ss_ra_ymin = MIN3(        0, m31 + m11, m31 - m11);
2369       const float ss_ra_ymax = MAX3(fb_height, m31 + m11, m31 - m11);
2370
2371       /* We want the guardband to be centered on that */
2372       const float ss_gb_xmin = (ss_ra_xmin + ss_ra_xmax) / 2 - gb_size;
2373       const float ss_gb_xmax = (ss_ra_xmin + ss_ra_xmax) / 2 + gb_size;
2374       const float ss_gb_ymin = (ss_ra_ymin + ss_ra_ymax) / 2 - gb_size;
2375       const float ss_gb_ymax = (ss_ra_ymin + ss_ra_ymax) / 2 + gb_size;
2376
2377       /* Now we need it in native device coordinates */
2378       const float ndc_gb_xmin = (ss_gb_xmin - m30) / m00;
2379       const float ndc_gb_xmax = (ss_gb_xmax - m30) / m00;
2380       const float ndc_gb_ymin = (ss_gb_ymin - m31) / m11;
2381       const float ndc_gb_ymax = (ss_gb_ymax - m31) / m11;
2382
2383       /* Thanks to Y-flipping and ORIGIN_UPPER_LEFT, the Y coordinates may be
2384        * flipped upside-down.  X should be fine though.
2385        */
2386       assert(ndc_gb_xmin <= ndc_gb_xmax);
2387       *xmin = ndc_gb_xmin;
2388       *xmax = ndc_gb_xmax;
2389       *ymin = MIN2(ndc_gb_ymin, ndc_gb_ymax);
2390       *ymax = MAX2(ndc_gb_ymin, ndc_gb_ymax);
2391    } else {
2392       /* The viewport scales to 0, so nothing will be rendered. */
2393       *xmin = 0.0f;
2394       *xmax = 0.0f;
2395       *ymin = 0.0f;
2396       *ymax = 0.0f;
2397    }
2398 }
2399
2400 static void
2401 genX(upload_sf_clip_viewport)(struct brw_context *brw)
2402 {
2403    struct gl_context *ctx = &brw->ctx;
2404    float y_scale, y_bias;
2405
2406    /* BRW_NEW_VIEWPORT_COUNT */
2407    const unsigned viewport_count = brw->clip.viewport_count;
2408
2409    /* _NEW_BUFFERS */
2410    const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
2411    const uint32_t fb_width = (float)_mesa_geometric_width(ctx->DrawBuffer);
2412    const uint32_t fb_height = (float)_mesa_geometric_height(ctx->DrawBuffer);
2413
2414 #if GEN_GEN >= 7
2415 #define clv sfv
2416    struct GENX(SF_CLIP_VIEWPORT) sfv;
2417    uint32_t sf_clip_vp_offset;
2418    uint32_t *sf_clip_map =
2419       brw_state_batch(brw, GENX(SF_CLIP_VIEWPORT_length) * 4 * viewport_count,
2420                       64, &sf_clip_vp_offset);
2421 #else
2422    struct GENX(SF_VIEWPORT) sfv;
2423    struct GENX(CLIP_VIEWPORT) clv;
2424    uint32_t sf_vp_offset, clip_vp_offset;
2425    uint32_t *sf_map =
2426       brw_state_batch(brw, GENX(SF_VIEWPORT_length) * 4 * viewport_count,
2427                       32, &sf_vp_offset);
2428    uint32_t *clip_map =
2429       brw_state_batch(brw, GENX(CLIP_VIEWPORT_length) * 4 * viewport_count,
2430                       32, &clip_vp_offset);
2431 #endif
2432
2433    /* _NEW_BUFFERS */
2434    if (render_to_fbo) {
2435       y_scale = 1.0;
2436       y_bias = 0;
2437    } else {
2438       y_scale = -1.0;
2439       y_bias = (float)fb_height;
2440    }
2441
2442    for (unsigned i = 0; i < brw->clip.viewport_count; i++) {
2443       /* _NEW_VIEWPORT: Guardband Clipping */
2444       float scale[3], translate[3], gb_xmin, gb_xmax, gb_ymin, gb_ymax;
2445       _mesa_get_viewport_xform(ctx, i, scale, translate);
2446
2447       sfv.ViewportMatrixElementm00 = scale[0];
2448       sfv.ViewportMatrixElementm11 = scale[1] * y_scale,
2449       sfv.ViewportMatrixElementm22 = scale[2],
2450       sfv.ViewportMatrixElementm30 = translate[0],
2451       sfv.ViewportMatrixElementm31 = translate[1] * y_scale + y_bias,
2452       sfv.ViewportMatrixElementm32 = translate[2],
2453       brw_calculate_guardband_size(fb_width, fb_height,
2454                                    sfv.ViewportMatrixElementm00,
2455                                    sfv.ViewportMatrixElementm11,
2456                                    sfv.ViewportMatrixElementm30,
2457                                    sfv.ViewportMatrixElementm31,
2458                                    &gb_xmin, &gb_xmax, &gb_ymin, &gb_ymax);
2459
2460
2461       clv.XMinClipGuardband = gb_xmin;
2462       clv.XMaxClipGuardband = gb_xmax;
2463       clv.YMinClipGuardband = gb_ymin;
2464       clv.YMaxClipGuardband = gb_ymax;
2465
2466 #if GEN_GEN < 6
2467       set_scissor_bits(ctx, i, render_to_fbo, fb_width, fb_height,
2468                        &sfv.ScissorRectangle);
2469 #elif GEN_GEN >= 8
2470       /* _NEW_VIEWPORT | _NEW_BUFFERS: Screen Space Viewport
2471        * The hardware will take the intersection of the drawing rectangle,
2472        * scissor rectangle, and the viewport extents.  However, emitting
2473        * 3DSTATE_DRAWING_RECTANGLE is expensive since it requires a full
2474        * pipeline stall so we're better off just being a little more clever
2475        * with our viewport so we can emit it once at context creation time.
2476        */
2477       const float viewport_Xmin = MAX2(ctx->ViewportArray[i].X, 0);
2478       const float viewport_Ymin = MAX2(ctx->ViewportArray[i].Y, 0);
2479       const float viewport_Xmax =
2480          MIN2(ctx->ViewportArray[i].X + ctx->ViewportArray[i].Width, fb_width);
2481       const float viewport_Ymax =
2482          MIN2(ctx->ViewportArray[i].Y + ctx->ViewportArray[i].Height, fb_height);
2483
2484       if (render_to_fbo) {
2485          sfv.XMinViewPort = viewport_Xmin;
2486          sfv.XMaxViewPort = viewport_Xmax - 1;
2487          sfv.YMinViewPort = viewport_Ymin;
2488          sfv.YMaxViewPort = viewport_Ymax - 1;
2489       } else {
2490          sfv.XMinViewPort = viewport_Xmin;
2491          sfv.XMaxViewPort = viewport_Xmax - 1;
2492          sfv.YMinViewPort = fb_height - viewport_Ymax;
2493          sfv.YMaxViewPort = fb_height - viewport_Ymin - 1;
2494       }
2495 #endif
2496
2497 #if GEN_GEN >= 7
2498       GENX(SF_CLIP_VIEWPORT_pack)(NULL, sf_clip_map, &sfv);
2499       sf_clip_map += GENX(SF_CLIP_VIEWPORT_length);
2500 #else
2501       GENX(SF_VIEWPORT_pack)(NULL, sf_map, &sfv);
2502       GENX(CLIP_VIEWPORT_pack)(NULL, clip_map, &clv);
2503       sf_map += GENX(SF_VIEWPORT_length);
2504       clip_map += GENX(CLIP_VIEWPORT_length);
2505 #endif
2506    }
2507
2508 #if GEN_GEN >= 7
2509    brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), ptr) {
2510       ptr.SFClipViewportPointer = sf_clip_vp_offset;
2511    }
2512 #elif GEN_GEN == 6
2513    brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) {
2514       vp.SFViewportStateChange = 1;
2515       vp.CLIPViewportStateChange = 1;
2516       vp.PointertoCLIP_VIEWPORT = clip_vp_offset;
2517       vp.PointertoSF_VIEWPORT = sf_vp_offset;
2518    }
2519 #else
2520    brw->sf.vp_offset = sf_vp_offset;
2521    brw->clip.vp_offset = clip_vp_offset;
2522    brw->ctx.NewDriverState |= BRW_NEW_SF_VP | BRW_NEW_CLIP_VP;
2523 #endif
2524 }
2525
2526 static const struct brw_tracked_state genX(sf_clip_viewport) = {
2527    .dirty = {
2528       .mesa = _NEW_BUFFERS |
2529               _NEW_VIEWPORT |
2530               (GEN_GEN <= 5 ? _NEW_SCISSOR : 0),
2531       .brw = BRW_NEW_BATCH |
2532              BRW_NEW_BLORP |
2533              BRW_NEW_VIEWPORT_COUNT,
2534    },
2535    .emit = genX(upload_sf_clip_viewport),
2536 };
2537
2538 /* ---------------------------------------------------------------------- */
2539
2540 static void
2541 genX(upload_gs_state)(struct brw_context *brw)
2542 {
2543    UNUSED struct gl_context *ctx = &brw->ctx;
2544    UNUSED const struct gen_device_info *devinfo = &brw->screen->devinfo;
2545    const struct brw_stage_state *stage_state = &brw->gs.base;
2546    const struct gl_program *gs_prog = brw->programs[MESA_SHADER_GEOMETRY];
2547    /* BRW_NEW_GEOMETRY_PROGRAM */
2548    bool active = GEN_GEN >= 6 && gs_prog;
2549
2550    /* BRW_NEW_GS_PROG_DATA */
2551    struct brw_stage_prog_data *stage_prog_data = stage_state->prog_data;
2552    UNUSED const struct brw_vue_prog_data *vue_prog_data =
2553       brw_vue_prog_data(stage_prog_data);
2554 #if GEN_GEN >= 7
2555    const struct brw_gs_prog_data *gs_prog_data =
2556       brw_gs_prog_data(stage_prog_data);
2557 #endif
2558
2559 #if GEN_GEN == 6
2560    brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_GS), cgs) {
2561       if (active && stage_state->push_const_size != 0) {
2562          cgs.Buffer0Valid = true;
2563          cgs.PointertoGSConstantBuffer0 = stage_state->push_const_offset;
2564          cgs.GSConstantBuffer0ReadLength = stage_state->push_const_size - 1;
2565       }
2566    }
2567 #endif
2568
2569 #if GEN_GEN == 7 && !GEN_IS_HASWELL
2570    /**
2571     * From Graphics BSpec: 3D-Media-GPGPU Engine > 3D Pipeline Stages >
2572     * Geometry > Geometry Shader > State:
2573     *
2574     *     "Note: Because of corruption in IVB:GT2, software needs to flush the
2575     *     whole fixed function pipeline when the GS enable changes value in
2576     *     the 3DSTATE_GS."
2577     *
2578     * The hardware architects have clarified that in this context "flush the
2579     * whole fixed function pipeline" means to emit a PIPE_CONTROL with the "CS
2580     * Stall" bit set.
2581     */
2582    if (devinfo->gt == 2 && brw->gs.enabled != active)
2583       gen7_emit_cs_stall_flush(brw);
2584 #endif
2585
2586 #if GEN_GEN >= 6
2587    brw_batch_emit(brw, GENX(3DSTATE_GS), gs) {
2588 #else
2589    ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
2590    brw_state_emit(brw, GENX(GS_STATE), 32, &brw->ff_gs.state_offset, gs) {
2591 #endif
2592
2593 #if GEN_GEN >= 6
2594       if (active) {
2595          INIT_THREAD_DISPATCH_FIELDS(gs, Vertex);
2596
2597 #if GEN_GEN >= 7
2598          gs.OutputVertexSize = gs_prog_data->output_vertex_size_hwords * 2 - 1;
2599          gs.OutputTopology = gs_prog_data->output_topology;
2600          gs.ControlDataHeaderSize =
2601             gs_prog_data->control_data_header_size_hwords;
2602
2603          gs.InstanceControl = gs_prog_data->invocations - 1;
2604          gs.DispatchMode = vue_prog_data->dispatch_mode;
2605
2606          gs.IncludePrimitiveID = gs_prog_data->include_primitive_id;
2607
2608          gs.ControlDataFormat = gs_prog_data->control_data_format;
2609 #endif
2610
2611          /* Note: the meaning of the GEN7_GS_REORDER_TRAILING bit changes between
2612           * Ivy Bridge and Haswell.
2613           *
2614           * On Ivy Bridge, setting this bit causes the vertices of a triangle
2615           * strip to be delivered to the geometry shader in an order that does
2616           * not strictly follow the OpenGL spec, but preserves triangle
2617           * orientation.  For example, if the vertices are (1, 2, 3, 4, 5), then
2618           * the geometry shader sees triangles:
2619           *
2620           * (1, 2, 3), (2, 4, 3), (3, 4, 5)
2621           *
2622           * (Clearing the bit is even worse, because it fails to preserve
2623           * orientation).
2624           *
2625           * Triangle strips with adjacency always ordered in a way that preserves
2626           * triangle orientation but does not strictly follow the OpenGL spec,
2627           * regardless of the setting of this bit.
2628           *
2629           * On Haswell, both triangle strips and triangle strips with adjacency
2630           * are always ordered in a way that preserves triangle orientation.
2631           * Setting this bit causes the ordering to strictly follow the OpenGL
2632           * spec.
2633           *
2634           * So in either case we want to set the bit.  Unfortunately on Ivy
2635           * Bridge this will get the order close to correct but not perfect.
2636           */
2637          gs.ReorderMode = TRAILING;
2638          gs.MaximumNumberofThreads =
2639             GEN_GEN == 8 ? (devinfo->max_gs_threads / 2 - 1)
2640                          : (devinfo->max_gs_threads - 1);
2641
2642 #if GEN_GEN < 7
2643          gs.SOStatisticsEnable = true;
2644          if (gs_prog->info.has_transform_feedback_varyings)
2645             gs.SVBIPayloadEnable = true;
2646
2647          /* GEN6_GS_SPF_MODE and GEN6_GS_VECTOR_MASK_ENABLE are enabled as it
2648           * was previously done for gen6.
2649           *
2650           * TODO: test with both disabled to see if the HW is behaving
2651           * as expected, like in gen7.
2652           */
2653          gs.SingleProgramFlow = true;
2654          gs.VectorMaskEnable = true;
2655 #endif
2656
2657 #if GEN_GEN >= 8
2658          gs.ExpectedVertexCount = gs_prog_data->vertices_in;
2659
2660          if (gs_prog_data->static_vertex_count != -1) {
2661             gs.StaticOutput = true;
2662             gs.StaticOutputVertexCount = gs_prog_data->static_vertex_count;
2663          }
2664          gs.IncludeVertexHandles = vue_prog_data->include_vue_handles;
2665
2666          gs.UserClipDistanceCullTestEnableBitmask =
2667             vue_prog_data->cull_distance_mask;
2668
2669          const int urb_entry_write_offset = 1;
2670          const uint32_t urb_entry_output_length =
2671             DIV_ROUND_UP(vue_prog_data->vue_map.num_slots, 2) -
2672             urb_entry_write_offset;
2673
2674          gs.VertexURBEntryOutputReadOffset = urb_entry_write_offset;
2675          gs.VertexURBEntryOutputLength = MAX2(urb_entry_output_length, 1);
2676 #endif
2677       }
2678 #endif
2679
2680 #if GEN_GEN <= 6
2681       if (!active && brw->ff_gs.prog_active) {
2682          /* In gen6, transform feedback for the VS stage is done with an
2683           * ad-hoc GS program. This function provides the needed 3DSTATE_GS
2684           * for this.
2685           */
2686          gs.KernelStartPointer = KSP(brw, brw->ff_gs.prog_offset);
2687          gs.SingleProgramFlow = true;
2688          gs.DispatchGRFStartRegisterForURBData = GEN_GEN == 6 ? 2 : 1;
2689          gs.VertexURBEntryReadLength = brw->ff_gs.prog_data->urb_read_length;
2690
2691 #if GEN_GEN <= 5
2692          gs.GRFRegisterCount =
2693             DIV_ROUND_UP(brw->ff_gs.prog_data->total_grf, 16) - 1;
2694          /* BRW_NEW_URB_FENCE */
2695          gs.NumberofURBEntries = brw->urb.nr_gs_entries;
2696          gs.URBEntryAllocationSize = brw->urb.vsize - 1;
2697          gs.MaximumNumberofThreads = brw->urb.nr_gs_entries >= 8 ? 1 : 0;
2698          gs.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
2699 #else
2700          gs.Enable = true;
2701          gs.VectorMaskEnable = true;
2702          gs.SVBIPayloadEnable = true;
2703          gs.SVBIPostIncrementEnable = true;
2704          gs.SVBIPostIncrementValue =
2705             brw->ff_gs.prog_data->svbi_postincrement_value;
2706          gs.SOStatisticsEnable = true;
2707          gs.MaximumNumberofThreads = devinfo->max_gs_threads - 1;
2708 #endif
2709       }
2710 #endif
2711       if (!active && !brw->ff_gs.prog_active) {
2712 #if GEN_GEN < 8
2713          gs.DispatchGRFStartRegisterForURBData = 1;
2714 #if GEN_GEN >= 7
2715          gs.IncludeVertexHandles = true;
2716 #endif
2717 #endif
2718       }
2719
2720 #if GEN_GEN >= 6
2721       gs.StatisticsEnable = true;
2722 #endif
2723 #if GEN_GEN == 5 || GEN_GEN == 6
2724       gs.RenderingEnabled = true;
2725 #endif
2726 #if GEN_GEN <= 5
2727       gs.MaximumVPIndex = brw->clip.viewport_count - 1;
2728 #endif
2729    }
2730
2731 #if GEN_GEN == 6
2732    brw->gs.enabled = active;
2733 #endif
2734 }
2735
2736 static const struct brw_tracked_state genX(gs_state) = {
2737    .dirty = {
2738       .mesa  = (GEN_GEN == 6 ? _NEW_PROGRAM_CONSTANTS : 0),
2739       .brw   = BRW_NEW_BATCH |
2740                BRW_NEW_BLORP |
2741                (GEN_GEN <= 5 ? BRW_NEW_PUSH_CONSTANT_ALLOCATION |
2742                                BRW_NEW_PROGRAM_CACHE |
2743                                BRW_NEW_URB_FENCE |
2744                                BRW_NEW_VIEWPORT_COUNT
2745                              : 0) |
2746                (GEN_GEN >= 6 ? BRW_NEW_CONTEXT |
2747                                BRW_NEW_GEOMETRY_PROGRAM |
2748                                BRW_NEW_GS_PROG_DATA
2749                              : 0) |
2750                (GEN_GEN < 7 ? BRW_NEW_FF_GS_PROG_DATA : 0),
2751    },
2752    .emit = genX(upload_gs_state),
2753 };
2754
2755 /* ---------------------------------------------------------------------- */
2756
2757 UNUSED static GLenum
2758 fix_dual_blend_alpha_to_one(GLenum function)
2759 {
2760    switch (function) {
2761    case GL_SRC1_ALPHA:
2762       return GL_ONE;
2763
2764    case GL_ONE_MINUS_SRC1_ALPHA:
2765       return GL_ZERO;
2766    }
2767
2768    return function;
2769 }
2770
2771 #define blend_factor(x) brw_translate_blend_factor(x)
2772 #define blend_eqn(x) brw_translate_blend_equation(x)
2773
2774 /**
2775  * Modify blend function to force destination alpha to 1.0
2776  *
2777  * If \c function specifies a blend function that uses destination alpha,
2778  * replace it with a function that hard-wires destination alpha to 1.0.  This
2779  * is used when rendering to xRGB targets.
2780  */
2781 static GLenum
2782 brw_fix_xRGB_alpha(GLenum function)
2783 {
2784    switch (function) {
2785    case GL_DST_ALPHA:
2786       return GL_ONE;
2787
2788    case GL_ONE_MINUS_DST_ALPHA:
2789    case GL_SRC_ALPHA_SATURATE:
2790       return GL_ZERO;
2791    }
2792
2793    return function;
2794 }
2795
2796 #if GEN_GEN >= 6
2797 typedef struct GENX(BLEND_STATE_ENTRY) BLEND_ENTRY_GENXML;
2798 #else
2799 typedef struct GENX(COLOR_CALC_STATE) BLEND_ENTRY_GENXML;
2800 #endif
2801
2802 UNUSED static bool
2803 set_blend_entry_bits(struct brw_context *brw, BLEND_ENTRY_GENXML *entry, int i,
2804                      bool alpha_to_one)
2805 {
2806    struct gl_context *ctx = &brw->ctx;
2807
2808    /* _NEW_BUFFERS */
2809    const struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
2810
2811    bool independent_alpha_blend = false;
2812
2813    /* Used for implementing the following bit of GL_EXT_texture_integer:
2814     * "Per-fragment operations that require floating-point color
2815     *  components, including multisample alpha operations, alpha test,
2816     *  blending, and dithering, have no effect when the corresponding
2817     *  colors are written to an integer color buffer."
2818     */
2819    const bool integer = ctx->DrawBuffer->_IntegerBuffers & (0x1 << i);
2820
2821    const unsigned blend_enabled = GEN_GEN >= 6 ?
2822       ctx->Color.BlendEnabled & (1 << i) : ctx->Color.BlendEnabled;
2823
2824    /* _NEW_COLOR */
2825    if (ctx->Color.ColorLogicOpEnabled) {
2826       GLenum rb_type = rb ? _mesa_get_format_datatype(rb->Format)
2827          : GL_UNSIGNED_NORMALIZED;
2828       WARN_ONCE(ctx->Color.LogicOp != GL_COPY &&
2829                 rb_type != GL_UNSIGNED_NORMALIZED &&
2830                 rb_type != GL_FLOAT, "Ignoring %s logic op on %s "
2831                 "renderbuffer\n",
2832                 _mesa_enum_to_string(ctx->Color.LogicOp),
2833                 _mesa_enum_to_string(rb_type));
2834       if (GEN_GEN >= 8 || rb_type == GL_UNSIGNED_NORMALIZED) {
2835          entry->LogicOpEnable = true;
2836          entry->LogicOpFunction = ctx->Color._LogicOp;
2837       }
2838    } else if (blend_enabled && !ctx->Color._AdvancedBlendMode
2839               && (GEN_GEN <= 5 || !integer)) {
2840       GLenum eqRGB = ctx->Color.Blend[i].EquationRGB;
2841       GLenum eqA = ctx->Color.Blend[i].EquationA;
2842       GLenum srcRGB = ctx->Color.Blend[i].SrcRGB;
2843       GLenum dstRGB = ctx->Color.Blend[i].DstRGB;
2844       GLenum srcA = ctx->Color.Blend[i].SrcA;
2845       GLenum dstA = ctx->Color.Blend[i].DstA;
2846
2847       if (eqRGB == GL_MIN || eqRGB == GL_MAX)
2848          srcRGB = dstRGB = GL_ONE;
2849
2850       if (eqA == GL_MIN || eqA == GL_MAX)
2851          srcA = dstA = GL_ONE;
2852
2853       /* Due to hardware limitations, the destination may have information
2854        * in an alpha channel even when the format specifies no alpha
2855        * channel. In order to avoid getting any incorrect blending due to
2856        * that alpha channel, coerce the blend factors to values that will
2857        * not read the alpha channel, but will instead use the correct
2858        * implicit value for alpha.
2859        */
2860       if (rb && !_mesa_base_format_has_channel(rb->_BaseFormat,
2861                                                GL_TEXTURE_ALPHA_TYPE)) {
2862          srcRGB = brw_fix_xRGB_alpha(srcRGB);
2863          srcA = brw_fix_xRGB_alpha(srcA);
2864          dstRGB = brw_fix_xRGB_alpha(dstRGB);
2865          dstA = brw_fix_xRGB_alpha(dstA);
2866       }
2867
2868       /* From the BLEND_STATE docs, DWord 0, Bit 29 (AlphaToOne Enable):
2869        * "If Dual Source Blending is enabled, this bit must be disabled."
2870        *
2871        * We override SRC1_ALPHA to ONE and ONE_MINUS_SRC1_ALPHA to ZERO,
2872        * and leave it enabled anyway.
2873        */
2874       if (GEN_GEN >= 6 && ctx->Color.Blend[i]._UsesDualSrc && alpha_to_one) {
2875          srcRGB = fix_dual_blend_alpha_to_one(srcRGB);
2876          srcA = fix_dual_blend_alpha_to_one(srcA);
2877          dstRGB = fix_dual_blend_alpha_to_one(dstRGB);
2878          dstA = fix_dual_blend_alpha_to_one(dstA);
2879       }
2880
2881       entry->ColorBufferBlendEnable = true;
2882       entry->DestinationBlendFactor = blend_factor(dstRGB);
2883       entry->SourceBlendFactor = blend_factor(srcRGB);
2884       entry->DestinationAlphaBlendFactor = blend_factor(dstA);
2885       entry->SourceAlphaBlendFactor = blend_factor(srcA);
2886       entry->ColorBlendFunction = blend_eqn(eqRGB);
2887       entry->AlphaBlendFunction = blend_eqn(eqA);
2888
2889       if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB)
2890          independent_alpha_blend = true;
2891    }
2892
2893    return independent_alpha_blend;
2894 }
2895
2896 #if GEN_GEN >= 6
2897 static void
2898 genX(upload_blend_state)(struct brw_context *brw)
2899 {
2900    struct gl_context *ctx = &brw->ctx;
2901    int size;
2902
2903    /* We need at least one BLEND_STATE written, because we might do
2904     * thread dispatch even if _NumColorDrawBuffers is 0 (for example
2905     * for computed depth or alpha test), which will do an FB write
2906     * with render target 0, which will reference BLEND_STATE[0] for
2907     * alpha test enable.
2908     */
2909    int nr_draw_buffers = ctx->DrawBuffer->_NumColorDrawBuffers;
2910    if (nr_draw_buffers == 0 && ctx->Color.AlphaEnabled)
2911       nr_draw_buffers = 1;
2912
2913    size = GENX(BLEND_STATE_ENTRY_length) * 4 * nr_draw_buffers;
2914 #if GEN_GEN >= 8
2915    size += GENX(BLEND_STATE_length) * 4;
2916 #endif
2917
2918    uint32_t *blend_map;
2919    blend_map = brw_state_batch(brw, size, 64, &brw->cc.blend_state_offset);
2920
2921 #if GEN_GEN >= 8
2922    struct GENX(BLEND_STATE) blend = { 0 };
2923    {
2924 #else
2925    for (int i = 0; i < nr_draw_buffers; i++) {
2926       struct GENX(BLEND_STATE_ENTRY) entry = { 0 };
2927 #define blend entry
2928 #endif
2929       /* OpenGL specification 3.3 (page 196), section 4.1.3 says:
2930        * "If drawbuffer zero is not NONE and the buffer it references has an
2931        * integer format, the SAMPLE_ALPHA_TO_COVERAGE and SAMPLE_ALPHA_TO_ONE
2932        * operations are skipped."
2933        */
2934       if (!(ctx->DrawBuffer->_IntegerBuffers & 0x1)) {
2935          /* _NEW_MULTISAMPLE */
2936          if (_mesa_is_multisample_enabled(ctx)) {
2937             if (ctx->Multisample.SampleAlphaToCoverage) {
2938                blend.AlphaToCoverageEnable = true;
2939                blend.AlphaToCoverageDitherEnable = GEN_GEN >= 7;
2940             }
2941             if (ctx->Multisample.SampleAlphaToOne)
2942                blend.AlphaToOneEnable = true;
2943          }
2944
2945          /* _NEW_COLOR */
2946          if (ctx->Color.AlphaEnabled) {
2947             blend.AlphaTestEnable = true;
2948             blend.AlphaTestFunction =
2949                intel_translate_compare_func(ctx->Color.AlphaFunc);
2950          }
2951
2952          if (ctx->Color.DitherFlag) {
2953             blend.ColorDitherEnable = true;
2954          }
2955       }
2956
2957 #if GEN_GEN >= 8
2958       for (int i = 0; i < nr_draw_buffers; i++) {
2959          struct GENX(BLEND_STATE_ENTRY) entry = { 0 };
2960 #else
2961       {
2962 #endif
2963          blend.IndependentAlphaBlendEnable =
2964             set_blend_entry_bits(brw, &entry, i, blend.AlphaToOneEnable) ||
2965             blend.IndependentAlphaBlendEnable;
2966
2967          /* See section 8.1.6 "Pre-Blend Color Clamping" of the
2968           * SandyBridge PRM Volume 2 Part 1 for HW requirements.
2969           *
2970           * We do our ARB_color_buffer_float CLAMP_FRAGMENT_COLOR
2971           * clamping in the fragment shader.  For its clamping of
2972           * blending, the spec says:
2973           *
2974           *     "RESOLVED: For fixed-point color buffers, the inputs and
2975           *      the result of the blending equation are clamped.  For
2976           *      floating-point color buffers, no clamping occurs."
2977           *
2978           * So, generally, we want clamping to the render target's range.
2979           * And, good news, the hardware tables for both pre- and
2980           * post-blend color clamping are either ignored, or any are
2981           * allowed, or clamping is required but RT range clamping is a
2982           * valid option.
2983           */
2984          entry.PreBlendColorClampEnable = true;
2985          entry.PostBlendColorClampEnable = true;
2986          entry.ColorClampRange = COLORCLAMP_RTFORMAT;
2987
2988          entry.WriteDisableRed   = !GET_COLORMASK_BIT(ctx->Color.ColorMask, i, 0);
2989          entry.WriteDisableGreen = !GET_COLORMASK_BIT(ctx->Color.ColorMask, i, 1);
2990          entry.WriteDisableBlue  = !GET_COLORMASK_BIT(ctx->Color.ColorMask, i, 2);
2991          entry.WriteDisableAlpha = !GET_COLORMASK_BIT(ctx->Color.ColorMask, i, 3);
2992
2993 #if GEN_GEN >= 8
2994          GENX(BLEND_STATE_ENTRY_pack)(NULL, &blend_map[1 + i * 2], &entry);
2995 #else
2996          GENX(BLEND_STATE_ENTRY_pack)(NULL, &blend_map[i * 2], &entry);
2997 #endif
2998       }
2999    }
3000
3001 #if GEN_GEN >= 8
3002    GENX(BLEND_STATE_pack)(NULL, blend_map, &blend);
3003 #endif
3004
3005 #if GEN_GEN < 7
3006    brw_batch_emit(brw, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
3007       ptr.PointertoBLEND_STATE = brw->cc.blend_state_offset;
3008       ptr.BLEND_STATEChange = true;
3009    }
3010 #else
3011    brw_batch_emit(brw, GENX(3DSTATE_BLEND_STATE_POINTERS), ptr) {
3012       ptr.BlendStatePointer = brw->cc.blend_state_offset;
3013 #if GEN_GEN >= 8
3014       ptr.BlendStatePointerValid = true;
3015 #endif
3016    }
3017 #endif
3018 }
3019
3020 static const struct brw_tracked_state genX(blend_state) = {
3021    .dirty = {
3022       .mesa = _NEW_BUFFERS |
3023               _NEW_COLOR |
3024               _NEW_MULTISAMPLE,
3025       .brw = BRW_NEW_BATCH |
3026              BRW_NEW_BLORP |
3027              BRW_NEW_STATE_BASE_ADDRESS,
3028    },
3029    .emit = genX(upload_blend_state),
3030 };
3031 #endif
3032
3033 /* ---------------------------------------------------------------------- */
3034
3035 #if GEN_GEN >= 7
3036 UNUSED static const uint32_t push_constant_opcodes[] = {
3037    [MESA_SHADER_VERTEX]                      = 21,
3038    [MESA_SHADER_TESS_CTRL]                   = 25, /* HS */
3039    [MESA_SHADER_TESS_EVAL]                   = 26, /* DS */
3040    [MESA_SHADER_GEOMETRY]                    = 22,
3041    [MESA_SHADER_FRAGMENT]                    = 23,
3042    [MESA_SHADER_COMPUTE]                     = 0,
3043 };
3044
3045 static void
3046 genX(upload_push_constant_packets)(struct brw_context *brw)
3047 {
3048    const struct gen_device_info *devinfo = &brw->screen->devinfo;
3049    struct gl_context *ctx = &brw->ctx;
3050
3051    UNUSED uint32_t mocs = GEN_GEN < 8 ? GEN7_MOCS_L3 : 0;
3052
3053    struct brw_stage_state *stage_states[] = {
3054       &brw->vs.base,
3055       &brw->tcs.base,
3056       &brw->tes.base,
3057       &brw->gs.base,
3058       &brw->wm.base,
3059    };
3060
3061    if (GEN_GEN == 7 && !GEN_IS_HASWELL && !devinfo->is_baytrail &&
3062        stage_states[MESA_SHADER_VERTEX]->push_constants_dirty)
3063       gen7_emit_vs_workaround_flush(brw);
3064
3065    for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
3066       struct brw_stage_state *stage_state = stage_states[stage];
3067       UNUSED struct gl_program *prog = ctx->_Shader->CurrentProgram[stage];
3068
3069       if (!stage_state->push_constants_dirty)
3070          continue;
3071
3072       brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_VS), pkt) {
3073          pkt._3DCommandSubOpcode = push_constant_opcodes[stage];
3074          if (stage_state->prog_data) {
3075 #if GEN_GEN >= 8 || GEN_IS_HASWELL
3076             /* The Skylake PRM contains the following restriction:
3077              *
3078              *    "The driver must ensure The following case does not occur
3079              *     without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
3080              *     buffer 3 read length equal to zero committed followed by a
3081              *     3DSTATE_CONSTANT_* with buffer 0 read length not equal to
3082              *     zero committed."
3083              *
3084              * To avoid this, we program the buffers in the highest slots.
3085              * This way, slot 0 is only used if slot 3 is also used.
3086              */
3087             int n = 3;
3088
3089             for (int i = 3; i >= 0; i--) {
3090                const struct brw_ubo_range *range =
3091                   &stage_state->prog_data->ubo_ranges[i];
3092
3093                if (range->length == 0)
3094                   continue;
3095
3096                const struct gl_uniform_block *block =
3097                   prog->sh.UniformBlocks[range->block];
3098                const struct gl_buffer_binding *binding =
3099                   &ctx->UniformBufferBindings[block->Binding];
3100
3101                if (binding->BufferObject == ctx->Shared->NullBufferObj) {
3102                   static unsigned msg_id = 0;
3103                   _mesa_gl_debug(ctx, &msg_id, MESA_DEBUG_SOURCE_API,
3104                                  MESA_DEBUG_TYPE_UNDEFINED,
3105                                  MESA_DEBUG_SEVERITY_HIGH,
3106                                  "UBO %d unbound, %s shader uniform data "
3107                                  "will be undefined.",
3108                                  range->block,
3109                                  _mesa_shader_stage_to_string(stage));
3110                   continue;
3111                }
3112
3113                assert(binding->Offset % 32 == 0);
3114
3115                struct brw_bo *bo = intel_bufferobj_buffer(brw,
3116                   intel_buffer_object(binding->BufferObject),
3117                   binding->Offset, range->length * 32, false);
3118
3119                pkt.ConstantBody.ReadLength[n] = range->length;
3120                pkt.ConstantBody.Buffer[n] =
3121                   ro_bo(bo, range->start * 32 + binding->Offset);
3122                n--;
3123             }
3124
3125             if (stage_state->push_const_size > 0) {
3126                assert(n >= 0);
3127                pkt.ConstantBody.ReadLength[n] = stage_state->push_const_size;
3128                pkt.ConstantBody.Buffer[n] =
3129                   ro_bo(stage_state->push_const_bo,
3130                         stage_state->push_const_offset);
3131             }
3132 #else
3133             pkt.ConstantBody.ReadLength[0] = stage_state->push_const_size;
3134             pkt.ConstantBody.Buffer[0].offset =
3135                stage_state->push_const_offset | mocs;
3136 #endif
3137          }
3138       }
3139
3140       stage_state->push_constants_dirty = false;
3141       brw->ctx.NewDriverState |= GEN_GEN >= 9 ? BRW_NEW_SURFACES : 0;
3142    }
3143 }
3144
3145 const struct brw_tracked_state genX(push_constant_packets) = {
3146    .dirty = {
3147       .mesa  = 0,
3148       .brw   = BRW_NEW_DRAW_CALL,
3149    },
3150    .emit = genX(upload_push_constant_packets),
3151 };
3152 #endif
3153
3154 #if GEN_GEN >= 6
3155 static void
3156 genX(upload_vs_push_constants)(struct brw_context *brw)
3157 {
3158    struct brw_stage_state *stage_state = &brw->vs.base;
3159
3160    /* BRW_NEW_VERTEX_PROGRAM */
3161    const struct gl_program *vp = brw->programs[MESA_SHADER_VERTEX];
3162    /* BRW_NEW_VS_PROG_DATA */
3163    const struct brw_stage_prog_data *prog_data = brw->vs.base.prog_data;
3164
3165    gen6_upload_push_constants(brw, vp, prog_data, stage_state);
3166 }
3167
3168 static const struct brw_tracked_state genX(vs_push_constants) = {
3169    .dirty = {
3170       .mesa  = _NEW_PROGRAM_CONSTANTS |
3171                _NEW_TRANSFORM,
3172       .brw   = BRW_NEW_BATCH |
3173                BRW_NEW_BLORP |
3174                BRW_NEW_VERTEX_PROGRAM |
3175                BRW_NEW_VS_PROG_DATA,
3176    },
3177    .emit = genX(upload_vs_push_constants),
3178 };
3179
3180 static void
3181 genX(upload_gs_push_constants)(struct brw_context *brw)
3182 {
3183    struct brw_stage_state *stage_state = &brw->gs.base;
3184
3185    /* BRW_NEW_GEOMETRY_PROGRAM */
3186    const struct gl_program *gp = brw->programs[MESA_SHADER_GEOMETRY];
3187
3188    /* BRW_NEW_GS_PROG_DATA */
3189    struct brw_stage_prog_data *prog_data = brw->gs.base.prog_data;
3190
3191    gen6_upload_push_constants(brw, gp, prog_data, stage_state);
3192 }
3193
3194 static const struct brw_tracked_state genX(gs_push_constants) = {
3195    .dirty = {
3196       .mesa  = _NEW_PROGRAM_CONSTANTS |
3197                _NEW_TRANSFORM,
3198       .brw   = BRW_NEW_BATCH |
3199                BRW_NEW_BLORP |
3200                BRW_NEW_GEOMETRY_PROGRAM |
3201                BRW_NEW_GS_PROG_DATA,
3202    },
3203    .emit = genX(upload_gs_push_constants),
3204 };
3205
3206 static void
3207 genX(upload_wm_push_constants)(struct brw_context *brw)
3208 {
3209    struct brw_stage_state *stage_state = &brw->wm.base;
3210    /* BRW_NEW_FRAGMENT_PROGRAM */
3211    const struct gl_program *fp = brw->programs[MESA_SHADER_FRAGMENT];
3212    /* BRW_NEW_FS_PROG_DATA */
3213    const struct brw_stage_prog_data *prog_data = brw->wm.base.prog_data;
3214
3215    gen6_upload_push_constants(brw, fp, prog_data, stage_state);
3216 }
3217
3218 static const struct brw_tracked_state genX(wm_push_constants) = {
3219    .dirty = {
3220       .mesa  = _NEW_PROGRAM_CONSTANTS,
3221       .brw   = BRW_NEW_BATCH |
3222                BRW_NEW_BLORP |
3223                BRW_NEW_FRAGMENT_PROGRAM |
3224                BRW_NEW_FS_PROG_DATA,
3225    },
3226    .emit = genX(upload_wm_push_constants),
3227 };
3228 #endif
3229
3230 /* ---------------------------------------------------------------------- */
3231
3232 #if GEN_GEN >= 6
3233 static unsigned
3234 genX(determine_sample_mask)(struct brw_context *brw)
3235 {
3236    struct gl_context *ctx = &brw->ctx;
3237    float coverage = 1.0f;
3238    float coverage_invert = false;
3239    unsigned sample_mask = ~0u;
3240
3241    /* BRW_NEW_NUM_SAMPLES */
3242    unsigned num_samples = brw->num_samples;
3243
3244    if (_mesa_is_multisample_enabled(ctx)) {
3245       if (ctx->Multisample.SampleCoverage) {
3246          coverage = ctx->Multisample.SampleCoverageValue;
3247          coverage_invert = ctx->Multisample.SampleCoverageInvert;
3248       }
3249       if (ctx->Multisample.SampleMask) {
3250          sample_mask = ctx->Multisample.SampleMaskValue;
3251       }
3252    }
3253
3254    if (num_samples > 1) {
3255       int coverage_int = (int) (num_samples * coverage + 0.5f);
3256       uint32_t coverage_bits = (1 << coverage_int) - 1;
3257       if (coverage_invert)
3258          coverage_bits ^= (1 << num_samples) - 1;
3259       return coverage_bits & sample_mask;
3260    } else {
3261       return 1;
3262    }
3263 }
3264
3265 static void
3266 genX(emit_3dstate_multisample2)(struct brw_context *brw,
3267                                 unsigned num_samples)
3268 {
3269    unsigned log2_samples = ffs(num_samples) - 1;
3270
3271    brw_batch_emit(brw, GENX(3DSTATE_MULTISAMPLE), multi) {
3272       multi.PixelLocation = CENTER;
3273       multi.NumberofMultisamples = log2_samples;
3274 #if GEN_GEN == 6
3275       GEN_SAMPLE_POS_4X(multi.Sample);
3276 #elif GEN_GEN == 7
3277       switch (num_samples) {
3278       case 1:
3279          GEN_SAMPLE_POS_1X(multi.Sample);
3280          break;
3281       case 2:
3282          GEN_SAMPLE_POS_2X(multi.Sample);
3283          break;
3284       case 4:
3285          GEN_SAMPLE_POS_4X(multi.Sample);
3286          break;
3287       case 8:
3288          GEN_SAMPLE_POS_8X(multi.Sample);
3289          break;
3290       default:
3291          break;
3292       }
3293 #endif
3294    }
3295 }
3296
3297 static void
3298 genX(upload_multisample_state)(struct brw_context *brw)
3299 {
3300    assert(brw->num_samples > 0 && brw->num_samples <= 16);
3301
3302    genX(emit_3dstate_multisample2)(brw, brw->num_samples);
3303
3304    brw_batch_emit(brw, GENX(3DSTATE_SAMPLE_MASK), sm) {
3305       sm.SampleMask = genX(determine_sample_mask)(brw);
3306    }
3307 }
3308
3309 static const struct brw_tracked_state genX(multisample_state) = {
3310    .dirty = {
3311       .mesa = _NEW_MULTISAMPLE |
3312               (GEN_GEN == 10 ? _NEW_BUFFERS : 0),
3313       .brw = BRW_NEW_BLORP |
3314              BRW_NEW_CONTEXT |
3315              BRW_NEW_NUM_SAMPLES,
3316    },
3317    .emit = genX(upload_multisample_state)
3318 };
3319 #endif
3320
3321 /* ---------------------------------------------------------------------- */
3322
3323 static void
3324 genX(upload_color_calc_state)(struct brw_context *brw)
3325 {
3326    struct gl_context *ctx = &brw->ctx;
3327
3328    brw_state_emit(brw, GENX(COLOR_CALC_STATE), 64, &brw->cc.state_offset, cc) {
3329 #if GEN_GEN <= 5
3330       cc.IndependentAlphaBlendEnable =
3331          set_blend_entry_bits(brw, &cc, 0, false);
3332       set_depth_stencil_bits(brw, &cc);
3333
3334       if (ctx->Color.AlphaEnabled &&
3335           ctx->DrawBuffer->_NumColorDrawBuffers <= 1) {
3336          cc.AlphaTestEnable = true;
3337          cc.AlphaTestFunction =
3338             intel_translate_compare_func(ctx->Color.AlphaFunc);
3339       }
3340
3341       cc.ColorDitherEnable = ctx->Color.DitherFlag;
3342
3343       cc.StatisticsEnable = brw->stats_wm;
3344
3345       cc.CCViewportStatePointer =
3346          ro_bo(brw->batch.state.bo, brw->cc.vp_offset);
3347 #else
3348       /* _NEW_COLOR */
3349       cc.BlendConstantColorRed = ctx->Color.BlendColorUnclamped[0];
3350       cc.BlendConstantColorGreen = ctx->Color.BlendColorUnclamped[1];
3351       cc.BlendConstantColorBlue = ctx->Color.BlendColorUnclamped[2];
3352       cc.BlendConstantColorAlpha = ctx->Color.BlendColorUnclamped[3];
3353
3354 #if GEN_GEN < 9
3355       /* _NEW_STENCIL */
3356       cc.StencilReferenceValue = _mesa_get_stencil_ref(ctx, 0);
3357       cc.BackfaceStencilReferenceValue =
3358          _mesa_get_stencil_ref(ctx, ctx->Stencil._BackFace);
3359 #endif
3360
3361 #endif
3362
3363       /* _NEW_COLOR */
3364       UNCLAMPED_FLOAT_TO_UBYTE(cc.AlphaReferenceValueAsUNORM8,
3365                                ctx->Color.AlphaRef);
3366    }
3367
3368 #if GEN_GEN >= 6
3369    brw_batch_emit(brw, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
3370       ptr.ColorCalcStatePointer = brw->cc.state_offset;
3371 #if GEN_GEN != 7
3372       ptr.ColorCalcStatePointerValid = true;
3373 #endif
3374    }
3375 #else
3376    brw->ctx.NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
3377 #endif
3378 }
3379
3380 static const struct brw_tracked_state genX(color_calc_state) = {
3381    .dirty = {
3382       .mesa = _NEW_COLOR |
3383               _NEW_STENCIL |
3384               (GEN_GEN <= 5 ? _NEW_BUFFERS |
3385                               _NEW_DEPTH
3386                             : 0),
3387       .brw = BRW_NEW_BATCH |
3388              BRW_NEW_BLORP |
3389              (GEN_GEN <= 5 ? BRW_NEW_CC_VP |
3390                              BRW_NEW_STATS_WM
3391                            : BRW_NEW_CC_STATE |
3392                              BRW_NEW_STATE_BASE_ADDRESS),
3393    },
3394    .emit = genX(upload_color_calc_state),
3395 };
3396
3397
3398 /* ---------------------------------------------------------------------- */
3399
3400 #if GEN_GEN >= 7
3401 static void
3402 genX(upload_sbe)(struct brw_context *brw)
3403 {
3404    struct gl_context *ctx = &brw->ctx;
3405    /* BRW_NEW_FRAGMENT_PROGRAM */
3406    UNUSED const struct gl_program *fp = brw->programs[MESA_SHADER_FRAGMENT];
3407    /* BRW_NEW_FS_PROG_DATA */
3408    const struct brw_wm_prog_data *wm_prog_data =
3409       brw_wm_prog_data(brw->wm.base.prog_data);
3410 #if GEN_GEN >= 8
3411    struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attr_overrides[16] = { { 0 } };
3412 #else
3413 #define attr_overrides sbe.Attribute
3414 #endif
3415    uint32_t urb_entry_read_length;
3416    uint32_t urb_entry_read_offset;
3417    uint32_t point_sprite_enables;
3418
3419    brw_batch_emit(brw, GENX(3DSTATE_SBE), sbe) {
3420       sbe.AttributeSwizzleEnable = true;
3421       sbe.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
3422
3423       /* _NEW_BUFFERS */
3424       bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3425
3426       /* _NEW_POINT
3427        *
3428        * Window coordinates in an FBO are inverted, which means point
3429        * sprite origin must be inverted.
3430        */
3431       if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) != render_to_fbo)
3432          sbe.PointSpriteTextureCoordinateOrigin = LOWERLEFT;
3433       else
3434          sbe.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
3435
3436       /* _NEW_POINT | _NEW_LIGHT | _NEW_PROGRAM,
3437        * BRW_NEW_FS_PROG_DATA | BRW_NEW_FRAGMENT_PROGRAM |
3438        * BRW_NEW_GS_PROG_DATA | BRW_NEW_PRIMITIVE | BRW_NEW_TES_PROG_DATA |
3439        * BRW_NEW_VUE_MAP_GEOM_OUT
3440        */
3441       genX(calculate_attr_overrides)(brw,
3442                                      attr_overrides,
3443                                      &point_sprite_enables,
3444                                      &urb_entry_read_length,
3445                                      &urb_entry_read_offset);
3446
3447       /* Typically, the URB entry read length and offset should be programmed
3448        * in 3DSTATE_VS and 3DSTATE_GS; SBE inherits it from the last active
3449        * stage which produces geometry.  However, we don't know the proper
3450        * value until we call calculate_attr_overrides().
3451        *
3452        * To fit with our existing code, we override the inherited values and
3453        * specify it here directly, as we did on previous generations.
3454        */
3455       sbe.VertexURBEntryReadLength = urb_entry_read_length;
3456       sbe.VertexURBEntryReadOffset = urb_entry_read_offset;
3457       sbe.PointSpriteTextureCoordinateEnable = point_sprite_enables;
3458       sbe.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
3459
3460 #if GEN_GEN >= 8
3461       sbe.ForceVertexURBEntryReadLength = true;
3462       sbe.ForceVertexURBEntryReadOffset = true;
3463 #endif
3464
3465 #if GEN_GEN >= 9
3466       /* prepare the active component dwords */
3467       const int num_inputs = urb_entry_read_length * 2;
3468       for (int input_index = 0; input_index < num_inputs; input_index++) {
3469          sbe.AttributeActiveComponentFormat[input_index] = ACTIVE_COMPONENT_XYZW;
3470       }
3471 #endif
3472    }
3473
3474 #if GEN_GEN >= 8
3475    brw_batch_emit(brw, GENX(3DSTATE_SBE_SWIZ), sbes) {
3476       for (int i = 0; i < 16; i++)
3477          sbes.Attribute[i] = attr_overrides[i];
3478    }
3479 #endif
3480
3481 #undef attr_overrides
3482 }
3483
3484 static const struct brw_tracked_state genX(sbe_state) = {
3485    .dirty = {
3486       .mesa  = _NEW_BUFFERS |
3487                _NEW_LIGHT |
3488                _NEW_POINT |
3489                _NEW_POLYGON |
3490                _NEW_PROGRAM,
3491       .brw   = BRW_NEW_BLORP |
3492                BRW_NEW_CONTEXT |
3493                BRW_NEW_FRAGMENT_PROGRAM |
3494                BRW_NEW_FS_PROG_DATA |
3495                BRW_NEW_GS_PROG_DATA |
3496                BRW_NEW_TES_PROG_DATA |
3497                BRW_NEW_VUE_MAP_GEOM_OUT |
3498                (GEN_GEN == 7 ? BRW_NEW_PRIMITIVE
3499                              : 0),
3500    },
3501    .emit = genX(upload_sbe),
3502 };
3503 #endif
3504
3505 /* ---------------------------------------------------------------------- */
3506
3507 #if GEN_GEN >= 7
3508 /**
3509  * Outputs the 3DSTATE_SO_DECL_LIST command.
3510  *
3511  * The data output is a series of 64-bit entries containing a SO_DECL per
3512  * stream.  We only have one stream of rendering coming out of the GS unit, so
3513  * we only emit stream 0 (low 16 bits) SO_DECLs.
3514  */
3515 static void
3516 genX(upload_3dstate_so_decl_list)(struct brw_context *brw,
3517                                   const struct brw_vue_map *vue_map)
3518 {
3519    struct gl_context *ctx = &brw->ctx;
3520    /* BRW_NEW_TRANSFORM_FEEDBACK */
3521    struct gl_transform_feedback_object *xfb_obj =
3522       ctx->TransformFeedback.CurrentObject;
3523    const struct gl_transform_feedback_info *linked_xfb_info =
3524       xfb_obj->program->sh.LinkedTransformFeedback;
3525    struct GENX(SO_DECL) so_decl[MAX_VERTEX_STREAMS][128];
3526    int buffer_mask[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
3527    int next_offset[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
3528    int decls[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
3529    int max_decls = 0;
3530    STATIC_ASSERT(ARRAY_SIZE(so_decl[0]) >= MAX_PROGRAM_OUTPUTS);
3531
3532    memset(so_decl, 0, sizeof(so_decl));
3533
3534    /* Construct the list of SO_DECLs to be emitted.  The formatting of the
3535     * command feels strange -- each dword pair contains a SO_DECL per stream.
3536     */
3537    for (unsigned i = 0; i < linked_xfb_info->NumOutputs; i++) {
3538       const struct gl_transform_feedback_output *output =
3539          &linked_xfb_info->Outputs[i];
3540       const int buffer = output->OutputBuffer;
3541       const int varying = output->OutputRegister;
3542       const unsigned stream_id = output->StreamId;
3543       assert(stream_id < MAX_VERTEX_STREAMS);
3544
3545       buffer_mask[stream_id] |= 1 << buffer;
3546
3547       assert(vue_map->varying_to_slot[varying] >= 0);
3548
3549       /* Mesa doesn't store entries for gl_SkipComponents in the Outputs[]
3550        * array.  Instead, it simply increments DstOffset for the following
3551        * input by the number of components that should be skipped.
3552        *
3553        * Our hardware is unusual in that it requires us to program SO_DECLs
3554        * for fake "hole" components, rather than simply taking the offset
3555        * for each real varying.  Each hole can have size 1, 2, 3, or 4; we
3556        * program as many size = 4 holes as we can, then a final hole to
3557        * accommodate the final 1, 2, or 3 remaining.
3558        */
3559       int skip_components = output->DstOffset - next_offset[buffer];
3560
3561       while (skip_components > 0) {
3562          so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
3563             .HoleFlag = 1,
3564             .OutputBufferSlot = output->OutputBuffer,
3565             .ComponentMask = (1 << MIN2(skip_components, 4)) - 1,
3566          };
3567          skip_components -= 4;
3568       }
3569
3570       next_offset[buffer] = output->DstOffset + output->NumComponents;
3571
3572       so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
3573          .OutputBufferSlot = output->OutputBuffer,
3574          .RegisterIndex = vue_map->varying_to_slot[varying],
3575          .ComponentMask =
3576             ((1 << output->NumComponents) - 1) << output->ComponentOffset,
3577       };
3578
3579       if (decls[stream_id] > max_decls)
3580          max_decls = decls[stream_id];
3581    }
3582
3583    uint32_t *dw;
3584    dw = brw_batch_emitn(brw, GENX(3DSTATE_SO_DECL_LIST), 3 + 2 * max_decls,
3585                         .StreamtoBufferSelects0 = buffer_mask[0],
3586                         .StreamtoBufferSelects1 = buffer_mask[1],
3587                         .StreamtoBufferSelects2 = buffer_mask[2],
3588                         .StreamtoBufferSelects3 = buffer_mask[3],
3589                         .NumEntries0 = decls[0],
3590                         .NumEntries1 = decls[1],
3591                         .NumEntries2 = decls[2],
3592                         .NumEntries3 = decls[3]);
3593
3594    for (int i = 0; i < max_decls; i++) {
3595       GENX(SO_DECL_ENTRY_pack)(
3596          brw, dw + 2 + i * 2,
3597          &(struct GENX(SO_DECL_ENTRY)) {
3598             .Stream0Decl = so_decl[0][i],
3599             .Stream1Decl = so_decl[1][i],
3600             .Stream2Decl = so_decl[2][i],
3601             .Stream3Decl = so_decl[3][i],
3602          });
3603    }
3604 }
3605
3606 static void
3607 genX(upload_3dstate_so_buffers)(struct brw_context *brw)
3608 {
3609    struct gl_context *ctx = &brw->ctx;
3610    /* BRW_NEW_TRANSFORM_FEEDBACK */
3611    struct gl_transform_feedback_object *xfb_obj =
3612       ctx->TransformFeedback.CurrentObject;
3613 #if GEN_GEN < 8
3614    const struct gl_transform_feedback_info *linked_xfb_info =
3615       xfb_obj->program->sh.LinkedTransformFeedback;
3616 #else
3617    struct brw_transform_feedback_object *brw_obj =
3618       (struct brw_transform_feedback_object *) xfb_obj;
3619    uint32_t mocs_wb = GEN_GEN >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
3620 #endif
3621
3622    /* Set up the up to 4 output buffers.  These are the ranges defined in the
3623     * gl_transform_feedback_object.
3624     */
3625    for (int i = 0; i < 4; i++) {
3626       struct intel_buffer_object *bufferobj =
3627          intel_buffer_object(xfb_obj->Buffers[i]);
3628
3629       if (!bufferobj) {
3630          brw_batch_emit(brw, GENX(3DSTATE_SO_BUFFER), sob) {
3631             sob.SOBufferIndex = i;
3632          }
3633          continue;
3634       }
3635
3636       uint32_t start = xfb_obj->Offset[i];
3637       assert(start % 4 == 0);
3638       uint32_t end = ALIGN(start + xfb_obj->Size[i], 4);
3639       struct brw_bo *bo =
3640          intel_bufferobj_buffer(brw, bufferobj, start, end - start, true);
3641       assert(end <= bo->size);
3642
3643       brw_batch_emit(brw, GENX(3DSTATE_SO_BUFFER), sob) {
3644          sob.SOBufferIndex = i;
3645
3646          sob.SurfaceBaseAddress = rw_bo(bo, start);
3647 #if GEN_GEN < 8
3648          sob.SurfacePitch = linked_xfb_info->Buffers[i].Stride * 4;
3649          sob.SurfaceEndAddress = rw_bo(bo, end);
3650 #else
3651          sob.SOBufferEnable = true;
3652          sob.StreamOffsetWriteEnable = true;
3653          sob.StreamOutputBufferOffsetAddressEnable = true;
3654          sob.SOBufferMOCS = mocs_wb;
3655
3656          sob.SurfaceSize = MAX2(xfb_obj->Size[i] / 4, 1) - 1;
3657          sob.StreamOutputBufferOffsetAddress =
3658             rw_bo(brw_obj->offset_bo, i * sizeof(uint32_t));
3659
3660          if (brw_obj->zero_offsets) {
3661             /* Zero out the offset and write that to offset_bo */
3662             sob.StreamOffset = 0;
3663          } else {
3664             /* Use offset_bo as the "Stream Offset." */
3665             sob.StreamOffset = 0xFFFFFFFF;
3666          }
3667 #endif
3668       }
3669    }
3670
3671 #if GEN_GEN >= 8
3672    brw_obj->zero_offsets = false;
3673 #endif
3674 }
3675
3676 static bool
3677 query_active(struct gl_query_object *q)
3678 {
3679    return q && q->Active;
3680 }
3681
3682 static void
3683 genX(upload_3dstate_streamout)(struct brw_context *brw, bool active,
3684                                const struct brw_vue_map *vue_map)
3685 {
3686    struct gl_context *ctx = &brw->ctx;
3687    /* BRW_NEW_TRANSFORM_FEEDBACK */
3688    struct gl_transform_feedback_object *xfb_obj =
3689       ctx->TransformFeedback.CurrentObject;
3690
3691    brw_batch_emit(brw, GENX(3DSTATE_STREAMOUT), sos) {
3692       if (active) {
3693          int urb_entry_read_offset = 0;
3694          int urb_entry_read_length = (vue_map->num_slots + 1) / 2 -
3695             urb_entry_read_offset;
3696
3697          sos.SOFunctionEnable = true;
3698          sos.SOStatisticsEnable = true;
3699
3700          /* BRW_NEW_RASTERIZER_DISCARD */
3701          if (ctx->RasterDiscard) {
3702             if (!query_active(ctx->Query.PrimitivesGenerated[0])) {
3703                sos.RenderingDisable = true;
3704             } else {
3705                perf_debug("Rasterizer discard with a GL_PRIMITIVES_GENERATED "
3706                           "query active relies on the clipper.\n");
3707             }
3708          }
3709
3710          /* _NEW_LIGHT */
3711          if (ctx->Light.ProvokingVertex != GL_FIRST_VERTEX_CONVENTION)
3712             sos.ReorderMode = TRAILING;
3713
3714 #if GEN_GEN < 8
3715          sos.SOBufferEnable0 = xfb_obj->Buffers[0] != NULL;
3716          sos.SOBufferEnable1 = xfb_obj->Buffers[1] != NULL;
3717          sos.SOBufferEnable2 = xfb_obj->Buffers[2] != NULL;
3718          sos.SOBufferEnable3 = xfb_obj->Buffers[3] != NULL;
3719 #else
3720          const struct gl_transform_feedback_info *linked_xfb_info =
3721             xfb_obj->program->sh.LinkedTransformFeedback;
3722          /* Set buffer pitches; 0 means unbound. */
3723          if (xfb_obj->Buffers[0])
3724             sos.Buffer0SurfacePitch = linked_xfb_info->Buffers[0].Stride * 4;
3725          if (xfb_obj->Buffers[1])
3726             sos.Buffer1SurfacePitch = linked_xfb_info->Buffers[1].Stride * 4;
3727          if (xfb_obj->Buffers[2])
3728             sos.Buffer2SurfacePitch = linked_xfb_info->Buffers[2].Stride * 4;
3729          if (xfb_obj->Buffers[3])
3730             sos.Buffer3SurfacePitch = linked_xfb_info->Buffers[3].Stride * 4;
3731 #endif
3732
3733          /* We always read the whole vertex.  This could be reduced at some
3734           * point by reading less and offsetting the register index in the
3735           * SO_DECLs.
3736           */
3737          sos.Stream0VertexReadOffset = urb_entry_read_offset;
3738          sos.Stream0VertexReadLength = urb_entry_read_length - 1;
3739          sos.Stream1VertexReadOffset = urb_entry_read_offset;
3740          sos.Stream1VertexReadLength = urb_entry_read_length - 1;
3741          sos.Stream2VertexReadOffset = urb_entry_read_offset;
3742          sos.Stream2VertexReadLength = urb_entry_read_length - 1;
3743          sos.Stream3VertexReadOffset = urb_entry_read_offset;
3744          sos.Stream3VertexReadLength = urb_entry_read_length - 1;
3745       }
3746    }
3747 }
3748
3749 static void
3750 genX(upload_sol)(struct brw_context *brw)
3751 {
3752    struct gl_context *ctx = &brw->ctx;
3753    /* BRW_NEW_TRANSFORM_FEEDBACK */
3754    bool active = _mesa_is_xfb_active_and_unpaused(ctx);
3755
3756    if (active) {
3757       genX(upload_3dstate_so_buffers)(brw);
3758
3759       /* BRW_NEW_VUE_MAP_GEOM_OUT */
3760       genX(upload_3dstate_so_decl_list)(brw, &brw->vue_map_geom_out);
3761    }
3762
3763    /* Finally, set up the SOL stage.  This command must always follow updates to
3764     * the nonpipelined SOL state (3DSTATE_SO_BUFFER, 3DSTATE_SO_DECL_LIST) or
3765     * MMIO register updates (current performed by the kernel at each batch
3766     * emit).
3767     */
3768    genX(upload_3dstate_streamout)(brw, active, &brw->vue_map_geom_out);
3769 }
3770
3771 static const struct brw_tracked_state genX(sol_state) = {
3772    .dirty = {
3773       .mesa  = _NEW_LIGHT,
3774       .brw   = BRW_NEW_BATCH |
3775                BRW_NEW_BLORP |
3776                BRW_NEW_RASTERIZER_DISCARD |
3777                BRW_NEW_VUE_MAP_GEOM_OUT |
3778                BRW_NEW_TRANSFORM_FEEDBACK,
3779    },
3780    .emit = genX(upload_sol),
3781 };
3782 #endif
3783
3784 /* ---------------------------------------------------------------------- */
3785
3786 #if GEN_GEN >= 7
3787 static void
3788 genX(upload_ps)(struct brw_context *brw)
3789 {
3790    UNUSED const struct gl_context *ctx = &brw->ctx;
3791    UNUSED const struct gen_device_info *devinfo = &brw->screen->devinfo;
3792
3793    /* BRW_NEW_FS_PROG_DATA */
3794    const struct brw_wm_prog_data *prog_data =
3795       brw_wm_prog_data(brw->wm.base.prog_data);
3796    const struct brw_stage_state *stage_state = &brw->wm.base;
3797
3798 #if GEN_GEN < 8
3799 #endif
3800
3801    brw_batch_emit(brw, GENX(3DSTATE_PS), ps) {
3802       /* Initialize the execution mask with VMask.  Otherwise, derivatives are
3803        * incorrect for subspans where some of the pixels are unlit.  We believe
3804        * the bit just didn't take effect in previous generations.
3805        */
3806       ps.VectorMaskEnable = GEN_GEN >= 8;
3807
3808       ps.SamplerCount =
3809          DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4);
3810
3811       /* BRW_NEW_FS_PROG_DATA */
3812       ps.BindingTableEntryCount = prog_data->base.binding_table.size_bytes / 4;
3813
3814       if (prog_data->base.use_alt_mode)
3815          ps.FloatingPointMode = Alternate;
3816
3817       /* Haswell requires the sample mask to be set in this packet as well as
3818        * in 3DSTATE_SAMPLE_MASK; the values should match.
3819        */
3820
3821       /* _NEW_BUFFERS, _NEW_MULTISAMPLE */
3822 #if GEN_IS_HASWELL
3823       ps.SampleMask = genX(determine_sample_mask(brw));
3824 #endif
3825
3826       /* 3DSTATE_PS expects the number of threads per PSD, which is always 64
3827        * for pre Gen11 and 128 for gen11+; On gen11+ If a programmed value is
3828        * k, it implies 2(k+1) threads. It implicitly scales for different GT
3829        * levels (which have some # of PSDs).
3830        *
3831        * In Gen8 the format is U8-2 whereas in Gen9+ it is U9-1.
3832        */
3833 #if GEN_GEN >= 9
3834       ps.MaximumNumberofThreadsPerPSD = 64 - 1;
3835 #elif GEN_GEN >= 8
3836       ps.MaximumNumberofThreadsPerPSD = 64 - 2;
3837 #else
3838       ps.MaximumNumberofThreads = devinfo->max_wm_threads - 1;
3839 #endif
3840
3841       if (prog_data->base.nr_params > 0 ||
3842           prog_data->base.ubo_ranges[0].length > 0)
3843          ps.PushConstantEnable = true;
3844
3845 #if GEN_GEN < 8
3846       /* From the IVB PRM, volume 2 part 1, page 287:
3847        * "This bit is inserted in the PS payload header and made available to
3848        * the DataPort (either via the message header or via header bypass) to
3849        * indicate that oMask data (one or two phases) is included in Render
3850        * Target Write messages. If present, the oMask data is used to mask off
3851        * samples."
3852        */
3853       ps.oMaskPresenttoRenderTarget = prog_data->uses_omask;
3854
3855       /* The hardware wedges if you have this bit set but don't turn on any
3856        * dual source blend factors.
3857        *
3858        * BRW_NEW_FS_PROG_DATA | _NEW_COLOR
3859        */
3860       ps.DualSourceBlendEnable = prog_data->dual_src_blend &&
3861                                  (ctx->Color.BlendEnabled & 1) &&
3862                                  ctx->Color.Blend[0]._UsesDualSrc;
3863
3864       /* BRW_NEW_FS_PROG_DATA */
3865       ps.AttributeEnable = (prog_data->num_varying_inputs != 0);
3866 #endif
3867
3868       /* From the documentation for this packet:
3869        * "If the PS kernel does not need the Position XY Offsets to
3870        *  compute a Position Value, then this field should be programmed
3871        *  to POSOFFSET_NONE."
3872        *
3873        * "SW Recommendation: If the PS kernel needs the Position Offsets
3874        *  to compute a Position XY value, this field should match Position
3875        *  ZW Interpolation Mode to ensure a consistent position.xyzw
3876        *  computation."
3877        *
3878        * We only require XY sample offsets. So, this recommendation doesn't
3879        * look useful at the moment. We might need this in future.
3880        */
3881       if (prog_data->uses_pos_offset)
3882          ps.PositionXYOffsetSelect = POSOFFSET_SAMPLE;
3883       else
3884          ps.PositionXYOffsetSelect = POSOFFSET_NONE;
3885
3886       ps._8PixelDispatchEnable = prog_data->dispatch_8;
3887       ps._16PixelDispatchEnable = prog_data->dispatch_16;
3888       ps.DispatchGRFStartRegisterForConstantSetupData0 =
3889          prog_data->base.dispatch_grf_start_reg;
3890       ps.DispatchGRFStartRegisterForConstantSetupData2 =
3891          prog_data->dispatch_grf_start_reg_2;
3892
3893       ps.KernelStartPointer0 = stage_state->prog_offset;
3894       ps.KernelStartPointer2 = stage_state->prog_offset +
3895          prog_data->prog_offset_2;
3896
3897       if (prog_data->base.total_scratch) {
3898          ps.ScratchSpaceBasePointer =
3899             rw_bo(stage_state->scratch_bo,
3900                   ffs(stage_state->per_thread_scratch) - 11);
3901       }
3902    }
3903 }
3904
3905 static const struct brw_tracked_state genX(ps_state) = {
3906    .dirty = {
3907       .mesa  = _NEW_MULTISAMPLE |
3908                (GEN_GEN < 8 ? _NEW_BUFFERS |
3909                               _NEW_COLOR
3910                             : 0),
3911       .brw   = BRW_NEW_BATCH |
3912                BRW_NEW_BLORP |
3913                BRW_NEW_FS_PROG_DATA,
3914    },
3915    .emit = genX(upload_ps),
3916 };
3917 #endif
3918
3919 /* ---------------------------------------------------------------------- */
3920
3921 #if GEN_GEN >= 7
3922 static void
3923 genX(upload_hs_state)(struct brw_context *brw)
3924 {
3925    const struct gen_device_info *devinfo = &brw->screen->devinfo;
3926    struct brw_stage_state *stage_state = &brw->tcs.base;
3927    struct brw_stage_prog_data *stage_prog_data = stage_state->prog_data;
3928    const struct brw_vue_prog_data *vue_prog_data =
3929       brw_vue_prog_data(stage_prog_data);
3930
3931    /* BRW_NEW_TES_PROG_DATA */
3932    struct brw_tcs_prog_data *tcs_prog_data =
3933       brw_tcs_prog_data(stage_prog_data);
3934
3935    if (!tcs_prog_data) {
3936       brw_batch_emit(brw, GENX(3DSTATE_HS), hs);
3937    } else {
3938       brw_batch_emit(brw, GENX(3DSTATE_HS), hs) {
3939          INIT_THREAD_DISPATCH_FIELDS(hs, Vertex);
3940
3941          hs.InstanceCount = tcs_prog_data->instances - 1;
3942          hs.IncludeVertexHandles = true;
3943
3944          hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1;
3945       }
3946    }
3947 }
3948
3949 static const struct brw_tracked_state genX(hs_state) = {
3950    .dirty = {
3951       .mesa  = 0,
3952       .brw   = BRW_NEW_BATCH |
3953                BRW_NEW_BLORP |
3954                BRW_NEW_TCS_PROG_DATA |
3955                BRW_NEW_TESS_PROGRAMS,
3956    },
3957    .emit = genX(upload_hs_state),
3958 };
3959
3960 static void
3961 genX(upload_ds_state)(struct brw_context *brw)
3962 {
3963    const struct gen_device_info *devinfo = &brw->screen->devinfo;
3964    const struct brw_stage_state *stage_state = &brw->tes.base;
3965    struct brw_stage_prog_data *stage_prog_data = stage_state->prog_data;
3966
3967    /* BRW_NEW_TES_PROG_DATA */
3968    const struct brw_tes_prog_data *tes_prog_data =
3969       brw_tes_prog_data(stage_prog_data);
3970    const struct brw_vue_prog_data *vue_prog_data =
3971       brw_vue_prog_data(stage_prog_data);
3972
3973    if (!tes_prog_data) {
3974       brw_batch_emit(brw, GENX(3DSTATE_DS), ds);
3975    } else {
3976       assert(GEN_GEN < 11 ||
3977              vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8);
3978
3979       brw_batch_emit(brw, GENX(3DSTATE_DS), ds) {
3980          INIT_THREAD_DISPATCH_FIELDS(ds, Patch);
3981
3982         ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1;
3983         ds.ComputeWCoordinateEnable =
3984            tes_prog_data->domain == BRW_TESS_DOMAIN_TRI;
3985
3986 #if GEN_GEN >= 8
3987         if (vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8)
3988            ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH;
3989         ds.UserClipDistanceCullTestEnableBitmask =
3990             vue_prog_data->cull_distance_mask;
3991 #endif
3992       }
3993    }
3994 }
3995
3996 static const struct brw_tracked_state genX(ds_state) = {
3997    .dirty = {
3998       .mesa  = 0,
3999       .brw   = BRW_NEW_BATCH |
4000                BRW_NEW_BLORP |
4001                BRW_NEW_TESS_PROGRAMS |
4002                BRW_NEW_TES_PROG_DATA,
4003    },
4004    .emit = genX(upload_ds_state),
4005 };
4006
4007 /* ---------------------------------------------------------------------- */
4008
4009 static void
4010 upload_te_state(struct brw_context *brw)
4011 {
4012    /* BRW_NEW_TESS_PROGRAMS */
4013    bool active = brw->programs[MESA_SHADER_TESS_EVAL];
4014
4015    /* BRW_NEW_TES_PROG_DATA */
4016    const struct brw_tes_prog_data *tes_prog_data =
4017       brw_tes_prog_data(brw->tes.base.prog_data);
4018
4019    if (active) {
4020       brw_batch_emit(brw, GENX(3DSTATE_TE), te) {
4021          te.Partitioning = tes_prog_data->partitioning;
4022          te.OutputTopology = tes_prog_data->output_topology;
4023          te.TEDomain = tes_prog_data->domain;
4024          te.TEEnable = true;
4025          te.MaximumTessellationFactorOdd = 63.0;
4026          te.MaximumTessellationFactorNotOdd = 64.0;
4027       }
4028    } else {
4029       brw_batch_emit(brw, GENX(3DSTATE_TE), te);
4030    }
4031 }
4032
4033 static const struct brw_tracked_state genX(te_state) = {
4034    .dirty = {
4035       .mesa  = 0,
4036       .brw   = BRW_NEW_BLORP |
4037                BRW_NEW_CONTEXT |
4038                BRW_NEW_TES_PROG_DATA |
4039                BRW_NEW_TESS_PROGRAMS,
4040    },
4041    .emit = upload_te_state,
4042 };
4043
4044 /* ---------------------------------------------------------------------- */
4045
4046 static void
4047 genX(upload_tes_push_constants)(struct brw_context *brw)
4048 {
4049    struct brw_stage_state *stage_state = &brw->tes.base;
4050    /* BRW_NEW_TESS_PROGRAMS */
4051    const struct gl_program *tep = brw->programs[MESA_SHADER_TESS_EVAL];
4052
4053    /* BRW_NEW_TES_PROG_DATA */
4054    const struct brw_stage_prog_data *prog_data = brw->tes.base.prog_data;
4055    gen6_upload_push_constants(brw, tep, prog_data, stage_state);
4056 }
4057
4058 static const struct brw_tracked_state genX(tes_push_constants) = {
4059    .dirty = {
4060       .mesa  = _NEW_PROGRAM_CONSTANTS,
4061       .brw   = BRW_NEW_BATCH |
4062                BRW_NEW_BLORP |
4063                BRW_NEW_TESS_PROGRAMS |
4064                BRW_NEW_TES_PROG_DATA,
4065    },
4066    .emit = genX(upload_tes_push_constants),
4067 };
4068
4069 static void
4070 genX(upload_tcs_push_constants)(struct brw_context *brw)
4071 {
4072    struct brw_stage_state *stage_state = &brw->tcs.base;
4073    /* BRW_NEW_TESS_PROGRAMS */
4074    const struct gl_program *tcp = brw->programs[MESA_SHADER_TESS_CTRL];
4075
4076    /* BRW_NEW_TCS_PROG_DATA */
4077    const struct brw_stage_prog_data *prog_data = brw->tcs.base.prog_data;
4078
4079    gen6_upload_push_constants(brw, tcp, prog_data, stage_state);
4080 }
4081
4082 static const struct brw_tracked_state genX(tcs_push_constants) = {
4083    .dirty = {
4084       .mesa  = _NEW_PROGRAM_CONSTANTS,
4085       .brw   = BRW_NEW_BATCH |
4086                BRW_NEW_BLORP |
4087                BRW_NEW_DEFAULT_TESS_LEVELS |
4088                BRW_NEW_TESS_PROGRAMS |
4089                BRW_NEW_TCS_PROG_DATA,
4090    },
4091    .emit = genX(upload_tcs_push_constants),
4092 };
4093
4094 #endif
4095
4096 /* ---------------------------------------------------------------------- */
4097
4098 #if GEN_GEN >= 7
4099 static void
4100 genX(upload_cs_push_constants)(struct brw_context *brw)
4101 {
4102    struct brw_stage_state *stage_state = &brw->cs.base;
4103
4104    /* BRW_NEW_COMPUTE_PROGRAM */
4105    const struct gl_program *cp = brw->programs[MESA_SHADER_COMPUTE];
4106
4107    if (cp) {
4108       /* BRW_NEW_CS_PROG_DATA */
4109       struct brw_cs_prog_data *cs_prog_data =
4110          brw_cs_prog_data(brw->cs.base.prog_data);
4111
4112       _mesa_shader_write_subroutine_indices(&brw->ctx, MESA_SHADER_COMPUTE);
4113       brw_upload_cs_push_constants(brw, cp, cs_prog_data, stage_state);
4114    }
4115 }
4116
4117 const struct brw_tracked_state genX(cs_push_constants) = {
4118    .dirty = {
4119       .mesa = _NEW_PROGRAM_CONSTANTS,
4120       .brw = BRW_NEW_BATCH |
4121              BRW_NEW_BLORP |
4122              BRW_NEW_COMPUTE_PROGRAM |
4123              BRW_NEW_CS_PROG_DATA,
4124    },
4125    .emit = genX(upload_cs_push_constants),
4126 };
4127
4128 /**
4129  * Creates a new CS constant buffer reflecting the current CS program's
4130  * constants, if needed by the CS program.
4131  */
4132 static void
4133 genX(upload_cs_pull_constants)(struct brw_context *brw)
4134 {
4135    struct brw_stage_state *stage_state = &brw->cs.base;
4136
4137    /* BRW_NEW_COMPUTE_PROGRAM */
4138    struct brw_program *cp =
4139       (struct brw_program *) brw->programs[MESA_SHADER_COMPUTE];
4140
4141    /* BRW_NEW_CS_PROG_DATA */
4142    const struct brw_stage_prog_data *prog_data = brw->cs.base.prog_data;
4143
4144    _mesa_shader_write_subroutine_indices(&brw->ctx, MESA_SHADER_COMPUTE);
4145    /* _NEW_PROGRAM_CONSTANTS */
4146    brw_upload_pull_constants(brw, BRW_NEW_SURFACES, &cp->program,
4147                              stage_state, prog_data);
4148 }
4149
4150 const struct brw_tracked_state genX(cs_pull_constants) = {
4151    .dirty = {
4152       .mesa = _NEW_PROGRAM_CONSTANTS,
4153       .brw = BRW_NEW_BATCH |
4154              BRW_NEW_BLORP |
4155              BRW_NEW_COMPUTE_PROGRAM |
4156              BRW_NEW_CS_PROG_DATA,
4157    },
4158    .emit = genX(upload_cs_pull_constants),
4159 };
4160
4161 static void
4162 genX(upload_cs_state)(struct brw_context *brw)
4163 {
4164    if (!brw->cs.base.prog_data)
4165       return;
4166
4167    uint32_t offset;
4168    uint32_t *desc = (uint32_t*) brw_state_batch(
4169       brw, GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t), 64,
4170       &offset);
4171
4172    struct brw_stage_state *stage_state = &brw->cs.base;
4173    struct brw_stage_prog_data *prog_data = stage_state->prog_data;
4174    struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data);
4175    const struct gen_device_info *devinfo = &brw->screen->devinfo;
4176
4177    if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
4178       brw_emit_buffer_surface_state(
4179          brw, &stage_state->surf_offset[
4180                  prog_data->binding_table.shader_time_start],
4181          brw->shader_time.bo, 0, ISL_FORMAT_RAW,
4182          brw->shader_time.bo->size, 1,
4183          RELOC_WRITE);
4184    }
4185
4186    uint32_t *bind = brw_state_batch(brw, prog_data->binding_table.size_bytes,
4187                                     32, &stage_state->bind_bo_offset);
4188
4189    /* The MEDIA_VFE_STATE documentation for Gen8+ says:
4190     *
4191     * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
4192     *  the only bits that are changed are scoreboard related: Scoreboard
4193     *  Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For
4194     *  these scoreboard related states, a MEDIA_STATE_FLUSH is sufficient."
4195     *
4196     * Earlier generations say "MI_FLUSH" instead of "stalling PIPE_CONTROL",
4197     * but MI_FLUSH isn't really a thing, so we assume they meant PIPE_CONTROL.
4198     */
4199    brw_emit_pipe_control_flush(brw, PIPE_CONTROL_CS_STALL);
4200
4201    brw_batch_emit(brw, GENX(MEDIA_VFE_STATE), vfe) {
4202       if (prog_data->total_scratch) {
4203          uint32_t per_thread_scratch_value;
4204
4205          if (GEN_GEN >= 8) {
4206             /* Broadwell's Per Thread Scratch Space is in the range [0, 11]
4207              * where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M.
4208              */
4209             per_thread_scratch_value = ffs(stage_state->per_thread_scratch) - 11;
4210          } else if (GEN_IS_HASWELL) {
4211             /* Haswell's Per Thread Scratch Space is in the range [0, 10]
4212              * where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M.
4213              */
4214             per_thread_scratch_value = ffs(stage_state->per_thread_scratch) - 12;
4215          } else {
4216             /* Earlier platforms use the range [0, 11] to mean [1kB, 12kB]
4217              * where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB.
4218              */
4219             per_thread_scratch_value = stage_state->per_thread_scratch / 1024 - 1;
4220          }
4221          vfe.ScratchSpaceBasePointer = rw_bo(stage_state->scratch_bo, 0);
4222          vfe.PerThreadScratchSpace = per_thread_scratch_value;
4223       }
4224
4225       /* If brw->screen->subslice_total is greater than one, then
4226        * devinfo->max_cs_threads stores number of threads per sub-slice;
4227        * thus we need to multiply by that number by subslices to get
4228        * the actual maximum number of threads; the -1 is because the HW
4229        * has a bias of 1 (would not make sense to say the maximum number
4230        * of threads is 0).
4231        */
4232       const uint32_t subslices = MAX2(brw->screen->subslice_total, 1);
4233       vfe.MaximumNumberofThreads = devinfo->max_cs_threads * subslices - 1;
4234       vfe.NumberofURBEntries = GEN_GEN >= 8 ? 2 : 0;
4235 #if GEN_GEN < 11
4236       vfe.ResetGatewayTimer =
4237          Resettingrelativetimerandlatchingtheglobaltimestamp;
4238 #endif
4239 #if GEN_GEN < 9
4240       vfe.BypassGatewayControl = BypassingOpenGatewayCloseGatewayprotocol;
4241 #endif
4242 #if GEN_GEN == 7
4243       vfe.GPGPUMode = 1;
4244 #endif
4245
4246       /* We are uploading duplicated copies of push constant uniforms for each
4247        * thread. Although the local id data needs to vary per thread, it won't
4248        * change for other uniform data. Unfortunately this duplication is
4249        * required for gen7. As of Haswell, this duplication can be avoided,
4250        * but this older mechanism with duplicated data continues to work.
4251        *
4252        * FINISHME: As of Haswell, we could make use of the
4253        * INTERFACE_DESCRIPTOR_DATA "Cross-Thread Constant Data Read Length"
4254        * field to only store one copy of uniform data.
4255        *
4256        * FINISHME: Broadwell adds a new alternative "Indirect Payload Storage"
4257        * which is described in the GPGPU_WALKER command and in the Broadwell
4258        * PRM Volume 7: 3D Media GPGPU, under Media GPGPU Pipeline => Mode of
4259        * Operations => GPGPU Mode => Indirect Payload Storage.
4260        *
4261        * Note: The constant data is built in brw_upload_cs_push_constants
4262        * below.
4263        */
4264       vfe.URBEntryAllocationSize = GEN_GEN >= 8 ? 2 : 0;
4265
4266       const uint32_t vfe_curbe_allocation =
4267          ALIGN(cs_prog_data->push.per_thread.regs * cs_prog_data->threads +
4268                cs_prog_data->push.cross_thread.regs, 2);
4269       vfe.CURBEAllocationSize = vfe_curbe_allocation;
4270    }
4271
4272    if (cs_prog_data->push.total.size > 0) {
4273       brw_batch_emit(brw, GENX(MEDIA_CURBE_LOAD), curbe) {
4274          curbe.CURBETotalDataLength =
4275             ALIGN(cs_prog_data->push.total.size, 64);
4276          curbe.CURBEDataStartAddress = stage_state->push_const_offset;
4277       }
4278    }
4279
4280    /* BRW_NEW_SURFACES and BRW_NEW_*_CONSTBUF */
4281    memcpy(bind, stage_state->surf_offset,
4282           prog_data->binding_table.size_bytes);
4283    const struct GENX(INTERFACE_DESCRIPTOR_DATA) idd = {
4284       .KernelStartPointer = brw->cs.base.prog_offset,
4285       .SamplerStatePointer = stage_state->sampler_offset,
4286       .SamplerCount = DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4),
4287       .BindingTablePointer = stage_state->bind_bo_offset,
4288       .ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs,
4289       .NumberofThreadsinGPGPUThreadGroup = cs_prog_data->threads,
4290       .SharedLocalMemorySize = encode_slm_size(GEN_GEN,
4291                                                prog_data->total_shared),
4292       .BarrierEnable = cs_prog_data->uses_barrier,
4293 #if GEN_GEN >= 8 || GEN_IS_HASWELL
4294       .CrossThreadConstantDataReadLength =
4295          cs_prog_data->push.cross_thread.regs,
4296 #endif
4297    };
4298
4299    GENX(INTERFACE_DESCRIPTOR_DATA_pack)(brw, desc, &idd);
4300
4301    brw_batch_emit(brw, GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), load) {
4302       load.InterfaceDescriptorTotalLength =
4303          GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
4304       load.InterfaceDescriptorDataStartAddress = offset;
4305    }
4306 }
4307
4308 static const struct brw_tracked_state genX(cs_state) = {
4309    .dirty = {
4310       .mesa = _NEW_PROGRAM_CONSTANTS,
4311       .brw = BRW_NEW_BATCH |
4312              BRW_NEW_BLORP |
4313              BRW_NEW_CS_PROG_DATA |
4314              BRW_NEW_SAMPLER_STATE_TABLE |
4315              BRW_NEW_SURFACES,
4316    },
4317    .emit = genX(upload_cs_state)
4318 };
4319
4320 #endif
4321
4322 /* ---------------------------------------------------------------------- */
4323
4324 #if GEN_GEN >= 8
4325 static void
4326 genX(upload_raster)(struct brw_context *brw)
4327 {
4328    const struct gl_context *ctx = &brw->ctx;
4329
4330    /* _NEW_BUFFERS */
4331    const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
4332
4333    /* _NEW_POLYGON */
4334    const struct gl_polygon_attrib *polygon = &ctx->Polygon;
4335
4336    /* _NEW_POINT */
4337    const struct gl_point_attrib *point = &ctx->Point;
4338
4339    brw_batch_emit(brw, GENX(3DSTATE_RASTER), raster) {
4340       if (brw->polygon_front_bit == render_to_fbo)
4341          raster.FrontWinding = CounterClockwise;
4342
4343       if (polygon->CullFlag) {
4344          switch (polygon->CullFaceMode) {
4345          case GL_FRONT:
4346             raster.CullMode = CULLMODE_FRONT;
4347             break;
4348          case GL_BACK:
4349             raster.CullMode = CULLMODE_BACK;
4350             break;
4351          case GL_FRONT_AND_BACK:
4352             raster.CullMode = CULLMODE_BOTH;
4353             break;
4354          default:
4355             unreachable("not reached");
4356          }
4357       } else {
4358          raster.CullMode = CULLMODE_NONE;
4359       }
4360
4361       raster.SmoothPointEnable = point->SmoothFlag;
4362
4363       raster.DXMultisampleRasterizationEnable =
4364          _mesa_is_multisample_enabled(ctx);
4365
4366       raster.GlobalDepthOffsetEnableSolid = polygon->OffsetFill;
4367       raster.GlobalDepthOffsetEnableWireframe = polygon->OffsetLine;
4368       raster.GlobalDepthOffsetEnablePoint = polygon->OffsetPoint;
4369
4370       switch (polygon->FrontMode) {
4371       case GL_FILL:
4372          raster.FrontFaceFillMode = FILL_MODE_SOLID;
4373          break;
4374       case GL_LINE:
4375          raster.FrontFaceFillMode = FILL_MODE_WIREFRAME;
4376          break;
4377       case GL_POINT:
4378          raster.FrontFaceFillMode = FILL_MODE_POINT;
4379          break;
4380       default:
4381          unreachable("not reached");
4382       }
4383
4384       switch (polygon->BackMode) {
4385       case GL_FILL:
4386          raster.BackFaceFillMode = FILL_MODE_SOLID;
4387          break;
4388       case GL_LINE:
4389          raster.BackFaceFillMode = FILL_MODE_WIREFRAME;
4390          break;
4391       case GL_POINT:
4392          raster.BackFaceFillMode = FILL_MODE_POINT;
4393          break;
4394       default:
4395          unreachable("not reached");
4396       }
4397
4398       /* _NEW_LINE */
4399       raster.AntialiasingEnable = ctx->Line.SmoothFlag;
4400
4401 #if GEN_GEN == 10
4402       /* _NEW_BUFFERS
4403        * Antialiasing Enable bit MUST not be set when NUM_MULTISAMPLES > 1.
4404        */
4405       const bool multisampled_fbo =
4406          _mesa_geometric_samples(ctx->DrawBuffer) > 1;
4407       if (multisampled_fbo)
4408          raster.AntialiasingEnable = false;
4409 #endif
4410
4411       /* _NEW_SCISSOR */
4412       raster.ScissorRectangleEnable = ctx->Scissor.EnableFlags;
4413
4414       /* _NEW_TRANSFORM */
4415       if (!ctx->Transform.DepthClamp) {
4416 #if GEN_GEN >= 9
4417          raster.ViewportZFarClipTestEnable = true;
4418          raster.ViewportZNearClipTestEnable = true;
4419 #else
4420          raster.ViewportZClipTestEnable = true;
4421 #endif
4422       }
4423
4424       /* BRW_NEW_CONSERVATIVE_RASTERIZATION */
4425 #if GEN_GEN >= 9
4426       raster.ConservativeRasterizationEnable =
4427          ctx->IntelConservativeRasterization;
4428 #endif
4429
4430       raster.GlobalDepthOffsetClamp = polygon->OffsetClamp;
4431       raster.GlobalDepthOffsetScale = polygon->OffsetFactor;
4432
4433       raster.GlobalDepthOffsetConstant = polygon->OffsetUnits * 2;
4434    }
4435 }
4436
4437 static const struct brw_tracked_state genX(raster_state) = {
4438    .dirty = {
4439       .mesa  = _NEW_BUFFERS |
4440                _NEW_LINE |
4441                _NEW_MULTISAMPLE |
4442                _NEW_POINT |
4443                _NEW_POLYGON |
4444                _NEW_SCISSOR |
4445                _NEW_TRANSFORM,
4446       .brw   = BRW_NEW_BLORP |
4447                BRW_NEW_CONTEXT |
4448                BRW_NEW_CONSERVATIVE_RASTERIZATION,
4449    },
4450    .emit = genX(upload_raster),
4451 };
4452 #endif
4453
4454 /* ---------------------------------------------------------------------- */
4455
4456 #if GEN_GEN >= 8
4457 static void
4458 genX(upload_ps_extra)(struct brw_context *brw)
4459 {
4460    UNUSED struct gl_context *ctx = &brw->ctx;
4461
4462    const struct brw_wm_prog_data *prog_data =
4463       brw_wm_prog_data(brw->wm.base.prog_data);
4464
4465    brw_batch_emit(brw, GENX(3DSTATE_PS_EXTRA), psx) {
4466       psx.PixelShaderValid = true;
4467       psx.PixelShaderComputedDepthMode = prog_data->computed_depth_mode;
4468       psx.PixelShaderKillsPixel = prog_data->uses_kill;
4469       psx.AttributeEnable = prog_data->num_varying_inputs != 0;
4470       psx.PixelShaderUsesSourceDepth = prog_data->uses_src_depth;
4471       psx.PixelShaderUsesSourceW = prog_data->uses_src_w;
4472       psx.PixelShaderIsPerSample = prog_data->persample_dispatch;
4473
4474       /* _NEW_MULTISAMPLE | BRW_NEW_CONSERVATIVE_RASTERIZATION */
4475       if (prog_data->uses_sample_mask) {
4476 #if GEN_GEN >= 9
4477          if (prog_data->post_depth_coverage)
4478             psx.InputCoverageMaskState = ICMS_DEPTH_COVERAGE;
4479          else if (prog_data->inner_coverage && ctx->IntelConservativeRasterization)
4480             psx.InputCoverageMaskState = ICMS_INNER_CONSERVATIVE;
4481          else
4482             psx.InputCoverageMaskState = ICMS_NORMAL;
4483 #else
4484          psx.PixelShaderUsesInputCoverageMask = true;
4485 #endif
4486       }
4487
4488       psx.oMaskPresenttoRenderTarget = prog_data->uses_omask;
4489 #if GEN_GEN >= 9
4490       psx.PixelShaderPullsBary = prog_data->pulls_bary;
4491       psx.PixelShaderComputesStencil = prog_data->computed_stencil;
4492 #endif
4493
4494       /* The stricter cross-primitive coherency guarantees that the hardware
4495        * gives us with the "Accesses UAV" bit set for at least one shader stage
4496        * and the "UAV coherency required" bit set on the 3DPRIMITIVE command
4497        * are redundant within the current image, atomic counter and SSBO GL
4498        * APIs, which all have very loose ordering and coherency requirements
4499        * and generally rely on the application to insert explicit barriers when
4500        * a shader invocation is expected to see the memory writes performed by
4501        * the invocations of some previous primitive.  Regardless of the value
4502        * of "UAV coherency required", the "Accesses UAV" bits will implicitly
4503        * cause an in most cases useless DC flush when the lowermost stage with
4504        * the bit set finishes execution.
4505        *
4506        * It would be nice to disable it, but in some cases we can't because on
4507        * Gen8+ it also has an influence on rasterization via the PS UAV-only
4508        * signal (which could be set independently from the coherency mechanism
4509        * in the 3DSTATE_WM command on Gen7), and because in some cases it will
4510        * determine whether the hardware skips execution of the fragment shader
4511        * or not via the ThreadDispatchEnable signal.  However if we know that
4512        * GEN8_PS_BLEND_HAS_WRITEABLE_RT is going to be set and
4513        * GEN8_PSX_PIXEL_SHADER_NO_RT_WRITE is not set it shouldn't make any
4514        * difference so we may just disable it here.
4515        *
4516        * Gen8 hardware tries to compute ThreadDispatchEnable for us but doesn't
4517        * take into account KillPixels when no depth or stencil writes are
4518        * enabled.  In order for occlusion queries to work correctly with no
4519        * attachments, we need to force-enable here.
4520        *
4521        * BRW_NEW_FS_PROG_DATA | BRW_NEW_FRAGMENT_PROGRAM | _NEW_BUFFERS |
4522        * _NEW_COLOR
4523        */
4524       if ((prog_data->has_side_effects || prog_data->uses_kill) &&
4525           !brw_color_buffer_write_enabled(brw))
4526          psx.PixelShaderHasUAV = true;
4527    }
4528 }
4529
4530 const struct brw_tracked_state genX(ps_extra) = {
4531    .dirty = {
4532       .mesa  = _NEW_BUFFERS | _NEW_COLOR,
4533       .brw   = BRW_NEW_BLORP |
4534                BRW_NEW_CONTEXT |
4535                BRW_NEW_FRAGMENT_PROGRAM |
4536                BRW_NEW_FS_PROG_DATA |
4537                BRW_NEW_CONSERVATIVE_RASTERIZATION,
4538    },
4539    .emit = genX(upload_ps_extra),
4540 };
4541 #endif
4542
4543 /* ---------------------------------------------------------------------- */
4544
4545 #if GEN_GEN >= 8
4546 static void
4547 genX(upload_ps_blend)(struct brw_context *brw)
4548 {
4549    struct gl_context *ctx = &brw->ctx;
4550
4551    /* _NEW_BUFFERS */
4552    struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[0];
4553    const bool buffer0_is_integer = ctx->DrawBuffer->_IntegerBuffers & 0x1;
4554
4555    /* _NEW_COLOR */
4556    struct gl_colorbuffer_attrib *color = &ctx->Color;
4557
4558    brw_batch_emit(brw, GENX(3DSTATE_PS_BLEND), pb) {
4559       /* BRW_NEW_FRAGMENT_PROGRAM | _NEW_BUFFERS | _NEW_COLOR */
4560       pb.HasWriteableRT = brw_color_buffer_write_enabled(brw);
4561
4562       bool alpha_to_one = false;
4563
4564       if (!buffer0_is_integer) {
4565          /* _NEW_MULTISAMPLE */
4566
4567          if (_mesa_is_multisample_enabled(ctx)) {
4568             pb.AlphaToCoverageEnable = ctx->Multisample.SampleAlphaToCoverage;
4569             alpha_to_one = ctx->Multisample.SampleAlphaToOne;
4570          }
4571
4572          pb.AlphaTestEnable = color->AlphaEnabled;
4573       }
4574
4575       /* Used for implementing the following bit of GL_EXT_texture_integer:
4576        * "Per-fragment operations that require floating-point color
4577        *  components, including multisample alpha operations, alpha test,
4578        *  blending, and dithering, have no effect when the corresponding
4579        *  colors are written to an integer color buffer."
4580        *
4581        * The OpenGL specification 3.3 (page 196), section 4.1.3 says:
4582        * "If drawbuffer zero is not NONE and the buffer it references has an
4583        *  integer format, the SAMPLE_ALPHA_TO_COVERAGE and SAMPLE_ALPHA_TO_ONE
4584        *  operations are skipped."
4585        */
4586       if (rb && !buffer0_is_integer && (color->BlendEnabled & 1)) {
4587          GLenum eqRGB = color->Blend[0].EquationRGB;
4588          GLenum eqA = color->Blend[0].EquationA;
4589          GLenum srcRGB = color->Blend[0].SrcRGB;
4590          GLenum dstRGB = color->Blend[0].DstRGB;
4591          GLenum srcA = color->Blend[0].SrcA;
4592          GLenum dstA = color->Blend[0].DstA;
4593
4594          if (eqRGB == GL_MIN || eqRGB == GL_MAX)
4595             srcRGB = dstRGB = GL_ONE;
4596
4597          if (eqA == GL_MIN || eqA == GL_MAX)
4598             srcA = dstA = GL_ONE;
4599
4600          /* Due to hardware limitations, the destination may have information
4601           * in an alpha channel even when the format specifies no alpha
4602           * channel. In order to avoid getting any incorrect blending due to
4603           * that alpha channel, coerce the blend factors to values that will
4604           * not read the alpha channel, but will instead use the correct
4605           * implicit value for alpha.
4606           */
4607          if (!_mesa_base_format_has_channel(rb->_BaseFormat,
4608                                             GL_TEXTURE_ALPHA_TYPE)) {
4609             srcRGB = brw_fix_xRGB_alpha(srcRGB);
4610             srcA = brw_fix_xRGB_alpha(srcA);
4611             dstRGB = brw_fix_xRGB_alpha(dstRGB);
4612             dstA = brw_fix_xRGB_alpha(dstA);
4613          }
4614
4615          /* Alpha to One doesn't work with Dual Color Blending.  Override
4616           * SRC1_ALPHA to ONE and ONE_MINUS_SRC1_ALPHA to ZERO.
4617           */
4618          if (alpha_to_one && color->Blend[0]._UsesDualSrc) {
4619             srcRGB = fix_dual_blend_alpha_to_one(srcRGB);
4620             srcA = fix_dual_blend_alpha_to_one(srcA);
4621             dstRGB = fix_dual_blend_alpha_to_one(dstRGB);
4622             dstA = fix_dual_blend_alpha_to_one(dstA);
4623          }
4624
4625          pb.ColorBufferBlendEnable = true;
4626          pb.SourceAlphaBlendFactor = brw_translate_blend_factor(srcA);
4627          pb.DestinationAlphaBlendFactor = brw_translate_blend_factor(dstA);
4628          pb.SourceBlendFactor = brw_translate_blend_factor(srcRGB);
4629          pb.DestinationBlendFactor = brw_translate_blend_factor(dstRGB);
4630
4631          pb.IndependentAlphaBlendEnable =
4632             srcA != srcRGB || dstA != dstRGB || eqA != eqRGB;
4633       }
4634    }
4635 }
4636
4637 static const struct brw_tracked_state genX(ps_blend) = {
4638    .dirty = {
4639       .mesa = _NEW_BUFFERS |
4640               _NEW_COLOR |
4641               _NEW_MULTISAMPLE,
4642       .brw = BRW_NEW_BLORP |
4643              BRW_NEW_CONTEXT |
4644              BRW_NEW_FRAGMENT_PROGRAM,
4645    },
4646    .emit = genX(upload_ps_blend)
4647 };
4648 #endif
4649
4650 /* ---------------------------------------------------------------------- */
4651
4652 #if GEN_GEN >= 8
4653 static void
4654 genX(emit_vf_topology)(struct brw_context *brw)
4655 {
4656    brw_batch_emit(brw, GENX(3DSTATE_VF_TOPOLOGY), vftopo) {
4657       vftopo.PrimitiveTopologyType = brw->primitive;
4658    }
4659 }
4660
4661 static const struct brw_tracked_state genX(vf_topology) = {
4662    .dirty = {
4663       .mesa = 0,
4664       .brw = BRW_NEW_BLORP |
4665              BRW_NEW_PRIMITIVE,
4666    },
4667    .emit = genX(emit_vf_topology),
4668 };
4669 #endif
4670
4671 /* ---------------------------------------------------------------------- */
4672
4673 #if GEN_GEN >= 7
4674 static void
4675 genX(emit_mi_report_perf_count)(struct brw_context *brw,
4676                                 struct brw_bo *bo,
4677                                 uint32_t offset_in_bytes,
4678                                 uint32_t report_id)
4679 {
4680    brw_batch_emit(brw, GENX(MI_REPORT_PERF_COUNT), mi_rpc) {
4681       mi_rpc.MemoryAddress = ggtt_bo(bo, offset_in_bytes);
4682       mi_rpc.ReportID = report_id;
4683    }
4684 }
4685 #endif
4686
4687 /* ---------------------------------------------------------------------- */
4688
4689 /**
4690  * Emit a 3DSTATE_SAMPLER_STATE_POINTERS_{VS,HS,GS,DS,PS} packet.
4691  */
4692 static void
4693 genX(emit_sampler_state_pointers_xs)(struct brw_context *brw,
4694                                      struct brw_stage_state *stage_state)
4695 {
4696 #if GEN_GEN >= 7
4697    static const uint16_t packet_headers[] = {
4698       [MESA_SHADER_VERTEX] = 43,
4699       [MESA_SHADER_TESS_CTRL] = 44,
4700       [MESA_SHADER_TESS_EVAL] = 45,
4701       [MESA_SHADER_GEOMETRY] = 46,
4702       [MESA_SHADER_FRAGMENT] = 47,
4703    };
4704
4705    /* Ivybridge requires a workaround flush before VS packets. */
4706    if (GEN_GEN == 7 && !GEN_IS_HASWELL &&
4707        stage_state->stage == MESA_SHADER_VERTEX) {
4708       gen7_emit_vs_workaround_flush(brw);
4709    }
4710
4711    brw_batch_emit(brw, GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ptr) {
4712       ptr._3DCommandSubOpcode = packet_headers[stage_state->stage];
4713       ptr.PointertoVSSamplerState = stage_state->sampler_offset;
4714    }
4715 #endif
4716 }
4717
4718 UNUSED static bool
4719 has_component(mesa_format format, int i)
4720 {
4721    if (_mesa_is_format_color_format(format))
4722       return _mesa_format_has_color_component(format, i);
4723
4724    /* depth and stencil have only one component */
4725    return i == 0;
4726 }
4727
4728 /**
4729  * Upload SAMPLER_BORDER_COLOR_STATE.
4730  */
4731 static void
4732 genX(upload_default_color)(struct brw_context *brw,
4733                            const struct gl_sampler_object *sampler,
4734                            mesa_format format, GLenum base_format,
4735                            bool is_integer_format, bool is_stencil_sampling,
4736                            uint32_t *sdc_offset)
4737 {
4738    union gl_color_union color;
4739
4740    switch (base_format) {
4741    case GL_DEPTH_COMPONENT:
4742       /* GL specs that border color for depth textures is taken from the
4743        * R channel, while the hardware uses A.  Spam R into all the
4744        * channels for safety.
4745        */
4746       color.ui[0] = sampler->BorderColor.ui[0];
4747       color.ui[1] = sampler->BorderColor.ui[0];
4748       color.ui[2] = sampler->BorderColor.ui[0];
4749       color.ui[3] = sampler->BorderColor.ui[0];
4750       break;
4751    case GL_ALPHA:
4752       color.ui[0] = 0u;
4753       color.ui[1] = 0u;
4754       color.ui[2] = 0u;
4755       color.ui[3] = sampler->BorderColor.ui[3];
4756       break;
4757    case GL_INTENSITY:
4758       color.ui[0] = sampler->BorderColor.ui[0];
4759       color.ui[1] = sampler->BorderColor.ui[0];
4760       color.ui[2] = sampler->BorderColor.ui[0];
4761       color.ui[3] = sampler->BorderColor.ui[0];
4762       break;
4763    case GL_LUMINANCE:
4764       color.ui[0] = sampler->BorderColor.ui[0];
4765       color.ui[1] = sampler->BorderColor.ui[0];
4766       color.ui[2] = sampler->BorderColor.ui[0];
4767       color.ui[3] = float_as_int(1.0);
4768       break;
4769    case GL_LUMINANCE_ALPHA:
4770       color.ui[0] = sampler->BorderColor.ui[0];
4771       color.ui[1] = sampler->BorderColor.ui[0];
4772       color.ui[2] = sampler->BorderColor.ui[0];
4773       color.ui[3] = sampler->BorderColor.ui[3];
4774       break;
4775    default:
4776       color.ui[0] = sampler->BorderColor.ui[0];
4777       color.ui[1] = sampler->BorderColor.ui[1];
4778       color.ui[2] = sampler->BorderColor.ui[2];
4779       color.ui[3] = sampler->BorderColor.ui[3];
4780       break;
4781    }
4782
4783    /* In some cases we use an RGBA surface format for GL RGB textures,
4784     * where we've initialized the A channel to 1.0.  We also have to set
4785     * the border color alpha to 1.0 in that case.
4786     */
4787    if (base_format == GL_RGB)
4788       color.ui[3] = float_as_int(1.0);
4789
4790    int alignment = 32;
4791    if (GEN_GEN >= 8) {
4792       alignment = 64;
4793    } else if (GEN_IS_HASWELL && (is_integer_format || is_stencil_sampling)) {
4794       alignment = 512;
4795    }
4796
4797    uint32_t *sdc = brw_state_batch(
4798       brw, GENX(SAMPLER_BORDER_COLOR_STATE_length) * sizeof(uint32_t),
4799       alignment, sdc_offset);
4800
4801    struct GENX(SAMPLER_BORDER_COLOR_STATE) state = { 0 };
4802
4803 #define ASSIGN(dst, src) \
4804    do {                  \
4805       dst = src;         \
4806    } while (0)
4807
4808 #define ASSIGNu16(dst, src) \
4809    do {                     \
4810       dst = (uint16_t)src;  \
4811    } while (0)
4812
4813 #define ASSIGNu8(dst, src) \
4814    do {                    \
4815       dst = (uint8_t)src;  \
4816    } while (0)
4817
4818 #define BORDER_COLOR_ATTR(macro, _color_type, src)              \
4819    macro(state.BorderColor ## _color_type ## Red, src[0]);   \
4820    macro(state.BorderColor ## _color_type ## Green, src[1]);   \
4821    macro(state.BorderColor ## _color_type ## Blue, src[2]);   \
4822    macro(state.BorderColor ## _color_type ## Alpha, src[3]);
4823
4824 #if GEN_GEN >= 8
4825    /* On Broadwell, the border color is represented as four 32-bit floats,
4826     * integers, or unsigned values, interpreted according to the surface
4827     * format.  This matches the sampler->BorderColor union exactly; just
4828     * memcpy the values.
4829     */
4830    BORDER_COLOR_ATTR(ASSIGN, 32bit, color.ui);
4831 #elif GEN_IS_HASWELL
4832    if (is_integer_format || is_stencil_sampling) {
4833       bool stencil = format == MESA_FORMAT_S_UINT8 || is_stencil_sampling;
4834       const int bits_per_channel =
4835          _mesa_get_format_bits(format, stencil ? GL_STENCIL_BITS : GL_RED_BITS);
4836
4837       /* From the Haswell PRM, "Command Reference: Structures", Page 36:
4838        * "If any color channel is missing from the surface format,
4839        *  corresponding border color should be programmed as zero and if
4840        *  alpha channel is missing, corresponding Alpha border color should
4841        *  be programmed as 1."
4842        */
4843       unsigned c[4] = { 0, 0, 0, 1 };
4844       for (int i = 0; i < 4; i++) {
4845          if (has_component(format, i))
4846             c[i] = color.ui[i];
4847       }
4848
4849       switch (bits_per_channel) {
4850       case 8:
4851          /* Copy RGBA in order. */
4852          BORDER_COLOR_ATTR(ASSIGNu8, 8bit, c);
4853          break;
4854       case 10:
4855          /* R10G10B10A2_UINT is treated like a 16-bit format. */
4856       case 16:
4857          BORDER_COLOR_ATTR(ASSIGNu16, 16bit, c);
4858          break;
4859       case 32:
4860          if (base_format == GL_RG) {
4861             /* Careful inspection of the tables reveals that for RG32 formats,
4862              * the green channel needs to go where blue normally belongs.
4863              */
4864             state.BorderColor32bitRed = c[0];
4865             state.BorderColor32bitBlue = c[1];
4866             state.BorderColor32bitAlpha = 1;
4867          } else {
4868             /* Copy RGBA in order. */
4869             BORDER_COLOR_ATTR(ASSIGN, 32bit, c);
4870          }
4871          break;
4872       default:
4873          assert(!"Invalid number of bits per channel in integer format.");
4874          break;
4875       }
4876    } else {
4877       BORDER_COLOR_ATTR(ASSIGN, Float, color.f);
4878    }
4879 #elif GEN_GEN == 5 || GEN_GEN == 6
4880    BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_UBYTE, Unorm, color.f);
4881    BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_USHORT, Unorm16, color.f);
4882    BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_SHORT, Snorm16, color.f);
4883
4884 #define MESA_FLOAT_TO_HALF(dst, src) \
4885    dst = _mesa_float_to_half(src);
4886
4887    BORDER_COLOR_ATTR(MESA_FLOAT_TO_HALF, Float16, color.f);
4888
4889 #undef MESA_FLOAT_TO_HALF
4890
4891    state.BorderColorSnorm8Red   = state.BorderColorSnorm16Red >> 8;
4892    state.BorderColorSnorm8Green = state.BorderColorSnorm16Green >> 8;
4893    state.BorderColorSnorm8Blue  = state.BorderColorSnorm16Blue >> 8;
4894    state.BorderColorSnorm8Alpha = state.BorderColorSnorm16Alpha >> 8;
4895
4896    BORDER_COLOR_ATTR(ASSIGN, Float, color.f);
4897 #elif GEN_GEN == 4
4898    BORDER_COLOR_ATTR(ASSIGN, , color.f);
4899 #else
4900    BORDER_COLOR_ATTR(ASSIGN, Float, color.f);
4901 #endif
4902
4903 #undef ASSIGN
4904 #undef BORDER_COLOR_ATTR
4905
4906    GENX(SAMPLER_BORDER_COLOR_STATE_pack)(brw, sdc, &state);
4907 }
4908
4909 static uint32_t
4910 translate_wrap_mode(struct brw_context *brw, GLenum wrap, bool using_nearest)
4911 {
4912    switch (wrap) {
4913    case GL_REPEAT:
4914       return TCM_WRAP;
4915    case GL_CLAMP:
4916 #if GEN_GEN >= 8
4917       /* GL_CLAMP is the weird mode where coordinates are clamped to
4918        * [0.0, 1.0], so linear filtering of coordinates outside of
4919        * [0.0, 1.0] give you half edge texel value and half border
4920        * color.
4921        *
4922        * Gen8+ supports this natively.
4923        */
4924       return TCM_HALF_BORDER;
4925 #else
4926       /* On Gen4-7.5, we clamp the coordinates in the fragment shader
4927        * and set clamp_border here, which gets the result desired.
4928        * We just use clamp(_to_edge) for nearest, because for nearest
4929        * clamping to 1.0 gives border color instead of the desired
4930        * edge texels.
4931        */
4932       if (using_nearest)
4933          return TCM_CLAMP;
4934       else
4935          return TCM_CLAMP_BORDER;
4936 #endif
4937    case GL_CLAMP_TO_EDGE:
4938       return TCM_CLAMP;
4939    case GL_CLAMP_TO_BORDER:
4940       return TCM_CLAMP_BORDER;
4941    case GL_MIRRORED_REPEAT:
4942       return TCM_MIRROR;
4943    case GL_MIRROR_CLAMP_TO_EDGE:
4944       return TCM_MIRROR_ONCE;
4945    default:
4946       return TCM_WRAP;
4947    }
4948 }
4949
4950 /**
4951  * Return true if the given wrap mode requires the border color to exist.
4952  */
4953 static bool
4954 wrap_mode_needs_border_color(unsigned wrap_mode)
4955 {
4956 #if GEN_GEN >= 8
4957    return wrap_mode == TCM_CLAMP_BORDER ||
4958           wrap_mode == TCM_HALF_BORDER;
4959 #else
4960    return wrap_mode == TCM_CLAMP_BORDER;
4961 #endif
4962 }
4963
4964 /**
4965  * Sets the sampler state for a single unit based off of the sampler key
4966  * entry.
4967  */
4968 static void
4969 genX(update_sampler_state)(struct brw_context *brw,
4970                            GLenum target, bool tex_cube_map_seamless,
4971                            GLfloat tex_unit_lod_bias,
4972                            mesa_format format, GLenum base_format,
4973                            const struct gl_texture_object *texObj,
4974                            const struct gl_sampler_object *sampler,
4975                            uint32_t *sampler_state,
4976                            uint32_t batch_offset_for_sampler_state)
4977 {
4978    struct GENX(SAMPLER_STATE) samp_st = { 0 };
4979
4980    /* Select min and mip filters. */
4981    switch (sampler->MinFilter) {
4982    case GL_NEAREST:
4983       samp_st.MinModeFilter = MAPFILTER_NEAREST;
4984       samp_st.MipModeFilter = MIPFILTER_NONE;
4985       break;
4986    case GL_LINEAR:
4987       samp_st.MinModeFilter = MAPFILTER_LINEAR;
4988       samp_st.MipModeFilter = MIPFILTER_NONE;
4989       break;
4990    case GL_NEAREST_MIPMAP_NEAREST:
4991       samp_st.MinModeFilter = MAPFILTER_NEAREST;
4992       samp_st.MipModeFilter = MIPFILTER_NEAREST;
4993       break;
4994    case GL_LINEAR_MIPMAP_NEAREST:
4995       samp_st.MinModeFilter = MAPFILTER_LINEAR;
4996       samp_st.MipModeFilter = MIPFILTER_NEAREST;
4997       break;
4998    case GL_NEAREST_MIPMAP_LINEAR:
4999       samp_st.MinModeFilter = MAPFILTER_NEAREST;
5000       samp_st.MipModeFilter = MIPFILTER_LINEAR;
5001       break;
5002    case GL_LINEAR_MIPMAP_LINEAR:
5003       samp_st.MinModeFilter = MAPFILTER_LINEAR;
5004       samp_st.MipModeFilter = MIPFILTER_LINEAR;
5005       break;
5006    default:
5007       unreachable("not reached");
5008    }
5009
5010    /* Select mag filter. */
5011    samp_st.MagModeFilter = sampler->MagFilter == GL_LINEAR ?
5012       MAPFILTER_LINEAR : MAPFILTER_NEAREST;
5013
5014    /* Enable anisotropic filtering if desired. */
5015    samp_st.MaximumAnisotropy = RATIO21;
5016
5017    if (sampler->MaxAnisotropy > 1.0f) {
5018       if (samp_st.MinModeFilter == MAPFILTER_LINEAR)
5019          samp_st.MinModeFilter = MAPFILTER_ANISOTROPIC;
5020       if (samp_st.MagModeFilter == MAPFILTER_LINEAR)
5021          samp_st.MagModeFilter = MAPFILTER_ANISOTROPIC;
5022
5023       if (sampler->MaxAnisotropy > 2.0f) {
5024          samp_st.MaximumAnisotropy =
5025             MIN2((sampler->MaxAnisotropy - 2) / 2, RATIO161);
5026       }
5027    }
5028
5029    /* Set address rounding bits if not using nearest filtering. */
5030    if (samp_st.MinModeFilter != MAPFILTER_NEAREST) {
5031       samp_st.UAddressMinFilterRoundingEnable = true;
5032       samp_st.VAddressMinFilterRoundingEnable = true;
5033       samp_st.RAddressMinFilterRoundingEnable = true;
5034    }
5035
5036    if (samp_st.MagModeFilter != MAPFILTER_NEAREST) {
5037       samp_st.UAddressMagFilterRoundingEnable = true;
5038       samp_st.VAddressMagFilterRoundingEnable = true;
5039       samp_st.RAddressMagFilterRoundingEnable = true;
5040    }
5041
5042    bool either_nearest =
5043       sampler->MinFilter == GL_NEAREST || sampler->MagFilter == GL_NEAREST;
5044    unsigned wrap_s = translate_wrap_mode(brw, sampler->WrapS, either_nearest);
5045    unsigned wrap_t = translate_wrap_mode(brw, sampler->WrapT, either_nearest);
5046    unsigned wrap_r = translate_wrap_mode(brw, sampler->WrapR, either_nearest);
5047
5048    if (target == GL_TEXTURE_CUBE_MAP ||
5049        target == GL_TEXTURE_CUBE_MAP_ARRAY) {
5050       /* Cube maps must use the same wrap mode for all three coordinate
5051        * dimensions.  Prior to Haswell, only CUBE and CLAMP are valid.
5052        *
5053        * Ivybridge and Baytrail seem to have problems with CUBE mode and
5054        * integer formats.  Fall back to CLAMP for now.
5055        */
5056       if ((tex_cube_map_seamless || sampler->CubeMapSeamless) &&
5057           !(GEN_GEN == 7 && !GEN_IS_HASWELL && texObj->_IsIntegerFormat)) {
5058          wrap_s = TCM_CUBE;
5059          wrap_t = TCM_CUBE;
5060          wrap_r = TCM_CUBE;
5061       } else {
5062          wrap_s = TCM_CLAMP;
5063          wrap_t = TCM_CLAMP;
5064          wrap_r = TCM_CLAMP;
5065       }
5066    } else if (target == GL_TEXTURE_1D) {
5067       /* There's a bug in 1D texture sampling - it actually pays
5068        * attention to the wrap_t value, though it should not.
5069        * Override the wrap_t value here to GL_REPEAT to keep
5070        * any nonexistent border pixels from floating in.
5071        */
5072       wrap_t = TCM_WRAP;
5073    }
5074
5075    samp_st.TCXAddressControlMode = wrap_s;
5076    samp_st.TCYAddressControlMode = wrap_t;
5077    samp_st.TCZAddressControlMode = wrap_r;
5078
5079    samp_st.ShadowFunction =
5080       sampler->CompareMode == GL_COMPARE_R_TO_TEXTURE_ARB ?
5081       intel_translate_shadow_compare_func(sampler->CompareFunc) : 0;
5082
5083 #if GEN_GEN >= 7
5084    /* Set shadow function. */
5085    samp_st.AnisotropicAlgorithm =
5086       samp_st.MinModeFilter == MAPFILTER_ANISOTROPIC ?
5087       EWAApproximation : LEGACY;
5088 #endif
5089
5090 #if GEN_GEN >= 6
5091    samp_st.NonnormalizedCoordinateEnable = target == GL_TEXTURE_RECTANGLE;
5092 #endif
5093
5094    const float hw_max_lod = GEN_GEN >= 7 ? 14 : 13;
5095    samp_st.MinLOD = CLAMP(sampler->MinLod, 0, hw_max_lod);
5096    samp_st.MaxLOD = CLAMP(sampler->MaxLod, 0, hw_max_lod);
5097    samp_st.TextureLODBias =
5098       CLAMP(tex_unit_lod_bias + sampler->LodBias, -16, 15);
5099
5100 #if GEN_GEN == 6
5101    samp_st.BaseMipLevel =
5102       CLAMP(texObj->MinLevel + texObj->BaseLevel, 0, hw_max_lod);
5103    samp_st.MinandMagStateNotEqual =
5104       samp_st.MinModeFilter != samp_st.MagModeFilter;
5105 #endif
5106
5107    /* Upload the border color if necessary.  If not, just point it at
5108     * offset 0 (the start of the batch) - the color should be ignored,
5109     * but that address won't fault in case something reads it anyway.
5110     */
5111    uint32_t border_color_offset = 0;
5112    if (wrap_mode_needs_border_color(wrap_s) ||
5113        wrap_mode_needs_border_color(wrap_t) ||
5114        wrap_mode_needs_border_color(wrap_r)) {
5115       genX(upload_default_color)(brw, sampler, format, base_format,
5116                                  texObj->_IsIntegerFormat,
5117                                  texObj->StencilSampling,
5118                                  &border_color_offset);
5119    }
5120 #if GEN_GEN < 6
5121       samp_st.BorderColorPointer =
5122          ro_bo(brw->batch.state.bo, border_color_offset);
5123 #else
5124       samp_st.BorderColorPointer = border_color_offset;
5125 #endif
5126
5127 #if GEN_GEN >= 8
5128    samp_st.LODPreClampMode = CLAMP_MODE_OGL;
5129 #else
5130    samp_st.LODPreClampEnable = true;
5131 #endif
5132
5133    GENX(SAMPLER_STATE_pack)(brw, sampler_state, &samp_st);
5134 }
5135
5136 static void
5137 update_sampler_state(struct brw_context *brw,
5138                      int unit,
5139                      uint32_t *sampler_state,
5140                      uint32_t batch_offset_for_sampler_state)
5141 {
5142    struct gl_context *ctx = &brw->ctx;
5143    const struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
5144    const struct gl_texture_object *texObj = texUnit->_Current;
5145    const struct gl_sampler_object *sampler = _mesa_get_samplerobj(ctx, unit);
5146
5147    /* These don't use samplers at all. */
5148    if (texObj->Target == GL_TEXTURE_BUFFER)
5149       return;
5150
5151    struct gl_texture_image *firstImage = texObj->Image[0][texObj->BaseLevel];
5152    genX(update_sampler_state)(brw, texObj->Target,
5153                               ctx->Texture.CubeMapSeamless,
5154                               texUnit->LodBias,
5155                               firstImage->TexFormat, firstImage->_BaseFormat,
5156                               texObj, sampler,
5157                               sampler_state, batch_offset_for_sampler_state);
5158 }
5159
5160 static void
5161 genX(upload_sampler_state_table)(struct brw_context *brw,
5162                                  struct gl_program *prog,
5163                                  struct brw_stage_state *stage_state)
5164 {
5165    struct gl_context *ctx = &brw->ctx;
5166    uint32_t sampler_count = stage_state->sampler_count;
5167
5168    GLbitfield SamplersUsed = prog->SamplersUsed;
5169
5170    if (sampler_count == 0)
5171       return;
5172
5173    /* SAMPLER_STATE is 4 DWords on all platforms. */
5174    const int dwords = GENX(SAMPLER_STATE_length);
5175    const int size_in_bytes = dwords * sizeof(uint32_t);
5176
5177    uint32_t *sampler_state = brw_state_batch(brw,
5178                                              sampler_count * size_in_bytes,
5179                                              32, &stage_state->sampler_offset);
5180    /* memset(sampler_state, 0, sampler_count * size_in_bytes); */
5181
5182    uint32_t batch_offset_for_sampler_state = stage_state->sampler_offset;
5183
5184    for (unsigned s = 0; s < sampler_count; s++) {
5185       if (SamplersUsed & (1 << s)) {
5186          const unsigned unit = prog->SamplerUnits[s];
5187          if (ctx->Texture.Unit[unit]._Current) {
5188             update_sampler_state(brw, unit, sampler_state,
5189                                  batch_offset_for_sampler_state);
5190          }
5191       }
5192
5193       sampler_state += dwords;
5194       batch_offset_for_sampler_state += size_in_bytes;
5195    }
5196
5197    if (GEN_GEN >= 7 && stage_state->stage != MESA_SHADER_COMPUTE) {
5198       /* Emit a 3DSTATE_SAMPLER_STATE_POINTERS_XS packet. */
5199       genX(emit_sampler_state_pointers_xs)(brw, stage_state);
5200    } else {
5201       /* Flag that the sampler state table pointer has changed; later atoms
5202        * will handle it.
5203        */
5204       brw->ctx.NewDriverState |= BRW_NEW_SAMPLER_STATE_TABLE;
5205    }
5206 }
5207
5208 static void
5209 genX(upload_fs_samplers)(struct brw_context *brw)
5210 {
5211    /* BRW_NEW_FRAGMENT_PROGRAM */
5212    struct gl_program *fs = brw->programs[MESA_SHADER_FRAGMENT];
5213    genX(upload_sampler_state_table)(brw, fs, &brw->wm.base);
5214 }
5215
5216 static const struct brw_tracked_state genX(fs_samplers) = {
5217    .dirty = {
5218       .mesa = _NEW_TEXTURE,
5219       .brw = BRW_NEW_BATCH |
5220              BRW_NEW_BLORP |
5221              BRW_NEW_FRAGMENT_PROGRAM,
5222    },
5223    .emit = genX(upload_fs_samplers),
5224 };
5225
5226 static void
5227 genX(upload_vs_samplers)(struct brw_context *brw)
5228 {
5229    /* BRW_NEW_VERTEX_PROGRAM */
5230    struct gl_program *vs = brw->programs[MESA_SHADER_VERTEX];
5231    genX(upload_sampler_state_table)(brw, vs, &brw->vs.base);
5232 }
5233
5234 static const struct brw_tracked_state genX(vs_samplers) = {
5235    .dirty = {
5236       .mesa = _NEW_TEXTURE,
5237       .brw = BRW_NEW_BATCH |
5238              BRW_NEW_BLORP |
5239              BRW_NEW_VERTEX_PROGRAM,
5240    },
5241    .emit = genX(upload_vs_samplers),
5242 };
5243
5244 #if GEN_GEN >= 6
5245 static void
5246 genX(upload_gs_samplers)(struct brw_context *brw)
5247 {
5248    /* BRW_NEW_GEOMETRY_PROGRAM */
5249    struct gl_program *gs = brw->programs[MESA_SHADER_GEOMETRY];
5250    if (!gs)
5251       return;
5252
5253    genX(upload_sampler_state_table)(brw, gs, &brw->gs.base);
5254 }
5255
5256
5257 static const struct brw_tracked_state genX(gs_samplers) = {
5258    .dirty = {
5259       .mesa = _NEW_TEXTURE,
5260       .brw = BRW_NEW_BATCH |
5261              BRW_NEW_BLORP |
5262              BRW_NEW_GEOMETRY_PROGRAM,
5263    },
5264    .emit = genX(upload_gs_samplers),
5265 };
5266 #endif
5267
5268 #if GEN_GEN >= 7
5269 static void
5270 genX(upload_tcs_samplers)(struct brw_context *brw)
5271 {
5272    /* BRW_NEW_TESS_PROGRAMS */
5273    struct gl_program *tcs = brw->programs[MESA_SHADER_TESS_CTRL];
5274    if (!tcs)
5275       return;
5276
5277    genX(upload_sampler_state_table)(brw, tcs, &brw->tcs.base);
5278 }
5279
5280 static const struct brw_tracked_state genX(tcs_samplers) = {
5281    .dirty = {
5282       .mesa = _NEW_TEXTURE,
5283       .brw = BRW_NEW_BATCH |
5284              BRW_NEW_BLORP |
5285              BRW_NEW_TESS_PROGRAMS,
5286    },
5287    .emit = genX(upload_tcs_samplers),
5288 };
5289 #endif
5290
5291 #if GEN_GEN >= 7
5292 static void
5293 genX(upload_tes_samplers)(struct brw_context *brw)
5294 {
5295    /* BRW_NEW_TESS_PROGRAMS */
5296    struct gl_program *tes = brw->programs[MESA_SHADER_TESS_EVAL];
5297    if (!tes)
5298       return;
5299
5300    genX(upload_sampler_state_table)(brw, tes, &brw->tes.base);
5301 }
5302
5303 static const struct brw_tracked_state genX(tes_samplers) = {
5304    .dirty = {
5305       .mesa = _NEW_TEXTURE,
5306       .brw = BRW_NEW_BATCH |
5307              BRW_NEW_BLORP |
5308              BRW_NEW_TESS_PROGRAMS,
5309    },
5310    .emit = genX(upload_tes_samplers),
5311 };
5312 #endif
5313
5314 #if GEN_GEN >= 7
5315 static void
5316 genX(upload_cs_samplers)(struct brw_context *brw)
5317 {
5318    /* BRW_NEW_COMPUTE_PROGRAM */
5319    struct gl_program *cs = brw->programs[MESA_SHADER_COMPUTE];
5320    if (!cs)
5321       return;
5322
5323    genX(upload_sampler_state_table)(brw, cs, &brw->cs.base);
5324 }
5325
5326 const struct brw_tracked_state genX(cs_samplers) = {
5327    .dirty = {
5328       .mesa = _NEW_TEXTURE,
5329       .brw = BRW_NEW_BATCH |
5330              BRW_NEW_BLORP |
5331              BRW_NEW_COMPUTE_PROGRAM,
5332    },
5333    .emit = genX(upload_cs_samplers),
5334 };
5335 #endif
5336
5337 /* ---------------------------------------------------------------------- */
5338
5339 #if GEN_GEN <= 5
5340
5341 static void genX(upload_blend_constant_color)(struct brw_context *brw)
5342 {
5343    struct gl_context *ctx = &brw->ctx;
5344
5345    brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_COLOR), blend_cc) {
5346       blend_cc.BlendConstantColorRed = ctx->Color.BlendColorUnclamped[0];
5347       blend_cc.BlendConstantColorGreen = ctx->Color.BlendColorUnclamped[1];
5348       blend_cc.BlendConstantColorBlue = ctx->Color.BlendColorUnclamped[2];
5349       blend_cc.BlendConstantColorAlpha = ctx->Color.BlendColorUnclamped[3];
5350    }
5351 }
5352
5353 static const struct brw_tracked_state genX(blend_constant_color) = {
5354    .dirty = {
5355       .mesa = _NEW_COLOR,
5356       .brw = BRW_NEW_CONTEXT |
5357              BRW_NEW_BLORP,
5358    },
5359    .emit = genX(upload_blend_constant_color)
5360 };
5361 #endif
5362
5363 /* ---------------------------------------------------------------------- */
5364
5365 void
5366 genX(init_atoms)(struct brw_context *brw)
5367 {
5368 #if GEN_GEN < 6
5369    static const struct brw_tracked_state *render_atoms[] =
5370    {
5371       /* Once all the programs are done, we know how large urb entry
5372        * sizes need to be and can decide if we need to change the urb
5373        * layout.
5374        */
5375       &brw_curbe_offsets,
5376       &brw_recalculate_urb_fence,
5377
5378       &genX(cc_vp),
5379       &genX(color_calc_state),
5380
5381       /* Surface state setup.  Must come before the VS/WM unit.  The binding
5382        * table upload must be last.
5383        */
5384       &brw_vs_pull_constants,
5385       &brw_wm_pull_constants,
5386       &brw_renderbuffer_surfaces,
5387       &brw_renderbuffer_read_surfaces,
5388       &brw_texture_surfaces,
5389       &brw_vs_binding_table,
5390       &brw_wm_binding_table,
5391
5392       &genX(fs_samplers),
5393       &genX(vs_samplers),
5394
5395       /* These set up state for brw_psp_urb_cbs */
5396       &genX(wm_state),
5397       &genX(sf_clip_viewport),
5398       &genX(sf_state),
5399       &genX(vs_state), /* always required, enabled or not */
5400       &genX(clip_state),
5401       &genX(gs_state),
5402
5403       /* Command packets:
5404        */
5405       &brw_binding_table_pointers,
5406       &genX(blend_constant_color),
5407
5408       &brw_depthbuffer,
5409
5410       &genX(polygon_stipple),
5411       &genX(polygon_stipple_offset),
5412
5413       &genX(line_stipple),
5414
5415       &brw_psp_urb_cbs,
5416
5417       &genX(drawing_rect),
5418       &brw_indices, /* must come before brw_vertices */
5419       &genX(index_buffer),
5420       &genX(vertices),
5421
5422       &brw_constant_buffer
5423    };
5424 #elif GEN_GEN == 6
5425    static const struct brw_tracked_state *render_atoms[] =
5426    {
5427       &genX(sf_clip_viewport),
5428
5429       /* Command packets: */
5430
5431       &genX(cc_vp),
5432
5433       &gen6_urb,
5434       &genX(blend_state),               /* must do before cc unit */
5435       &genX(color_calc_state),  /* must do before cc unit */
5436       &genX(depth_stencil_state),       /* must do before cc unit */
5437
5438       &genX(vs_push_constants), /* Before vs_state */
5439       &genX(gs_push_constants), /* Before gs_state */
5440       &genX(wm_push_constants), /* Before wm_state */
5441
5442       /* Surface state setup.  Must come before the VS/WM unit.  The binding
5443        * table upload must be last.
5444        */
5445       &brw_vs_pull_constants,
5446       &brw_vs_ubo_surfaces,
5447       &brw_gs_pull_constants,
5448       &brw_gs_ubo_surfaces,
5449       &brw_wm_pull_constants,
5450       &brw_wm_ubo_surfaces,
5451       &gen6_renderbuffer_surfaces,
5452       &brw_renderbuffer_read_surfaces,
5453       &brw_texture_surfaces,
5454       &gen6_sol_surface,
5455       &brw_vs_binding_table,
5456       &gen6_gs_binding_table,
5457       &brw_wm_binding_table,
5458
5459       &genX(fs_samplers),
5460       &genX(vs_samplers),
5461       &genX(gs_samplers),
5462       &gen6_sampler_state,
5463       &genX(multisample_state),
5464
5465       &genX(vs_state),
5466       &genX(gs_state),
5467       &genX(clip_state),
5468       &genX(sf_state),
5469       &genX(wm_state),
5470
5471       &genX(scissor_state),
5472
5473       &gen6_binding_table_pointers,
5474
5475       &brw_depthbuffer,
5476
5477       &genX(polygon_stipple),
5478       &genX(polygon_stipple_offset),
5479
5480       &genX(line_stipple),
5481
5482       &genX(drawing_rect),
5483
5484       &brw_indices, /* must come before brw_vertices */
5485       &genX(index_buffer),
5486       &genX(vertices),
5487    };
5488 #elif GEN_GEN == 7
5489    static const struct brw_tracked_state *render_atoms[] =
5490    {
5491       /* Command packets: */
5492
5493       &genX(cc_vp),
5494       &genX(sf_clip_viewport),
5495
5496       &gen7_l3_state,
5497       &gen7_push_constant_space,
5498       &gen7_urb,
5499       &genX(blend_state),               /* must do before cc unit */
5500       &genX(color_calc_state),  /* must do before cc unit */
5501       &genX(depth_stencil_state),       /* must do before cc unit */
5502
5503       &brw_vs_image_surfaces, /* Before vs push/pull constants and binding table */
5504       &brw_tcs_image_surfaces, /* Before tcs push/pull constants and binding table */
5505       &brw_tes_image_surfaces, /* Before tes push/pull constants and binding table */
5506       &brw_gs_image_surfaces, /* Before gs push/pull constants and binding table */
5507       &brw_wm_image_surfaces, /* Before wm push/pull constants and binding table */
5508
5509       &genX(vs_push_constants), /* Before vs_state */
5510       &genX(tcs_push_constants),
5511       &genX(tes_push_constants),
5512       &genX(gs_push_constants), /* Before gs_state */
5513       &genX(wm_push_constants), /* Before wm_surfaces and constant_buffer */
5514
5515       /* Surface state setup.  Must come before the VS/WM unit.  The binding
5516        * table upload must be last.
5517        */
5518       &brw_vs_pull_constants,
5519       &brw_vs_ubo_surfaces,
5520       &brw_tcs_pull_constants,
5521       &brw_tcs_ubo_surfaces,
5522       &brw_tes_pull_constants,
5523       &brw_tes_ubo_surfaces,
5524       &brw_gs_pull_constants,
5525       &brw_gs_ubo_surfaces,
5526       &brw_wm_pull_constants,
5527       &brw_wm_ubo_surfaces,
5528       &gen6_renderbuffer_surfaces,
5529       &brw_renderbuffer_read_surfaces,
5530       &brw_texture_surfaces,
5531
5532       &genX(push_constant_packets),
5533
5534       &brw_vs_binding_table,
5535       &brw_tcs_binding_table,
5536       &brw_tes_binding_table,
5537       &brw_gs_binding_table,
5538       &brw_wm_binding_table,
5539
5540       &genX(fs_samplers),
5541       &genX(vs_samplers),
5542       &genX(tcs_samplers),
5543       &genX(tes_samplers),
5544       &genX(gs_samplers),
5545       &genX(multisample_state),
5546
5547       &genX(vs_state),
5548       &genX(hs_state),
5549       &genX(te_state),
5550       &genX(ds_state),
5551       &genX(gs_state),
5552       &genX(sol_state),
5553       &genX(clip_state),
5554       &genX(sbe_state),
5555       &genX(sf_state),
5556       &genX(wm_state),
5557       &genX(ps_state),
5558
5559       &genX(scissor_state),
5560
5561       &gen7_depthbuffer,
5562
5563       &genX(polygon_stipple),
5564       &genX(polygon_stipple_offset),
5565
5566       &genX(line_stipple),
5567
5568       &genX(drawing_rect),
5569
5570       &brw_indices, /* must come before brw_vertices */
5571       &genX(index_buffer),
5572       &genX(vertices),
5573
5574 #if GEN_IS_HASWELL
5575       &genX(cut_index),
5576 #endif
5577    };
5578 #elif GEN_GEN >= 8
5579    static const struct brw_tracked_state *render_atoms[] =
5580    {
5581       &genX(cc_vp),
5582       &genX(sf_clip_viewport),
5583
5584       &gen7_l3_state,
5585       &gen7_push_constant_space,
5586       &gen7_urb,
5587       &genX(blend_state),
5588       &genX(color_calc_state),
5589
5590       &brw_vs_image_surfaces, /* Before vs push/pull constants and binding table */
5591       &brw_tcs_image_surfaces, /* Before tcs push/pull constants and binding table */
5592       &brw_tes_image_surfaces, /* Before tes push/pull constants and binding table */
5593       &brw_gs_image_surfaces, /* Before gs push/pull constants and binding table */
5594       &brw_wm_image_surfaces, /* Before wm push/pull constants and binding table */
5595
5596       &genX(vs_push_constants), /* Before vs_state */
5597       &genX(tcs_push_constants),
5598       &genX(tes_push_constants),
5599       &genX(gs_push_constants), /* Before gs_state */
5600       &genX(wm_push_constants), /* Before wm_surfaces and constant_buffer */
5601
5602       /* Surface state setup.  Must come before the VS/WM unit.  The binding
5603        * table upload must be last.
5604        */
5605       &brw_vs_pull_constants,
5606       &brw_vs_ubo_surfaces,
5607       &brw_tcs_pull_constants,
5608       &brw_tcs_ubo_surfaces,
5609       &brw_tes_pull_constants,
5610       &brw_tes_ubo_surfaces,
5611       &brw_gs_pull_constants,
5612       &brw_gs_ubo_surfaces,
5613       &brw_wm_pull_constants,
5614       &brw_wm_ubo_surfaces,
5615       &gen6_renderbuffer_surfaces,
5616       &brw_renderbuffer_read_surfaces,
5617       &brw_texture_surfaces,
5618
5619       &genX(push_constant_packets),
5620
5621       &brw_vs_binding_table,
5622       &brw_tcs_binding_table,
5623       &brw_tes_binding_table,
5624       &brw_gs_binding_table,
5625       &brw_wm_binding_table,
5626
5627       &genX(fs_samplers),
5628       &genX(vs_samplers),
5629       &genX(tcs_samplers),
5630       &genX(tes_samplers),
5631       &genX(gs_samplers),
5632       &genX(multisample_state),
5633
5634       &genX(vs_state),
5635       &genX(hs_state),
5636       &genX(te_state),
5637       &genX(ds_state),
5638       &genX(gs_state),
5639       &genX(sol_state),
5640       &genX(clip_state),
5641       &genX(raster_state),
5642       &genX(sbe_state),
5643       &genX(sf_state),
5644       &genX(ps_blend),
5645       &genX(ps_extra),
5646       &genX(ps_state),
5647       &genX(depth_stencil_state),
5648       &genX(wm_state),
5649
5650       &genX(scissor_state),
5651
5652       &gen7_depthbuffer,
5653
5654       &genX(polygon_stipple),
5655       &genX(polygon_stipple_offset),
5656
5657       &genX(line_stipple),
5658
5659       &genX(drawing_rect),
5660
5661       &genX(vf_topology),
5662
5663       &brw_indices,
5664       &genX(index_buffer),
5665       &genX(vertices),
5666
5667       &genX(cut_index),
5668       &gen8_pma_fix,
5669    };
5670 #endif
5671
5672    STATIC_ASSERT(ARRAY_SIZE(render_atoms) <= ARRAY_SIZE(brw->render_atoms));
5673    brw_copy_pipeline_atoms(brw, BRW_RENDER_PIPELINE,
5674                            render_atoms, ARRAY_SIZE(render_atoms));
5675
5676 #if GEN_GEN >= 7
5677    static const struct brw_tracked_state *compute_atoms[] =
5678    {
5679       &gen7_l3_state,
5680       &brw_cs_image_surfaces,
5681       &genX(cs_push_constants),
5682       &genX(cs_pull_constants),
5683       &brw_cs_ubo_surfaces,
5684       &brw_cs_texture_surfaces,
5685       &brw_cs_work_groups_surface,
5686       &genX(cs_samplers),
5687       &genX(cs_state),
5688    };
5689
5690    STATIC_ASSERT(ARRAY_SIZE(compute_atoms) <= ARRAY_SIZE(brw->compute_atoms));
5691    brw_copy_pipeline_atoms(brw, BRW_COMPUTE_PIPELINE,
5692                            compute_atoms, ARRAY_SIZE(compute_atoms));
5693
5694    brw->vtbl.emit_mi_report_perf_count = genX(emit_mi_report_perf_count);
5695 #endif
5696 }