src/mesa/drivers/dri/i965/genX_state_upload.c

   1 /*
   2  * Copyright © 2017 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include <assert.h>
  25
  26 #include "common/gen_device_info.h"
  27 #include "common/gen_sample_positions.h"
  28 #include "genxml/gen_macros.h"
  29
  30 #include "main/bufferobj.h"
  31 #include "main/context.h"
  32 #include "main/enums.h"
  33 #include "main/macros.h"
  34 #include "main/state.h"
  35
  36 #include "brw_context.h"
  37 #if GEN_GEN == 6
  38 #include "brw_defines.h"
  39 #endif
  40 #include "brw_draw.h"
  41 #include "brw_multisample_state.h"
  42 #include "brw_state.h"
  43 #include "brw_wm.h"
  44 #include "brw_util.h"
  45
  46 #include "intel_batchbuffer.h"
  47 #include "intel_buffer_objects.h"
  48 #include "intel_fbo.h"
  49
  50 #include "main/enums.h"
  51 #include "main/fbobject.h"
  52 #include "main/framebuffer.h"
  53 #include "main/glformats.h"
  54 #include "main/shaderapi.h"
  55 #include "main/stencil.h"
  56 #include "main/transformfeedback.h"
  57 #include "main/varray.h"
  58 #include "main/viewport.h"
  59
  60 UNUSED static void *
  61 emit_dwords(struct brw_context *brw, unsigned n)
  62 {
  63    intel_batchbuffer_begin(brw, n, RENDER_RING);
  64    uint32_t *map = brw->batch.map_next;
  65    brw->batch.map_next += n;
  66    intel_batchbuffer_advance(brw);
  67    return map;
  68 }
  69
  70 struct brw_address {
  71    struct brw_bo *bo;
  72    uint32_t read_domains;
  73    uint32_t write_domain;
  74    uint32_t offset;
  75 };
  76
  77 static uint64_t
  78 emit_reloc(struct brw_context *brw,
  79            void *location, struct brw_address address, uint32_t delta)
  80 {
  81    uint32_t offset = (char *) location - (char *) brw->batch.map;
  82
  83    return brw_emit_reloc(&brw->batch, offset, address.bo,
  84                          address.offset + delta,
  85                          address.read_domains,
  86                          address.write_domain);
  87 }
  88
  89 #define __gen_address_type struct brw_address
  90 #define __gen_user_data struct brw_context
  91
  92 static uint64_t
  93 __gen_combine_address(struct brw_context *brw, void *location,
  94                       struct brw_address address, uint32_t delta)
  95 {
  96    if (address.bo == NULL) {
  97       return address.offset + delta;
  98    } else {
  99       return emit_reloc(brw, location, address, delta);
 100    }
 101 }
 102
 103 static inline struct brw_address
 104 render_bo(struct brw_bo *bo, uint32_t offset)
 105 {
 106    return (struct brw_address) {
 107             .bo = bo,
 108             .offset = offset,
 109             .read_domains = I915_GEM_DOMAIN_RENDER,
 110             .write_domain = I915_GEM_DOMAIN_RENDER,
 111    };
 112 }
 113
 114 static inline struct brw_address
 115 render_ro_bo(struct brw_bo *bo, uint32_t offset)
 116 {
 117    return (struct brw_address) {
 118             .bo = bo,
 119             .offset = offset,
 120             .read_domains = I915_GEM_DOMAIN_RENDER,
 121             .write_domain = 0,
 122    };
 123 }
 124
 125 static inline struct brw_address
 126 instruction_bo(struct brw_bo *bo, uint32_t offset)
 127 {
 128    return (struct brw_address) {
 129             .bo = bo,
 130             .offset = offset,
 131             .read_domains = I915_GEM_DOMAIN_INSTRUCTION,
 132             .write_domain = I915_GEM_DOMAIN_INSTRUCTION,
 133    };
 134 }
 135
 136 static inline struct brw_address
 137 instruction_ro_bo(struct brw_bo *bo, uint32_t offset)
 138 {
 139    return (struct brw_address) {
 140             .bo = bo,
 141             .offset = offset,
 142             .read_domains = I915_GEM_DOMAIN_INSTRUCTION,
 143             .write_domain = 0,
 144    };
 145 }
 146
 147 static inline struct brw_address
 148 vertex_bo(struct brw_bo *bo, uint32_t offset)
 149 {
 150    return (struct brw_address) {
 151             .bo = bo,
 152             .offset = offset,
 153             .read_domains = I915_GEM_DOMAIN_VERTEX,
 154             .write_domain = 0,
 155    };
 156 }
 157
 158 #if GEN_GEN == 4
 159 static inline struct brw_address
 160 KSP(struct brw_context *brw, uint32_t offset)
 161 {
 162    return instruction_bo(brw->cache.bo, offset);
 163 }
 164
 165 static inline struct brw_address
 166 KSP_ro(struct brw_context *brw, uint32_t offset)
 167 {
 168    return instruction_ro_bo(brw->cache.bo, offset);
 169 }
 170 #else
 171 static inline uint32_t
 172 KSP(struct brw_context *brw, uint32_t offset)
 173 {
 174    return offset;
 175 }
 176
 177 #define KSP_ro KSP
 178
 179 #endif
 180
 181 #include "genxml/genX_pack.h"
 182
 183 #define _brw_cmd_length(cmd) cmd ## _length
 184 #define _brw_cmd_length_bias(cmd) cmd ## _length_bias
 185 #define _brw_cmd_header(cmd) cmd ## _header
 186 #define _brw_cmd_pack(cmd) cmd ## _pack
 187
 188 #define brw_batch_emit(brw, cmd, name)                  \
 189    for (struct cmd name = { _brw_cmd_header(cmd) },     \
 190         *_dst = emit_dwords(brw, _brw_cmd_length(cmd)); \
 191         __builtin_expect(_dst != NULL, 1);              \
 192         _brw_cmd_pack(cmd)(brw, (void *)_dst, &name),   \
 193         _dst = NULL)
 194
 195 #define brw_batch_emitn(brw, cmd, n, ...) ({           \
 196       uint32_t *_dw = emit_dwords(brw, n);             \
 197       struct cmd template = {                          \
 198          _brw_cmd_header(cmd),                         \
 199          .DWordLength = n - _brw_cmd_length_bias(cmd), \
 200          __VA_ARGS__                                   \
 201       };                                               \
 202       _brw_cmd_pack(cmd)(brw, _dw, &template);         \
 203       _dw + 1; /* Array starts at dw[1] */             \
 204    })
 205
 206 #define brw_state_emit(brw, cmd, align, offset, name)              \
 207    for (struct cmd name = { 0, },                                  \
 208         *_dst = brw_state_batch(brw, _brw_cmd_length(cmd) * 4,     \
 209                                 align, offset);                    \
 210         __builtin_expect(_dst != NULL, 1);                         \
 211         _brw_cmd_pack(cmd)(brw, (void *)_dst, &name),              \
 212         _dst = NULL)
 213
 214 /**
 215  * Polygon stipple packet
 216  */
 217 static void
 218 genX(upload_polygon_stipple)(struct brw_context *brw)
 219 {
 220    struct gl_context *ctx = &brw->ctx;
 221
 222    /* _NEW_POLYGON */
 223    if (!ctx->Polygon.StippleFlag)
 224       return;
 225
 226    brw_batch_emit(brw, GENX(3DSTATE_POLY_STIPPLE_PATTERN), poly) {
 227       /* Polygon stipple is provided in OpenGL order, i.e. bottom
 228        * row first.  If we're rendering to a window (i.e. the
 229        * default frame buffer object, 0), then we need to invert
 230        * it to match our pixel layout.  But if we're rendering
 231        * to a FBO (i.e. any named frame buffer object), we *don't*
 232        * need to invert - we already match the layout.
 233        */
 234       if (_mesa_is_winsys_fbo(ctx->DrawBuffer)) {
 235          for (unsigned i = 0; i < 32; i++)
 236             poly.PatternRow[i] = ctx->PolygonStipple[31 - i]; /* invert */
 237       } else {
 238          for (unsigned i = 0; i < 32; i++)
 239             poly.PatternRow[i] = ctx->PolygonStipple[i];
 240       }
 241    }
 242 }
 243
 244 static const struct brw_tracked_state genX(polygon_stipple) = {
 245    .dirty = {
 246       .mesa = _NEW_POLYGON |
 247               _NEW_POLYGONSTIPPLE,
 248       .brw = BRW_NEW_CONTEXT,
 249    },
 250    .emit = genX(upload_polygon_stipple),
 251 };
 252
 253 /**
 254  * Polygon stipple offset packet
 255  */
 256 static void
 257 genX(upload_polygon_stipple_offset)(struct brw_context *brw)
 258 {
 259    struct gl_context *ctx = &brw->ctx;
 260
 261    /* _NEW_POLYGON */
 262    if (!ctx->Polygon.StippleFlag)
 263       return;
 264
 265    brw_batch_emit(brw, GENX(3DSTATE_POLY_STIPPLE_OFFSET), poly) {
 266       /* _NEW_BUFFERS
 267        *
 268        * If we're drawing to a system window we have to invert the Y axis
 269        * in order to match the OpenGL pixel coordinate system, and our
 270        * offset must be matched to the window position.  If we're drawing
 271        * to a user-created FBO then our native pixel coordinate system
 272        * works just fine, and there's no window system to worry about.
 273        */
 274       if (_mesa_is_winsys_fbo(ctx->DrawBuffer)) {
 275          poly.PolygonStippleYOffset =
 276             (32 - (_mesa_geometric_height(ctx->DrawBuffer) & 31)) & 31;
 277       }
 278    }
 279 }
 280
 281 static const struct brw_tracked_state genX(polygon_stipple_offset) = {
 282    .dirty = {
 283       .mesa = _NEW_BUFFERS |
 284               _NEW_POLYGON,
 285       .brw = BRW_NEW_CONTEXT,
 286    },
 287    .emit = genX(upload_polygon_stipple_offset),
 288 };
 289
 290 /**
 291  * Line stipple packet
 292  */
 293 static void
 294 genX(upload_line_stipple)(struct brw_context *brw)
 295 {
 296    struct gl_context *ctx = &brw->ctx;
 297
 298    if (!ctx->Line.StippleFlag)
 299       return;
 300
 301    brw_batch_emit(brw, GENX(3DSTATE_LINE_STIPPLE), line) {
 302       line.LineStipplePattern = ctx->Line.StipplePattern;
 303
 304       line.LineStippleInverseRepeatCount = 1.0f / ctx->Line.StippleFactor;
 305       line.LineStippleRepeatCount = ctx->Line.StippleFactor;
 306    }
 307 }
 308
 309 static const struct brw_tracked_state genX(line_stipple) = {
 310    .dirty = {
 311       .mesa = _NEW_LINE,
 312       .brw = BRW_NEW_CONTEXT,
 313    },
 314    .emit = genX(upload_line_stipple),
 315 };
 316
 317 /* Constant single cliprect for framebuffer object or DRI2 drawing */
 318 static void
 319 genX(upload_drawing_rect)(struct brw_context *brw)
 320 {
 321    struct gl_context *ctx = &brw->ctx;
 322    const struct gl_framebuffer *fb = ctx->DrawBuffer;
 323    const unsigned int fb_width = _mesa_geometric_width(fb);
 324    const unsigned int fb_height = _mesa_geometric_height(fb);
 325
 326    brw_batch_emit(brw, GENX(3DSTATE_DRAWING_RECTANGLE), rect) {
 327       rect.ClippedDrawingRectangleXMax = fb_width - 1;
 328       rect.ClippedDrawingRectangleYMax = fb_height - 1;
 329    }
 330 }
 331
 332 static const struct brw_tracked_state genX(drawing_rect) = {
 333    .dirty = {
 334       .mesa = _NEW_BUFFERS,
 335       .brw = BRW_NEW_BLORP |
 336              BRW_NEW_CONTEXT,
 337    },
 338    .emit = genX(upload_drawing_rect),
 339 };
 340
 341 static uint32_t *
 342 genX(emit_vertex_buffer_state)(struct brw_context *brw,
 343                                uint32_t *dw,
 344                                unsigned buffer_nr,
 345                                struct brw_bo *bo,
 346                                unsigned start_offset,
 347                                unsigned end_offset,
 348                                unsigned stride,
 349                                unsigned step_rate)
 350 {
 351    struct GENX(VERTEX_BUFFER_STATE) buf_state = {
 352       .VertexBufferIndex = buffer_nr,
 353       .BufferPitch = stride,
 354       .BufferStartingAddress = vertex_bo(bo, start_offset),
 355 #if GEN_GEN >= 8
 356       .BufferSize = end_offset - start_offset,
 357 #endif
 358
 359 #if GEN_GEN >= 7
 360       .AddressModifyEnable = true,
 361 #endif
 362
 363 #if GEN_GEN < 8
 364       .BufferAccessType = step_rate ? INSTANCEDATA : VERTEXDATA,
 365       .InstanceDataStepRate = step_rate,
 366 #if GEN_GEN >= 5
 367       .EndAddress = vertex_bo(bo, end_offset - 1),
 368 #endif
 369 #endif
 370
 371 #if GEN_GEN == 10
 372       .VertexBufferMOCS = CNL_MOCS_WB,
 373 #elif GEN_GEN == 9
 374       .VertexBufferMOCS = SKL_MOCS_WB,
 375 #elif GEN_GEN == 8
 376       .VertexBufferMOCS = BDW_MOCS_WB,
 377 #elif GEN_GEN == 7
 378       .VertexBufferMOCS = GEN7_MOCS_L3,
 379 #endif
 380    };
 381
 382    GENX(VERTEX_BUFFER_STATE_pack)(brw, dw, &buf_state);
 383    return dw + GENX(VERTEX_BUFFER_STATE_length);
 384 }
 385
 386 UNUSED static bool
 387 is_passthru_format(uint32_t format)
 388 {
 389    switch (format) {
 390    case ISL_FORMAT_R64_PASSTHRU:
 391    case ISL_FORMAT_R64G64_PASSTHRU:
 392    case ISL_FORMAT_R64G64B64_PASSTHRU:
 393    case ISL_FORMAT_R64G64B64A64_PASSTHRU:
 394       return true;
 395    default:
 396       return false;
 397    }
 398 }
 399
 400 UNUSED static int
 401 uploads_needed(uint32_t format)
 402 {
 403    if (!is_passthru_format(format))
 404       return 1;
 405
 406    switch (format) {
 407    case ISL_FORMAT_R64_PASSTHRU:
 408    case ISL_FORMAT_R64G64_PASSTHRU:
 409       return 1;
 410    case ISL_FORMAT_R64G64B64_PASSTHRU:
 411    case ISL_FORMAT_R64G64B64A64_PASSTHRU:
 412       return 2;
 413    default:
 414       unreachable("not reached");
 415    }
 416 }
 417
 418 /*
 419  * Returns the format that we are finally going to use when upload a vertex
 420  * element. It will only change if we are using *64*PASSTHRU formats, as for
 421  * gen < 8 they need to be splitted on two *32*FLOAT formats.
 422  *
 423  * @upload points in which upload we are. Valid values are [0,1]
 424  */
 425 static uint32_t
 426 downsize_format_if_needed(uint32_t format,
 427                           int upload)
 428 {
 429    assert(upload == 0 || upload == 1);
 430
 431    if (!is_passthru_format(format))
 432       return format;
 433
 434    switch (format) {
 435    case ISL_FORMAT_R64_PASSTHRU:
 436       return ISL_FORMAT_R32G32_FLOAT;
 437    case ISL_FORMAT_R64G64_PASSTHRU:
 438       return ISL_FORMAT_R32G32B32A32_FLOAT;
 439    case ISL_FORMAT_R64G64B64_PASSTHRU:
 440       return !upload ? ISL_FORMAT_R32G32B32A32_FLOAT
 441                      : ISL_FORMAT_R32G32_FLOAT;
 442    case ISL_FORMAT_R64G64B64A64_PASSTHRU:
 443       return ISL_FORMAT_R32G32B32A32_FLOAT;
 444    default:
 445       unreachable("not reached");
 446    }
 447 }
 448
 449 /*
 450  * Returns the number of componentes associated with a format that is used on
 451  * a 64 to 32 format split. See downsize_format()
 452  */
 453 static int
 454 upload_format_size(uint32_t upload_format)
 455 {
 456    switch (upload_format) {
 457    case ISL_FORMAT_R32G32_FLOAT:
 458       return 2;
 459    case ISL_FORMAT_R32G32B32A32_FLOAT:
 460       return 4;
 461    default:
 462       unreachable("not reached");
 463    }
 464 }
 465
 466 static void
 467 genX(emit_vertices)(struct brw_context *brw)
 468 {
 469    uint32_t *dw;
 470
 471    brw_prepare_vertices(brw);
 472    brw_prepare_shader_draw_parameters(brw);
 473
 474 #if GEN_GEN < 6
 475    brw_emit_query_begin(brw);
 476 #endif
 477
 478    const struct brw_vs_prog_data *vs_prog_data =
 479       brw_vs_prog_data(brw->vs.base.prog_data);
 480
 481 #if GEN_GEN >= 8
 482    struct gl_context *ctx = &brw->ctx;
 483    const bool uses_edge_flag = (ctx->Polygon.FrontMode != GL_FILL ||
 484                                 ctx->Polygon.BackMode != GL_FILL);
 485
 486    if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid) {
 487       unsigned vue = brw->vb.nr_enabled;
 488
 489       /* The element for the edge flags must always be last, so we have to
 490        * insert the SGVS before it in that case.
 491        */
 492       if (uses_edge_flag) {
 493          assert(vue > 0);
 494          vue--;
 495       }
 496
 497       WARN_ONCE(vue >= 33,
 498                 "Trying to insert VID/IID past 33rd vertex element, "
 499                 "need to reorder the vertex attrbutes.");
 500
 501       brw_batch_emit(brw, GENX(3DSTATE_VF_SGVS), vfs) {
 502          if (vs_prog_data->uses_vertexid) {
 503             vfs.VertexIDEnable = true;
 504             vfs.VertexIDComponentNumber = 2;
 505             vfs.VertexIDElementOffset = vue;
 506          }
 507
 508          if (vs_prog_data->uses_instanceid) {
 509             vfs.InstanceIDEnable = true;
 510             vfs.InstanceIDComponentNumber = 3;
 511             vfs.InstanceIDElementOffset = vue;
 512          }
 513       }
 514
 515       brw_batch_emit(brw, GENX(3DSTATE_VF_INSTANCING), vfi) {
 516          vfi.InstancingEnable = true;
 517          vfi.VertexElementIndex = vue;
 518       }
 519    } else {
 520       brw_batch_emit(brw, GENX(3DSTATE_VF_SGVS), vfs);
 521    }
 522
 523    /* Normally we don't need an element for the SGVS attribute because the
 524     * 3DSTATE_VF_SGVS instruction lets you store the generated attribute in an
 525     * element that is past the list in 3DSTATE_VERTEX_ELEMENTS. However if
 526     * we're using draw parameters then we need an element for the those
 527     * values.  Additionally if there is an edge flag element then the SGVS
 528     * can't be inserted past that so we need a dummy element to ensure that
 529     * the edge flag is the last one.
 530     */
 531    const bool needs_sgvs_element = (vs_prog_data->uses_basevertex ||
 532                                     vs_prog_data->uses_baseinstance ||
 533                                     ((vs_prog_data->uses_instanceid ||
 534                                       vs_prog_data->uses_vertexid)
 535                                      && uses_edge_flag));
 536 #else
 537    const bool needs_sgvs_element = (vs_prog_data->uses_basevertex ||
 538                                     vs_prog_data->uses_baseinstance ||
 539                                     vs_prog_data->uses_instanceid ||
 540                                     vs_prog_data->uses_vertexid);
 541 #endif
 542    unsigned nr_elements =
 543       brw->vb.nr_enabled + needs_sgvs_element + vs_prog_data->uses_drawid;
 544
 545 #if GEN_GEN < 8
 546    /* If any of the formats of vb.enabled needs more that one upload, we need
 547     * to add it to nr_elements
 548     */
 549    for (unsigned i = 0; i < brw->vb.nr_enabled; i++) {
 550       struct brw_vertex_element *input = brw->vb.enabled[i];
 551       uint32_t format = brw_get_vertex_surface_type(brw, input->glarray);
 552
 553       if (uploads_needed(format) > 1)
 554          nr_elements++;
 555    }
 556 #endif
 557
 558    /* If the VS doesn't read any inputs (calculating vertex position from
 559     * a state variable for some reason, for example), emit a single pad
 560     * VERTEX_ELEMENT struct and bail.
 561     *
 562     * The stale VB state stays in place, but they don't do anything unless
 563     * a VE loads from them.
 564     */
 565    if (nr_elements == 0) {
 566       dw = brw_batch_emitn(brw, GENX(3DSTATE_VERTEX_ELEMENTS),
 567                            1 + GENX(VERTEX_ELEMENT_STATE_length));
 568       struct GENX(VERTEX_ELEMENT_STATE) elem = {
 569          .Valid = true,
 570          .SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT,
 571          .Component0Control = VFCOMP_STORE_0,
 572          .Component1Control = VFCOMP_STORE_0,
 573          .Component2Control = VFCOMP_STORE_0,
 574          .Component3Control = VFCOMP_STORE_1_FP,
 575       };
 576       GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem);
 577       return;
 578    }
 579
 580    /* Now emit 3DSTATE_VERTEX_BUFFERS and 3DSTATE_VERTEX_ELEMENTS packets. */
 581    const bool uses_draw_params =
 582       vs_prog_data->uses_basevertex ||
 583       vs_prog_data->uses_baseinstance;
 584    const unsigned nr_buffers = brw->vb.nr_buffers +
 585       uses_draw_params + vs_prog_data->uses_drawid;
 586
 587    if (nr_buffers) {
 588       assert(nr_buffers <= (GEN_GEN >= 6 ? 33 : 17));
 589
 590       dw = brw_batch_emitn(brw, GENX(3DSTATE_VERTEX_BUFFERS),
 591                            1 + GENX(VERTEX_BUFFER_STATE_length) * nr_buffers);
 592
 593       for (unsigned i = 0; i < brw->vb.nr_buffers; i++) {
 594          const struct brw_vertex_buffer *buffer = &brw->vb.buffers[i];
 595          /* Prior to Haswell and Bay Trail we have to use 4-component formats
 596           * to fake 3-component ones.  In particular, we do this for
 597           * half-float and 8 and 16-bit integer formats.  This means that the
 598           * vertex element may poke over the end of the buffer by 2 bytes.
 599           */
 600          const unsigned padding =
 601             (GEN_GEN <= 7 && !brw->is_baytrail && !brw->is_haswell) * 2;
 602          const unsigned end = buffer->offset + buffer->size + padding;
 603          dw = genX(emit_vertex_buffer_state)(brw, dw, i, buffer->bo,
 604                                              buffer->offset,
 605                                              end,
 606                                              buffer->stride,
 607                                              buffer->step_rate);
 608       }
 609
 610       if (uses_draw_params) {
 611          dw = genX(emit_vertex_buffer_state)(brw, dw, brw->vb.nr_buffers,
 612                                              brw->draw.draw_params_bo,
 613                                              brw->draw.draw_params_offset,
 614                                              brw->draw.draw_params_bo->size,
 615                                              0 /* stride */,
 616                                              0 /* step rate */);
 617       }
 618
 619       if (vs_prog_data->uses_drawid) {
 620          dw = genX(emit_vertex_buffer_state)(brw, dw, brw->vb.nr_buffers + 1,
 621                                              brw->draw.draw_id_bo,
 622                                              brw->draw.draw_id_offset,
 623                                              brw->draw.draw_id_bo->size,
 624                                              0 /* stride */,
 625                                              0 /* step rate */);
 626       }
 627    }
 628
 629    /* The hardware allows one more VERTEX_ELEMENTS than VERTEX_BUFFERS,
 630     * presumably for VertexID/InstanceID.
 631     */
 632 #if GEN_GEN >= 6
 633    assert(nr_elements <= 34);
 634    const struct brw_vertex_element *gen6_edgeflag_input = NULL;
 635 #else
 636    assert(nr_elements <= 18);
 637 #endif
 638
 639    dw = brw_batch_emitn(brw, GENX(3DSTATE_VERTEX_ELEMENTS),
 640                         1 + GENX(VERTEX_ELEMENT_STATE_length) * nr_elements);
 641    unsigned i;
 642    for (i = 0; i < brw->vb.nr_enabled; i++) {
 643       const struct brw_vertex_element *input = brw->vb.enabled[i];
 644       uint32_t format = brw_get_vertex_surface_type(brw, input->glarray);
 645       uint32_t comp0 = VFCOMP_STORE_SRC;
 646       uint32_t comp1 = VFCOMP_STORE_SRC;
 647       uint32_t comp2 = VFCOMP_STORE_SRC;
 648       uint32_t comp3 = VFCOMP_STORE_SRC;
 649       const unsigned num_uploads = GEN_GEN < 8 ? uploads_needed(format) : 1;
 650
 651 #if GEN_GEN >= 8
 652       /* From the BDW PRM, Volume 2d, page 588 (VERTEX_ELEMENT_STATE):
 653        * "Any SourceElementFormat of *64*_PASSTHRU cannot be used with an
 654        * element which has edge flag enabled."
 655        */
 656       assert(!(is_passthru_format(format) && uses_edge_flag));
 657 #endif
 658
 659       /* The gen4 driver expects edgeflag to come in as a float, and passes
 660        * that float on to the tests in the clipper.  Mesa's current vertex
 661        * attribute value for EdgeFlag is stored as a float, which works out.
 662        * glEdgeFlagPointer, on the other hand, gives us an unnormalized
 663        * integer ubyte.  Just rewrite that to convert to a float.
 664        *
 665        * Gen6+ passes edgeflag as sideband along with the vertex, instead
 666        * of in the VUE.  We have to upload it sideband as the last vertex
 667        * element according to the B-Spec.
 668        */
 669 #if GEN_GEN >= 6
 670       if (input == &brw->vb.inputs[VERT_ATTRIB_EDGEFLAG]) {
 671          gen6_edgeflag_input = input;
 672          continue;
 673       }
 674 #endif
 675
 676       for (unsigned c = 0; c < num_uploads; c++) {
 677          const uint32_t upload_format = GEN_GEN >= 8 ? format :
 678             downsize_format_if_needed(format, c);
 679          /* If we need more that one upload, the offset stride would be 128
 680           * bits (16 bytes), as for previous uploads we are using the full
 681           * entry. */
 682          const unsigned offset = input->offset + c * 16;
 683
 684          const int size = (GEN_GEN < 8 && is_passthru_format(format)) ?
 685             upload_format_size(upload_format) : input->glarray->Size;
 686
 687          switch (size) {
 688             case 0: comp0 = VFCOMP_STORE_0;
 689             case 1: comp1 = VFCOMP_STORE_0;
 690             case 2: comp2 = VFCOMP_STORE_0;
 691             case 3:
 692                if (GEN_GEN >= 8 && input->glarray->Doubles) {
 693                   comp3 = VFCOMP_STORE_0;
 694                } else if (input->glarray->Integer) {
 695                   comp3 = VFCOMP_STORE_1_INT;
 696                } else {
 697                   comp3 = VFCOMP_STORE_1_FP;
 698                }
 699
 700                break;
 701          }
 702
 703 #if GEN_GEN >= 8
 704          /* From the BDW PRM, Volume 2d, page 586 (VERTEX_ELEMENT_STATE):
 705           *
 706           *     "When SourceElementFormat is set to one of the *64*_PASSTHRU
 707           *     formats, 64-bit components are stored in the URB without any
 708           *     conversion. In this case, vertex elements must be written as 128
 709           *     or 256 bits, with VFCOMP_STORE_0 being used to pad the output as
 710           *     required. E.g., if R64_PASSTHRU is used to copy a 64-bit Red
 711           *     component into the URB, Component 1 must be specified as
 712           *     VFCOMP_STORE_0 (with Components 2,3 set to VFCOMP_NOSTORE) in
 713           *     order to output a 128-bit vertex element, or Components 1-3 must
 714           *     be specified as VFCOMP_STORE_0 in order to output a 256-bit vertex
 715           *     element. Likewise, use of R64G64B64_PASSTHRU requires Component 3
 716           *     to be specified as VFCOMP_STORE_0 in order to output a 256-bit
 717           *     vertex element."
 718           */
 719          if (input->glarray->Doubles && !input->is_dual_slot) {
 720             /* Store vertex elements which correspond to double and dvec2 vertex
 721              * shader inputs as 128-bit vertex elements, instead of 256-bits.
 722              */
 723             comp2 = VFCOMP_NOSTORE;
 724             comp3 = VFCOMP_NOSTORE;
 725          }
 726 #endif
 727
 728          struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
 729             .VertexBufferIndex = input->buffer,
 730             .Valid = true,
 731             .SourceElementFormat = upload_format,
 732             .SourceElementOffset = offset,
 733             .Component0Control = comp0,
 734             .Component1Control = comp1,
 735             .Component2Control = comp2,
 736             .Component3Control = comp3,
 737 #if GEN_GEN < 5
 738             .DestinationElementOffset = i * 4,
 739 #endif
 740          };
 741
 742          GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state);
 743          dw += GENX(VERTEX_ELEMENT_STATE_length);
 744       }
 745    }
 746
 747    if (needs_sgvs_element) {
 748       struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
 749          .Valid = true,
 750          .Component0Control = VFCOMP_STORE_0,
 751          .Component1Control = VFCOMP_STORE_0,
 752          .Component2Control = VFCOMP_STORE_0,
 753          .Component3Control = VFCOMP_STORE_0,
 754 #if GEN_GEN < 5
 755          .DestinationElementOffset = i * 4,
 756 #endif
 757       };
 758
 759 #if GEN_GEN >= 8
 760       if (vs_prog_data->uses_basevertex ||
 761           vs_prog_data->uses_baseinstance) {
 762          elem_state.VertexBufferIndex = brw->vb.nr_buffers;
 763          elem_state.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
 764          elem_state.Component0Control = VFCOMP_STORE_SRC;
 765          elem_state.Component1Control = VFCOMP_STORE_SRC;
 766       }
 767 #else
 768       elem_state.VertexBufferIndex = brw->vb.nr_buffers;
 769       elem_state.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
 770       if (vs_prog_data->uses_basevertex)
 771          elem_state.Component0Control = VFCOMP_STORE_SRC;
 772
 773       if (vs_prog_data->uses_baseinstance)
 774          elem_state.Component1Control = VFCOMP_STORE_SRC;
 775
 776       if (vs_prog_data->uses_vertexid)
 777          elem_state.Component2Control = VFCOMP_STORE_VID;
 778
 779       if (vs_prog_data->uses_instanceid)
 780          elem_state.Component3Control = VFCOMP_STORE_IID;
 781 #endif
 782
 783       GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state);
 784       dw += GENX(VERTEX_ELEMENT_STATE_length);
 785    }
 786
 787    if (vs_prog_data->uses_drawid) {
 788       struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
 789          .Valid = true,
 790          .VertexBufferIndex = brw->vb.nr_buffers + 1,
 791          .SourceElementFormat = ISL_FORMAT_R32_UINT,
 792          .Component0Control = VFCOMP_STORE_SRC,
 793          .Component1Control = VFCOMP_STORE_0,
 794          .Component2Control = VFCOMP_STORE_0,
 795          .Component3Control = VFCOMP_STORE_0,
 796 #if GEN_GEN < 5
 797          .DestinationElementOffset = i * 4,
 798 #endif
 799       };
 800
 801       GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state);
 802       dw += GENX(VERTEX_ELEMENT_STATE_length);
 803    }
 804
 805 #if GEN_GEN >= 6
 806    if (gen6_edgeflag_input) {
 807       const uint32_t format =
 808          brw_get_vertex_surface_type(brw, gen6_edgeflag_input->glarray);
 809
 810       struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
 811          .Valid = true,
 812          .VertexBufferIndex = gen6_edgeflag_input->buffer,
 813          .EdgeFlagEnable = true,
 814          .SourceElementFormat = format,
 815          .SourceElementOffset = gen6_edgeflag_input->offset,
 816          .Component0Control = VFCOMP_STORE_SRC,
 817          .Component1Control = VFCOMP_STORE_0,
 818          .Component2Control = VFCOMP_STORE_0,
 819          .Component3Control = VFCOMP_STORE_0,
 820       };
 821
 822       GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state);
 823       dw += GENX(VERTEX_ELEMENT_STATE_length);
 824    }
 825 #endif
 826
 827 #if GEN_GEN >= 8
 828    for (unsigned i = 0, j = 0; i < brw->vb.nr_enabled; i++) {
 829       const struct brw_vertex_element *input = brw->vb.enabled[i];
 830       const struct brw_vertex_buffer *buffer = &brw->vb.buffers[input->buffer];
 831       unsigned element_index;
 832
 833       /* The edge flag element is reordered to be the last one in the code
 834        * above so we need to compensate for that in the element indices used
 835        * below.
 836        */
 837       if (input == gen6_edgeflag_input)
 838          element_index = nr_elements - 1;
 839       else
 840          element_index = j++;
 841
 842       brw_batch_emit(brw, GENX(3DSTATE_VF_INSTANCING), vfi) {
 843          vfi.VertexElementIndex = element_index;
 844          vfi.InstancingEnable = buffer->step_rate != 0;
 845          vfi.InstanceDataStepRate = buffer->step_rate;
 846       }
 847    }
 848
 849    if (vs_prog_data->uses_drawid) {
 850       const unsigned element = brw->vb.nr_enabled + needs_sgvs_element;
 851
 852       brw_batch_emit(brw, GENX(3DSTATE_VF_INSTANCING), vfi) {
 853          vfi.VertexElementIndex = element;
 854       }
 855    }
 856 #endif
 857 }
 858
 859 static const struct brw_tracked_state genX(vertices) = {
 860    .dirty = {
 861       .mesa = _NEW_POLYGON,
 862       .brw = BRW_NEW_BATCH |
 863              BRW_NEW_BLORP |
 864              BRW_NEW_VERTICES |
 865              BRW_NEW_VS_PROG_DATA,
 866    },
 867    .emit = genX(emit_vertices),
 868 };
 869
 870 static void
 871 genX(emit_index_buffer)(struct brw_context *brw)
 872 {
 873    const struct _mesa_index_buffer *index_buffer = brw->ib.ib;
 874
 875    if (index_buffer == NULL)
 876       return;
 877
 878    brw_batch_emit(brw, GENX(3DSTATE_INDEX_BUFFER), ib) {
 879 #if GEN_GEN < 8 && !GEN_IS_HASWELL
 880       ib.CutIndexEnable = brw->prim_restart.enable_cut_index;
 881 #endif
 882       ib.IndexFormat = brw_get_index_type(index_buffer->index_size);
 883       ib.BufferStartingAddress = vertex_bo(brw->ib.bo, 0);
 884 #if GEN_GEN >= 8
 885       ib.IndexBufferMOCS = GEN_GEN >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
 886       ib.BufferSize = brw->ib.size;
 887 #else
 888       ib.BufferEndingAddress = vertex_bo(brw->ib.bo, brw->ib.size - 1);
 889 #endif
 890    }
 891 }
 892
 893 static const struct brw_tracked_state genX(index_buffer) = {
 894    .dirty = {
 895       .mesa = 0,
 896       .brw = BRW_NEW_BATCH |
 897              BRW_NEW_BLORP |
 898              BRW_NEW_INDEX_BUFFER,
 899    },
 900    .emit = genX(emit_index_buffer),
 901 };
 902
 903 #if GEN_IS_HASWELL || GEN_GEN >= 8
 904 static void
 905 genX(upload_cut_index)(struct brw_context *brw)
 906 {
 907    const struct gl_context *ctx = &brw->ctx;
 908
 909    brw_batch_emit(brw, GENX(3DSTATE_VF), vf) {
 910       if (ctx->Array._PrimitiveRestart && brw->ib.ib) {
 911          vf.IndexedDrawCutIndexEnable = true;
 912          vf.CutIndex = _mesa_primitive_restart_index(ctx, brw->ib.index_size);
 913       }
 914    }
 915 }
 916
 917 const struct brw_tracked_state genX(cut_index) = {
 918    .dirty = {
 919       .mesa  = _NEW_TRANSFORM,
 920       .brw   = BRW_NEW_INDEX_BUFFER,
 921    },
 922    .emit = genX(upload_cut_index),
 923 };
 924 #endif
 925
 926 #if GEN_GEN >= 6
 927 /**
 928  * Determine the appropriate attribute override value to store into the
 929  * 3DSTATE_SF structure for a given fragment shader attribute.  The attribute
 930  * override value contains two pieces of information: the location of the
 931  * attribute in the VUE (relative to urb_entry_read_offset, see below), and a
 932  * flag indicating whether to "swizzle" the attribute based on the direction
 933  * the triangle is facing.
 934  *
 935  * If an attribute is "swizzled", then the given VUE location is used for
 936  * front-facing triangles, and the VUE location that immediately follows is
 937  * used for back-facing triangles.  We use this to implement the mapping from
 938  * gl_FrontColor/gl_BackColor to gl_Color.
 939  *
 940  * urb_entry_read_offset is the offset into the VUE at which the SF unit is
 941  * being instructed to begin reading attribute data.  It can be set to a
 942  * nonzero value to prevent the SF unit from wasting time reading elements of
 943  * the VUE that are not needed by the fragment shader.  It is measured in
 944  * 256-bit increments.
 945  */
 946 static void
 947 genX(get_attr_override)(struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr,
 948                         const struct brw_vue_map *vue_map,
 949                         int urb_entry_read_offset, int fs_attr,
 950                         bool two_side_color, uint32_t *max_source_attr)
 951 {
 952    /* Find the VUE slot for this attribute. */
 953    int slot = vue_map->varying_to_slot[fs_attr];
 954
 955    /* Viewport and Layer are stored in the VUE header.  We need to override
 956     * them to zero if earlier stages didn't write them, as GL requires that
 957     * they read back as zero when not explicitly set.
 958     */
 959    if (fs_attr == VARYING_SLOT_VIEWPORT || fs_attr == VARYING_SLOT_LAYER) {
 960       attr->ComponentOverrideX = true;
 961       attr->ComponentOverrideW = true;
 962       attr->ConstantSource = CONST_0000;
 963
 964       if (!(vue_map->slots_valid & VARYING_BIT_LAYER))
 965          attr->ComponentOverrideY = true;
 966       if (!(vue_map->slots_valid & VARYING_BIT_VIEWPORT))
 967          attr->ComponentOverrideZ = true;
 968
 969       return;
 970    }
 971
 972    /* If there was only a back color written but not front, use back
 973     * as the color instead of undefined
 974     */
 975    if (slot == -1 && fs_attr == VARYING_SLOT_COL0)
 976       slot = vue_map->varying_to_slot[VARYING_SLOT_BFC0];
 977    if (slot == -1 && fs_attr == VARYING_SLOT_COL1)
 978       slot = vue_map->varying_to_slot[VARYING_SLOT_BFC1];
 979
 980    if (slot == -1) {
 981       /* This attribute does not exist in the VUE--that means that the vertex
 982        * shader did not write to it.  This means that either:
 983        *
 984        * (a) This attribute is a texture coordinate, and it is going to be
 985        * replaced with point coordinates (as a consequence of a call to
 986        * glTexEnvi(GL_POINT_SPRITE, GL_COORD_REPLACE, GL_TRUE)), so the
 987        * hardware will ignore whatever attribute override we supply.
 988        *
 989        * (b) This attribute is read by the fragment shader but not written by
 990        * the vertex shader, so its value is undefined.  Therefore the
 991        * attribute override we supply doesn't matter.
 992        *
 993        * (c) This attribute is gl_PrimitiveID, and it wasn't written by the
 994        * previous shader stage.
 995        *
 996        * Note that we don't have to worry about the cases where the attribute
 997        * is gl_PointCoord or is undergoing point sprite coordinate
 998        * replacement, because in those cases, this function isn't called.
 999        *
1000        * In case (c), we need to program the attribute overrides so that the
1001        * primitive ID will be stored in this slot.  In every other case, the
1002        * attribute override we supply doesn't matter.  So just go ahead and
1003        * program primitive ID in every case.
1004        */
1005       attr->ComponentOverrideW = true;
1006       attr->ComponentOverrideX = true;
1007       attr->ComponentOverrideY = true;
1008       attr->ComponentOverrideZ = true;
1009       attr->ConstantSource = PRIM_ID;
1010       return;
1011    }
1012
1013    /* Compute the location of the attribute relative to urb_entry_read_offset.
1014     * Each increment of urb_entry_read_offset represents a 256-bit value, so
1015     * it counts for two 128-bit VUE slots.
1016     */
1017    int source_attr = slot - 2 * urb_entry_read_offset;
1018    assert(source_attr >= 0 && source_attr < 32);
1019
1020    /* If we are doing two-sided color, and the VUE slot following this one
1021     * represents a back-facing color, then we need to instruct the SF unit to
1022     * do back-facing swizzling.
1023     */
1024    bool swizzling = two_side_color &&
1025       ((vue_map->slot_to_varying[slot] == VARYING_SLOT_COL0 &&
1026         vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC0) ||
1027        (vue_map->slot_to_varying[slot] == VARYING_SLOT_COL1 &&
1028         vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC1));
1029
1030    /* Update max_source_attr.  If swizzling, the SF will read this slot + 1. */
1031    if (*max_source_attr < source_attr + swizzling)
1032       *max_source_attr = source_attr + swizzling;
1033
1034    attr->SourceAttribute = source_attr;
1035    if (swizzling)
1036       attr->SwizzleSelect = INPUTATTR_FACING;
1037 }
1038
1039
1040 static void
1041 genX(calculate_attr_overrides)(const struct brw_context *brw,
1042                                struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr_overrides,
1043                                uint32_t *point_sprite_enables,
1044                                uint32_t *urb_entry_read_length,
1045                                uint32_t *urb_entry_read_offset)
1046 {
1047    const struct gl_context *ctx = &brw->ctx;
1048
1049    /* _NEW_POINT */
1050    const struct gl_point_attrib *point = &ctx->Point;
1051
1052    /* BRW_NEW_FS_PROG_DATA */
1053    const struct brw_wm_prog_data *wm_prog_data =
1054       brw_wm_prog_data(brw->wm.base.prog_data);
1055    uint32_t max_source_attr = 0;
1056
1057    *point_sprite_enables = 0;
1058
1059    /* BRW_NEW_FRAGMENT_PROGRAM
1060     *
1061     * If the fragment shader reads VARYING_SLOT_LAYER, then we need to pass in
1062     * the full vertex header.  Otherwise, we can program the SF to start
1063     * reading at an offset of 1 (2 varying slots) to skip unnecessary data:
1064     * - VARYING_SLOT_PSIZ and BRW_VARYING_SLOT_NDC on gen4-5
1065     * - VARYING_SLOT_{PSIZ,LAYER} and VARYING_SLOT_POS on gen6+
1066     */
1067
1068    bool fs_needs_vue_header = brw->fragment_program->info.inputs_read &
1069       (VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT);
1070
1071    *urb_entry_read_offset = fs_needs_vue_header ? 0 : 1;
1072
1073    /* From the Ivybridge PRM, Vol 2 Part 1, 3DSTATE_SBE,
1074     * description of dw10 Point Sprite Texture Coordinate Enable:
1075     *
1076     * "This field must be programmed to zero when non-point primitives
1077     * are rendered."
1078     *
1079     * The SandyBridge PRM doesn't explicitly say that point sprite enables
1080     * must be programmed to zero when rendering non-point primitives, but
1081     * the IvyBridge PRM does, and if we don't, we get garbage.
1082     *
1083     * This is not required on Haswell, as the hardware ignores this state
1084     * when drawing non-points -- although we do still need to be careful to
1085     * correctly set the attr overrides.
1086     *
1087     * _NEW_POLYGON
1088     * BRW_NEW_PRIMITIVE | BRW_NEW_GS_PROG_DATA | BRW_NEW_TES_PROG_DATA
1089     */
1090    bool drawing_points = brw_is_drawing_points(brw);
1091
1092    for (int attr = 0; attr < VARYING_SLOT_MAX; attr++) {
1093       int input_index = wm_prog_data->urb_setup[attr];
1094
1095       if (input_index < 0)
1096          continue;
1097
1098       /* _NEW_POINT */
1099       bool point_sprite = false;
1100       if (drawing_points) {
1101          if (point->PointSprite &&
1102              (attr >= VARYING_SLOT_TEX0 && attr <= VARYING_SLOT_TEX7) &&
1103              (point->CoordReplace & (1u << (attr - VARYING_SLOT_TEX0)))) {
1104             point_sprite = true;
1105          }
1106
1107          if (attr == VARYING_SLOT_PNTC)
1108             point_sprite = true;
1109
1110          if (point_sprite)
1111             *point_sprite_enables |= (1 << input_index);
1112       }
1113
1114       /* BRW_NEW_VUE_MAP_GEOM_OUT | _NEW_LIGHT | _NEW_PROGRAM */
1115       struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attribute = { 0 };
1116
1117       if (!point_sprite) {
1118          genX(get_attr_override)(&attribute,
1119                                  &brw->vue_map_geom_out,
1120                                  *urb_entry_read_offset, attr,
1121                                  _mesa_vertex_program_two_side_enabled(ctx),
1122                                  &max_source_attr);
1123       }
1124
1125       /* The hardware can only do the overrides on 16 overrides at a
1126        * time, and the other up to 16 have to be lined up so that the
1127        * input index = the output index.  We'll need to do some
1128        * tweaking to make sure that's the case.
1129        */
1130       if (input_index < 16)
1131          attr_overrides[input_index] = attribute;
1132       else
1133          assert(attribute.SourceAttribute == input_index);
1134    }
1135
1136    /* From the Sandy Bridge PRM, Volume 2, Part 1, documentation for
1137     * 3DSTATE_SF DWord 1 bits 15:11, "Vertex URB Entry Read Length":
1138     *
1139     * "This field should be set to the minimum length required to read the
1140     *  maximum source attribute.  The maximum source attribute is indicated
1141     *  by the maximum value of the enabled Attribute # Source Attribute if
1142     *  Attribute Swizzle Enable is set, Number of Output Attributes-1 if
1143     *  enable is not set.
1144     *  read_length = ceiling((max_source_attr + 1) / 2)
1145     *
1146     *  [errata] Corruption/Hang possible if length programmed larger than
1147     *  recommended"
1148     *
1149     * Similar text exists for Ivy Bridge.
1150     */
1151    *urb_entry_read_length = DIV_ROUND_UP(max_source_attr + 1, 2);
1152 }
1153 #endif
1154
1155 /* ---------------------------------------------------------------------- */
1156
1157 #if GEN_GEN >= 6
1158 static void
1159 genX(upload_depth_stencil_state)(struct brw_context *brw)
1160 {
1161    struct gl_context *ctx = &brw->ctx;
1162
1163    /* _NEW_BUFFERS */
1164    struct intel_renderbuffer *depth_irb =
1165       intel_get_renderbuffer(ctx->DrawBuffer, BUFFER_DEPTH);
1166
1167    /* _NEW_DEPTH */
1168    struct gl_depthbuffer_attrib *depth = &ctx->Depth;
1169
1170    /* _NEW_STENCIL */
1171    struct gl_stencil_attrib *stencil = &ctx->Stencil;
1172    const int b = stencil->_BackFace;
1173
1174 #if GEN_GEN >= 8
1175    brw_batch_emit(brw, GENX(3DSTATE_WM_DEPTH_STENCIL), wmds) {
1176 #else
1177    uint32_t ds_offset;
1178    brw_state_emit(brw, GENX(DEPTH_STENCIL_STATE), 64, &ds_offset, wmds) {
1179 #endif
1180       if (depth->Test && depth_irb) {
1181          wmds.DepthTestEnable = true;
1182          wmds.DepthBufferWriteEnable = brw_depth_writes_enabled(brw);
1183          wmds.DepthTestFunction = intel_translate_compare_func(depth->Func);
1184       }
1185
1186       if (brw->stencil_enabled) {
1187          wmds.StencilTestEnable = true;
1188          wmds.StencilWriteMask = stencil->WriteMask[0] & 0xff;
1189          wmds.StencilTestMask = stencil->ValueMask[0] & 0xff;
1190
1191          wmds.StencilTestFunction =
1192             intel_translate_compare_func(stencil->Function[0]);
1193          wmds.StencilFailOp =
1194             intel_translate_stencil_op(stencil->FailFunc[0]);
1195          wmds.StencilPassDepthPassOp =
1196             intel_translate_stencil_op(stencil->ZPassFunc[0]);
1197          wmds.StencilPassDepthFailOp =
1198             intel_translate_stencil_op(stencil->ZFailFunc[0]);
1199
1200          wmds.StencilBufferWriteEnable = brw->stencil_write_enabled;
1201
1202          if (brw->stencil_two_sided) {
1203             wmds.DoubleSidedStencilEnable = true;
1204             wmds.BackfaceStencilWriteMask = stencil->WriteMask[b] & 0xff;
1205             wmds.BackfaceStencilTestMask = stencil->ValueMask[b] & 0xff;
1206
1207             wmds.BackfaceStencilTestFunction =
1208                intel_translate_compare_func(stencil->Function[b]);
1209             wmds.BackfaceStencilFailOp =
1210                intel_translate_stencil_op(stencil->FailFunc[b]);
1211             wmds.BackfaceStencilPassDepthPassOp =
1212                intel_translate_stencil_op(stencil->ZPassFunc[b]);
1213             wmds.BackfaceStencilPassDepthFailOp =
1214                intel_translate_stencil_op(stencil->ZFailFunc[b]);
1215          }
1216
1217 #if GEN_GEN >= 9
1218          wmds.StencilReferenceValue = _mesa_get_stencil_ref(ctx, 0);
1219          wmds.BackfaceStencilReferenceValue = _mesa_get_stencil_ref(ctx, b);
1220 #endif
1221       }
1222    }
1223
1224 #if GEN_GEN == 6
1225    brw_batch_emit(brw, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
1226       ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
1227       ptr.DEPTH_STENCIL_STATEChange = true;
1228    }
1229 #elif GEN_GEN == 7
1230    brw_batch_emit(brw, GENX(3DSTATE_DEPTH_STENCIL_STATE_POINTERS), ptr) {
1231       ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
1232    }
1233 #endif
1234 }
1235
1236 static const struct brw_tracked_state genX(depth_stencil_state) = {
1237    .dirty = {
1238       .mesa = _NEW_BUFFERS |
1239               _NEW_DEPTH |
1240               _NEW_STENCIL,
1241       .brw  = BRW_NEW_BLORP |
1242               (GEN_GEN >= 8 ? BRW_NEW_CONTEXT
1243                             : BRW_NEW_BATCH |
1244                               BRW_NEW_STATE_BASE_ADDRESS),
1245    },
1246    .emit = genX(upload_depth_stencil_state),
1247 };
1248 #endif
1249
1250 /* ---------------------------------------------------------------------- */
1251
1252 #if GEN_GEN >= 6
1253 static void
1254 genX(upload_clip_state)(struct brw_context *brw)
1255 {
1256    struct gl_context *ctx = &brw->ctx;
1257
1258    /* _NEW_BUFFERS */
1259    struct gl_framebuffer *fb = ctx->DrawBuffer;
1260
1261    /* BRW_NEW_FS_PROG_DATA */
1262    struct brw_wm_prog_data *wm_prog_data =
1263       brw_wm_prog_data(brw->wm.base.prog_data);
1264
1265    brw_batch_emit(brw, GENX(3DSTATE_CLIP), clip) {
1266       clip.StatisticsEnable = !brw->meta_in_progress;
1267
1268       if (wm_prog_data->barycentric_interp_modes &
1269           BRW_BARYCENTRIC_NONPERSPECTIVE_BITS)
1270          clip.NonPerspectiveBarycentricEnable = true;
1271
1272 #if GEN_GEN >= 7
1273       clip.EarlyCullEnable = true;
1274 #endif
1275
1276 #if GEN_GEN == 7
1277       clip.FrontWinding = brw->polygon_front_bit == _mesa_is_user_fbo(fb);
1278
1279       if (ctx->Polygon.CullFlag) {
1280          switch (ctx->Polygon.CullFaceMode) {
1281          case GL_FRONT:
1282             clip.CullMode = CULLMODE_FRONT;
1283             break;
1284          case GL_BACK:
1285             clip.CullMode = CULLMODE_BACK;
1286             break;
1287          case GL_FRONT_AND_BACK:
1288             clip.CullMode = CULLMODE_BOTH;
1289             break;
1290          default:
1291             unreachable("Should not get here: invalid CullFlag");
1292          }
1293       } else {
1294          clip.CullMode = CULLMODE_NONE;
1295       }
1296 #endif
1297
1298 #if GEN_GEN < 8
1299       clip.UserClipDistanceCullTestEnableBitmask =
1300          brw_vue_prog_data(brw->vs.base.prog_data)->cull_distance_mask;
1301
1302       clip.ViewportZClipTestEnable = !ctx->Transform.DepthClamp;
1303 #endif
1304
1305       /* _NEW_LIGHT */
1306       if (ctx->Light.ProvokingVertex == GL_FIRST_VERTEX_CONVENTION) {
1307          clip.TriangleStripListProvokingVertexSelect = 0;
1308          clip.TriangleFanProvokingVertexSelect = 1;
1309          clip.LineStripListProvokingVertexSelect = 0;
1310       } else {
1311          clip.TriangleStripListProvokingVertexSelect = 2;
1312          clip.TriangleFanProvokingVertexSelect = 2;
1313          clip.LineStripListProvokingVertexSelect = 1;
1314       }
1315
1316       /* _NEW_TRANSFORM */
1317       clip.UserClipDistanceClipTestEnableBitmask =
1318          ctx->Transform.ClipPlanesEnabled;
1319
1320 #if GEN_GEN >= 8
1321       clip.ForceUserClipDistanceClipTestEnableBitmask = true;
1322 #endif
1323
1324       if (ctx->Transform.ClipDepthMode == GL_ZERO_TO_ONE)
1325          clip.APIMode = APIMODE_D3D;
1326       else
1327          clip.APIMode = APIMODE_OGL;
1328
1329       clip.GuardbandClipTestEnable = true;
1330
1331       /* BRW_NEW_VIEWPORT_COUNT */
1332       const unsigned viewport_count = brw->clip.viewport_count;
1333
1334       if (ctx->RasterDiscard) {
1335          clip.ClipMode = CLIPMODE_REJECT_ALL;
1336 #if GEN_GEN == 6
1337          perf_debug("Rasterizer discard is currently implemented via the "
1338                     "clipper; having the GS not write primitives would "
1339                     "likely be faster.\n");
1340 #endif
1341       } else {
1342          clip.ClipMode = CLIPMODE_NORMAL;
1343       }
1344
1345       clip.ClipEnable = true;
1346
1347       /* _NEW_POLYGON,
1348        * BRW_NEW_GEOMETRY_PROGRAM | BRW_NEW_TES_PROG_DATA | BRW_NEW_PRIMITIVE
1349        */
1350       if (!brw_is_drawing_points(brw) && !brw_is_drawing_lines(brw))
1351          clip.ViewportXYClipTestEnable = true;
1352
1353       clip.MinimumPointWidth = 0.125;
1354       clip.MaximumPointWidth = 255.875;
1355       clip.MaximumVPIndex = viewport_count - 1;
1356       if (_mesa_geometric_layers(fb) == 0)
1357          clip.ForceZeroRTAIndexEnable = true;
1358    }
1359 }
1360
1361 static const struct brw_tracked_state genX(clip_state) = {
1362    .dirty = {
1363       .mesa  = _NEW_BUFFERS |
1364                _NEW_LIGHT |
1365                _NEW_POLYGON |
1366                _NEW_TRANSFORM,
1367       .brw   = BRW_NEW_BLORP |
1368                BRW_NEW_CONTEXT |
1369                BRW_NEW_FS_PROG_DATA |
1370                BRW_NEW_GS_PROG_DATA |
1371                BRW_NEW_VS_PROG_DATA |
1372                BRW_NEW_META_IN_PROGRESS |
1373                BRW_NEW_PRIMITIVE |
1374                BRW_NEW_RASTERIZER_DISCARD |
1375                BRW_NEW_TES_PROG_DATA |
1376                BRW_NEW_VIEWPORT_COUNT,
1377    },
1378    .emit = genX(upload_clip_state),
1379 };
1380 #endif
1381
1382 /* ---------------------------------------------------------------------- */
1383
1384 static void
1385 genX(upload_sf)(struct brw_context *brw)
1386 {
1387    struct gl_context *ctx = &brw->ctx;
1388    float point_size;
1389
1390 #if GEN_GEN <= 7
1391    /* _NEW_BUFFERS */
1392    bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
1393    UNUSED const bool multisampled_fbo =
1394       _mesa_geometric_samples(ctx->DrawBuffer) > 1;
1395 #endif
1396
1397 #if GEN_GEN < 6
1398    const struct brw_sf_prog_data *sf_prog_data = brw->sf.prog_data;
1399
1400    ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
1401
1402    brw_state_emit(brw, GENX(SF_STATE), 64, &brw->sf.state_offset, sf) {
1403       sf.KernelStartPointer = KSP_ro(brw, brw->sf.prog_offset);
1404       sf.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
1405       sf.GRFRegisterCount = DIV_ROUND_UP(sf_prog_data->total_grf, 16) - 1;
1406       sf.DispatchGRFStartRegisterForURBData = 3;
1407       sf.VertexURBEntryReadOffset = BRW_SF_URB_ENTRY_READ_OFFSET;
1408       sf.VertexURBEntryReadLength = sf_prog_data->urb_read_length;
1409       sf.NumberofURBEntries = brw->urb.nr_sf_entries;
1410       sf.URBEntryAllocationSize = brw->urb.sfsize - 1;
1411
1412       /* STATE_PREFETCH command description describes this state as being
1413        * something loaded through the GPE (L2 ISC), so it's INSTRUCTION
1414        * domain.
1415        */
1416       sf.SetupViewportStateOffset =
1417          instruction_ro_bo(brw->batch.bo, brw->sf.vp_offset);
1418
1419       sf.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
1420
1421       /* sf.ConstantURBEntryReadLength = stage_prog_data->curb_read_length; */
1422       /* sf.ConstantURBEntryReadOffset = brw->curbe.vs_start * 2; */
1423
1424       sf.MaximumNumberofThreads =
1425          MIN2(GEN_GEN == 5 ? 48 : 24, brw->urb.nr_sf_entries) - 1;
1426
1427       sf.SpritePointEnable = ctx->Point.PointSprite;
1428
1429       sf.DestinationOriginHorizontalBias = 0.5;
1430       sf.DestinationOriginVerticalBias = 0.5;
1431 #else
1432    brw_batch_emit(brw, GENX(3DSTATE_SF), sf) {
1433       sf.StatisticsEnable = true;
1434 #endif
1435       sf.ViewportTransformEnable = true;
1436
1437 #if GEN_GEN == 7
1438       /* _NEW_BUFFERS */
1439       sf.DepthBufferSurfaceFormat = brw_depthbuffer_format(brw);
1440 #endif
1441
1442 #if GEN_GEN <= 7
1443       /* _NEW_POLYGON */
1444       sf.FrontWinding = brw->polygon_front_bit == render_to_fbo;
1445 #if GEN_GEN >= 6
1446       sf.GlobalDepthOffsetEnableSolid = ctx->Polygon.OffsetFill;
1447       sf.GlobalDepthOffsetEnableWireframe = ctx->Polygon.OffsetLine;
1448       sf.GlobalDepthOffsetEnablePoint = ctx->Polygon.OffsetPoint;
1449
1450       switch (ctx->Polygon.FrontMode) {
1451          case GL_FILL:
1452             sf.FrontFaceFillMode = FILL_MODE_SOLID;
1453             break;
1454          case GL_LINE:
1455             sf.FrontFaceFillMode = FILL_MODE_WIREFRAME;
1456             break;
1457          case GL_POINT:
1458             sf.FrontFaceFillMode = FILL_MODE_POINT;
1459             break;
1460          default:
1461             unreachable("not reached");
1462       }
1463
1464       switch (ctx->Polygon.BackMode) {
1465          case GL_FILL:
1466             sf.BackFaceFillMode = FILL_MODE_SOLID;
1467             break;
1468          case GL_LINE:
1469             sf.BackFaceFillMode = FILL_MODE_WIREFRAME;
1470             break;
1471          case GL_POINT:
1472             sf.BackFaceFillMode = FILL_MODE_POINT;
1473             break;
1474          default:
1475             unreachable("not reached");
1476       }
1477
1478       if (multisampled_fbo && ctx->Multisample.Enabled)
1479          sf.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
1480
1481       sf.GlobalDepthOffsetConstant = ctx->Polygon.OffsetUnits * 2;
1482       sf.GlobalDepthOffsetScale = ctx->Polygon.OffsetFactor;
1483       sf.GlobalDepthOffsetClamp = ctx->Polygon.OffsetClamp;
1484 #endif
1485
1486       sf.ScissorRectangleEnable = true;
1487
1488       if (ctx->Polygon.CullFlag) {
1489          switch (ctx->Polygon.CullFaceMode) {
1490             case GL_FRONT:
1491                sf.CullMode = CULLMODE_FRONT;
1492                break;
1493             case GL_BACK:
1494                sf.CullMode = CULLMODE_BACK;
1495                break;
1496             case GL_FRONT_AND_BACK:
1497                sf.CullMode = CULLMODE_BOTH;
1498                break;
1499             default:
1500                unreachable("not reached");
1501          }
1502       } else {
1503          sf.CullMode = CULLMODE_NONE;
1504       }
1505
1506 #if GEN_IS_HASWELL
1507       sf.LineStippleEnable = ctx->Line.StippleFlag;
1508 #endif
1509
1510 #endif
1511
1512       /* _NEW_LINE */
1513 #if GEN_GEN == 8
1514       if (brw->is_cherryview)
1515          sf.CHVLineWidth = brw_get_line_width(brw);
1516       else
1517          sf.LineWidth = brw_get_line_width(brw);
1518 #else
1519       sf.LineWidth = brw_get_line_width(brw);
1520 #endif
1521
1522       if (ctx->Line.SmoothFlag) {
1523          sf.LineEndCapAntialiasingRegionWidth = _10pixels;
1524 #if GEN_GEN <= 7
1525          sf.AntiAliasingEnable = true;
1526 #endif
1527       }
1528
1529       /* _NEW_POINT - Clamp to ARB_point_parameters user limits */
1530       point_size = CLAMP(ctx->Point.Size, ctx->Point.MinSize, ctx->Point.MaxSize);
1531       /* Clamp to the hardware limits */
1532       sf.PointWidth = CLAMP(point_size, 0.125f, 255.875f);
1533
1534       /* _NEW_PROGRAM | _NEW_POINT, BRW_NEW_VUE_MAP_GEOM_OUT */
1535       if (use_state_point_size(brw))
1536          sf.PointWidthSource = State;
1537
1538 #if GEN_GEN >= 8
1539       /* _NEW_POINT | _NEW_MULTISAMPLE */
1540       if ((ctx->Point.SmoothFlag || _mesa_is_multisample_enabled(ctx)) &&
1541           !ctx->Point.PointSprite)
1542          sf.SmoothPointEnable = true;
1543 #endif
1544
1545 #if GEN_IS_G4X || GEN_GEN >= 5
1546       sf.AALineDistanceMode = AALINEDISTANCE_TRUE;
1547 #endif
1548
1549       /* _NEW_LIGHT */
1550       if (ctx->Light.ProvokingVertex != GL_FIRST_VERTEX_CONVENTION) {
1551          sf.TriangleStripListProvokingVertexSelect = 2;
1552          sf.TriangleFanProvokingVertexSelect = 2;
1553          sf.LineStripListProvokingVertexSelect = 1;
1554       } else {
1555          sf.TriangleFanProvokingVertexSelect = 1;
1556       }
1557
1558 #if GEN_GEN == 6
1559       /* BRW_NEW_FS_PROG_DATA */
1560       const struct brw_wm_prog_data *wm_prog_data =
1561          brw_wm_prog_data(brw->wm.base.prog_data);
1562
1563       sf.AttributeSwizzleEnable = true;
1564       sf.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
1565
1566       /*
1567        * Window coordinates in an FBO are inverted, which means point
1568        * sprite origin must be inverted, too.
1569        */
1570       if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) != render_to_fbo) {
1571          sf.PointSpriteTextureCoordinateOrigin = LOWERLEFT;
1572       } else {
1573          sf.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
1574       }
1575
1576       /* BRW_NEW_VUE_MAP_GEOM_OUT | BRW_NEW_FRAGMENT_PROGRAM |
1577        * _NEW_POINT | _NEW_LIGHT | _NEW_PROGRAM | BRW_NEW_FS_PROG_DATA
1578        */
1579       uint32_t urb_entry_read_length;
1580       uint32_t urb_entry_read_offset;
1581       uint32_t point_sprite_enables;
1582       genX(calculate_attr_overrides)(brw, sf.Attribute, &point_sprite_enables,
1583                                      &urb_entry_read_length,
1584                                      &urb_entry_read_offset);
1585       sf.VertexURBEntryReadLength = urb_entry_read_length;
1586       sf.VertexURBEntryReadOffset = urb_entry_read_offset;
1587       sf.PointSpriteTextureCoordinateEnable = point_sprite_enables;
1588       sf.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
1589 #endif
1590    }
1591 }
1592
1593 static const struct brw_tracked_state genX(sf_state) = {
1594    .dirty = {
1595       .mesa  = _NEW_LIGHT |
1596                _NEW_LINE |
1597                _NEW_POINT |
1598                _NEW_PROGRAM |
1599                (GEN_GEN >= 6 ? _NEW_MULTISAMPLE : 0) |
1600                (GEN_GEN <= 7 ? _NEW_BUFFERS | _NEW_POLYGON : 0),
1601       .brw   = BRW_NEW_BLORP |
1602                BRW_NEW_VUE_MAP_GEOM_OUT |
1603                (GEN_GEN <= 5 ? BRW_NEW_BATCH |
1604                                BRW_NEW_PROGRAM_CACHE |
1605                                BRW_NEW_SF_PROG_DATA |
1606                                BRW_NEW_SF_VP |
1607                                BRW_NEW_URB_FENCE
1608                              : 0) |
1609                (GEN_GEN >= 6 ? BRW_NEW_CONTEXT : 0) |
1610                (GEN_GEN >= 6 && GEN_GEN <= 7 ?
1611                                BRW_NEW_GS_PROG_DATA |
1612                                BRW_NEW_PRIMITIVE |
1613                                BRW_NEW_TES_PROG_DATA
1614                              : 0) |
1615                (GEN_GEN == 6 ? BRW_NEW_FS_PROG_DATA |
1616                                BRW_NEW_FRAGMENT_PROGRAM
1617                              : 0),
1618    },
1619    .emit = genX(upload_sf),
1620 };
1621
1622 /* ---------------------------------------------------------------------- */
1623
1624 #if GEN_GEN >= 6
1625 static void
1626 genX(upload_wm)(struct brw_context *brw)
1627 {
1628    struct gl_context *ctx = &brw->ctx;
1629
1630    /* BRW_NEW_FS_PROG_DATA */
1631    const struct brw_wm_prog_data *wm_prog_data =
1632       brw_wm_prog_data(brw->wm.base.prog_data);
1633
1634    UNUSED bool writes_depth =
1635       wm_prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF;
1636
1637 #if GEN_GEN < 7
1638    const struct brw_stage_state *stage_state = &brw->wm.base;
1639    const struct gen_device_info *devinfo = &brw->screen->devinfo;
1640
1641    /* We can't fold this into gen6_upload_wm_push_constants(), because
1642     * according to the SNB PRM, vol 2 part 1 section 7.2.2
1643     * (3DSTATE_CONSTANT_PS [DevSNB]):
1644     *
1645     *     "[DevSNB]: This packet must be followed by WM_STATE."
1646     */
1647    brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_PS), wmcp) {
1648       if (wm_prog_data->base.nr_params != 0) {
1649          wmcp.Buffer0Valid = true;
1650          /* Pointer to the WM constant buffer.  Covered by the set of
1651           * state flags from gen6_upload_wm_push_constants.
1652           */
1653          wmcp.PointertoPSConstantBuffer0 = stage_state->push_const_offset;
1654          wmcp.PSConstantBuffer0ReadLength = stage_state->push_const_size - 1;
1655       }
1656    }
1657 #endif
1658
1659    brw_batch_emit(brw, GENX(3DSTATE_WM), wm) {
1660       wm.StatisticsEnable = true;
1661       wm.LineAntialiasingRegionWidth = _10pixels;
1662       wm.LineEndCapAntialiasingRegionWidth = _05pixels;
1663
1664 #if GEN_GEN < 7
1665       if (wm_prog_data->base.use_alt_mode)
1666          wm.FloatingPointMode = Alternate;
1667
1668       wm.SamplerCount = DIV_ROUND_UP(stage_state->sampler_count, 4);
1669       wm.BindingTableEntryCount = wm_prog_data->base.binding_table.size_bytes / 4;
1670       wm.MaximumNumberofThreads = devinfo->max_wm_threads - 1;
1671       wm._8PixelDispatchEnable = wm_prog_data->dispatch_8;
1672       wm._16PixelDispatchEnable = wm_prog_data->dispatch_16;
1673       wm.DispatchGRFStartRegisterForConstantSetupData0 =
1674          wm_prog_data->base.dispatch_grf_start_reg;
1675       wm.DispatchGRFStartRegisterForConstantSetupData2 =
1676          wm_prog_data->dispatch_grf_start_reg_2;
1677       wm.KernelStartPointer0 = stage_state->prog_offset;
1678       wm.KernelStartPointer2 = stage_state->prog_offset +
1679          wm_prog_data->prog_offset_2;
1680       wm.DualSourceBlendEnable =
1681          wm_prog_data->dual_src_blend && (ctx->Color.BlendEnabled & 1) &&
1682          ctx->Color.Blend[0]._UsesDualSrc;
1683       wm.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
1684       wm.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
1685
1686       /* From the SNB PRM, volume 2 part 1, page 281:
1687        * "If the PS kernel does not need the Position XY Offsets
1688        * to compute a Position XY value, then this field should be
1689        * programmed to POSOFFSET_NONE."
1690        *
1691        * "SW Recommendation: If the PS kernel needs the Position Offsets
1692        * to compute a Position XY value, this field should match Position
1693        * ZW Interpolation Mode to ensure a consistent position.xyzw
1694        * computation."
1695        * We only require XY sample offsets. So, this recommendation doesn't
1696        * look useful at the moment. We might need this in future.
1697        */
1698       if (wm_prog_data->uses_pos_offset)
1699          wm.PositionXYOffsetSelect = POSOFFSET_SAMPLE;
1700       else
1701          wm.PositionXYOffsetSelect = POSOFFSET_NONE;
1702
1703       if (wm_prog_data->base.total_scratch) {
1704          wm.ScratchSpaceBasePointer =
1705             render_bo(stage_state->scratch_bo,
1706                       ffs(stage_state->per_thread_scratch) - 11);
1707       }
1708
1709       wm.PixelShaderComputedDepth = writes_depth;
1710 #endif
1711
1712       wm.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
1713
1714       /* _NEW_LINE */
1715       wm.LineStippleEnable = ctx->Line.StippleFlag;
1716
1717       /* _NEW_POLYGON */
1718       wm.PolygonStippleEnable = ctx->Polygon.StippleFlag;
1719       wm.BarycentricInterpolationMode = wm_prog_data->barycentric_interp_modes;
1720
1721 #if GEN_GEN < 8
1722       /* _NEW_BUFFERS */
1723       const bool multisampled_fbo = _mesa_geometric_samples(ctx->DrawBuffer) > 1;
1724
1725       wm.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth;
1726       wm.PixelShaderUsesSourceW = wm_prog_data->uses_src_w;
1727       if (wm_prog_data->uses_kill ||
1728           _mesa_is_alpha_test_enabled(ctx) ||
1729           _mesa_is_alpha_to_coverage_enabled(ctx) ||
1730           wm_prog_data->uses_omask) {
1731          wm.PixelShaderKillsPixel = true;
1732       }
1733
1734       /* _NEW_BUFFERS | _NEW_COLOR */
1735       if (brw_color_buffer_write_enabled(brw) || writes_depth ||
1736           wm_prog_data->has_side_effects || wm.PixelShaderKillsPixel) {
1737          wm.ThreadDispatchEnable = true;
1738       }
1739       if (multisampled_fbo) {
1740          /* _NEW_MULTISAMPLE */
1741          if (ctx->Multisample.Enabled)
1742             wm.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
1743          else
1744             wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
1745
1746          if (wm_prog_data->persample_dispatch)
1747             wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
1748          else
1749             wm.MultisampleDispatchMode = MSDISPMODE_PERPIXEL;
1750       } else {
1751          wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
1752          wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
1753       }
1754
1755 #if GEN_GEN >= 7
1756       wm.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode;
1757       wm.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask;
1758 #endif
1759
1760       /* The "UAV access enable" bits are unnecessary on HSW because they only
1761        * seem to have an effect on the HW-assisted coherency mechanism which we
1762        * don't need, and the rasterization-related UAV_ONLY flag and the
1763        * DISPATCH_ENABLE bit can be set independently from it.
1764        * C.f. gen8_upload_ps_extra().
1765        *
1766        * BRW_NEW_FRAGMENT_PROGRAM | BRW_NEW_FS_PROG_DATA | _NEW_BUFFERS |
1767        * _NEW_COLOR
1768        */
1769 #if GEN_IS_HASWELL
1770       if (!(brw_color_buffer_write_enabled(brw) || writes_depth) &&
1771           wm_prog_data->has_side_effects)
1772          wm.PSUAVonly = ON;
1773 #endif
1774 #endif
1775
1776 #if GEN_GEN >= 7
1777       /* BRW_NEW_FS_PROG_DATA */
1778       if (wm_prog_data->early_fragment_tests)
1779          wm.EarlyDepthStencilControl = EDSC_PREPS;
1780       else if (wm_prog_data->has_side_effects)
1781          wm.EarlyDepthStencilControl = EDSC_PSEXEC;
1782 #endif
1783    }
1784 }
1785
1786 static const struct brw_tracked_state genX(wm_state) = {
1787    .dirty = {
1788       .mesa  = _NEW_LINE |
1789                _NEW_POLYGON |
1790                (GEN_GEN < 8 ? _NEW_BUFFERS |
1791                               _NEW_COLOR |
1792                               _NEW_MULTISAMPLE :
1793                               0) |
1794                (GEN_GEN < 7 ? _NEW_PROGRAM_CONSTANTS : 0),
1795       .brw   = BRW_NEW_BLORP |
1796                BRW_NEW_FS_PROG_DATA |
1797                (GEN_GEN < 7 ? BRW_NEW_BATCH : BRW_NEW_CONTEXT),
1798    },
1799    .emit = genX(upload_wm),
1800 };
1801 #endif
1802
1803 /* ---------------------------------------------------------------------- */
1804
1805 #define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix) \
1806    pkt.KernelStartPointer = KSP(brw, stage_state->prog_offset);           \
1807    pkt.SamplerCount       =                                               \
1808       DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4);          \
1809    pkt.BindingTableEntryCount =                                           \
1810       stage_prog_data->binding_table.size_bytes / 4;                      \
1811    pkt.FloatingPointMode  = stage_prog_data->use_alt_mode;                \
1812                                                                           \
1813    if (stage_prog_data->total_scratch) {                                  \
1814       pkt.ScratchSpaceBasePointer =                                       \
1815          render_bo(stage_state->scratch_bo, 0);                           \
1816       pkt.PerThreadScratchSpace =                                         \
1817          ffs(stage_state->per_thread_scratch) - 11;                       \
1818    }                                                                      \
1819                                                                           \
1820    pkt.DispatchGRFStartRegisterForURBData =                               \
1821       stage_prog_data->dispatch_grf_start_reg;                            \
1822    pkt.prefix##URBEntryReadLength = vue_prog_data->urb_read_length;       \
1823    pkt.prefix##URBEntryReadOffset = 0;                                    \
1824                                                                           \
1825    pkt.StatisticsEnable = true;                                           \
1826    pkt.Enable           = true;
1827
1828 static void
1829 genX(upload_vs_state)(struct brw_context *brw)
1830 {
1831    UNUSED struct gl_context *ctx = &brw->ctx;
1832    const struct gen_device_info *devinfo = &brw->screen->devinfo;
1833    struct brw_stage_state *stage_state = &brw->vs.base;
1834
1835    /* BRW_NEW_VS_PROG_DATA */
1836    const struct brw_vue_prog_data *vue_prog_data =
1837       brw_vue_prog_data(brw->vs.base.prog_data);
1838    const struct brw_stage_prog_data *stage_prog_data = &vue_prog_data->base;
1839
1840    assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8 ||
1841           vue_prog_data->dispatch_mode == DISPATCH_MODE_4X2_DUAL_OBJECT);
1842
1843 #if GEN_GEN == 6
1844    /* From the BSpec, 3D Pipeline > Geometry > Vertex Shader > State,
1845     * 3DSTATE_VS, Dword 5.0 "VS Function Enable":
1846     *
1847     *   [DevSNB] A pipeline flush must be programmed prior to a 3DSTATE_VS
1848     *   command that causes the VS Function Enable to toggle. Pipeline
1849     *   flush can be executed by sending a PIPE_CONTROL command with CS
1850     *   stall bit set and a post sync operation.
1851     *
1852     * We've already done such a flush at the start of state upload, so we
1853     * don't need to do another one here.
1854     */
1855    brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_VS), cvs) {
1856       if (stage_state->push_const_size != 0) {
1857          cvs.Buffer0Valid = true;
1858          cvs.PointertoVSConstantBuffer0 = stage_state->push_const_offset;
1859          cvs.VSConstantBuffer0ReadLength = stage_state->push_const_size - 1;
1860       }
1861    }
1862 #endif
1863
1864    if (GEN_GEN == 7 && devinfo->is_ivybridge)
1865       gen7_emit_vs_workaround_flush(brw);
1866
1867 #if GEN_GEN >= 6
1868    brw_batch_emit(brw, GENX(3DSTATE_VS), vs) {
1869 #else
1870    ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
1871    brw_state_emit(brw, GENX(VS_STATE), 32, &stage_state->state_offset, vs) {
1872 #endif
1873       INIT_THREAD_DISPATCH_FIELDS(vs, Vertex);
1874
1875       vs.MaximumNumberofThreads = devinfo->max_vs_threads - 1;
1876
1877 #if GEN_GEN < 6
1878       vs.GRFRegisterCount = DIV_ROUND_UP(vue_prog_data->total_grf, 16) - 1;
1879       vs.ConstantURBEntryReadLength = stage_prog_data->curb_read_length;
1880       vs.ConstantURBEntryReadOffset = brw->curbe.vs_start * 2;
1881
1882       vs.NumberofURBEntries = brw->urb.nr_vs_entries >> (GEN_GEN == 5 ? 2 : 0);
1883       vs.URBEntryAllocationSize = brw->urb.vsize - 1;
1884
1885       vs.MaximumNumberofThreads =
1886          CLAMP(brw->urb.nr_vs_entries / 2, 1, devinfo->max_vs_threads) - 1;
1887
1888       vs.StatisticsEnable = false;
1889       vs.SamplerStatePointer =
1890          instruction_ro_bo(brw->batch.bo, stage_state->sampler_offset);
1891 #endif
1892
1893 #if GEN_GEN == 5
1894       /* Force single program flow on Ironlake.  We cannot reliably get
1895        * all applications working without it.  See:
1896        * https://bugs.freedesktop.org/show_bug.cgi?id=29172
1897        *
1898        * The most notable and reliably failing application is the Humus
1899        * demo "CelShading"
1900        */
1901       vs.SingleProgramFlow = true;
1902       vs.SamplerCount = 0; /* hardware requirement */
1903 #endif
1904
1905 #if GEN_GEN >= 8
1906       vs.SIMD8DispatchEnable =
1907          vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8;
1908
1909       vs.UserClipDistanceCullTestEnableBitmask =
1910          vue_prog_data->cull_distance_mask;
1911 #endif
1912    }
1913
1914 #if GEN_GEN == 6
1915    /* Based on my reading of the simulator, the VS constants don't get
1916     * pulled into the VS FF unit until an appropriate pipeline flush
1917     * happens, and instead the 3DSTATE_CONSTANT_VS packet just adds
1918     * references to them into a little FIFO.  The flushes are common,
1919     * but don't reliably happen between this and a 3DPRIMITIVE, causing
1920     * the primitive to use the wrong constants.  Then the FIFO
1921     * containing the constant setup gets added to again on the next
1922     * constants change, and eventually when a flush does happen the
1923     * unit is overwhelmed by constant changes and dies.
1924     *
1925     * To avoid this, send a PIPE_CONTROL down the line that will
1926     * update the unit immediately loading the constants.  The flush
1927     * type bits here were those set by the STATE_BASE_ADDRESS whose
1928     * move in a82a43e8d99e1715dd11c9c091b5ab734079b6a6 triggered the
1929     * bug reports that led to this workaround, and may be more than
1930     * what is strictly required to avoid the issue.
1931     */
1932    brw_emit_pipe_control_flush(brw,
1933                                PIPE_CONTROL_DEPTH_STALL |
1934                                PIPE_CONTROL_INSTRUCTION_INVALIDATE |
1935                                PIPE_CONTROL_STATE_CACHE_INVALIDATE);
1936 #endif
1937 }
1938
1939 static const struct brw_tracked_state genX(vs_state) = {
1940    .dirty = {
1941       .mesa  = (GEN_GEN == 6 ? (_NEW_PROGRAM_CONSTANTS | _NEW_TRANSFORM) : 0),
1942       .brw   = BRW_NEW_BATCH |
1943                BRW_NEW_BLORP |
1944                BRW_NEW_CONTEXT |
1945                BRW_NEW_VS_PROG_DATA |
1946                (GEN_GEN == 6 ? BRW_NEW_VERTEX_PROGRAM : 0) |
1947                (GEN_GEN <= 5 ? BRW_NEW_PUSH_CONSTANT_ALLOCATION |
1948                                BRW_NEW_PROGRAM_CACHE |
1949                                BRW_NEW_SAMPLER_STATE_TABLE |
1950                                BRW_NEW_URB_FENCE
1951                              : 0),
1952    },
1953    .emit = genX(upload_vs_state),
1954 };
1955
1956 /* ---------------------------------------------------------------------- */
1957
1958 static void
1959 genX(upload_cc_viewport)(struct brw_context *brw)
1960 {
1961    struct gl_context *ctx = &brw->ctx;
1962
1963    /* BRW_NEW_VIEWPORT_COUNT */
1964    const unsigned viewport_count = brw->clip.viewport_count;
1965
1966    struct GENX(CC_VIEWPORT) ccv;
1967    uint32_t cc_vp_offset;
1968    uint32_t *cc_map =
1969       brw_state_batch(brw, 4 * GENX(CC_VIEWPORT_length) * viewport_count,
1970                       32, &cc_vp_offset);
1971
1972    for (unsigned i = 0; i < viewport_count; i++) {
1973       /* _NEW_VIEWPORT | _NEW_TRANSFORM */
1974       const struct gl_viewport_attrib *vp = &ctx->ViewportArray[i];
1975       if (ctx->Transform.DepthClamp) {
1976          ccv.MinimumDepth = MIN2(vp->Near, vp->Far);
1977          ccv.MaximumDepth = MAX2(vp->Near, vp->Far);
1978       } else {
1979          ccv.MinimumDepth = 0.0;
1980          ccv.MaximumDepth = 1.0;
1981       }
1982       GENX(CC_VIEWPORT_pack)(NULL, cc_map, &ccv);
1983       cc_map += GENX(CC_VIEWPORT_length);
1984    }
1985
1986 #if GEN_GEN >= 7
1987    brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), ptr) {
1988       ptr.CCViewportPointer = cc_vp_offset;
1989    }
1990 #elif GEN_GEN == 6
1991    brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) {
1992       vp.CCViewportStateChange = 1;
1993       vp.PointertoCC_VIEWPORT = cc_vp_offset;
1994    }
1995 #else
1996    brw->cc.vp_offset = cc_vp_offset;
1997    ctx->NewDriverState |= BRW_NEW_CC_VP;
1998 #endif
1999 }
2000
2001 const struct brw_tracked_state genX(cc_vp) = {
2002    .dirty = {
2003       .mesa = _NEW_TRANSFORM |
2004               _NEW_VIEWPORT,
2005       .brw = BRW_NEW_BATCH |
2006              BRW_NEW_BLORP |
2007              BRW_NEW_VIEWPORT_COUNT,
2008    },
2009    .emit = genX(upload_cc_viewport)
2010 };
2011
2012 /* ---------------------------------------------------------------------- */
2013
2014 static inline void
2015 set_scissor_bits(const struct gl_context *ctx, int i,
2016                  bool render_to_fbo, unsigned fb_width, unsigned fb_height,
2017                  struct GENX(SCISSOR_RECT) *sc)
2018 {
2019    int bbox[4];
2020
2021    bbox[0] = MAX2(ctx->ViewportArray[i].X, 0);
2022    bbox[1] = MIN2(bbox[0] + ctx->ViewportArray[i].Width, fb_width);
2023    bbox[2] = MAX2(ctx->ViewportArray[i].Y, 0);
2024    bbox[3] = MIN2(bbox[2] + ctx->ViewportArray[i].Height, fb_height);
2025    _mesa_intersect_scissor_bounding_box(ctx, i, bbox);
2026
2027    if (bbox[0] == bbox[1] || bbox[2] == bbox[3]) {
2028       /* If the scissor was out of bounds and got clamped to 0 width/height
2029        * at the bounds, the subtraction of 1 from maximums could produce a
2030        * negative number and thus not clip anything.  Instead, just provide
2031        * a min > max scissor inside the bounds, which produces the expected
2032        * no rendering.
2033        */
2034       sc->ScissorRectangleXMin = 1;
2035       sc->ScissorRectangleXMax = 0;
2036       sc->ScissorRectangleYMin = 1;
2037       sc->ScissorRectangleYMax = 0;
2038    } else if (render_to_fbo) {
2039       /* texmemory: Y=0=bottom */
2040       sc->ScissorRectangleXMin = bbox[0];
2041       sc->ScissorRectangleXMax = bbox[1] - 1;
2042       sc->ScissorRectangleYMin = bbox[2];
2043       sc->ScissorRectangleYMax = bbox[3] - 1;
2044    } else {
2045       /* memory: Y=0=top */
2046       sc->ScissorRectangleXMin = bbox[0];
2047       sc->ScissorRectangleXMax = bbox[1] - 1;
2048       sc->ScissorRectangleYMin = fb_height - bbox[3];
2049       sc->ScissorRectangleYMax = fb_height - bbox[2] - 1;
2050    }
2051 }
2052
2053 #if GEN_GEN >= 6
2054 static void
2055 genX(upload_scissor_state)(struct brw_context *brw)
2056 {
2057    struct gl_context *ctx = &brw->ctx;
2058    const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
2059    struct GENX(SCISSOR_RECT) scissor;
2060    uint32_t scissor_state_offset;
2061    const unsigned int fb_width = _mesa_geometric_width(ctx->DrawBuffer);
2062    const unsigned int fb_height = _mesa_geometric_height(ctx->DrawBuffer);
2063    uint32_t *scissor_map;
2064
2065    /* BRW_NEW_VIEWPORT_COUNT */
2066    const unsigned viewport_count = brw->clip.viewport_count;
2067
2068    scissor_map = brw_state_batch(
2069       brw, GENX(SCISSOR_RECT_length) * sizeof(uint32_t) * viewport_count,
2070       32, &scissor_state_offset);
2071
2072    /* _NEW_SCISSOR | _NEW_BUFFERS | _NEW_VIEWPORT */
2073
2074    /* The scissor only needs to handle the intersection of drawable and
2075     * scissor rect.  Clipping to the boundaries of static shared buffers
2076     * for front/back/depth is covered by looping over cliprects in brw_draw.c.
2077     *
2078     * Note that the hardware's coordinates are inclusive, while Mesa's min is
2079     * inclusive but max is exclusive.
2080     */
2081    for (unsigned i = 0; i < viewport_count; i++) {
2082       set_scissor_bits(ctx, i, render_to_fbo, fb_width, fb_height, &scissor);
2083       GENX(SCISSOR_RECT_pack)(
2084          NULL, scissor_map + i * GENX(SCISSOR_RECT_length), &scissor);
2085    }
2086
2087    brw_batch_emit(brw, GENX(3DSTATE_SCISSOR_STATE_POINTERS), ptr) {
2088       ptr.ScissorRectPointer = scissor_state_offset;
2089    }
2090 }
2091
2092 static const struct brw_tracked_state genX(scissor_state) = {
2093    .dirty = {
2094       .mesa = _NEW_BUFFERS |
2095               _NEW_SCISSOR |
2096               _NEW_VIEWPORT,
2097       .brw = BRW_NEW_BATCH |
2098              BRW_NEW_BLORP |
2099              BRW_NEW_VIEWPORT_COUNT,
2100    },
2101    .emit = genX(upload_scissor_state),
2102 };
2103 #endif
2104
2105 /* ---------------------------------------------------------------------- */
2106
2107 static void
2108 brw_calculate_guardband_size(uint32_t fb_width, uint32_t fb_height,
2109                              float m00, float m11, float m30, float m31,
2110                              float *xmin, float *xmax,
2111                              float *ymin, float *ymax)
2112 {
2113    /* According to the "Vertex X,Y Clamping and Quantization" section of the
2114     * Strips and Fans documentation:
2115     *
2116     * "The vertex X and Y screen-space coordinates are also /clamped/ to the
2117     *  fixed-point "guardband" range supported by the rasterization hardware"
2118     *
2119     * and
2120     *
2121     * "In almost all circumstances, if an object’s vertices are actually
2122     *  modified by this clamping (i.e., had X or Y coordinates outside of
2123     *  the guardband extent the rendered object will not match the intended
2124     *  result.  Therefore software should take steps to ensure that this does
2125     *  not happen - e.g., by clipping objects such that they do not exceed
2126     *  these limits after the Drawing Rectangle is applied."
2127     *
2128     * I believe the fundamental restriction is that the rasterizer (in
2129     * the SF/WM stages) have a limit on the number of pixels that can be
2130     * rasterized.  We need to ensure any coordinates beyond the rasterizer
2131     * limit are handled by the clipper.  So effectively that limit becomes
2132     * the clipper's guardband size.
2133     *
2134     * It goes on to say:
2135     *
2136     * "In addition, in order to be correctly rendered, objects must have a
2137     *  screenspace bounding box not exceeding 8K in the X or Y direction.
2138     *  This additional restriction must also be comprehended by software,
2139     *  i.e., enforced by use of clipping."
2140     *
2141     * This makes no sense.  Gen7+ hardware supports 16K render targets,
2142     * and you definitely need to be able to draw polygons that fill the
2143     * surface.  Our assumption is that the rasterizer was limited to 8K
2144     * on Sandybridge, which only supports 8K surfaces, and it was actually
2145     * increased to 16K on Ivybridge and later.
2146     *
2147     * So, limit the guardband to 16K on Gen7+ and 8K on Sandybridge.
2148     */
2149    const float gb_size = GEN_GEN >= 7 ? 16384.0f : 8192.0f;
2150
2151    if (m00 != 0 && m11 != 0) {
2152       /* First, we compute the screen-space render area */
2153       const float ss_ra_xmin = MIN3(        0, m30 + m00, m30 - m00);
2154       const float ss_ra_xmax = MAX3( fb_width, m30 + m00, m30 - m00);
2155       const float ss_ra_ymin = MIN3(        0, m31 + m11, m31 - m11);
2156       const float ss_ra_ymax = MAX3(fb_height, m31 + m11, m31 - m11);
2157
2158       /* We want the guardband to be centered on that */
2159       const float ss_gb_xmin = (ss_ra_xmin + ss_ra_xmax) / 2 - gb_size;
2160       const float ss_gb_xmax = (ss_ra_xmin + ss_ra_xmax) / 2 + gb_size;
2161       const float ss_gb_ymin = (ss_ra_ymin + ss_ra_ymax) / 2 - gb_size;
2162       const float ss_gb_ymax = (ss_ra_ymin + ss_ra_ymax) / 2 + gb_size;
2163
2164       /* Now we need it in native device coordinates */
2165       const float ndc_gb_xmin = (ss_gb_xmin - m30) / m00;
2166       const float ndc_gb_xmax = (ss_gb_xmax - m30) / m00;
2167       const float ndc_gb_ymin = (ss_gb_ymin - m31) / m11;
2168       const float ndc_gb_ymax = (ss_gb_ymax - m31) / m11;
2169
2170       /* Thanks to Y-flipping and ORIGIN_UPPER_LEFT, the Y coordinates may be
2171        * flipped upside-down.  X should be fine though.
2172        */
2173       assert(ndc_gb_xmin <= ndc_gb_xmax);
2174       *xmin = ndc_gb_xmin;
2175       *xmax = ndc_gb_xmax;
2176       *ymin = MIN2(ndc_gb_ymin, ndc_gb_ymax);
2177       *ymax = MAX2(ndc_gb_ymin, ndc_gb_ymax);
2178    } else {
2179       /* The viewport scales to 0, so nothing will be rendered. */
2180       *xmin = 0.0f;
2181       *xmax = 0.0f;
2182       *ymin = 0.0f;
2183       *ymax = 0.0f;
2184    }
2185 }
2186
2187 static void
2188 genX(upload_sf_clip_viewport)(struct brw_context *brw)
2189 {
2190    struct gl_context *ctx = &brw->ctx;
2191    float y_scale, y_bias;
2192
2193    /* BRW_NEW_VIEWPORT_COUNT */
2194    const unsigned viewport_count = brw->clip.viewport_count;
2195
2196    /* _NEW_BUFFERS */
2197    const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
2198    const uint32_t fb_width = (float)_mesa_geometric_width(ctx->DrawBuffer);
2199    const uint32_t fb_height = (float)_mesa_geometric_height(ctx->DrawBuffer);
2200
2201 #if GEN_GEN >= 7
2202 #define clv sfv
2203    struct GENX(SF_CLIP_VIEWPORT) sfv;
2204    uint32_t sf_clip_vp_offset;
2205    uint32_t *sf_clip_map =
2206       brw_state_batch(brw, GENX(SF_CLIP_VIEWPORT_length) * 4 * viewport_count,
2207                       64, &sf_clip_vp_offset);
2208 #else
2209    struct GENX(SF_VIEWPORT) sfv;
2210    struct GENX(CLIP_VIEWPORT) clv;
2211    uint32_t sf_vp_offset, clip_vp_offset;
2212    uint32_t *sf_map =
2213       brw_state_batch(brw, GENX(SF_VIEWPORT_length) * 4 * viewport_count,
2214                       32, &sf_vp_offset);
2215    uint32_t *clip_map =
2216       brw_state_batch(brw, GENX(CLIP_VIEWPORT_length) * 4 * viewport_count,
2217                       32, &clip_vp_offset);
2218 #endif
2219
2220    /* _NEW_BUFFERS */
2221    if (render_to_fbo) {
2222       y_scale = 1.0;
2223       y_bias = 0;
2224    } else {
2225       y_scale = -1.0;
2226       y_bias = (float)fb_height;
2227    }
2228
2229    for (unsigned i = 0; i < brw->clip.viewport_count; i++) {
2230       /* _NEW_VIEWPORT: Guardband Clipping */
2231       float scale[3], translate[3], gb_xmin, gb_xmax, gb_ymin, gb_ymax;
2232       _mesa_get_viewport_xform(ctx, i, scale, translate);
2233
2234       sfv.ViewportMatrixElementm00 = scale[0];
2235       sfv.ViewportMatrixElementm11 = scale[1] * y_scale,
2236       sfv.ViewportMatrixElementm22 = scale[2],
2237       sfv.ViewportMatrixElementm30 = translate[0],
2238       sfv.ViewportMatrixElementm31 = translate[1] * y_scale + y_bias,
2239       sfv.ViewportMatrixElementm32 = translate[2],
2240       brw_calculate_guardband_size(fb_width, fb_height,
2241                                    sfv.ViewportMatrixElementm00,
2242                                    sfv.ViewportMatrixElementm11,
2243                                    sfv.ViewportMatrixElementm30,
2244                                    sfv.ViewportMatrixElementm31,
2245                                    &gb_xmin, &gb_xmax, &gb_ymin, &gb_ymax);
2246
2247
2248       clv.XMinClipGuardband = gb_xmin;
2249       clv.XMaxClipGuardband = gb_xmax;
2250       clv.YMinClipGuardband = gb_ymin;
2251       clv.YMaxClipGuardband = gb_ymax;
2252
2253 #if GEN_GEN < 6
2254       set_scissor_bits(ctx, i, render_to_fbo, fb_width, fb_height,
2255                        &sfv.ScissorRectangle);
2256 #elif GEN_GEN >= 8
2257       /* _NEW_VIEWPORT | _NEW_BUFFERS: Screen Space Viewport
2258        * The hardware will take the intersection of the drawing rectangle,
2259        * scissor rectangle, and the viewport extents. We don't need to be
2260        * smart, and can therefore just program the viewport extents.
2261        */
2262       const float viewport_Xmax =
2263          ctx->ViewportArray[i].X + ctx->ViewportArray[i].Width;
2264       const float viewport_Ymax =
2265          ctx->ViewportArray[i].Y + ctx->ViewportArray[i].Height;
2266
2267       if (render_to_fbo) {
2268          sfv.XMinViewPort = ctx->ViewportArray[i].X;
2269          sfv.XMaxViewPort = viewport_Xmax - 1;
2270          sfv.YMinViewPort = ctx->ViewportArray[i].Y;
2271          sfv.YMaxViewPort = viewport_Ymax - 1;
2272       } else {
2273          sfv.XMinViewPort = ctx->ViewportArray[i].X;
2274          sfv.XMaxViewPort = viewport_Xmax - 1;
2275          sfv.YMinViewPort = fb_height - viewport_Ymax;
2276          sfv.YMaxViewPort = fb_height - ctx->ViewportArray[i].Y - 1;
2277       }
2278 #endif
2279
2280 #if GEN_GEN >= 7
2281       GENX(SF_CLIP_VIEWPORT_pack)(NULL, sf_clip_map, &sfv);
2282       sf_clip_map += GENX(SF_CLIP_VIEWPORT_length);
2283 #else
2284       GENX(SF_VIEWPORT_pack)(NULL, sf_map, &sfv);
2285       GENX(CLIP_VIEWPORT_pack)(NULL, clip_map, &clv);
2286       sf_map += GENX(SF_VIEWPORT_length);
2287       clip_map += GENX(CLIP_VIEWPORT_length);
2288 #endif
2289    }
2290
2291 #if GEN_GEN >= 7
2292    brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), ptr) {
2293       ptr.SFClipViewportPointer = sf_clip_vp_offset;
2294    }
2295 #elif GEN_GEN == 6
2296    brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) {
2297       vp.SFViewportStateChange = 1;
2298       vp.CLIPViewportStateChange = 1;
2299       vp.PointertoCLIP_VIEWPORT = clip_vp_offset;
2300       vp.PointertoSF_VIEWPORT = sf_vp_offset;
2301    }
2302 #else
2303    brw->sf.vp_offset = sf_vp_offset;
2304    brw->clip.vp_offset = clip_vp_offset;
2305    brw->ctx.NewDriverState |= BRW_NEW_SF_VP | BRW_NEW_CLIP_VP;
2306 #endif
2307 }
2308
2309 static const struct brw_tracked_state genX(sf_clip_viewport) = {
2310    .dirty = {
2311       .mesa = _NEW_BUFFERS |
2312               _NEW_VIEWPORT |
2313               (GEN_GEN <= 5 ? _NEW_SCISSOR : 0),
2314       .brw = BRW_NEW_BATCH |
2315              BRW_NEW_BLORP |
2316              BRW_NEW_VIEWPORT_COUNT,
2317    },
2318    .emit = genX(upload_sf_clip_viewport),
2319 };
2320
2321 /* ---------------------------------------------------------------------- */
2322
2323 #if GEN_GEN >= 6
2324 static void
2325 genX(upload_gs_state)(struct brw_context *brw)
2326 {
2327    const struct gen_device_info *devinfo = &brw->screen->devinfo;
2328    const struct brw_stage_state *stage_state = &brw->gs.base;
2329    /* BRW_NEW_GEOMETRY_PROGRAM */
2330    bool active = brw->geometry_program;
2331
2332    /* BRW_NEW_GS_PROG_DATA */
2333    struct brw_stage_prog_data *stage_prog_data = stage_state->prog_data;
2334    const struct brw_vue_prog_data *vue_prog_data =
2335       brw_vue_prog_data(stage_prog_data);
2336 #if GEN_GEN >= 7
2337    const struct brw_gs_prog_data *gs_prog_data =
2338       brw_gs_prog_data(stage_prog_data);
2339 #endif
2340
2341 #if GEN_GEN < 7
2342    brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_GS), cgs) {
2343       if (active && stage_state->push_const_size != 0) {
2344          cgs.Buffer0Valid = true;
2345          cgs.PointertoGSConstantBuffer0 = stage_state->push_const_offset;
2346          cgs.GSConstantBuffer0ReadLength = stage_state->push_const_size - 1;
2347       }
2348    }
2349 #endif
2350
2351 #if GEN_GEN == 7 && !GEN_IS_HASWELL
2352    /**
2353     * From Graphics BSpec: 3D-Media-GPGPU Engine > 3D Pipeline Stages >
2354     * Geometry > Geometry Shader > State:
2355     *
2356     *     "Note: Because of corruption in IVB:GT2, software needs to flush the
2357     *     whole fixed function pipeline when the GS enable changes value in
2358     *     the 3DSTATE_GS."
2359     *
2360     * The hardware architects have clarified that in this context "flush the
2361     * whole fixed function pipeline" means to emit a PIPE_CONTROL with the "CS
2362     * Stall" bit set.
2363     */
2364    if (brw->gt == 2 && brw->gs.enabled != active)
2365       gen7_emit_cs_stall_flush(brw);
2366 #endif
2367
2368    if (active) {
2369       brw_batch_emit(brw, GENX(3DSTATE_GS), gs) {
2370          INIT_THREAD_DISPATCH_FIELDS(gs, Vertex);
2371
2372 #if GEN_GEN >= 7
2373          gs.OutputVertexSize = gs_prog_data->output_vertex_size_hwords * 2 - 1;
2374          gs.OutputTopology = gs_prog_data->output_topology;
2375          gs.ControlDataHeaderSize =
2376             gs_prog_data->control_data_header_size_hwords;
2377
2378          gs.InstanceControl = gs_prog_data->invocations - 1;
2379          gs.DispatchMode = vue_prog_data->dispatch_mode;
2380
2381          gs.IncludePrimitiveID = gs_prog_data->include_primitive_id;
2382
2383          gs.ControlDataFormat = gs_prog_data->control_data_format;
2384 #endif
2385
2386          /* Note: the meaning of the GEN7_GS_REORDER_TRAILING bit changes between
2387           * Ivy Bridge and Haswell.
2388           *
2389           * On Ivy Bridge, setting this bit causes the vertices of a triangle
2390           * strip to be delivered to the geometry shader in an order that does
2391           * not strictly follow the OpenGL spec, but preserves triangle
2392           * orientation.  For example, if the vertices are (1, 2, 3, 4, 5), then
2393           * the geometry shader sees triangles:
2394           *
2395           * (1, 2, 3), (2, 4, 3), (3, 4, 5)
2396           *
2397           * (Clearing the bit is even worse, because it fails to preserve
2398           * orientation).
2399           *
2400           * Triangle strips with adjacency always ordered in a way that preserves
2401           * triangle orientation but does not strictly follow the OpenGL spec,
2402           * regardless of the setting of this bit.
2403           *
2404           * On Haswell, both triangle strips and triangle strips with adjacency
2405           * are always ordered in a way that preserves triangle orientation.
2406           * Setting this bit causes the ordering to strictly follow the OpenGL
2407           * spec.
2408           *
2409           * So in either case we want to set the bit.  Unfortunately on Ivy
2410           * Bridge this will get the order close to correct but not perfect.
2411           */
2412          gs.ReorderMode = TRAILING;
2413          gs.MaximumNumberofThreads =
2414             GEN_GEN == 8 ? (devinfo->max_gs_threads / 2 - 1)
2415                          : (devinfo->max_gs_threads - 1);
2416
2417 #if GEN_GEN < 7
2418          gs.SOStatisticsEnable = true;
2419          gs.RenderingEnabled = 1;
2420          if (brw->geometry_program->info.has_transform_feedback_varyings)
2421             gs.SVBIPayloadEnable = true;
2422
2423          /* GEN6_GS_SPF_MODE and GEN6_GS_VECTOR_MASK_ENABLE are enabled as it
2424           * was previously done for gen6.
2425           *
2426           * TODO: test with both disabled to see if the HW is behaving
2427           * as expected, like in gen7.
2428           */
2429          gs.SingleProgramFlow = true;
2430          gs.VectorMaskEnable = true;
2431 #endif
2432
2433 #if GEN_GEN >= 8
2434          gs.ExpectedVertexCount = gs_prog_data->vertices_in;
2435
2436          if (gs_prog_data->static_vertex_count != -1) {
2437             gs.StaticOutput = true;
2438             gs.StaticOutputVertexCount = gs_prog_data->static_vertex_count;
2439          }
2440          gs.IncludeVertexHandles = vue_prog_data->include_vue_handles;
2441
2442          gs.UserClipDistanceCullTestEnableBitmask =
2443             vue_prog_data->cull_distance_mask;
2444
2445          const int urb_entry_write_offset = 1;
2446          const uint32_t urb_entry_output_length =
2447             DIV_ROUND_UP(vue_prog_data->vue_map.num_slots, 2) -
2448             urb_entry_write_offset;
2449
2450          gs.VertexURBEntryOutputReadOffset = urb_entry_write_offset;
2451          gs.VertexURBEntryOutputLength = MAX2(urb_entry_output_length, 1);
2452 #endif
2453       }
2454 #if GEN_GEN < 7
2455    } else if (brw->ff_gs.prog_active)  {
2456       /* In gen6, transform feedback for the VS stage is done with an ad-hoc GS
2457        * program. This function provides the needed 3DSTATE_GS for this.
2458        */
2459       upload_gs_state_for_tf(brw);
2460 #endif
2461    } else {
2462       brw_batch_emit(brw, GENX(3DSTATE_GS), gs) {
2463          gs.StatisticsEnable = true;
2464 #if GEN_GEN < 7
2465          gs.RenderingEnabled = true;
2466 #endif
2467
2468 #if GEN_GEN < 8
2469          gs.DispatchGRFStartRegisterForURBData = 1;
2470 #if GEN_GEN >= 7
2471          gs.IncludeVertexHandles = true;
2472 #endif
2473 #endif
2474       }
2475    }
2476 #if GEN_GEN < 7
2477    brw->gs.enabled = active;
2478 #endif
2479 }
2480
2481 static const struct brw_tracked_state genX(gs_state) = {
2482    .dirty = {
2483       .mesa  = (GEN_GEN < 7 ? _NEW_PROGRAM_CONSTANTS : 0),
2484       .brw   = BRW_NEW_BATCH |
2485                BRW_NEW_BLORP |
2486                BRW_NEW_CONTEXT |
2487                BRW_NEW_GEOMETRY_PROGRAM |
2488                BRW_NEW_GS_PROG_DATA |
2489                (GEN_GEN < 7 ? BRW_NEW_FF_GS_PROG_DATA : 0),
2490    },
2491    .emit = genX(upload_gs_state),
2492 };
2493 #endif
2494
2495 /* ---------------------------------------------------------------------- */
2496
2497 UNUSED static GLenum
2498 fix_dual_blend_alpha_to_one(GLenum function)
2499 {
2500    switch (function) {
2501    case GL_SRC1_ALPHA:
2502       return GL_ONE;
2503
2504    case GL_ONE_MINUS_SRC1_ALPHA:
2505       return GL_ZERO;
2506    }
2507
2508    return function;
2509 }
2510
2511 #define blend_factor(x) brw_translate_blend_factor(x)
2512 #define blend_eqn(x) brw_translate_blend_equation(x)
2513
2514 #if GEN_GEN >= 6
2515 static void
2516 genX(upload_blend_state)(struct brw_context *brw)
2517 {
2518    struct gl_context *ctx = &brw->ctx;
2519    int size;
2520
2521    /* We need at least one BLEND_STATE written, because we might do
2522     * thread dispatch even if _NumColorDrawBuffers is 0 (for example
2523     * for computed depth or alpha test), which will do an FB write
2524     * with render target 0, which will reference BLEND_STATE[0] for
2525     * alpha test enable.
2526     */
2527    int nr_draw_buffers = ctx->DrawBuffer->_NumColorDrawBuffers;
2528    if (nr_draw_buffers == 0 && ctx->Color.AlphaEnabled)
2529       nr_draw_buffers = 1;
2530
2531    size = GENX(BLEND_STATE_ENTRY_length) * 4 * nr_draw_buffers;
2532 #if GEN_GEN >= 8
2533    size += GENX(BLEND_STATE_length) * 4;
2534 #endif
2535
2536    uint32_t *blend_map;
2537    blend_map = brw_state_batch(brw, size, 64, &brw->cc.blend_state_offset);
2538
2539 #if GEN_GEN >= 8
2540    struct GENX(BLEND_STATE) blend = { 0 };
2541    {
2542 #else
2543    for (int i = 0; i < nr_draw_buffers; i++) {
2544       struct GENX(BLEND_STATE_ENTRY) entry = { 0 };
2545 #define blend entry
2546 #endif
2547       /* OpenGL specification 3.3 (page 196), section 4.1.3 says:
2548        * "If drawbuffer zero is not NONE and the buffer it references has an
2549        * integer format, the SAMPLE_ALPHA_TO_COVERAGE and SAMPLE_ALPHA_TO_ONE
2550        * operations are skipped."
2551        */
2552       if (!(ctx->DrawBuffer->_IntegerBuffers & 0x1)) {
2553          /* _NEW_MULTISAMPLE */
2554          if (_mesa_is_multisample_enabled(ctx)) {
2555             if (ctx->Multisample.SampleAlphaToCoverage) {
2556                blend.AlphaToCoverageEnable = true;
2557                blend.AlphaToCoverageDitherEnable = GEN_GEN >= 7;
2558             }
2559             if (ctx->Multisample.SampleAlphaToOne)
2560                blend.AlphaToOneEnable = true;
2561          }
2562
2563          /* _NEW_COLOR */
2564          if (ctx->Color.AlphaEnabled) {
2565             blend.AlphaTestEnable = true;
2566             blend.AlphaTestFunction =
2567                intel_translate_compare_func(ctx->Color.AlphaFunc);
2568          }
2569
2570          if (ctx->Color.DitherFlag) {
2571             blend.ColorDitherEnable = true;
2572          }
2573       }
2574
2575 #if GEN_GEN >= 8
2576       for (int i = 0; i < nr_draw_buffers; i++) {
2577          struct GENX(BLEND_STATE_ENTRY) entry = { 0 };
2578 #else
2579       {
2580 #endif
2581
2582          /* _NEW_BUFFERS */
2583          struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
2584
2585          /* Used for implementing the following bit of GL_EXT_texture_integer:
2586           * "Per-fragment operations that require floating-point color
2587           *  components, including multisample alpha operations, alpha test,
2588           *  blending, and dithering, have no effect when the corresponding
2589           *  colors are written to an integer color buffer."
2590           */
2591          bool integer = ctx->DrawBuffer->_IntegerBuffers & (0x1 << i);
2592
2593          /* _NEW_COLOR */
2594          if (ctx->Color.ColorLogicOpEnabled) {
2595             GLenum rb_type = rb ? _mesa_get_format_datatype(rb->Format)
2596                                 : GL_UNSIGNED_NORMALIZED;
2597             WARN_ONCE(ctx->Color.LogicOp != GL_COPY &&
2598                       rb_type != GL_UNSIGNED_NORMALIZED &&
2599                       rb_type != GL_FLOAT, "Ignoring %s logic op on %s "
2600                       "renderbuffer\n",
2601                       _mesa_enum_to_string(ctx->Color.LogicOp),
2602                       _mesa_enum_to_string(rb_type));
2603             if (GEN_GEN >= 8 || rb_type == GL_UNSIGNED_NORMALIZED) {
2604                entry.LogicOpEnable = true;
2605                entry.LogicOpFunction =
2606                   intel_translate_logic_op(ctx->Color.LogicOp);
2607             }
2608          } else if (ctx->Color.BlendEnabled & (1 << i) && !integer &&
2609                     !ctx->Color._AdvancedBlendMode) {
2610             GLenum eqRGB = ctx->Color.Blend[i].EquationRGB;
2611             GLenum eqA = ctx->Color.Blend[i].EquationA;
2612             GLenum srcRGB = ctx->Color.Blend[i].SrcRGB;
2613             GLenum dstRGB = ctx->Color.Blend[i].DstRGB;
2614             GLenum srcA = ctx->Color.Blend[i].SrcA;
2615             GLenum dstA = ctx->Color.Blend[i].DstA;
2616
2617             if (eqRGB == GL_MIN || eqRGB == GL_MAX)
2618                srcRGB = dstRGB = GL_ONE;
2619
2620             if (eqA == GL_MIN || eqA == GL_MAX)
2621                srcA = dstA = GL_ONE;
2622
2623             /* Due to hardware limitations, the destination may have information
2624              * in an alpha channel even when the format specifies no alpha
2625              * channel. In order to avoid getting any incorrect blending due to
2626              * that alpha channel, coerce the blend factors to values that will
2627              * not read the alpha channel, but will instead use the correct
2628              * implicit value for alpha.
2629              */
2630             if (rb && !_mesa_base_format_has_channel(rb->_BaseFormat,
2631                                                      GL_TEXTURE_ALPHA_TYPE)) {
2632                srcRGB = brw_fix_xRGB_alpha(srcRGB);
2633                srcA = brw_fix_xRGB_alpha(srcA);
2634                dstRGB = brw_fix_xRGB_alpha(dstRGB);
2635                dstA = brw_fix_xRGB_alpha(dstA);
2636             }
2637
2638             /* From the BLEND_STATE docs, DWord 0, Bit 29 (AlphaToOne Enable):
2639              * "If Dual Source Blending is enabled, this bit must be disabled."
2640              *
2641              * We override SRC1_ALPHA to ONE and ONE_MINUS_SRC1_ALPHA to ZERO,
2642              * and leave it enabled anyway.
2643              */
2644             if (ctx->Color.Blend[i]._UsesDualSrc && blend.AlphaToOneEnable) {
2645                srcRGB = fix_dual_blend_alpha_to_one(srcRGB);
2646                srcA = fix_dual_blend_alpha_to_one(srcA);
2647                dstRGB = fix_dual_blend_alpha_to_one(dstRGB);
2648                dstA = fix_dual_blend_alpha_to_one(dstA);
2649             }
2650
2651             entry.ColorBufferBlendEnable = true;
2652             entry.DestinationBlendFactor = blend_factor(dstRGB);
2653             entry.SourceBlendFactor = blend_factor(srcRGB);
2654             entry.DestinationAlphaBlendFactor = blend_factor(dstA);
2655             entry.SourceAlphaBlendFactor = blend_factor(srcA);
2656             entry.ColorBlendFunction = blend_eqn(eqRGB);
2657             entry.AlphaBlendFunction = blend_eqn(eqA);
2658
2659             if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB)
2660                blend.IndependentAlphaBlendEnable = true;
2661          }
2662
2663          /* See section 8.1.6 "Pre-Blend Color Clamping" of the
2664           * SandyBridge PRM Volume 2 Part 1 for HW requirements.
2665           *
2666           * We do our ARB_color_buffer_float CLAMP_FRAGMENT_COLOR
2667           * clamping in the fragment shader.  For its clamping of
2668           * blending, the spec says:
2669           *
2670           *     "RESOLVED: For fixed-point color buffers, the inputs and
2671           *      the result of the blending equation are clamped.  For
2672           *      floating-point color buffers, no clamping occurs."
2673           *
2674           * So, generally, we want clamping to the render target's range.
2675           * And, good news, the hardware tables for both pre- and
2676           * post-blend color clamping are either ignored, or any are
2677           * allowed, or clamping is required but RT range clamping is a
2678           * valid option.
2679           */
2680          entry.PreBlendColorClampEnable = true;
2681          entry.PostBlendColorClampEnable = true;
2682          entry.ColorClampRange = COLORCLAMP_RTFORMAT;
2683
2684          entry.WriteDisableRed   = !ctx->Color.ColorMask[i][0];
2685          entry.WriteDisableGreen = !ctx->Color.ColorMask[i][1];
2686          entry.WriteDisableBlue  = !ctx->Color.ColorMask[i][2];
2687          entry.WriteDisableAlpha = !ctx->Color.ColorMask[i][3];
2688
2689 #if GEN_GEN >= 8
2690          GENX(BLEND_STATE_ENTRY_pack)(NULL, &blend_map[1 + i * 2], &entry);
2691 #else
2692          GENX(BLEND_STATE_ENTRY_pack)(NULL, &blend_map[i * 2], &entry);
2693 #endif
2694       }
2695    }
2696
2697 #if GEN_GEN >= 8
2698    GENX(BLEND_STATE_pack)(NULL, blend_map, &blend);
2699 #endif
2700
2701 #if GEN_GEN < 7
2702    brw_batch_emit(brw, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
2703       ptr.PointertoBLEND_STATE = brw->cc.blend_state_offset;
2704       ptr.BLEND_STATEChange = true;
2705    }
2706 #else
2707    brw_batch_emit(brw, GENX(3DSTATE_BLEND_STATE_POINTERS), ptr) {
2708       ptr.BlendStatePointer = brw->cc.blend_state_offset;
2709 #if GEN_GEN >= 8
2710       ptr.BlendStatePointerValid = true;
2711 #endif
2712    }
2713 #endif
2714 }
2715
2716 static const struct brw_tracked_state genX(blend_state) = {
2717    .dirty = {
2718       .mesa = _NEW_BUFFERS |
2719               _NEW_COLOR |
2720               _NEW_MULTISAMPLE,
2721       .brw = BRW_NEW_BATCH |
2722              BRW_NEW_BLORP |
2723              BRW_NEW_STATE_BASE_ADDRESS,
2724    },
2725    .emit = genX(upload_blend_state),
2726 };
2727 #endif
2728
2729 /* ---------------------------------------------------------------------- */
2730
2731 #if GEN_GEN >= 7
2732 UNUSED static const uint32_t push_constant_opcodes[] = {
2733    [MESA_SHADER_VERTEX]                      = 21,
2734    [MESA_SHADER_TESS_CTRL]                   = 25, /* HS */
2735    [MESA_SHADER_TESS_EVAL]                   = 26, /* DS */
2736    [MESA_SHADER_GEOMETRY]                    = 22,
2737    [MESA_SHADER_FRAGMENT]                    = 23,
2738    [MESA_SHADER_COMPUTE]                     = 0,
2739 };
2740
2741 static void
2742 upload_constant_state(struct brw_context *brw,
2743                       struct brw_stage_state *stage_state,
2744                       bool active, uint32_t stage)
2745 {
2746    UNUSED uint32_t mocs = GEN_GEN < 8 ? GEN7_MOCS_L3 : 0;
2747    active = active && stage_state->push_const_size != 0;
2748
2749    brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_VS), pkt) {
2750       pkt._3DCommandSubOpcode = push_constant_opcodes[stage];
2751       if (active) {
2752 #if GEN_GEN >= 8 || GEN_IS_HASWELL
2753          pkt.ConstantBody.ReadLength[2] = stage_state->push_const_size;
2754          pkt.ConstantBody.Buffer[2] =
2755             render_ro_bo(brw->curbe.curbe_bo, stage_state->push_const_offset);
2756 #else
2757          pkt.ConstantBody.ReadLength[0] = stage_state->push_const_size;
2758          pkt.ConstantBody.Buffer[0].offset =
2759             stage_state->push_const_offset | mocs;
2760 #endif
2761       }
2762    }
2763
2764    brw->ctx.NewDriverState |= GEN_GEN >= 9 ? BRW_NEW_SURFACES : 0;
2765 }
2766 #endif
2767
2768 #if GEN_GEN >= 6
2769 static void
2770 genX(upload_vs_push_constants)(struct brw_context *brw)
2771 {
2772    struct brw_stage_state *stage_state = &brw->vs.base;
2773
2774    /* _BRW_NEW_VERTEX_PROGRAM */
2775    const struct brw_program *vp = brw_program_const(brw->vertex_program);
2776    /* BRW_NEW_VS_PROG_DATA */
2777    const struct brw_stage_prog_data *prog_data = brw->vs.base.prog_data;
2778
2779    _mesa_shader_write_subroutine_indices(&brw->ctx, MESA_SHADER_VERTEX);
2780    gen6_upload_push_constants(brw, &vp->program, prog_data, stage_state);
2781
2782 #if GEN_GEN >= 7
2783    if (GEN_GEN == 7 && !GEN_IS_HASWELL && !brw->is_baytrail)
2784       gen7_emit_vs_workaround_flush(brw);
2785
2786    upload_constant_state(brw, stage_state, true /* active */,
2787                          MESA_SHADER_VERTEX);
2788 #endif
2789 }
2790
2791 static const struct brw_tracked_state genX(vs_push_constants) = {
2792    .dirty = {
2793       .mesa  = _NEW_PROGRAM_CONSTANTS |
2794                _NEW_TRANSFORM,
2795       .brw   = BRW_NEW_BATCH |
2796                BRW_NEW_BLORP |
2797                BRW_NEW_PUSH_CONSTANT_ALLOCATION |
2798                BRW_NEW_VERTEX_PROGRAM |
2799                BRW_NEW_VS_PROG_DATA,
2800    },
2801    .emit = genX(upload_vs_push_constants),
2802 };
2803
2804 static void
2805 genX(upload_gs_push_constants)(struct brw_context *brw)
2806 {
2807    struct brw_stage_state *stage_state = &brw->gs.base;
2808
2809    /* BRW_NEW_GEOMETRY_PROGRAM */
2810    const struct brw_program *gp = brw_program_const(brw->geometry_program);
2811
2812    if (gp) {
2813       /* BRW_NEW_GS_PROG_DATA */
2814       struct brw_stage_prog_data *prog_data = brw->gs.base.prog_data;
2815
2816       _mesa_shader_write_subroutine_indices(&brw->ctx, MESA_SHADER_GEOMETRY);
2817       gen6_upload_push_constants(brw, &gp->program, prog_data, stage_state);
2818    }
2819
2820 #if GEN_GEN >= 7
2821    upload_constant_state(brw, stage_state, gp, MESA_SHADER_GEOMETRY);
2822 #endif
2823 }
2824
2825 static const struct brw_tracked_state genX(gs_push_constants) = {
2826    .dirty = {
2827       .mesa  = _NEW_PROGRAM_CONSTANTS |
2828                _NEW_TRANSFORM,
2829       .brw   = BRW_NEW_BATCH |
2830                BRW_NEW_BLORP |
2831                BRW_NEW_GEOMETRY_PROGRAM |
2832                BRW_NEW_GS_PROG_DATA |
2833                BRW_NEW_PUSH_CONSTANT_ALLOCATION,
2834    },
2835    .emit = genX(upload_gs_push_constants),
2836 };
2837
2838 static void
2839 genX(upload_wm_push_constants)(struct brw_context *brw)
2840 {
2841    struct brw_stage_state *stage_state = &brw->wm.base;
2842    /* BRW_NEW_FRAGMENT_PROGRAM */
2843    const struct brw_program *fp = brw_program_const(brw->fragment_program);
2844    /* BRW_NEW_FS_PROG_DATA */
2845    const struct brw_stage_prog_data *prog_data = brw->wm.base.prog_data;
2846
2847    _mesa_shader_write_subroutine_indices(&brw->ctx, MESA_SHADER_FRAGMENT);
2848
2849    gen6_upload_push_constants(brw, &fp->program, prog_data, stage_state);
2850
2851 #if GEN_GEN >= 7
2852    upload_constant_state(brw, stage_state, true, MESA_SHADER_FRAGMENT);
2853 #endif
2854 }
2855
2856 static const struct brw_tracked_state genX(wm_push_constants) = {
2857    .dirty = {
2858       .mesa  = _NEW_PROGRAM_CONSTANTS,
2859       .brw   = BRW_NEW_BATCH |
2860                BRW_NEW_BLORP |
2861                BRW_NEW_FRAGMENT_PROGRAM |
2862                BRW_NEW_FS_PROG_DATA |
2863                BRW_NEW_PUSH_CONSTANT_ALLOCATION,
2864    },
2865    .emit = genX(upload_wm_push_constants),
2866 };
2867 #endif
2868
2869 /* ---------------------------------------------------------------------- */
2870
2871 #if GEN_GEN >= 6
2872 static unsigned
2873 genX(determine_sample_mask)(struct brw_context *brw)
2874 {
2875    struct gl_context *ctx = &brw->ctx;
2876    float coverage = 1.0f;
2877    float coverage_invert = false;
2878    unsigned sample_mask = ~0u;
2879
2880    /* BRW_NEW_NUM_SAMPLES */
2881    unsigned num_samples = brw->num_samples;
2882
2883    if (_mesa_is_multisample_enabled(ctx)) {
2884       if (ctx->Multisample.SampleCoverage) {
2885          coverage = ctx->Multisample.SampleCoverageValue;
2886          coverage_invert = ctx->Multisample.SampleCoverageInvert;
2887       }
2888       if (ctx->Multisample.SampleMask) {
2889          sample_mask = ctx->Multisample.SampleMaskValue;
2890       }
2891    }
2892
2893    if (num_samples > 1) {
2894       int coverage_int = (int) (num_samples * coverage + 0.5f);
2895       uint32_t coverage_bits = (1 << coverage_int) - 1;
2896       if (coverage_invert)
2897          coverage_bits ^= (1 << num_samples) - 1;
2898       return coverage_bits & sample_mask;
2899    } else {
2900       return 1;
2901    }
2902 }
2903
2904 static void
2905 genX(emit_3dstate_multisample2)(struct brw_context *brw,
2906                                 unsigned num_samples)
2907 {
2908    assert(brw->num_samples <= 16);
2909
2910    unsigned log2_samples = ffs(MAX2(num_samples, 1)) - 1;
2911
2912    brw_batch_emit(brw, GENX(3DSTATE_MULTISAMPLE), multi) {
2913       multi.PixelLocation = CENTER;
2914       multi.NumberofMultisamples = log2_samples;
2915 #if GEN_GEN == 6
2916       GEN_SAMPLE_POS_4X(multi.Sample);
2917 #elif GEN_GEN == 7
2918       switch (num_samples) {
2919       case 1:
2920          GEN_SAMPLE_POS_1X(multi.Sample);
2921          break;
2922       case 2:
2923          GEN_SAMPLE_POS_2X(multi.Sample);
2924          break;
2925       case 4:
2926          GEN_SAMPLE_POS_4X(multi.Sample);
2927          break;
2928       case 8:
2929          GEN_SAMPLE_POS_8X(multi.Sample);
2930          break;
2931       default:
2932          break;
2933       }
2934 #endif
2935    }
2936 }
2937
2938 static void
2939 genX(upload_multisample_state)(struct brw_context *brw)
2940 {
2941    genX(emit_3dstate_multisample2)(brw, brw->num_samples);
2942
2943    brw_batch_emit(brw, GENX(3DSTATE_SAMPLE_MASK), sm) {
2944       sm.SampleMask = genX(determine_sample_mask)(brw);
2945    }
2946 }
2947
2948 static const struct brw_tracked_state genX(multisample_state) = {
2949    .dirty = {
2950       .mesa = _NEW_MULTISAMPLE,
2951       .brw = BRW_NEW_BLORP |
2952              BRW_NEW_CONTEXT |
2953              BRW_NEW_NUM_SAMPLES,
2954    },
2955    .emit = genX(upload_multisample_state)
2956 };
2957 #endif
2958
2959 /* ---------------------------------------------------------------------- */
2960
2961 #if GEN_GEN >= 6
2962 static void
2963 genX(upload_color_calc_state)(struct brw_context *brw)
2964 {
2965    struct gl_context *ctx = &brw->ctx;
2966
2967    brw_state_emit(brw, GENX(COLOR_CALC_STATE), 64, &brw->cc.state_offset, cc) {
2968       /* _NEW_COLOR */
2969       cc.AlphaTestFormat = ALPHATEST_UNORM8;
2970       UNCLAMPED_FLOAT_TO_UBYTE(cc.AlphaReferenceValueAsUNORM8,
2971                                ctx->Color.AlphaRef);
2972
2973 #if GEN_GEN < 9
2974       /* _NEW_STENCIL */
2975       cc.StencilReferenceValue = _mesa_get_stencil_ref(ctx, 0);
2976       cc.BackfaceStencilReferenceValue =
2977          _mesa_get_stencil_ref(ctx, ctx->Stencil._BackFace);
2978 #endif
2979
2980       /* _NEW_COLOR */
2981       cc.BlendConstantColorRed = ctx->Color.BlendColorUnclamped[0];
2982       cc.BlendConstantColorGreen = ctx->Color.BlendColorUnclamped[1];
2983       cc.BlendConstantColorBlue = ctx->Color.BlendColorUnclamped[2];
2984       cc.BlendConstantColorAlpha = ctx->Color.BlendColorUnclamped[3];
2985    }
2986
2987    brw_batch_emit(brw, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
2988       ptr.ColorCalcStatePointer = brw->cc.state_offset;
2989 #if GEN_GEN != 7
2990       ptr.ColorCalcStatePointerValid = true;
2991 #endif
2992    }
2993 }
2994
2995 static const struct brw_tracked_state genX(color_calc_state) = {
2996    .dirty = {
2997       .mesa = _NEW_COLOR |
2998               _NEW_STENCIL,
2999       .brw = BRW_NEW_BATCH |
3000              BRW_NEW_BLORP |
3001              BRW_NEW_CC_STATE |
3002              BRW_NEW_STATE_BASE_ADDRESS,
3003    },
3004    .emit = genX(upload_color_calc_state),
3005 };
3006
3007 #endif
3008
3009 /* ---------------------------------------------------------------------- */
3010
3011 #if GEN_GEN >= 7
3012 static void
3013 genX(upload_sbe)(struct brw_context *brw)
3014 {
3015    struct gl_context *ctx = &brw->ctx;
3016    /* BRW_NEW_FS_PROG_DATA */
3017    const struct brw_wm_prog_data *wm_prog_data =
3018       brw_wm_prog_data(brw->wm.base.prog_data);
3019 #if GEN_GEN >= 8
3020    struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attr_overrides[16] = { { 0 } };
3021 #else
3022 #define attr_overrides sbe.Attribute
3023 #endif
3024    uint32_t urb_entry_read_length;
3025    uint32_t urb_entry_read_offset;
3026    uint32_t point_sprite_enables;
3027
3028    brw_batch_emit(brw, GENX(3DSTATE_SBE), sbe) {
3029       sbe.AttributeSwizzleEnable = true;
3030       sbe.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
3031
3032       /* _NEW_BUFFERS */
3033       bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3034
3035       /* _NEW_POINT
3036        *
3037        * Window coordinates in an FBO are inverted, which means point
3038        * sprite origin must be inverted.
3039        */
3040       if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) != render_to_fbo)
3041          sbe.PointSpriteTextureCoordinateOrigin = LOWERLEFT;
3042       else
3043          sbe.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
3044
3045       /* _NEW_POINT | _NEW_LIGHT | _NEW_PROGRAM,
3046        * BRW_NEW_FS_PROG_DATA | BRW_NEW_FRAGMENT_PROGRAM |
3047        * BRW_NEW_GS_PROG_DATA | BRW_NEW_PRIMITIVE | BRW_NEW_TES_PROG_DATA |
3048        * BRW_NEW_VUE_MAP_GEOM_OUT
3049        */
3050       genX(calculate_attr_overrides)(brw,
3051                                      attr_overrides,
3052                                      &point_sprite_enables,
3053                                      &urb_entry_read_length,
3054                                      &urb_entry_read_offset);
3055
3056       /* Typically, the URB entry read length and offset should be programmed
3057        * in 3DSTATE_VS and 3DSTATE_GS; SBE inherits it from the last active
3058        * stage which produces geometry.  However, we don't know the proper
3059        * value until we call calculate_attr_overrides().
3060        *
3061        * To fit with our existing code, we override the inherited values and
3062        * specify it here directly, as we did on previous generations.
3063        */
3064       sbe.VertexURBEntryReadLength = urb_entry_read_length;
3065       sbe.VertexURBEntryReadOffset = urb_entry_read_offset;
3066       sbe.PointSpriteTextureCoordinateEnable = point_sprite_enables;
3067       sbe.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
3068
3069 #if GEN_GEN >= 8
3070       sbe.ForceVertexURBEntryReadLength = true;
3071       sbe.ForceVertexURBEntryReadOffset = true;
3072 #endif
3073
3074 #if GEN_GEN >= 9
3075       /* prepare the active component dwords */
3076       int input_index = 0;
3077       for (int attr = 0; attr < VARYING_SLOT_MAX; attr++) {
3078          if (!(brw->fragment_program->info.inputs_read &
3079                BITFIELD64_BIT(attr))) {
3080             continue;
3081          }
3082
3083          assert(input_index < 32);
3084
3085          sbe.AttributeActiveComponentFormat[input_index] = ACTIVE_COMPONENT_XYZW;
3086          ++input_index;
3087       }
3088 #endif
3089    }
3090
3091 #if GEN_GEN >= 8
3092    brw_batch_emit(brw, GENX(3DSTATE_SBE_SWIZ), sbes) {
3093       for (int i = 0; i < 16; i++)
3094          sbes.Attribute[i] = attr_overrides[i];
3095    }
3096 #endif
3097
3098 #undef attr_overrides
3099 }
3100
3101 static const struct brw_tracked_state genX(sbe_state) = {
3102    .dirty = {
3103       .mesa  = _NEW_BUFFERS |
3104                _NEW_LIGHT |
3105                _NEW_POINT |
3106                _NEW_POLYGON |
3107                _NEW_PROGRAM,
3108       .brw   = BRW_NEW_BLORP |
3109                BRW_NEW_CONTEXT |
3110                BRW_NEW_FRAGMENT_PROGRAM |
3111                BRW_NEW_FS_PROG_DATA |
3112                BRW_NEW_GS_PROG_DATA |
3113                BRW_NEW_TES_PROG_DATA |
3114                BRW_NEW_VUE_MAP_GEOM_OUT |
3115                (GEN_GEN == 7 ? BRW_NEW_PRIMITIVE
3116                              : 0),
3117    },
3118    .emit = genX(upload_sbe),
3119 };
3120 #endif
3121
3122 /* ---------------------------------------------------------------------- */
3123
3124 #if GEN_GEN >= 7
3125 /**
3126  * Outputs the 3DSTATE_SO_DECL_LIST command.
3127  *
3128  * The data output is a series of 64-bit entries containing a SO_DECL per
3129  * stream.  We only have one stream of rendering coming out of the GS unit, so
3130  * we only emit stream 0 (low 16 bits) SO_DECLs.
3131  */
3132 static void
3133 genX(upload_3dstate_so_decl_list)(struct brw_context *brw,
3134                                   const struct brw_vue_map *vue_map)
3135 {
3136    struct gl_context *ctx = &brw->ctx;
3137    /* BRW_NEW_TRANSFORM_FEEDBACK */
3138    struct gl_transform_feedback_object *xfb_obj =
3139       ctx->TransformFeedback.CurrentObject;
3140    const struct gl_transform_feedback_info *linked_xfb_info =
3141       xfb_obj->program->sh.LinkedTransformFeedback;
3142    struct GENX(SO_DECL) so_decl[MAX_VERTEX_STREAMS][128];
3143    int buffer_mask[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
3144    int next_offset[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
3145    int decls[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
3146    int max_decls = 0;
3147    STATIC_ASSERT(ARRAY_SIZE(so_decl[0]) >= MAX_PROGRAM_OUTPUTS);
3148
3149    memset(so_decl, 0, sizeof(so_decl));
3150
3151    /* Construct the list of SO_DECLs to be emitted.  The formatting of the
3152     * command feels strange -- each dword pair contains a SO_DECL per stream.
3153     */
3154    for (unsigned i = 0; i < linked_xfb_info->NumOutputs; i++) {
3155       const struct gl_transform_feedback_output *output =
3156          &linked_xfb_info->Outputs[i];
3157       const int buffer = output->OutputBuffer;
3158       const int varying = output->OutputRegister;
3159       const unsigned stream_id = output->StreamId;
3160       assert(stream_id < MAX_VERTEX_STREAMS);
3161
3162       buffer_mask[stream_id] |= 1 << buffer;
3163
3164       assert(vue_map->varying_to_slot[varying] >= 0);
3165
3166       /* Mesa doesn't store entries for gl_SkipComponents in the Outputs[]
3167        * array.  Instead, it simply increments DstOffset for the following
3168        * input by the number of components that should be skipped.
3169        *
3170        * Our hardware is unusual in that it requires us to program SO_DECLs
3171        * for fake "hole" components, rather than simply taking the offset
3172        * for each real varying.  Each hole can have size 1, 2, 3, or 4; we
3173        * program as many size = 4 holes as we can, then a final hole to
3174        * accommodate the final 1, 2, or 3 remaining.
3175        */
3176       int skip_components = output->DstOffset - next_offset[buffer];
3177
3178       while (skip_components > 0) {
3179          so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
3180             .HoleFlag = 1,
3181             .OutputBufferSlot = output->OutputBuffer,
3182             .ComponentMask = (1 << MIN2(skip_components, 4)) - 1,
3183          };
3184          skip_components -= 4;
3185       }
3186
3187       next_offset[buffer] = output->DstOffset + output->NumComponents;
3188
3189       so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
3190          .OutputBufferSlot = output->OutputBuffer,
3191          .RegisterIndex = vue_map->varying_to_slot[varying],
3192          .ComponentMask =
3193             ((1 << output->NumComponents) - 1) << output->ComponentOffset,
3194       };
3195
3196       if (decls[stream_id] > max_decls)
3197          max_decls = decls[stream_id];
3198    }
3199
3200    uint32_t *dw;
3201    dw = brw_batch_emitn(brw, GENX(3DSTATE_SO_DECL_LIST), 3 + 2 * max_decls,
3202                         .StreamtoBufferSelects0 = buffer_mask[0],
3203                         .StreamtoBufferSelects1 = buffer_mask[1],
3204                         .StreamtoBufferSelects2 = buffer_mask[2],
3205                         .StreamtoBufferSelects3 = buffer_mask[3],
3206                         .NumEntries0 = decls[0],
3207                         .NumEntries1 = decls[1],
3208                         .NumEntries2 = decls[2],
3209                         .NumEntries3 = decls[3]);
3210
3211    for (int i = 0; i < max_decls; i++) {
3212       GENX(SO_DECL_ENTRY_pack)(
3213          brw, dw + 2 + i * 2,
3214          &(struct GENX(SO_DECL_ENTRY)) {
3215             .Stream0Decl = so_decl[0][i],
3216             .Stream1Decl = so_decl[1][i],
3217             .Stream2Decl = so_decl[2][i],
3218             .Stream3Decl = so_decl[3][i],
3219          });
3220    }
3221 }
3222
3223 static void
3224 genX(upload_3dstate_so_buffers)(struct brw_context *brw)
3225 {
3226    struct gl_context *ctx = &brw->ctx;
3227    /* BRW_NEW_TRANSFORM_FEEDBACK */
3228    struct gl_transform_feedback_object *xfb_obj =
3229       ctx->TransformFeedback.CurrentObject;
3230 #if GEN_GEN < 8
3231    const struct gl_transform_feedback_info *linked_xfb_info =
3232       xfb_obj->program->sh.LinkedTransformFeedback;
3233 #else
3234    struct brw_transform_feedback_object *brw_obj =
3235       (struct brw_transform_feedback_object *) xfb_obj;
3236    uint32_t mocs_wb = GEN_GEN >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
3237 #endif
3238
3239    /* Set up the up to 4 output buffers.  These are the ranges defined in the
3240     * gl_transform_feedback_object.
3241     */
3242    for (int i = 0; i < 4; i++) {
3243       struct intel_buffer_object *bufferobj =
3244          intel_buffer_object(xfb_obj->Buffers[i]);
3245
3246       if (!bufferobj) {
3247          brw_batch_emit(brw, GENX(3DSTATE_SO_BUFFER), sob) {
3248             sob.SOBufferIndex = i;
3249          }
3250          continue;
3251       }
3252
3253       uint32_t start = xfb_obj->Offset[i];
3254       assert(start % 4 == 0);
3255       uint32_t end = ALIGN(start + xfb_obj->Size[i], 4);
3256       struct brw_bo *bo =
3257          intel_bufferobj_buffer(brw, bufferobj, start, end - start);
3258       assert(end <= bo->size);
3259
3260       brw_batch_emit(brw, GENX(3DSTATE_SO_BUFFER), sob) {
3261          sob.SOBufferIndex = i;
3262
3263          sob.SurfaceBaseAddress = render_bo(bo, start);
3264 #if GEN_GEN < 8
3265          sob.SurfacePitch = linked_xfb_info->Buffers[i].Stride * 4;
3266          sob.SurfaceEndAddress = render_bo(bo, end);
3267 #else
3268          sob.SOBufferEnable = true;
3269          sob.StreamOffsetWriteEnable = true;
3270          sob.StreamOutputBufferOffsetAddressEnable = true;
3271          sob.SOBufferMOCS = mocs_wb;
3272
3273          sob.SurfaceSize = MAX2(xfb_obj->Size[i] / 4, 1) - 1;
3274          sob.StreamOutputBufferOffsetAddress =
3275             instruction_bo(brw_obj->offset_bo, i * sizeof(uint32_t));
3276
3277          if (brw_obj->zero_offsets) {
3278             /* Zero out the offset and write that to offset_bo */
3279             sob.StreamOffset = 0;
3280          } else {
3281             /* Use offset_bo as the "Stream Offset." */
3282             sob.StreamOffset = 0xFFFFFFFF;
3283          }
3284 #endif
3285       }
3286    }
3287
3288 #if GEN_GEN >= 8
3289    brw_obj->zero_offsets = false;
3290 #endif
3291 }
3292
3293 static inline bool
3294 query_active(struct gl_query_object *q)
3295 {
3296    return q && q->Active;
3297 }
3298
3299 static void
3300 genX(upload_3dstate_streamout)(struct brw_context *brw, bool active,
3301                                const struct brw_vue_map *vue_map)
3302 {
3303    struct gl_context *ctx = &brw->ctx;
3304    /* BRW_NEW_TRANSFORM_FEEDBACK */
3305    struct gl_transform_feedback_object *xfb_obj =
3306       ctx->TransformFeedback.CurrentObject;
3307
3308    brw_batch_emit(brw, GENX(3DSTATE_STREAMOUT), sos) {
3309       if (active) {
3310          int urb_entry_read_offset = 0;
3311          int urb_entry_read_length = (vue_map->num_slots + 1) / 2 -
3312             urb_entry_read_offset;
3313
3314          sos.SOFunctionEnable = true;
3315          sos.SOStatisticsEnable = true;
3316
3317          /* BRW_NEW_RASTERIZER_DISCARD */
3318          if (ctx->RasterDiscard) {
3319             if (!query_active(ctx->Query.PrimitivesGenerated[0])) {
3320                sos.RenderingDisable = true;
3321             } else {
3322                perf_debug("Rasterizer discard with a GL_PRIMITIVES_GENERATED "
3323                           "query active relies on the clipper.");
3324             }
3325          }
3326
3327          /* _NEW_LIGHT */
3328          if (ctx->Light.ProvokingVertex != GL_FIRST_VERTEX_CONVENTION)
3329             sos.ReorderMode = TRAILING;
3330
3331 #if GEN_GEN < 8
3332          sos.SOBufferEnable0 = xfb_obj->Buffers[0] != NULL;
3333          sos.SOBufferEnable1 = xfb_obj->Buffers[1] != NULL;
3334          sos.SOBufferEnable2 = xfb_obj->Buffers[2] != NULL;
3335          sos.SOBufferEnable3 = xfb_obj->Buffers[3] != NULL;
3336 #else
3337          const struct gl_transform_feedback_info *linked_xfb_info =
3338             xfb_obj->program->sh.LinkedTransformFeedback;
3339          /* Set buffer pitches; 0 means unbound. */
3340          if (xfb_obj->Buffers[0])
3341             sos.Buffer0SurfacePitch = linked_xfb_info->Buffers[0].Stride * 4;
3342          if (xfb_obj->Buffers[1])
3343             sos.Buffer1SurfacePitch = linked_xfb_info->Buffers[1].Stride * 4;
3344          if (xfb_obj->Buffers[2])
3345             sos.Buffer2SurfacePitch = linked_xfb_info->Buffers[2].Stride * 4;
3346          if (xfb_obj->Buffers[3])
3347             sos.Buffer3SurfacePitch = linked_xfb_info->Buffers[3].Stride * 4;
3348 #endif
3349
3350          /* We always read the whole vertex.  This could be reduced at some
3351           * point by reading less and offsetting the register index in the
3352           * SO_DECLs.
3353           */
3354          sos.Stream0VertexReadOffset = urb_entry_read_offset;
3355          sos.Stream0VertexReadLength = urb_entry_read_length - 1;
3356          sos.Stream1VertexReadOffset = urb_entry_read_offset;
3357          sos.Stream1VertexReadLength = urb_entry_read_length - 1;
3358          sos.Stream2VertexReadOffset = urb_entry_read_offset;
3359          sos.Stream2VertexReadLength = urb_entry_read_length - 1;
3360          sos.Stream3VertexReadOffset = urb_entry_read_offset;
3361          sos.Stream3VertexReadLength = urb_entry_read_length - 1;
3362       }
3363    }
3364 }
3365
3366 static void
3367 genX(upload_sol)(struct brw_context *brw)
3368 {
3369    struct gl_context *ctx = &brw->ctx;
3370    /* BRW_NEW_TRANSFORM_FEEDBACK */
3371    bool active = _mesa_is_xfb_active_and_unpaused(ctx);
3372
3373    if (active) {
3374       genX(upload_3dstate_so_buffers)(brw);
3375
3376       /* BRW_NEW_VUE_MAP_GEOM_OUT */
3377       genX(upload_3dstate_so_decl_list)(brw, &brw->vue_map_geom_out);
3378    }
3379
3380    /* Finally, set up the SOL stage.  This command must always follow updates to
3381     * the nonpipelined SOL state (3DSTATE_SO_BUFFER, 3DSTATE_SO_DECL_LIST) or
3382     * MMIO register updates (current performed by the kernel at each batch
3383     * emit).
3384     */
3385    genX(upload_3dstate_streamout)(brw, active, &brw->vue_map_geom_out);
3386 }
3387
3388 static const struct brw_tracked_state genX(sol_state) = {
3389    .dirty = {
3390       .mesa  = _NEW_LIGHT,
3391       .brw   = BRW_NEW_BATCH |
3392                BRW_NEW_BLORP |
3393                BRW_NEW_RASTERIZER_DISCARD |
3394                BRW_NEW_VUE_MAP_GEOM_OUT |
3395                BRW_NEW_TRANSFORM_FEEDBACK,
3396    },
3397    .emit = genX(upload_sol),
3398 };
3399 #endif
3400
3401 /* ---------------------------------------------------------------------- */
3402
3403 #if GEN_GEN >= 7
3404 static void
3405 genX(upload_ps)(struct brw_context *brw)
3406 {
3407    UNUSED const struct gl_context *ctx = &brw->ctx;
3408    UNUSED const struct gen_device_info *devinfo = &brw->screen->devinfo;
3409
3410    /* BRW_NEW_FS_PROG_DATA */
3411    const struct brw_wm_prog_data *prog_data =
3412       brw_wm_prog_data(brw->wm.base.prog_data);
3413    const struct brw_stage_state *stage_state = &brw->wm.base;
3414
3415 #if GEN_GEN < 8
3416 #endif
3417
3418    brw_batch_emit(brw, GENX(3DSTATE_PS), ps) {
3419       /* Initialize the execution mask with VMask.  Otherwise, derivatives are
3420        * incorrect for subspans where some of the pixels are unlit.  We believe
3421        * the bit just didn't take effect in previous generations.
3422        */
3423       ps.VectorMaskEnable = GEN_GEN >= 8;
3424
3425       ps.SamplerCount =
3426          DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4);
3427
3428       /* BRW_NEW_FS_PROG_DATA */
3429       ps.BindingTableEntryCount = prog_data->base.binding_table.size_bytes / 4;
3430
3431       if (prog_data->base.use_alt_mode)
3432          ps.FloatingPointMode = Alternate;
3433
3434       /* Haswell requires the sample mask to be set in this packet as well as
3435        * in 3DSTATE_SAMPLE_MASK; the values should match.
3436        */
3437
3438       /* _NEW_BUFFERS, _NEW_MULTISAMPLE */
3439 #if GEN_IS_HASWELL
3440       ps.SampleMask = genX(determine_sample_mask(brw));
3441 #endif
3442
3443       /* 3DSTATE_PS expects the number of threads per PSD, which is always 64;
3444        * it implicitly scales for different GT levels (which have some # of
3445        * PSDs).
3446        *
3447        * In Gen8 the format is U8-2 whereas in Gen9 it is U8-1.
3448        */
3449 #if GEN_GEN >= 9
3450       ps.MaximumNumberofThreadsPerPSD = 64 - 1;
3451 #elif GEN_GEN >= 8
3452       ps.MaximumNumberofThreadsPerPSD = 64 - 2;
3453 #else
3454       ps.MaximumNumberofThreads = devinfo->max_wm_threads - 1;
3455 #endif
3456
3457       if (prog_data->base.nr_params > 0)
3458          ps.PushConstantEnable = true;
3459
3460 #if GEN_GEN < 8
3461       /* From the IVB PRM, volume 2 part 1, page 287:
3462        * "This bit is inserted in the PS payload header and made available to
3463        * the DataPort (either via the message header or via header bypass) to
3464        * indicate that oMask data (one or two phases) is included in Render
3465        * Target Write messages. If present, the oMask data is used to mask off
3466        * samples."
3467        */
3468       ps.oMaskPresenttoRenderTarget = prog_data->uses_omask;
3469
3470       /* The hardware wedges if you have this bit set but don't turn on any
3471        * dual source blend factors.
3472        *
3473        * BRW_NEW_FS_PROG_DATA | _NEW_COLOR
3474        */
3475       ps.DualSourceBlendEnable = prog_data->dual_src_blend &&
3476                                  (ctx->Color.BlendEnabled & 1) &&
3477                                  ctx->Color.Blend[0]._UsesDualSrc;
3478
3479       /* BRW_NEW_FS_PROG_DATA */
3480       ps.AttributeEnable = (prog_data->num_varying_inputs != 0);
3481 #endif
3482
3483       /* From the documentation for this packet:
3484        * "If the PS kernel does not need the Position XY Offsets to
3485        *  compute a Position Value, then this field should be programmed
3486        *  to POSOFFSET_NONE."
3487        *
3488        * "SW Recommendation: If the PS kernel needs the Position Offsets
3489        *  to compute a Position XY value, this field should match Position
3490        *  ZW Interpolation Mode to ensure a consistent position.xyzw
3491        *  computation."
3492        *
3493        * We only require XY sample offsets. So, this recommendation doesn't
3494        * look useful at the moment. We might need this in future.
3495        */
3496       if (prog_data->uses_pos_offset)
3497          ps.PositionXYOffsetSelect = POSOFFSET_SAMPLE;
3498       else
3499          ps.PositionXYOffsetSelect = POSOFFSET_NONE;
3500
3501       ps.RenderTargetFastClearEnable = brw->wm.fast_clear_op;
3502       ps._8PixelDispatchEnable = prog_data->dispatch_8;
3503       ps._16PixelDispatchEnable = prog_data->dispatch_16;
3504       ps.DispatchGRFStartRegisterForConstantSetupData0 =
3505          prog_data->base.dispatch_grf_start_reg;
3506       ps.DispatchGRFStartRegisterForConstantSetupData2 =
3507          prog_data->dispatch_grf_start_reg_2;
3508
3509       ps.KernelStartPointer0 = stage_state->prog_offset;
3510       ps.KernelStartPointer2 = stage_state->prog_offset +
3511          prog_data->prog_offset_2;
3512
3513       if (prog_data->base.total_scratch) {
3514          ps.ScratchSpaceBasePointer =
3515             render_bo(stage_state->scratch_bo,
3516                       ffs(stage_state->per_thread_scratch) - 11);
3517       }
3518    }
3519 }
3520
3521 static const struct brw_tracked_state genX(ps_state) = {
3522    .dirty = {
3523       .mesa  = _NEW_MULTISAMPLE |
3524                (GEN_GEN < 8 ? _NEW_BUFFERS |
3525                               _NEW_COLOR
3526                             : 0),
3527       .brw   = BRW_NEW_BATCH |
3528                BRW_NEW_BLORP |
3529                BRW_NEW_FS_PROG_DATA,
3530    },
3531    .emit = genX(upload_ps),
3532 };
3533 #endif
3534
3535 /* ---------------------------------------------------------------------- */
3536
3537 #if GEN_GEN >= 7
3538 static void
3539 genX(upload_hs_state)(struct brw_context *brw)
3540 {
3541    const struct gen_device_info *devinfo = &brw->screen->devinfo;
3542    struct brw_stage_state *stage_state = &brw->tcs.base;
3543    struct brw_stage_prog_data *stage_prog_data = stage_state->prog_data;
3544    const struct brw_vue_prog_data *vue_prog_data =
3545       brw_vue_prog_data(stage_prog_data);
3546
3547    /* BRW_NEW_TES_PROG_DATA */
3548    struct brw_tcs_prog_data *tcs_prog_data =
3549       brw_tcs_prog_data(stage_prog_data);
3550
3551    if (!tcs_prog_data) {
3552       brw_batch_emit(brw, GENX(3DSTATE_HS), hs);
3553    } else {
3554       brw_batch_emit(brw, GENX(3DSTATE_HS), hs) {
3555          INIT_THREAD_DISPATCH_FIELDS(hs, Vertex);
3556
3557          hs.InstanceCount = tcs_prog_data->instances - 1;
3558          hs.IncludeVertexHandles = true;
3559
3560          hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1;
3561       }
3562    }
3563 }
3564
3565 static const struct brw_tracked_state genX(hs_state) = {
3566    .dirty = {
3567       .mesa  = 0,
3568       .brw   = BRW_NEW_BATCH |
3569                BRW_NEW_BLORP |
3570                BRW_NEW_TCS_PROG_DATA |
3571                BRW_NEW_TESS_PROGRAMS,
3572    },
3573    .emit = genX(upload_hs_state),
3574 };
3575
3576 static void
3577 genX(upload_ds_state)(struct brw_context *brw)
3578 {
3579    const struct gen_device_info *devinfo = &brw->screen->devinfo;
3580    const struct brw_stage_state *stage_state = &brw->tes.base;
3581    struct brw_stage_prog_data *stage_prog_data = stage_state->prog_data;
3582
3583    /* BRW_NEW_TES_PROG_DATA */
3584    const struct brw_tes_prog_data *tes_prog_data =
3585       brw_tes_prog_data(stage_prog_data);
3586    const struct brw_vue_prog_data *vue_prog_data =
3587       brw_vue_prog_data(stage_prog_data);
3588
3589    if (!tes_prog_data) {
3590       brw_batch_emit(brw, GENX(3DSTATE_DS), ds);
3591    } else {
3592       brw_batch_emit(brw, GENX(3DSTATE_DS), ds) {
3593          INIT_THREAD_DISPATCH_FIELDS(ds, Patch);
3594
3595         ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1;
3596         ds.ComputeWCoordinateEnable =
3597            tes_prog_data->domain == BRW_TESS_DOMAIN_TRI;
3598
3599 #if GEN_GEN >= 8
3600         if (vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8)
3601            ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH;
3602         ds.UserClipDistanceCullTestEnableBitmask =
3603             vue_prog_data->cull_distance_mask;
3604 #endif
3605       }
3606    }
3607 }
3608
3609 static const struct brw_tracked_state genX(ds_state) = {
3610    .dirty = {
3611       .mesa  = 0,
3612       .brw   = BRW_NEW_BATCH |
3613                BRW_NEW_BLORP |
3614                BRW_NEW_TESS_PROGRAMS |
3615                BRW_NEW_TES_PROG_DATA,
3616    },
3617    .emit = genX(upload_ds_state),
3618 };
3619
3620 /* ---------------------------------------------------------------------- */
3621
3622 static void
3623 upload_te_state(struct brw_context *brw)
3624 {
3625    /* BRW_NEW_TESS_PROGRAMS */
3626    bool active = brw->tess_eval_program;
3627
3628    /* BRW_NEW_TES_PROG_DATA */
3629    const struct brw_tes_prog_data *tes_prog_data =
3630       brw_tes_prog_data(brw->tes.base.prog_data);
3631
3632    if (active) {
3633       brw_batch_emit(brw, GENX(3DSTATE_TE), te) {
3634          te.Partitioning = tes_prog_data->partitioning;
3635          te.OutputTopology = tes_prog_data->output_topology;
3636          te.TEDomain = tes_prog_data->domain;
3637          te.TEEnable = true;
3638          te.MaximumTessellationFactorOdd = 63.0;
3639          te.MaximumTessellationFactorNotOdd = 64.0;
3640       }
3641    } else {
3642       brw_batch_emit(brw, GENX(3DSTATE_TE), te);
3643    }
3644 }
3645
3646 static const struct brw_tracked_state genX(te_state) = {
3647    .dirty = {
3648       .mesa  = 0,
3649       .brw   = BRW_NEW_BLORP |
3650                BRW_NEW_CONTEXT |
3651                BRW_NEW_TES_PROG_DATA |
3652                BRW_NEW_TESS_PROGRAMS,
3653    },
3654    .emit = upload_te_state,
3655 };
3656
3657 /* ---------------------------------------------------------------------- */
3658
3659 static void
3660 genX(upload_tes_push_constants)(struct brw_context *brw)
3661 {
3662    struct brw_stage_state *stage_state = &brw->tes.base;
3663    /* BRW_NEW_TESS_PROGRAMS */
3664    const struct brw_program *tep = brw_program_const(brw->tess_eval_program);
3665
3666    if (tep) {
3667       /* BRW_NEW_TES_PROG_DATA */
3668       const struct brw_stage_prog_data *prog_data = brw->tes.base.prog_data;
3669       _mesa_shader_write_subroutine_indices(&brw->ctx, MESA_SHADER_TESS_EVAL);
3670       gen6_upload_push_constants(brw, &tep->program, prog_data, stage_state);
3671    }
3672
3673    upload_constant_state(brw, stage_state, tep, MESA_SHADER_TESS_EVAL);
3674 }
3675
3676 static const struct brw_tracked_state genX(tes_push_constants) = {
3677    .dirty = {
3678       .mesa  = _NEW_PROGRAM_CONSTANTS,
3679       .brw   = BRW_NEW_BATCH |
3680                BRW_NEW_BLORP |
3681                BRW_NEW_PUSH_CONSTANT_ALLOCATION |
3682                BRW_NEW_TESS_PROGRAMS |
3683                BRW_NEW_TES_PROG_DATA,
3684    },
3685    .emit = genX(upload_tes_push_constants),
3686 };
3687
3688 static void
3689 genX(upload_tcs_push_constants)(struct brw_context *brw)
3690 {
3691    struct brw_stage_state *stage_state = &brw->tcs.base;
3692    /* BRW_NEW_TESS_PROGRAMS */
3693    const struct brw_program *tcp = brw_program_const(brw->tess_ctrl_program);
3694    bool active = brw->tess_eval_program;
3695
3696    if (active) {
3697       /* BRW_NEW_TCS_PROG_DATA */
3698       const struct brw_stage_prog_data *prog_data = brw->tcs.base.prog_data;
3699
3700       _mesa_shader_write_subroutine_indices(&brw->ctx, MESA_SHADER_TESS_CTRL);
3701       gen6_upload_push_constants(brw, &tcp->program, prog_data, stage_state);
3702    }
3703
3704    upload_constant_state(brw, stage_state, active, MESA_SHADER_TESS_CTRL);
3705 }
3706
3707 static const struct brw_tracked_state genX(tcs_push_constants) = {
3708    .dirty = {
3709       .mesa  = _NEW_PROGRAM_CONSTANTS,
3710       .brw   = BRW_NEW_BATCH |
3711                BRW_NEW_BLORP |
3712                BRW_NEW_DEFAULT_TESS_LEVELS |
3713                BRW_NEW_PUSH_CONSTANT_ALLOCATION |
3714                BRW_NEW_TESS_PROGRAMS |
3715                BRW_NEW_TCS_PROG_DATA,
3716    },
3717    .emit = genX(upload_tcs_push_constants),
3718 };
3719
3720 #endif
3721
3722 /* ---------------------------------------------------------------------- */
3723
3724 #if GEN_GEN >= 7
3725 static void
3726 genX(upload_cs_state)(struct brw_context *brw)
3727 {
3728    if (!brw->cs.base.prog_data)
3729       return;
3730
3731    uint32_t offset;
3732    uint32_t *desc = (uint32_t*) brw_state_batch(
3733       brw, GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t), 64,
3734       &offset);
3735
3736    struct brw_stage_state *stage_state = &brw->cs.base;
3737    struct brw_stage_prog_data *prog_data = stage_state->prog_data;
3738    struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data);
3739    const struct gen_device_info *devinfo = &brw->screen->devinfo;
3740
3741    if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
3742       brw_emit_buffer_surface_state(
3743          brw, &stage_state->surf_offset[
3744                  prog_data->binding_table.shader_time_start],
3745          brw->shader_time.bo, 0, ISL_FORMAT_RAW,
3746          brw->shader_time.bo->size, 1, true);
3747    }
3748
3749    uint32_t *bind = brw_state_batch(brw, prog_data->binding_table.size_bytes,
3750                                     32, &stage_state->bind_bo_offset);
3751
3752    brw_batch_emit(brw, GENX(MEDIA_VFE_STATE), vfe) {
3753       if (prog_data->total_scratch) {
3754          uint32_t bo_offset;
3755
3756          if (GEN_GEN >= 8) {
3757             /* Broadwell's Per Thread Scratch Space is in the range [0, 11]
3758              * where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M.
3759              */
3760             bo_offset = ffs(stage_state->per_thread_scratch) - 11;
3761          } else if (GEN_IS_HASWELL) {
3762             /* Haswell's Per Thread Scratch Space is in the range [0, 10]
3763              * where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M.
3764              */
3765             bo_offset = ffs(stage_state->per_thread_scratch) - 12;
3766          } else {
3767             /* Earlier platforms use the range [0, 11] to mean [1kB, 12kB]
3768              * where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB.
3769              */
3770             bo_offset = stage_state->per_thread_scratch / 1024 - 1;
3771          }
3772          vfe.ScratchSpaceBasePointer =
3773             render_bo(stage_state->scratch_bo, bo_offset);
3774       }
3775
3776       const uint32_t subslices = MAX2(brw->screen->subslice_total, 1);
3777       vfe.MaximumNumberofThreads = devinfo->max_cs_threads * subslices - 1;
3778       vfe.NumberofURBEntries = GEN_GEN >= 8 ? 2 : 0;
3779       vfe.ResetGatewayTimer =
3780          Resettingrelativetimerandlatchingtheglobaltimestamp;
3781 #if GEN_GEN < 9
3782       vfe.BypassGatewayControl = BypassingOpenGatewayCloseGatewayprotocol;
3783 #endif
3784 #if GEN_GEN == 7
3785       vfe.GPGPUMode = 1;
3786 #endif
3787
3788       /* We are uploading duplicated copies of push constant uniforms for each
3789        * thread. Although the local id data needs to vary per thread, it won't
3790        * change for other uniform data. Unfortunately this duplication is
3791        * required for gen7. As of Haswell, this duplication can be avoided,
3792        * but this older mechanism with duplicated data continues to work.
3793        *
3794        * FINISHME: As of Haswell, we could make use of the
3795        * INTERFACE_DESCRIPTOR_DATA "Cross-Thread Constant Data Read Length"
3796        * field to only store one copy of uniform data.
3797        *
3798        * FINISHME: Broadwell adds a new alternative "Indirect Payload Storage"
3799        * which is described in the GPGPU_WALKER command and in the Broadwell
3800        * PRM Volume 7: 3D Media GPGPU, under Media GPGPU Pipeline => Mode of
3801        * Operations => GPGPU Mode => Indirect Payload Storage.
3802        *
3803        * Note: The constant data is built in brw_upload_cs_push_constants
3804        * below.
3805        */
3806       vfe.URBEntryAllocationSize = GEN_GEN >= 8 ? 2 : 0;
3807
3808       const uint32_t vfe_curbe_allocation =
3809          ALIGN(cs_prog_data->push.per_thread.regs * cs_prog_data->threads +
3810                cs_prog_data->push.cross_thread.regs, 2);
3811       vfe.CURBEAllocationSize = vfe_curbe_allocation;
3812    }
3813
3814    if (cs_prog_data->push.total.size > 0) {
3815       brw_batch_emit(brw, GENX(MEDIA_CURBE_LOAD), curbe) {
3816          curbe.CURBETotalDataLength =
3817             ALIGN(cs_prog_data->push.total.size, 64);
3818          curbe.CURBEDataStartAddress = stage_state->push_const_offset;
3819       }
3820    }
3821
3822    /* BRW_NEW_SURFACES and BRW_NEW_*_CONSTBUF */
3823    memcpy(bind, stage_state->surf_offset,
3824           prog_data->binding_table.size_bytes);
3825    const struct GENX(INTERFACE_DESCRIPTOR_DATA) idd = {
3826       .KernelStartPointer = brw->cs.base.prog_offset,
3827       .SamplerStatePointer = stage_state->sampler_offset,
3828       .SamplerCount = DIV_ROUND_UP(stage_state->sampler_count, 4) >> 2,
3829       .BindingTablePointer = stage_state->bind_bo_offset,
3830       .ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs,
3831       .NumberofThreadsinGPGPUThreadGroup = cs_prog_data->threads,
3832       .SharedLocalMemorySize = encode_slm_size(devinfo->gen,
3833                                                prog_data->total_shared),
3834       .BarrierEnable = cs_prog_data->uses_barrier,
3835 #if GEN_GEN >= 8 || GEN_IS_HASWELL
3836       .CrossThreadConstantDataReadLength =
3837          cs_prog_data->push.cross_thread.regs,
3838 #endif
3839    };
3840
3841    GENX(INTERFACE_DESCRIPTOR_DATA_pack)(brw, desc, &idd);
3842
3843    brw_batch_emit(brw, GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), load) {
3844       load.InterfaceDescriptorTotalLength =
3845          GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
3846       load.InterfaceDescriptorDataStartAddress = offset;
3847    }
3848 }
3849
3850 static const struct brw_tracked_state genX(cs_state) = {
3851    .dirty = {
3852       .mesa = _NEW_PROGRAM_CONSTANTS,
3853       .brw = BRW_NEW_BATCH |
3854              BRW_NEW_BLORP |
3855              BRW_NEW_CS_PROG_DATA |
3856              BRW_NEW_SAMPLER_STATE_TABLE |
3857              BRW_NEW_SURFACES,
3858    },
3859    .emit = genX(upload_cs_state)
3860 };
3861
3862 #endif
3863
3864 /* ---------------------------------------------------------------------- */
3865
3866 #if GEN_GEN >= 8
3867 static void
3868 genX(upload_raster)(struct brw_context *brw)
3869 {
3870    struct gl_context *ctx = &brw->ctx;
3871
3872    /* _NEW_BUFFERS */
3873    bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3874
3875    /* _NEW_POLYGON */
3876    struct gl_polygon_attrib *polygon = &ctx->Polygon;
3877
3878    /* _NEW_POINT */
3879    struct gl_point_attrib *point = &ctx->Point;
3880
3881    brw_batch_emit(brw, GENX(3DSTATE_RASTER), raster) {
3882       if (brw->polygon_front_bit == render_to_fbo)
3883          raster.FrontWinding = CounterClockwise;
3884
3885       if (polygon->CullFlag) {
3886          switch (polygon->CullFaceMode) {
3887          case GL_FRONT:
3888             raster.CullMode = CULLMODE_FRONT;
3889             break;
3890          case GL_BACK:
3891             raster.CullMode = CULLMODE_BACK;
3892             break;
3893          case GL_FRONT_AND_BACK:
3894             raster.CullMode = CULLMODE_BOTH;
3895             break;
3896          default:
3897             unreachable("not reached");
3898          }
3899       } else {
3900          raster.CullMode = CULLMODE_NONE;
3901       }
3902
3903       point->SmoothFlag = raster.SmoothPointEnable;
3904
3905       raster.DXMultisampleRasterizationEnable =
3906          _mesa_is_multisample_enabled(ctx);
3907
3908       raster.GlobalDepthOffsetEnableSolid = polygon->OffsetFill;
3909       raster.GlobalDepthOffsetEnableWireframe = polygon->OffsetLine;
3910       raster.GlobalDepthOffsetEnablePoint = polygon->OffsetPoint;
3911
3912       switch (polygon->FrontMode) {
3913       case GL_FILL:
3914          raster.FrontFaceFillMode = FILL_MODE_SOLID;
3915          break;
3916       case GL_LINE:
3917          raster.FrontFaceFillMode = FILL_MODE_WIREFRAME;
3918          break;
3919       case GL_POINT:
3920          raster.FrontFaceFillMode = FILL_MODE_POINT;
3921          break;
3922       default:
3923          unreachable("not reached");
3924       }
3925
3926       switch (polygon->BackMode) {
3927       case GL_FILL:
3928          raster.BackFaceFillMode = FILL_MODE_SOLID;
3929          break;
3930       case GL_LINE:
3931          raster.BackFaceFillMode = FILL_MODE_WIREFRAME;
3932          break;
3933       case GL_POINT:
3934          raster.BackFaceFillMode = FILL_MODE_POINT;
3935          break;
3936       default:
3937          unreachable("not reached");
3938       }
3939
3940       /* _NEW_LINE */
3941       raster.AntialiasingEnable = ctx->Line.SmoothFlag;
3942
3943       /* _NEW_SCISSOR */
3944       raster.ScissorRectangleEnable = ctx->Scissor.EnableFlags;
3945
3946       /* _NEW_TRANSFORM */
3947       if (!ctx->Transform.DepthClamp) {
3948 #if GEN_GEN >= 9
3949          raster.ViewportZFarClipTestEnable = true;
3950          raster.ViewportZNearClipTestEnable = true;
3951 #else
3952          raster.ViewportZClipTestEnable = true;
3953 #endif
3954       }
3955
3956       /* BRW_NEW_CONSERVATIVE_RASTERIZATION */
3957 #if GEN_GEN >= 9
3958       raster.ConservativeRasterizationEnable =
3959          ctx->IntelConservativeRasterization;
3960 #endif
3961
3962       raster.GlobalDepthOffsetClamp = polygon->OffsetClamp;
3963       raster.GlobalDepthOffsetScale = polygon->OffsetFactor;
3964
3965       raster.GlobalDepthOffsetConstant = polygon->OffsetUnits * 2;
3966    }
3967 }
3968
3969 static const struct brw_tracked_state genX(raster_state) = {
3970    .dirty = {
3971       .mesa  = _NEW_BUFFERS |
3972                _NEW_LINE |
3973                _NEW_MULTISAMPLE |
3974                _NEW_POINT |
3975                _NEW_POLYGON |
3976                _NEW_SCISSOR |
3977                _NEW_TRANSFORM,
3978       .brw   = BRW_NEW_BLORP |
3979                BRW_NEW_CONTEXT |
3980                BRW_NEW_CONSERVATIVE_RASTERIZATION,
3981    },
3982    .emit = genX(upload_raster),
3983 };
3984 #endif
3985
3986 /* ---------------------------------------------------------------------- */
3987
3988 #if GEN_GEN >= 8
3989 static void
3990 genX(upload_ps_extra)(struct brw_context *brw)
3991 {
3992    UNUSED struct gl_context *ctx = &brw->ctx;
3993
3994    const struct brw_wm_prog_data *prog_data =
3995       brw_wm_prog_data(brw->wm.base.prog_data);
3996
3997    brw_batch_emit(brw, GENX(3DSTATE_PS_EXTRA), psx) {
3998       psx.PixelShaderValid = true;
3999       psx.PixelShaderComputedDepthMode = prog_data->computed_depth_mode;
4000       psx.PixelShaderKillsPixel = prog_data->uses_kill;
4001       psx.AttributeEnable = prog_data->num_varying_inputs != 0;
4002       psx.PixelShaderUsesSourceDepth = prog_data->uses_src_depth;
4003       psx.PixelShaderUsesSourceW = prog_data->uses_src_w;
4004       psx.PixelShaderIsPerSample = prog_data->persample_dispatch;
4005
4006       /* _NEW_MULTISAMPLE | BRW_NEW_CONSERVATIVE_RASTERIZATION */
4007       if (prog_data->uses_sample_mask) {
4008 #if GEN_GEN >= 9
4009          if (prog_data->post_depth_coverage)
4010             psx.InputCoverageMaskState = ICMS_DEPTH_COVERAGE;
4011          else if (prog_data->inner_coverage && ctx->IntelConservativeRasterization)
4012             psx.InputCoverageMaskState = ICMS_INNER_CONSERVATIVE;
4013          else
4014             psx.InputCoverageMaskState = ICMS_NORMAL;
4015 #else
4016          psx.PixelShaderUsesInputCoverageMask = true;
4017 #endif
4018       }
4019
4020       psx.oMaskPresenttoRenderTarget = prog_data->uses_omask;
4021 #if GEN_GEN >= 9
4022       psx.PixelShaderPullsBary = prog_data->pulls_bary;
4023       psx.PixelShaderComputesStencil = prog_data->computed_stencil;
4024 #endif
4025
4026       /* The stricter cross-primitive coherency guarantees that the hardware
4027        * gives us with the "Accesses UAV" bit set for at least one shader stage
4028        * and the "UAV coherency required" bit set on the 3DPRIMITIVE command
4029        * are redundant within the current image, atomic counter and SSBO GL
4030        * APIs, which all have very loose ordering and coherency requirements
4031        * and generally rely on the application to insert explicit barriers when
4032        * a shader invocation is expected to see the memory writes performed by
4033        * the invocations of some previous primitive.  Regardless of the value
4034        * of "UAV coherency required", the "Accesses UAV" bits will implicitly
4035        * cause an in most cases useless DC flush when the lowermost stage with
4036        * the bit set finishes execution.
4037        *
4038        * It would be nice to disable it, but in some cases we can't because on
4039        * Gen8+ it also has an influence on rasterization via the PS UAV-only
4040        * signal (which could be set independently from the coherency mechanism
4041        * in the 3DSTATE_WM command on Gen7), and because in some cases it will
4042        * determine whether the hardware skips execution of the fragment shader
4043        * or not via the ThreadDispatchEnable signal.  However if we know that
4044        * GEN8_PS_BLEND_HAS_WRITEABLE_RT is going to be set and
4045        * GEN8_PSX_PIXEL_SHADER_NO_RT_WRITE is not set it shouldn't make any
4046        * difference so we may just disable it here.
4047        *
4048        * Gen8 hardware tries to compute ThreadDispatchEnable for us but doesn't
4049        * take into account KillPixels when no depth or stencil writes are
4050        * enabled.  In order for occlusion queries to work correctly with no
4051        * attachments, we need to force-enable here.
4052        *
4053        * BRW_NEW_FS_PROG_DATA | BRW_NEW_FRAGMENT_PROGRAM | _NEW_BUFFERS |
4054        * _NEW_COLOR
4055        */
4056       if ((prog_data->has_side_effects || prog_data->uses_kill) &&
4057           !brw_color_buffer_write_enabled(brw))
4058          psx.PixelShaderHasUAV = true;
4059    }
4060 }
4061
4062 const struct brw_tracked_state genX(ps_extra) = {
4063    .dirty = {
4064       .mesa  = _NEW_BUFFERS | _NEW_COLOR,
4065       .brw   = BRW_NEW_BLORP |
4066                BRW_NEW_CONTEXT |
4067                BRW_NEW_FRAGMENT_PROGRAM |
4068                BRW_NEW_FS_PROG_DATA |
4069                BRW_NEW_CONSERVATIVE_RASTERIZATION,
4070    },
4071    .emit = genX(upload_ps_extra),
4072 };
4073 #endif
4074
4075 /* ---------------------------------------------------------------------- */
4076
4077 #if GEN_GEN >= 8
4078 static void
4079 genX(upload_ps_blend)(struct brw_context *brw)
4080 {
4081    struct gl_context *ctx = &brw->ctx;
4082
4083    /* _NEW_BUFFERS */
4084    struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[0];
4085    const bool buffer0_is_integer = ctx->DrawBuffer->_IntegerBuffers & 0x1;
4086
4087    /* _NEW_COLOR */
4088    struct gl_colorbuffer_attrib *color = &ctx->Color;
4089
4090    brw_batch_emit(brw, GENX(3DSTATE_PS_BLEND), pb) {
4091       /* BRW_NEW_FRAGMENT_PROGRAM | _NEW_BUFFERS | _NEW_COLOR */
4092       pb.HasWriteableRT = brw_color_buffer_write_enabled(brw);
4093
4094       bool alpha_to_one = false;
4095
4096       if (!buffer0_is_integer) {
4097          /* _NEW_MULTISAMPLE */
4098
4099          if (_mesa_is_multisample_enabled(ctx)) {
4100             pb.AlphaToCoverageEnable = ctx->Multisample.SampleAlphaToCoverage;
4101             alpha_to_one = ctx->Multisample.SampleAlphaToOne;
4102          }
4103
4104          pb.AlphaTestEnable = color->AlphaEnabled;
4105       }
4106
4107       /* Used for implementing the following bit of GL_EXT_texture_integer:
4108        * "Per-fragment operations that require floating-point color
4109        *  components, including multisample alpha operations, alpha test,
4110        *  blending, and dithering, have no effect when the corresponding
4111        *  colors are written to an integer color buffer."
4112        *
4113        * The OpenGL specification 3.3 (page 196), section 4.1.3 says:
4114        * "If drawbuffer zero is not NONE and the buffer it references has an
4115        *  integer format, the SAMPLE_ALPHA_TO_COVERAGE and SAMPLE_ALPHA_TO_ONE
4116        *  operations are skipped."
4117        */
4118       if (rb && !buffer0_is_integer && (color->BlendEnabled & 1)) {
4119          GLenum eqRGB = color->Blend[0].EquationRGB;
4120          GLenum eqA = color->Blend[0].EquationA;
4121          GLenum srcRGB = color->Blend[0].SrcRGB;
4122          GLenum dstRGB = color->Blend[0].DstRGB;
4123          GLenum srcA = color->Blend[0].SrcA;
4124          GLenum dstA = color->Blend[0].DstA;
4125
4126          if (eqRGB == GL_MIN || eqRGB == GL_MAX)
4127             srcRGB = dstRGB = GL_ONE;
4128
4129          if (eqA == GL_MIN || eqA == GL_MAX)
4130             srcA = dstA = GL_ONE;
4131
4132          /* Due to hardware limitations, the destination may have information
4133           * in an alpha channel even when the format specifies no alpha
4134           * channel. In order to avoid getting any incorrect blending due to
4135           * that alpha channel, coerce the blend factors to values that will
4136           * not read the alpha channel, but will instead use the correct
4137           * implicit value for alpha.
4138           */
4139          if (!_mesa_base_format_has_channel(rb->_BaseFormat,
4140                                             GL_TEXTURE_ALPHA_TYPE)) {
4141             srcRGB = brw_fix_xRGB_alpha(srcRGB);
4142             srcA = brw_fix_xRGB_alpha(srcA);
4143             dstRGB = brw_fix_xRGB_alpha(dstRGB);
4144             dstA = brw_fix_xRGB_alpha(dstA);
4145          }
4146
4147          /* Alpha to One doesn't work with Dual Color Blending.  Override
4148           * SRC1_ALPHA to ONE and ONE_MINUS_SRC1_ALPHA to ZERO.
4149           */
4150          if (alpha_to_one && color->Blend[0]._UsesDualSrc) {
4151             srcRGB = fix_dual_blend_alpha_to_one(srcRGB);
4152             srcA = fix_dual_blend_alpha_to_one(srcA);
4153             dstRGB = fix_dual_blend_alpha_to_one(dstRGB);
4154             dstA = fix_dual_blend_alpha_to_one(dstA);
4155          }
4156
4157          pb.ColorBufferBlendEnable = true;
4158          pb.SourceAlphaBlendFactor = brw_translate_blend_factor(srcA);
4159          pb.DestinationAlphaBlendFactor = brw_translate_blend_factor(dstA);
4160          pb.SourceBlendFactor = brw_translate_blend_factor(srcRGB);
4161          pb.DestinationBlendFactor = brw_translate_blend_factor(dstRGB);
4162
4163          pb.IndependentAlphaBlendEnable =
4164             srcA != srcRGB || dstA != dstRGB || eqA != eqRGB;
4165       }
4166    }
4167 }
4168
4169 static const struct brw_tracked_state genX(ps_blend) = {
4170    .dirty = {
4171       .mesa = _NEW_BUFFERS |
4172               _NEW_COLOR |
4173               _NEW_MULTISAMPLE,
4174       .brw = BRW_NEW_BLORP |
4175              BRW_NEW_CONTEXT |
4176              BRW_NEW_FRAGMENT_PROGRAM,
4177    },
4178    .emit = genX(upload_ps_blend)
4179 };
4180 #endif
4181
4182 /* ---------------------------------------------------------------------- */
4183
4184 #if GEN_GEN >= 8
4185 static void
4186 genX(emit_vf_topology)(struct brw_context *brw)
4187 {
4188    brw_batch_emit(brw, GENX(3DSTATE_VF_TOPOLOGY), vftopo) {
4189       vftopo.PrimitiveTopologyType = brw->primitive;
4190    }
4191 }
4192
4193 static const struct brw_tracked_state genX(vf_topology) = {
4194    .dirty = {
4195       .mesa = 0,
4196       .brw = BRW_NEW_BLORP |
4197              BRW_NEW_PRIMITIVE,
4198    },
4199    .emit = genX(emit_vf_topology),
4200 };
4201 #endif
4202
4203 /* ---------------------------------------------------------------------- */
4204
4205 #if GEN_GEN >= 7
4206 static void
4207 genX(emit_mi_report_perf_count)(struct brw_context *brw,
4208                                 struct brw_bo *bo,
4209                                 uint32_t offset_in_bytes,
4210                                 uint32_t report_id)
4211 {
4212    brw_batch_emit(brw, GENX(MI_REPORT_PERF_COUNT), mi_rpc) {
4213       mi_rpc.MemoryAddress = instruction_bo(bo, offset_in_bytes);
4214       mi_rpc.ReportID = report_id;
4215    }
4216 }
4217 #endif
4218
4219 /* ---------------------------------------------------------------------- */
4220
4221 void
4222 genX(init_atoms)(struct brw_context *brw)
4223 {
4224 #if GEN_GEN < 6
4225    static const struct brw_tracked_state *render_atoms[] =
4226    {
4227       /* Once all the programs are done, we know how large urb entry
4228        * sizes need to be and can decide if we need to change the urb
4229        * layout.
4230        */
4231       &brw_curbe_offsets,
4232       &brw_recalculate_urb_fence,
4233
4234       &genX(cc_vp),
4235       &brw_cc_unit,
4236
4237       /* Surface state setup.  Must come before the VS/WM unit.  The binding
4238        * table upload must be last.
4239        */
4240       &brw_vs_pull_constants,
4241       &brw_wm_pull_constants,
4242       &brw_renderbuffer_surfaces,
4243       &brw_renderbuffer_read_surfaces,
4244       &brw_texture_surfaces,
4245       &brw_vs_binding_table,
4246       &brw_wm_binding_table,
4247
4248       &brw_fs_samplers,
4249       &brw_vs_samplers,
4250
4251       /* These set up state for brw_psp_urb_cbs */
4252       &brw_wm_unit,
4253       &genX(sf_clip_viewport),
4254       &genX(sf_state),
4255       &genX(vs_state), /* always required, enabled or not */
4256       &brw_clip_unit,
4257       &brw_gs_unit,
4258
4259       /* Command packets:
4260        */
4261       &brw_invariant_state,
4262
4263       &brw_binding_table_pointers,
4264       &brw_blend_constant_color,
4265
4266       &brw_depthbuffer,
4267
4268       &genX(polygon_stipple),
4269       &genX(polygon_stipple_offset),
4270
4271       &genX(line_stipple),
4272
4273       &brw_psp_urb_cbs,
4274
4275       &genX(drawing_rect),
4276       &brw_indices, /* must come before brw_vertices */
4277       &genX(index_buffer),
4278       &genX(vertices),
4279
4280       &brw_constant_buffer
4281    };
4282 #elif GEN_GEN == 6
4283    static const struct brw_tracked_state *render_atoms[] =
4284    {
4285       &genX(sf_clip_viewport),
4286
4287       /* Command packets: */
4288
4289       &genX(cc_vp),
4290
4291       &gen6_urb,
4292       &genX(blend_state),               /* must do before cc unit */
4293       &genX(color_calc_state),  /* must do before cc unit */
4294       &genX(depth_stencil_state),       /* must do before cc unit */
4295
4296       &genX(vs_push_constants), /* Before vs_state */
4297       &genX(gs_push_constants), /* Before gs_state */
4298       &genX(wm_push_constants), /* Before wm_state */
4299
4300       /* Surface state setup.  Must come before the VS/WM unit.  The binding
4301        * table upload must be last.
4302        */
4303       &brw_vs_pull_constants,
4304       &brw_vs_ubo_surfaces,
4305       &brw_gs_pull_constants,
4306       &brw_gs_ubo_surfaces,
4307       &brw_wm_pull_constants,
4308       &brw_wm_ubo_surfaces,
4309       &gen6_renderbuffer_surfaces,
4310       &brw_renderbuffer_read_surfaces,
4311       &brw_texture_surfaces,
4312       &gen6_sol_surface,
4313       &brw_vs_binding_table,
4314       &gen6_gs_binding_table,
4315       &brw_wm_binding_table,
4316
4317       &brw_fs_samplers,
4318       &brw_vs_samplers,
4319       &brw_gs_samplers,
4320       &gen6_sampler_state,
4321       &genX(multisample_state),
4322
4323       &genX(vs_state),
4324       &genX(gs_state),
4325       &genX(clip_state),
4326       &genX(sf_state),
4327       &genX(wm_state),
4328
4329       &genX(scissor_state),
4330
4331       &gen6_binding_table_pointers,
4332
4333       &brw_depthbuffer,
4334
4335       &genX(polygon_stipple),
4336       &genX(polygon_stipple_offset),
4337
4338       &genX(line_stipple),
4339
4340       &genX(drawing_rect),
4341
4342       &brw_indices, /* must come before brw_vertices */
4343       &genX(index_buffer),
4344       &genX(vertices),
4345    };
4346 #elif GEN_GEN == 7
4347    static const struct brw_tracked_state *render_atoms[] =
4348    {
4349       /* Command packets: */
4350
4351       &genX(cc_vp),
4352       &genX(sf_clip_viewport),
4353
4354       &gen7_l3_state,
4355       &gen7_push_constant_space,
4356       &gen7_urb,
4357       &genX(blend_state),               /* must do before cc unit */
4358       &genX(color_calc_state),  /* must do before cc unit */
4359       &genX(depth_stencil_state),       /* must do before cc unit */
4360
4361       &brw_vs_image_surfaces, /* Before vs push/pull constants and binding table */
4362       &brw_tcs_image_surfaces, /* Before tcs push/pull constants and binding table */
4363       &brw_tes_image_surfaces, /* Before tes push/pull constants and binding table */
4364       &brw_gs_image_surfaces, /* Before gs push/pull constants and binding table */
4365       &brw_wm_image_surfaces, /* Before wm push/pull constants and binding table */
4366
4367       &genX(vs_push_constants), /* Before vs_state */
4368       &genX(tcs_push_constants),
4369       &genX(tes_push_constants),
4370       &genX(gs_push_constants), /* Before gs_state */
4371       &genX(wm_push_constants), /* Before wm_surfaces and constant_buffer */
4372
4373       /* Surface state setup.  Must come before the VS/WM unit.  The binding
4374        * table upload must be last.
4375        */
4376       &brw_vs_pull_constants,
4377       &brw_vs_ubo_surfaces,
4378       &brw_vs_abo_surfaces,
4379       &brw_tcs_pull_constants,
4380       &brw_tcs_ubo_surfaces,
4381       &brw_tcs_abo_surfaces,
4382       &brw_tes_pull_constants,
4383       &brw_tes_ubo_surfaces,
4384       &brw_tes_abo_surfaces,
4385       &brw_gs_pull_constants,
4386       &brw_gs_ubo_surfaces,
4387       &brw_gs_abo_surfaces,
4388       &brw_wm_pull_constants,
4389       &brw_wm_ubo_surfaces,
4390       &brw_wm_abo_surfaces,
4391       &gen6_renderbuffer_surfaces,
4392       &brw_renderbuffer_read_surfaces,
4393       &brw_texture_surfaces,
4394       &brw_vs_binding_table,
4395       &brw_tcs_binding_table,
4396       &brw_tes_binding_table,
4397       &brw_gs_binding_table,
4398       &brw_wm_binding_table,
4399
4400       &brw_fs_samplers,
4401       &brw_vs_samplers,
4402       &brw_tcs_samplers,
4403       &brw_tes_samplers,
4404       &brw_gs_samplers,
4405       &genX(multisample_state),
4406
4407       &genX(vs_state),
4408       &genX(hs_state),
4409       &genX(te_state),
4410       &genX(ds_state),
4411       &genX(gs_state),
4412       &genX(sol_state),
4413       &genX(clip_state),
4414       &genX(sbe_state),
4415       &genX(sf_state),
4416       &genX(wm_state),
4417       &genX(ps_state),
4418
4419       &genX(scissor_state),
4420
4421       &gen7_depthbuffer,
4422
4423       &genX(polygon_stipple),
4424       &genX(polygon_stipple_offset),
4425
4426       &genX(line_stipple),
4427
4428       &genX(drawing_rect),
4429
4430       &brw_indices, /* must come before brw_vertices */
4431       &genX(index_buffer),
4432       &genX(vertices),
4433
4434 #if GEN_IS_HASWELL
4435       &genX(cut_index),
4436 #endif
4437    };
4438 #elif GEN_GEN >= 8
4439    static const struct brw_tracked_state *render_atoms[] =
4440    {
4441       &genX(cc_vp),
4442       &genX(sf_clip_viewport),
4443
4444       &gen7_l3_state,
4445       &gen7_push_constant_space,
4446       &gen7_urb,
4447       &genX(blend_state),
4448       &genX(color_calc_state),
4449
4450       &brw_vs_image_surfaces, /* Before vs push/pull constants and binding table */
4451       &brw_tcs_image_surfaces, /* Before tcs push/pull constants and binding table */
4452       &brw_tes_image_surfaces, /* Before tes push/pull constants and binding table */
4453       &brw_gs_image_surfaces, /* Before gs push/pull constants and binding table */
4454       &brw_wm_image_surfaces, /* Before wm push/pull constants and binding table */
4455
4456       &genX(vs_push_constants), /* Before vs_state */
4457       &genX(tcs_push_constants),
4458       &genX(tes_push_constants),
4459       &genX(gs_push_constants), /* Before gs_state */
4460       &genX(wm_push_constants), /* Before wm_surfaces and constant_buffer */
4461
4462       /* Surface state setup.  Must come before the VS/WM unit.  The binding
4463        * table upload must be last.
4464        */
4465       &brw_vs_pull_constants,
4466       &brw_vs_ubo_surfaces,
4467       &brw_vs_abo_surfaces,
4468       &brw_tcs_pull_constants,
4469       &brw_tcs_ubo_surfaces,
4470       &brw_tcs_abo_surfaces,
4471       &brw_tes_pull_constants,
4472       &brw_tes_ubo_surfaces,
4473       &brw_tes_abo_surfaces,
4474       &brw_gs_pull_constants,
4475       &brw_gs_ubo_surfaces,
4476       &brw_gs_abo_surfaces,
4477       &brw_wm_pull_constants,
4478       &brw_wm_ubo_surfaces,
4479       &brw_wm_abo_surfaces,
4480       &gen6_renderbuffer_surfaces,
4481       &brw_renderbuffer_read_surfaces,
4482       &brw_texture_surfaces,
4483       &brw_vs_binding_table,
4484       &brw_tcs_binding_table,
4485       &brw_tes_binding_table,
4486       &brw_gs_binding_table,
4487       &brw_wm_binding_table,
4488
4489       &brw_fs_samplers,
4490       &brw_vs_samplers,
4491       &brw_tcs_samplers,
4492       &brw_tes_samplers,
4493       &brw_gs_samplers,
4494       &genX(multisample_state),
4495
4496       &genX(vs_state),
4497       &genX(hs_state),
4498       &genX(te_state),
4499       &genX(ds_state),
4500       &genX(gs_state),
4501       &genX(sol_state),
4502       &genX(clip_state),
4503       &genX(raster_state),
4504       &genX(sbe_state),
4505       &genX(sf_state),
4506       &genX(ps_blend),
4507       &genX(ps_extra),
4508       &genX(ps_state),
4509       &genX(depth_stencil_state),
4510       &genX(wm_state),
4511
4512       &genX(scissor_state),
4513
4514       &gen7_depthbuffer,
4515
4516       &genX(polygon_stipple),
4517       &genX(polygon_stipple_offset),
4518
4519       &genX(line_stipple),
4520
4521       &genX(drawing_rect),
4522
4523       &genX(vf_topology),
4524
4525       &brw_indices,
4526       &genX(index_buffer),
4527       &genX(vertices),
4528
4529       &genX(cut_index),
4530       &gen8_pma_fix,
4531    };
4532 #endif
4533
4534    STATIC_ASSERT(ARRAY_SIZE(render_atoms) <= ARRAY_SIZE(brw->render_atoms));
4535    brw_copy_pipeline_atoms(brw, BRW_RENDER_PIPELINE,
4536                            render_atoms, ARRAY_SIZE(render_atoms));
4537
4538 #if GEN_GEN >= 7
4539    static const struct brw_tracked_state *compute_atoms[] =
4540    {
4541       &gen7_l3_state,
4542       &brw_cs_image_surfaces,
4543       &gen7_cs_push_constants,
4544       &brw_cs_pull_constants,
4545       &brw_cs_ubo_surfaces,
4546       &brw_cs_abo_surfaces,
4547       &brw_cs_texture_surfaces,
4548       &brw_cs_work_groups_surface,
4549       &brw_cs_samplers,
4550       &genX(cs_state),
4551    };
4552
4553    STATIC_ASSERT(ARRAY_SIZE(compute_atoms) <= ARRAY_SIZE(brw->compute_atoms));
4554    brw_copy_pipeline_atoms(brw, BRW_COMPUTE_PIPELINE,
4555                            compute_atoms, ARRAY_SIZE(compute_atoms));
4556
4557    brw->vtbl.emit_mi_report_perf_count = genX(emit_mi_report_perf_count);
4558 #endif
4559 }