src/mesa/drivers/dri/i965/genX_state_upload.c

   1 /*
   2  * Copyright © 2017 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include <assert.h>
  25
  26 #include "dev/gen_device_info.h"
  27 #include "common/gen_sample_positions.h"
  28 #include "genxml/gen_macros.h"
  29
  30 #include "main/bufferobj.h"
  31 #include "main/context.h"
  32 #include "main/enums.h"
  33 #include "main/macros.h"
  34 #include "main/state.h"
  35
  36 #include "brw_context.h"
  37 #include "brw_draw.h"
  38 #include "brw_multisample_state.h"
  39 #include "brw_state.h"
  40 #include "brw_wm.h"
  41 #include "brw_util.h"
  42
  43 #include "intel_batchbuffer.h"
  44 #include "intel_buffer_objects.h"
  45 #include "intel_fbo.h"
  46
  47 #include "main/enums.h"
  48 #include "main/fbobject.h"
  49 #include "main/framebuffer.h"
  50 #include "main/glformats.h"
  51 #include "main/samplerobj.h"
  52 #include "main/shaderapi.h"
  53 #include "main/stencil.h"
  54 #include "main/transformfeedback.h"
  55 #include "main/varray.h"
  56 #include "main/viewport.h"
  57 #include "util/half_float.h"
  58
  59 UNUSED static void *
  60 emit_dwords(struct brw_context *brw, unsigned n)
  61 {
  62    intel_batchbuffer_begin(brw, n);
  63    uint32_t *map = brw->batch.map_next;
  64    brw->batch.map_next += n;
  65    intel_batchbuffer_advance(brw);
  66    return map;
  67 }
  68
  69 struct brw_address {
  70    struct brw_bo *bo;
  71    unsigned reloc_flags;
  72    uint32_t offset;
  73 };
  74
  75 #define __gen_address_type struct brw_address
  76 #define __gen_user_data struct brw_context
  77
  78 static uint64_t
  79 __gen_combine_address(struct brw_context *brw, void *location,
  80                       struct brw_address address, uint32_t delta)
  81 {
  82    struct intel_batchbuffer *batch = &brw->batch;
  83    uint32_t offset;
  84
  85    if (address.bo == NULL) {
  86       return address.offset + delta;
  87    } else {
  88       if (GEN_GEN < 6 && brw_ptr_in_state_buffer(batch, location)) {
  89          offset = (char *) location - (char *) brw->batch.state.map;
  90          return brw_state_reloc(batch, offset, address.bo,
  91                                 address.offset + delta,
  92                                 address.reloc_flags);
  93       }
  94
  95       assert(!brw_ptr_in_state_buffer(batch, location));
  96
  97       offset = (char *) location - (char *) brw->batch.batch.map;
  98       return brw_batch_reloc(batch, offset, address.bo,
  99                              address.offset + delta,
 100                              address.reloc_flags);
 101    }
 102 }
 103
 104 UNUSED static struct brw_address
 105 rw_bo(struct brw_bo *bo, uint32_t offset)
 106 {
 107    return (struct brw_address) {
 108             .bo = bo,
 109             .offset = offset,
 110             .reloc_flags = RELOC_WRITE,
 111    };
 112 }
 113
 114 static struct brw_address
 115 ro_bo(struct brw_bo *bo, uint32_t offset)
 116 {
 117    return (struct brw_address) {
 118             .bo = bo,
 119             .offset = offset,
 120    };
 121 }
 122
 123 static struct brw_address
 124 rw_32_bo(struct brw_bo *bo, uint32_t offset)
 125 {
 126    return (struct brw_address) {
 127             .bo = bo,
 128             .offset = offset,
 129             .reloc_flags = RELOC_WRITE | RELOC_32BIT,
 130    };
 131 }
 132
 133 static struct brw_address
 134 ro_32_bo(struct brw_bo *bo, uint32_t offset)
 135 {
 136    return (struct brw_address) {
 137             .bo = bo,
 138             .offset = offset,
 139             .reloc_flags = RELOC_32BIT,
 140    };
 141 }
 142
 143 UNUSED static struct brw_address
 144 ggtt_bo(struct brw_bo *bo, uint32_t offset)
 145 {
 146    return (struct brw_address) {
 147             .bo = bo,
 148             .offset = offset,
 149             .reloc_flags = RELOC_WRITE | RELOC_NEEDS_GGTT,
 150    };
 151 }
 152
 153 #if GEN_GEN == 4
 154 static struct brw_address
 155 KSP(struct brw_context *brw, uint32_t offset)
 156 {
 157    return ro_bo(brw->cache.bo, offset);
 158 }
 159 #else
 160 static uint32_t
 161 KSP(UNUSED struct brw_context *brw, uint32_t offset)
 162 {
 163    return offset;
 164 }
 165 #endif
 166
 167 #include "genxml/genX_pack.h"
 168
 169 #define _brw_cmd_length(cmd) cmd ## _length
 170 #define _brw_cmd_length_bias(cmd) cmd ## _length_bias
 171 #define _brw_cmd_header(cmd) cmd ## _header
 172 #define _brw_cmd_pack(cmd) cmd ## _pack
 173
 174 #define brw_batch_emit(brw, cmd, name)                  \
 175    for (struct cmd name = { _brw_cmd_header(cmd) },     \
 176         *_dst = emit_dwords(brw, _brw_cmd_length(cmd)); \
 177         __builtin_expect(_dst != NULL, 1);              \
 178         _brw_cmd_pack(cmd)(brw, (void *)_dst, &name),   \
 179         _dst = NULL)
 180
 181 #define brw_batch_emitn(brw, cmd, n, ...) ({           \
 182       uint32_t *_dw = emit_dwords(brw, n);             \
 183       struct cmd template = {                          \
 184          _brw_cmd_header(cmd),                         \
 185          .DWordLength = n - _brw_cmd_length_bias(cmd), \
 186          __VA_ARGS__                                   \
 187       };                                               \
 188       _brw_cmd_pack(cmd)(brw, _dw, &template);         \
 189       _dw + 1; /* Array starts at dw[1] */             \
 190    })
 191
 192 #define brw_state_emit(brw, cmd, align, offset, name)              \
 193    for (struct cmd name = {},                                      \
 194         *_dst = brw_state_batch(brw, _brw_cmd_length(cmd) * 4,     \
 195                                 align, offset);                    \
 196         __builtin_expect(_dst != NULL, 1);                         \
 197         _brw_cmd_pack(cmd)(brw, (void *)_dst, &name),              \
 198         _dst = NULL)
 199
 200 #if GEN_GEN >= 7
 201 MAYBE_UNUSED static void
 202 emit_lrm(struct brw_context *brw, uint32_t reg, struct brw_address addr)
 203 {
 204    brw_batch_emit(brw, GENX(MI_LOAD_REGISTER_MEM), lrm) {
 205       lrm.RegisterAddress  = reg;
 206       lrm.MemoryAddress    = addr;
 207    }
 208 }
 209 #endif
 210
 211 MAYBE_UNUSED static void
 212 emit_lri(struct brw_context *brw, uint32_t reg, uint32_t imm)
 213 {
 214    brw_batch_emit(brw, GENX(MI_LOAD_REGISTER_IMM), lri) {
 215       lri.RegisterOffset   = reg;
 216       lri.DataDWord        = imm;
 217    }
 218 }
 219
 220 #if GEN_IS_HASWELL || GEN_GEN >= 8
 221 MAYBE_UNUSED static void
 222 emit_lrr(struct brw_context *brw, uint32_t dst, uint32_t src)
 223 {
 224    brw_batch_emit(brw, GENX(MI_LOAD_REGISTER_REG), lrr) {
 225       lrr.SourceRegisterAddress        = src;
 226       lrr.DestinationRegisterAddress   = dst;
 227    }
 228 }
 229 #endif
 230
 231 /**
 232  * Polygon stipple packet
 233  */
 234 static void
 235 genX(upload_polygon_stipple)(struct brw_context *brw)
 236 {
 237    struct gl_context *ctx = &brw->ctx;
 238
 239    /* _NEW_POLYGON */
 240    if (!ctx->Polygon.StippleFlag)
 241       return;
 242
 243    brw_batch_emit(brw, GENX(3DSTATE_POLY_STIPPLE_PATTERN), poly) {
 244       /* Polygon stipple is provided in OpenGL order, i.e. bottom
 245        * row first.  If we're rendering to a window (i.e. the
 246        * default frame buffer object, 0), then we need to invert
 247        * it to match our pixel layout.  But if we're rendering
 248        * to a FBO (i.e. any named frame buffer object), we *don't*
 249        * need to invert - we already match the layout.
 250        */
 251       if (ctx->DrawBuffer->FlipY) {
 252          for (unsigned i = 0; i < 32; i++)
 253             poly.PatternRow[i] = ctx->PolygonStipple[31 - i]; /* invert */
 254       } else {
 255          for (unsigned i = 0; i < 32; i++)
 256             poly.PatternRow[i] = ctx->PolygonStipple[i];
 257       }
 258    }
 259 }
 260
 261 static const struct brw_tracked_state genX(polygon_stipple) = {
 262    .dirty = {
 263       .mesa = _NEW_POLYGON |
 264               _NEW_POLYGONSTIPPLE,
 265       .brw = BRW_NEW_CONTEXT,
 266    },
 267    .emit = genX(upload_polygon_stipple),
 268 };
 269
 270 /**
 271  * Polygon stipple offset packet
 272  */
 273 static void
 274 genX(upload_polygon_stipple_offset)(struct brw_context *brw)
 275 {
 276    struct gl_context *ctx = &brw->ctx;
 277
 278    /* _NEW_POLYGON */
 279    if (!ctx->Polygon.StippleFlag)
 280       return;
 281
 282    brw_batch_emit(brw, GENX(3DSTATE_POLY_STIPPLE_OFFSET), poly) {
 283       /* _NEW_BUFFERS
 284        *
 285        * If we're drawing to a system window we have to invert the Y axis
 286        * in order to match the OpenGL pixel coordinate system, and our
 287        * offset must be matched to the window position.  If we're drawing
 288        * to a user-created FBO then our native pixel coordinate system
 289        * works just fine, and there's no window system to worry about.
 290        */
 291       if (ctx->DrawBuffer->FlipY) {
 292          poly.PolygonStippleYOffset =
 293             (32 - (_mesa_geometric_height(ctx->DrawBuffer) & 31)) & 31;
 294       }
 295    }
 296 }
 297
 298 static const struct brw_tracked_state genX(polygon_stipple_offset) = {
 299    .dirty = {
 300       .mesa = _NEW_BUFFERS |
 301               _NEW_POLYGON,
 302       .brw = BRW_NEW_CONTEXT,
 303    },
 304    .emit = genX(upload_polygon_stipple_offset),
 305 };
 306
 307 /**
 308  * Line stipple packet
 309  */
 310 static void
 311 genX(upload_line_stipple)(struct brw_context *brw)
 312 {
 313    struct gl_context *ctx = &brw->ctx;
 314
 315    if (!ctx->Line.StippleFlag)
 316       return;
 317
 318    brw_batch_emit(brw, GENX(3DSTATE_LINE_STIPPLE), line) {
 319       line.LineStipplePattern = ctx->Line.StipplePattern;
 320
 321       line.LineStippleInverseRepeatCount = 1.0f / ctx->Line.StippleFactor;
 322       line.LineStippleRepeatCount = ctx->Line.StippleFactor;
 323    }
 324 }
 325
 326 static const struct brw_tracked_state genX(line_stipple) = {
 327    .dirty = {
 328       .mesa = _NEW_LINE,
 329       .brw = BRW_NEW_CONTEXT,
 330    },
 331    .emit = genX(upload_line_stipple),
 332 };
 333
 334 /* Constant single cliprect for framebuffer object or DRI2 drawing */
 335 static void
 336 genX(upload_drawing_rect)(struct brw_context *brw)
 337 {
 338    struct gl_context *ctx = &brw->ctx;
 339    const struct gl_framebuffer *fb = ctx->DrawBuffer;
 340    const unsigned int fb_width = _mesa_geometric_width(fb);
 341    const unsigned int fb_height = _mesa_geometric_height(fb);
 342
 343    brw_batch_emit(brw, GENX(3DSTATE_DRAWING_RECTANGLE), rect) {
 344       rect.ClippedDrawingRectangleXMax = fb_width - 1;
 345       rect.ClippedDrawingRectangleYMax = fb_height - 1;
 346    }
 347 }
 348
 349 static const struct brw_tracked_state genX(drawing_rect) = {
 350    .dirty = {
 351       .mesa = _NEW_BUFFERS,
 352       .brw = BRW_NEW_BLORP |
 353              BRW_NEW_CONTEXT,
 354    },
 355    .emit = genX(upload_drawing_rect),
 356 };
 357
 358 static uint32_t *
 359 genX(emit_vertex_buffer_state)(struct brw_context *brw,
 360                                uint32_t *dw,
 361                                unsigned buffer_nr,
 362                                struct brw_bo *bo,
 363                                unsigned start_offset,
 364                                MAYBE_UNUSED unsigned end_offset,
 365                                unsigned stride,
 366                                MAYBE_UNUSED unsigned step_rate)
 367 {
 368    struct GENX(VERTEX_BUFFER_STATE) buf_state = {
 369       .VertexBufferIndex = buffer_nr,
 370       .BufferPitch = stride,
 371
 372       /* The VF cache designers apparently cut corners, and made the cache
 373        * only consider the bottom 32 bits of memory addresses.  If you happen
 374        * to have two vertex buffers which get placed exactly 4 GiB apart and
 375        * use them in back-to-back draw calls, you can get collisions.  To work
 376        * around this problem, we restrict vertex buffers to the low 32 bits of
 377        * the address space.
 378        */
 379       .BufferStartingAddress = ro_32_bo(bo, start_offset),
 380 #if GEN_GEN >= 8
 381       .BufferSize = end_offset - start_offset,
 382 #endif
 383
 384 #if GEN_GEN >= 7
 385       .AddressModifyEnable = true,
 386 #endif
 387
 388 #if GEN_GEN < 8
 389       .BufferAccessType = step_rate ? INSTANCEDATA : VERTEXDATA,
 390       .InstanceDataStepRate = step_rate,
 391 #if GEN_GEN >= 5
 392       .EndAddress = ro_bo(bo, end_offset - 1),
 393 #endif
 394 #endif
 395
 396 #if GEN_GEN == 11
 397       .MOCS = ICL_MOCS_WB,
 398 #elif GEN_GEN == 10
 399       .MOCS = CNL_MOCS_WB,
 400 #elif GEN_GEN == 9
 401       .MOCS = SKL_MOCS_WB,
 402 #elif GEN_GEN == 8
 403       .MOCS = BDW_MOCS_WB,
 404 #elif GEN_GEN == 7
 405       .MOCS = GEN7_MOCS_L3,
 406 #endif
 407    };
 408
 409    GENX(VERTEX_BUFFER_STATE_pack)(brw, dw, &buf_state);
 410    return dw + GENX(VERTEX_BUFFER_STATE_length);
 411 }
 412
 413 UNUSED static bool
 414 is_passthru_format(uint32_t format)
 415 {
 416    switch (format) {
 417    case ISL_FORMAT_R64_PASSTHRU:
 418    case ISL_FORMAT_R64G64_PASSTHRU:
 419    case ISL_FORMAT_R64G64B64_PASSTHRU:
 420    case ISL_FORMAT_R64G64B64A64_PASSTHRU:
 421       return true;
 422    default:
 423       return false;
 424    }
 425 }
 426
 427 UNUSED static int
 428 uploads_needed(uint32_t format,
 429                bool is_dual_slot)
 430 {
 431    if (!is_passthru_format(format))
 432       return 1;
 433
 434    if (is_dual_slot)
 435       return 2;
 436
 437    switch (format) {
 438    case ISL_FORMAT_R64_PASSTHRU:
 439    case ISL_FORMAT_R64G64_PASSTHRU:
 440       return 1;
 441    case ISL_FORMAT_R64G64B64_PASSTHRU:
 442    case ISL_FORMAT_R64G64B64A64_PASSTHRU:
 443       return 2;
 444    default:
 445       unreachable("not reached");
 446    }
 447 }
 448
 449 /*
 450  * Returns the format that we are finally going to use when upload a vertex
 451  * element. It will only change if we are using *64*PASSTHRU formats, as for
 452  * gen < 8 they need to be splitted on two *32*FLOAT formats.
 453  *
 454  * @upload points in which upload we are. Valid values are [0,1]
 455  */
 456 static uint32_t
 457 downsize_format_if_needed(uint32_t format,
 458                           int upload)
 459 {
 460    assert(upload == 0 || upload == 1);
 461
 462    if (!is_passthru_format(format))
 463       return format;
 464
 465    /* ISL_FORMAT_R64_PASSTHRU and ISL_FORMAT_R64G64_PASSTHRU with an upload ==
 466     * 1 means that we have been forced to do 2 uploads for a size <= 2. This
 467     * happens with gen < 8 and dvec3 or dvec4 vertex shader input
 468     * variables. In those cases, we return ISL_FORMAT_R32_FLOAT as a way of
 469     * flagging that we want to fill with zeroes this second forced upload.
 470     */
 471    switch (format) {
 472    case ISL_FORMAT_R64_PASSTHRU:
 473       return upload == 0 ? ISL_FORMAT_R32G32_FLOAT
 474                          : ISL_FORMAT_R32_FLOAT;
 475    case ISL_FORMAT_R64G64_PASSTHRU:
 476       return upload == 0 ? ISL_FORMAT_R32G32B32A32_FLOAT
 477                          : ISL_FORMAT_R32_FLOAT;
 478    case ISL_FORMAT_R64G64B64_PASSTHRU:
 479       return upload == 0 ? ISL_FORMAT_R32G32B32A32_FLOAT
 480                          : ISL_FORMAT_R32G32_FLOAT;
 481    case ISL_FORMAT_R64G64B64A64_PASSTHRU:
 482       return ISL_FORMAT_R32G32B32A32_FLOAT;
 483    default:
 484       unreachable("not reached");
 485    }
 486 }
 487
 488 /*
 489  * Returns the number of componentes associated with a format that is used on
 490  * a 64 to 32 format split. See downsize_format()
 491  */
 492 static int
 493 upload_format_size(uint32_t upload_format)
 494 {
 495    switch (upload_format) {
 496    case ISL_FORMAT_R32_FLOAT:
 497
 498       /* downsized_format has returned this one in order to flag that we are
 499        * performing a second upload which we want to have filled with
 500        * zeroes. This happens with gen < 8, a size <= 2, and dvec3 or dvec4
 501        * vertex shader input variables.
 502        */
 503
 504       return 0;
 505    case ISL_FORMAT_R32G32_FLOAT:
 506       return 2;
 507    case ISL_FORMAT_R32G32B32A32_FLOAT:
 508       return 4;
 509    default:
 510       unreachable("not reached");
 511    }
 512 }
 513
 514 static UNUSED uint16_t
 515 pinned_bo_high_bits(struct brw_bo *bo)
 516 {
 517    return (bo->kflags & EXEC_OBJECT_PINNED) ? bo->gtt_offset >> 32ull : 0;
 518 }
 519
 520 /* The VF cache designers apparently cut corners, and made the cache key's
 521  * <VertexBufferIndex, Memory Address> tuple only consider the bottom 32 bits
 522  * of the address.  If you happen to have two vertex buffers which get placed
 523  * exactly 4 GiB apart and use them in back-to-back draw calls, you can get
 524  * collisions.  (These collisions can happen within a single batch.)
 525  *
 526  * In the soft-pin world, we'd like to assign addresses up front, and never
 527  * move buffers.  So, we need to do a VF cache invalidate if the buffer for
 528  * a particular VB slot has different [48:32] address bits than the last one.
 529  *
 530  * In the relocation world, we have no idea what the addresses will be, so
 531  * we can't apply this workaround.  Instead, we tell the kernel to move it
 532  * to the low 4GB regardless.
 533  *
 534  * This HW issue is gone on Gen11+.
 535  */
 536 static void
 537 vf_invalidate_for_vb_48bit_transitions(struct brw_context *brw)
 538 {
 539 #if GEN_GEN >= 8 && GEN_GEN < 11
 540    bool need_invalidate = false;
 541    unsigned i;
 542
 543    for (i = 0; i < brw->vb.nr_buffers; i++) {
 544       uint16_t high_bits = pinned_bo_high_bits(brw->vb.buffers[i].bo);
 545
 546       if (high_bits != brw->vb.last_bo_high_bits[i]) {
 547          need_invalidate = true;
 548          brw->vb.last_bo_high_bits[i] = high_bits;
 549       }
 550    }
 551
 552    /* Don't bother with draw parameter buffers - those are generated by
 553     * the driver so we can select a consistent memory zone.
 554     */
 555
 556    if (need_invalidate) {
 557       brw_emit_pipe_control_flush(brw, PIPE_CONTROL_VF_CACHE_INVALIDATE);
 558    }
 559 #endif
 560 }
 561
 562 static void
 563 vf_invalidate_for_ib_48bit_transition(struct brw_context *brw)
 564 {
 565 #if GEN_GEN >= 8
 566    uint16_t high_bits = pinned_bo_high_bits(brw->ib.bo);
 567
 568    if (high_bits != brw->ib.last_bo_high_bits) {
 569       brw_emit_pipe_control_flush(brw, PIPE_CONTROL_VF_CACHE_INVALIDATE);
 570       brw->ib.last_bo_high_bits = high_bits;
 571    }
 572 #endif
 573 }
 574
 575 static void
 576 genX(emit_vertices)(struct brw_context *brw)
 577 {
 578    const struct gen_device_info *devinfo = &brw->screen->devinfo;
 579    uint32_t *dw;
 580
 581    brw_prepare_vertices(brw);
 582    brw_prepare_shader_draw_parameters(brw);
 583
 584 #if GEN_GEN < 6
 585    brw_emit_query_begin(brw);
 586 #endif
 587
 588    const struct brw_vs_prog_data *vs_prog_data =
 589       brw_vs_prog_data(brw->vs.base.prog_data);
 590
 591 #if GEN_GEN >= 8
 592    struct gl_context *ctx = &brw->ctx;
 593    const bool uses_edge_flag = (ctx->Polygon.FrontMode != GL_FILL ||
 594                                 ctx->Polygon.BackMode != GL_FILL);
 595
 596    if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid) {
 597       unsigned vue = brw->vb.nr_enabled;
 598
 599       /* The element for the edge flags must always be last, so we have to
 600        * insert the SGVS before it in that case.
 601        */
 602       if (uses_edge_flag) {
 603          assert(vue > 0);
 604          vue--;
 605       }
 606
 607       WARN_ONCE(vue >= 33,
 608                 "Trying to insert VID/IID past 33rd vertex element, "
 609                 "need to reorder the vertex attrbutes.");
 610
 611       brw_batch_emit(brw, GENX(3DSTATE_VF_SGVS), vfs) {
 612          if (vs_prog_data->uses_vertexid) {
 613             vfs.VertexIDEnable = true;
 614             vfs.VertexIDComponentNumber = 2;
 615             vfs.VertexIDElementOffset = vue;
 616          }
 617
 618          if (vs_prog_data->uses_instanceid) {
 619             vfs.InstanceIDEnable = true;
 620             vfs.InstanceIDComponentNumber = 3;
 621             vfs.InstanceIDElementOffset = vue;
 622          }
 623       }
 624
 625       brw_batch_emit(brw, GENX(3DSTATE_VF_INSTANCING), vfi) {
 626          vfi.InstancingEnable = true;
 627          vfi.VertexElementIndex = vue;
 628       }
 629    } else {
 630       brw_batch_emit(brw, GENX(3DSTATE_VF_SGVS), vfs);
 631    }
 632 #endif
 633
 634    const bool uses_draw_params =
 635       vs_prog_data->uses_firstvertex ||
 636       vs_prog_data->uses_baseinstance;
 637
 638    const bool uses_derived_draw_params =
 639       vs_prog_data->uses_drawid ||
 640       vs_prog_data->uses_is_indexed_draw;
 641
 642    const bool needs_sgvs_element = (uses_draw_params ||
 643                                     vs_prog_data->uses_instanceid ||
 644                                     vs_prog_data->uses_vertexid);
 645
 646    unsigned nr_elements =
 647       brw->vb.nr_enabled + needs_sgvs_element + uses_derived_draw_params;
 648
 649 #if GEN_GEN < 8
 650    /* If any of the formats of vb.enabled needs more that one upload, we need
 651     * to add it to nr_elements
 652     */
 653    for (unsigned i = 0; i < brw->vb.nr_enabled; i++) {
 654       struct brw_vertex_element *input = brw->vb.enabled[i];
 655       const struct gl_array_attributes *glattrib = input->glattrib;
 656       uint32_t format = brw_get_vertex_surface_type(brw, &glattrib->Format);
 657
 658       if (uploads_needed(format, input->is_dual_slot) > 1)
 659          nr_elements++;
 660    }
 661 #endif
 662
 663    /* If the VS doesn't read any inputs (calculating vertex position from
 664     * a state variable for some reason, for example), emit a single pad
 665     * VERTEX_ELEMENT struct and bail.
 666     *
 667     * The stale VB state stays in place, but they don't do anything unless
 668     * a VE loads from them.
 669     */
 670    if (nr_elements == 0) {
 671       dw = brw_batch_emitn(brw, GENX(3DSTATE_VERTEX_ELEMENTS),
 672                            1 + GENX(VERTEX_ELEMENT_STATE_length));
 673       struct GENX(VERTEX_ELEMENT_STATE) elem = {
 674          .Valid = true,
 675          .SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT,
 676          .Component0Control = VFCOMP_STORE_0,
 677          .Component1Control = VFCOMP_STORE_0,
 678          .Component2Control = VFCOMP_STORE_0,
 679          .Component3Control = VFCOMP_STORE_1_FP,
 680       };
 681       GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem);
 682       return;
 683    }
 684
 685    /* Now emit 3DSTATE_VERTEX_BUFFERS and 3DSTATE_VERTEX_ELEMENTS packets. */
 686    const unsigned nr_buffers = brw->vb.nr_buffers +
 687       uses_draw_params + uses_derived_draw_params;
 688
 689    vf_invalidate_for_vb_48bit_transitions(brw);
 690
 691    if (nr_buffers) {
 692       assert(nr_buffers <= (GEN_GEN >= 6 ? 33 : 17));
 693
 694       dw = brw_batch_emitn(brw, GENX(3DSTATE_VERTEX_BUFFERS),
 695                            1 + GENX(VERTEX_BUFFER_STATE_length) * nr_buffers);
 696
 697       for (unsigned i = 0; i < brw->vb.nr_buffers; i++) {
 698          const struct brw_vertex_buffer *buffer = &brw->vb.buffers[i];
 699          /* Prior to Haswell and Bay Trail we have to use 4-component formats
 700           * to fake 3-component ones.  In particular, we do this for
 701           * half-float and 8 and 16-bit integer formats.  This means that the
 702           * vertex element may poke over the end of the buffer by 2 bytes.
 703           */
 704          const unsigned padding =
 705             (GEN_GEN <= 7 && !GEN_IS_HASWELL && !devinfo->is_baytrail) * 2;
 706          const unsigned end = buffer->offset + buffer->size + padding;
 707          dw = genX(emit_vertex_buffer_state)(brw, dw, i, buffer->bo,
 708                                              buffer->offset,
 709                                              end,
 710                                              buffer->stride,
 711                                              buffer->step_rate);
 712       }
 713
 714       if (uses_draw_params) {
 715          dw = genX(emit_vertex_buffer_state)(brw, dw, brw->vb.nr_buffers,
 716                                              brw->draw.draw_params_bo,
 717                                              brw->draw.draw_params_offset,
 718                                              brw->draw.draw_params_bo->size,
 719                                              0 /* stride */,
 720                                              0 /* step rate */);
 721       }
 722
 723       if (uses_derived_draw_params) {
 724          dw = genX(emit_vertex_buffer_state)(brw, dw, brw->vb.nr_buffers + 1,
 725                                              brw->draw.derived_draw_params_bo,
 726                                              brw->draw.derived_draw_params_offset,
 727                                              brw->draw.derived_draw_params_bo->size,
 728                                              0 /* stride */,
 729                                              0 /* step rate */);
 730       }
 731    }
 732
 733    /* The hardware allows one more VERTEX_ELEMENTS than VERTEX_BUFFERS,
 734     * presumably for VertexID/InstanceID.
 735     */
 736 #if GEN_GEN >= 6
 737    assert(nr_elements <= 34);
 738    const struct brw_vertex_element *gen6_edgeflag_input = NULL;
 739 #else
 740    assert(nr_elements <= 18);
 741 #endif
 742
 743    dw = brw_batch_emitn(brw, GENX(3DSTATE_VERTEX_ELEMENTS),
 744                         1 + GENX(VERTEX_ELEMENT_STATE_length) * nr_elements);
 745    unsigned i;
 746    for (i = 0; i < brw->vb.nr_enabled; i++) {
 747       const struct brw_vertex_element *input = brw->vb.enabled[i];
 748       const struct gl_array_attributes *glattrib = input->glattrib;
 749       uint32_t format = brw_get_vertex_surface_type(brw, &glattrib->Format);
 750       uint32_t comp0 = VFCOMP_STORE_SRC;
 751       uint32_t comp1 = VFCOMP_STORE_SRC;
 752       uint32_t comp2 = VFCOMP_STORE_SRC;
 753       uint32_t comp3 = VFCOMP_STORE_SRC;
 754       const unsigned num_uploads = GEN_GEN < 8 ?
 755          uploads_needed(format, input->is_dual_slot) : 1;
 756
 757 #if GEN_GEN >= 8
 758       /* From the BDW PRM, Volume 2d, page 588 (VERTEX_ELEMENT_STATE):
 759        * "Any SourceElementFormat of *64*_PASSTHRU cannot be used with an
 760        * element which has edge flag enabled."
 761        */
 762       assert(!(is_passthru_format(format) && uses_edge_flag));
 763 #endif
 764
 765       /* The gen4 driver expects edgeflag to come in as a float, and passes
 766        * that float on to the tests in the clipper.  Mesa's current vertex
 767        * attribute value for EdgeFlag is stored as a float, which works out.
 768        * glEdgeFlagPointer, on the other hand, gives us an unnormalized
 769        * integer ubyte.  Just rewrite that to convert to a float.
 770        *
 771        * Gen6+ passes edgeflag as sideband along with the vertex, instead
 772        * of in the VUE.  We have to upload it sideband as the last vertex
 773        * element according to the B-Spec.
 774        */
 775 #if GEN_GEN >= 6
 776       if (input == &brw->vb.inputs[VERT_ATTRIB_EDGEFLAG]) {
 777          gen6_edgeflag_input = input;
 778          continue;
 779       }
 780 #endif
 781
 782       for (unsigned c = 0; c < num_uploads; c++) {
 783          const uint32_t upload_format = GEN_GEN >= 8 ? format :
 784             downsize_format_if_needed(format, c);
 785          /* If we need more that one upload, the offset stride would be 128
 786           * bits (16 bytes), as for previous uploads we are using the full
 787           * entry. */
 788          const unsigned offset = input->offset + c * 16;
 789
 790          const struct gl_array_attributes *glattrib = input->glattrib;
 791          const int size = (GEN_GEN < 8 && is_passthru_format(format)) ?
 792             upload_format_size(upload_format) : glattrib->Format.Size;
 793
 794          switch (size) {
 795             case 0: comp0 = VFCOMP_STORE_0;
 796             case 1: comp1 = VFCOMP_STORE_0;
 797             case 2: comp2 = VFCOMP_STORE_0;
 798             case 3:
 799                if (GEN_GEN >= 8 && glattrib->Format.Doubles) {
 800                   comp3 = VFCOMP_STORE_0;
 801                } else if (glattrib->Format.Integer) {
 802                   comp3 = VFCOMP_STORE_1_INT;
 803                } else {
 804                   comp3 = VFCOMP_STORE_1_FP;
 805                }
 806
 807                break;
 808          }
 809
 810 #if GEN_GEN >= 8
 811          /* From the BDW PRM, Volume 2d, page 586 (VERTEX_ELEMENT_STATE):
 812           *
 813           *     "When SourceElementFormat is set to one of the *64*_PASSTHRU
 814           *     formats, 64-bit components are stored in the URB without any
 815           *     conversion. In this case, vertex elements must be written as 128
 816           *     or 256 bits, with VFCOMP_STORE_0 being used to pad the output as
 817           *     required. E.g., if R64_PASSTHRU is used to copy a 64-bit Red
 818           *     component into the URB, Component 1 must be specified as
 819           *     VFCOMP_STORE_0 (with Components 2,3 set to VFCOMP_NOSTORE) in
 820           *     order to output a 128-bit vertex element, or Components 1-3 must
 821           *     be specified as VFCOMP_STORE_0 in order to output a 256-bit vertex
 822           *     element. Likewise, use of R64G64B64_PASSTHRU requires Component 3
 823           *     to be specified as VFCOMP_STORE_0 in order to output a 256-bit
 824           *     vertex element."
 825           */
 826          if (glattrib->Format.Doubles && !input->is_dual_slot) {
 827             /* Store vertex elements which correspond to double and dvec2 vertex
 828              * shader inputs as 128-bit vertex elements, instead of 256-bits.
 829              */
 830             comp2 = VFCOMP_NOSTORE;
 831             comp3 = VFCOMP_NOSTORE;
 832          }
 833 #endif
 834
 835          struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
 836             .VertexBufferIndex = input->buffer,
 837             .Valid = true,
 838             .SourceElementFormat = upload_format,
 839             .SourceElementOffset = offset,
 840             .Component0Control = comp0,
 841             .Component1Control = comp1,
 842             .Component2Control = comp2,
 843             .Component3Control = comp3,
 844 #if GEN_GEN < 5
 845             .DestinationElementOffset = i * 4,
 846 #endif
 847          };
 848
 849          GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state);
 850          dw += GENX(VERTEX_ELEMENT_STATE_length);
 851       }
 852    }
 853
 854    if (needs_sgvs_element) {
 855       struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
 856          .Valid = true,
 857          .Component0Control = VFCOMP_STORE_0,
 858          .Component1Control = VFCOMP_STORE_0,
 859          .Component2Control = VFCOMP_STORE_0,
 860          .Component3Control = VFCOMP_STORE_0,
 861 #if GEN_GEN < 5
 862          .DestinationElementOffset = i * 4,
 863 #endif
 864       };
 865
 866 #if GEN_GEN >= 8
 867       if (uses_draw_params) {
 868          elem_state.VertexBufferIndex = brw->vb.nr_buffers;
 869          elem_state.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
 870          elem_state.Component0Control = VFCOMP_STORE_SRC;
 871          elem_state.Component1Control = VFCOMP_STORE_SRC;
 872       }
 873 #else
 874       elem_state.VertexBufferIndex = brw->vb.nr_buffers;
 875       elem_state.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
 876       if (uses_draw_params) {
 877          elem_state.Component0Control = VFCOMP_STORE_SRC;
 878          elem_state.Component1Control = VFCOMP_STORE_SRC;
 879       }
 880
 881       if (vs_prog_data->uses_vertexid)
 882          elem_state.Component2Control = VFCOMP_STORE_VID;
 883
 884       if (vs_prog_data->uses_instanceid)
 885          elem_state.Component3Control = VFCOMP_STORE_IID;
 886 #endif
 887
 888       GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state);
 889       dw += GENX(VERTEX_ELEMENT_STATE_length);
 890    }
 891
 892    if (uses_derived_draw_params) {
 893       struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
 894          .Valid = true,
 895          .VertexBufferIndex = brw->vb.nr_buffers + 1,
 896          .SourceElementFormat = ISL_FORMAT_R32G32_UINT,
 897          .Component0Control = VFCOMP_STORE_SRC,
 898          .Component1Control = VFCOMP_STORE_SRC,
 899          .Component2Control = VFCOMP_STORE_0,
 900          .Component3Control = VFCOMP_STORE_0,
 901 #if GEN_GEN < 5
 902          .DestinationElementOffset = i * 4,
 903 #endif
 904       };
 905
 906       GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state);
 907       dw += GENX(VERTEX_ELEMENT_STATE_length);
 908    }
 909
 910 #if GEN_GEN >= 6
 911    if (gen6_edgeflag_input) {
 912       const struct gl_array_attributes *glattrib = gen6_edgeflag_input->glattrib;
 913       const uint32_t format = brw_get_vertex_surface_type(brw, &glattrib->Format);
 914
 915       struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
 916          .Valid = true,
 917          .VertexBufferIndex = gen6_edgeflag_input->buffer,
 918          .EdgeFlagEnable = true,
 919          .SourceElementFormat = format,
 920          .SourceElementOffset = gen6_edgeflag_input->offset,
 921          .Component0Control = VFCOMP_STORE_SRC,
 922          .Component1Control = VFCOMP_STORE_0,
 923          .Component2Control = VFCOMP_STORE_0,
 924          .Component3Control = VFCOMP_STORE_0,
 925       };
 926
 927       GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state);
 928       dw += GENX(VERTEX_ELEMENT_STATE_length);
 929    }
 930 #endif
 931
 932 #if GEN_GEN >= 8
 933    for (unsigned i = 0, j = 0; i < brw->vb.nr_enabled; i++) {
 934       const struct brw_vertex_element *input = brw->vb.enabled[i];
 935       const struct brw_vertex_buffer *buffer = &brw->vb.buffers[input->buffer];
 936       unsigned element_index;
 937
 938       /* The edge flag element is reordered to be the last one in the code
 939        * above so we need to compensate for that in the element indices used
 940        * below.
 941        */
 942       if (input == gen6_edgeflag_input)
 943          element_index = nr_elements - 1;
 944       else
 945          element_index = j++;
 946
 947       brw_batch_emit(brw, GENX(3DSTATE_VF_INSTANCING), vfi) {
 948          vfi.VertexElementIndex = element_index;
 949          vfi.InstancingEnable = buffer->step_rate != 0;
 950          vfi.InstanceDataStepRate = buffer->step_rate;
 951       }
 952    }
 953
 954    if (vs_prog_data->uses_drawid) {
 955       const unsigned element = brw->vb.nr_enabled + needs_sgvs_element;
 956
 957       brw_batch_emit(brw, GENX(3DSTATE_VF_INSTANCING), vfi) {
 958          vfi.VertexElementIndex = element;
 959       }
 960    }
 961 #endif
 962 }
 963
 964 static const struct brw_tracked_state genX(vertices) = {
 965    .dirty = {
 966       .mesa = _NEW_POLYGON,
 967       .brw = BRW_NEW_BATCH |
 968              BRW_NEW_BLORP |
 969              BRW_NEW_VERTEX_PROGRAM |
 970              BRW_NEW_VERTICES |
 971              BRW_NEW_VS_PROG_DATA,
 972    },
 973    .emit = genX(emit_vertices),
 974 };
 975
 976 static void
 977 genX(emit_index_buffer)(struct brw_context *brw)
 978 {
 979    const struct _mesa_index_buffer *index_buffer = brw->ib.ib;
 980
 981    if (index_buffer == NULL)
 982       return;
 983
 984    vf_invalidate_for_ib_48bit_transition(brw);
 985
 986    brw_batch_emit(brw, GENX(3DSTATE_INDEX_BUFFER), ib) {
 987 #if GEN_GEN < 8 && !GEN_IS_HASWELL
 988       ib.CutIndexEnable = brw->prim_restart.enable_cut_index;
 989 #endif
 990       ib.IndexFormat = brw_get_index_type(index_buffer->index_size);
 991
 992       /* The VF cache designers apparently cut corners, and made the cache
 993        * only consider the bottom 32 bits of memory addresses.  If you happen
 994        * to have two index buffers which get placed exactly 4 GiB apart and
 995        * use them in back-to-back draw calls, you can get collisions.  To work
 996        * around this problem, we restrict index buffers to the low 32 bits of
 997        * the address space.
 998        */
 999       ib.BufferStartingAddress = ro_32_bo(brw->ib.bo, 0);
1000 #if GEN_GEN >= 8
1001       ib.MOCS = GEN_GEN >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
1002       ib.BufferSize = brw->ib.size;
1003 #else
1004       ib.BufferEndingAddress = ro_bo(brw->ib.bo, brw->ib.size - 1);
1005 #endif
1006    }
1007 }
1008
1009 static const struct brw_tracked_state genX(index_buffer) = {
1010    .dirty = {
1011       .mesa = 0,
1012       .brw = BRW_NEW_BATCH |
1013              BRW_NEW_BLORP |
1014              BRW_NEW_INDEX_BUFFER,
1015    },
1016    .emit = genX(emit_index_buffer),
1017 };
1018
1019 #if GEN_IS_HASWELL || GEN_GEN >= 8
1020 static void
1021 genX(upload_cut_index)(struct brw_context *brw)
1022 {
1023    const struct gl_context *ctx = &brw->ctx;
1024
1025    brw_batch_emit(brw, GENX(3DSTATE_VF), vf) {
1026       if (ctx->Array._PrimitiveRestart && brw->ib.ib) {
1027          vf.IndexedDrawCutIndexEnable = true;
1028          vf.CutIndex = _mesa_primitive_restart_index(ctx, brw->ib.index_size);
1029       }
1030    }
1031 }
1032
1033 const struct brw_tracked_state genX(cut_index) = {
1034    .dirty = {
1035       .mesa  = _NEW_TRANSFORM,
1036       .brw   = BRW_NEW_INDEX_BUFFER,
1037    },
1038    .emit = genX(upload_cut_index),
1039 };
1040 #endif
1041
1042 #if GEN_GEN >= 6
1043 /**
1044  * Determine the appropriate attribute override value to store into the
1045  * 3DSTATE_SF structure for a given fragment shader attribute.  The attribute
1046  * override value contains two pieces of information: the location of the
1047  * attribute in the VUE (relative to urb_entry_read_offset, see below), and a
1048  * flag indicating whether to "swizzle" the attribute based on the direction
1049  * the triangle is facing.
1050  *
1051  * If an attribute is "swizzled", then the given VUE location is used for
1052  * front-facing triangles, and the VUE location that immediately follows is
1053  * used for back-facing triangles.  We use this to implement the mapping from
1054  * gl_FrontColor/gl_BackColor to gl_Color.
1055  *
1056  * urb_entry_read_offset is the offset into the VUE at which the SF unit is
1057  * being instructed to begin reading attribute data.  It can be set to a
1058  * nonzero value to prevent the SF unit from wasting time reading elements of
1059  * the VUE that are not needed by the fragment shader.  It is measured in
1060  * 256-bit increments.
1061  */
1062 static void
1063 genX(get_attr_override)(struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr,
1064                         const struct brw_vue_map *vue_map,
1065                         int urb_entry_read_offset, int fs_attr,
1066                         bool two_side_color, uint32_t *max_source_attr)
1067 {
1068    /* Find the VUE slot for this attribute. */
1069    int slot = vue_map->varying_to_slot[fs_attr];
1070
1071    /* Viewport and Layer are stored in the VUE header.  We need to override
1072     * them to zero if earlier stages didn't write them, as GL requires that
1073     * they read back as zero when not explicitly set.
1074     */
1075    if (fs_attr == VARYING_SLOT_VIEWPORT || fs_attr == VARYING_SLOT_LAYER) {
1076       attr->ComponentOverrideX = true;
1077       attr->ComponentOverrideW = true;
1078       attr->ConstantSource = CONST_0000;
1079
1080       if (!(vue_map->slots_valid & VARYING_BIT_LAYER))
1081          attr->ComponentOverrideY = true;
1082       if (!(vue_map->slots_valid & VARYING_BIT_VIEWPORT))
1083          attr->ComponentOverrideZ = true;
1084
1085       return;
1086    }
1087
1088    /* If there was only a back color written but not front, use back
1089     * as the color instead of undefined
1090     */
1091    if (slot == -1 && fs_attr == VARYING_SLOT_COL0)
1092       slot = vue_map->varying_to_slot[VARYING_SLOT_BFC0];
1093    if (slot == -1 && fs_attr == VARYING_SLOT_COL1)
1094       slot = vue_map->varying_to_slot[VARYING_SLOT_BFC1];
1095
1096    if (slot == -1) {
1097       /* This attribute does not exist in the VUE--that means that the vertex
1098        * shader did not write to it.  This means that either:
1099        *
1100        * (a) This attribute is a texture coordinate, and it is going to be
1101        * replaced with point coordinates (as a consequence of a call to
1102        * glTexEnvi(GL_POINT_SPRITE, GL_COORD_REPLACE, GL_TRUE)), so the
1103        * hardware will ignore whatever attribute override we supply.
1104        *
1105        * (b) This attribute is read by the fragment shader but not written by
1106        * the vertex shader, so its value is undefined.  Therefore the
1107        * attribute override we supply doesn't matter.
1108        *
1109        * (c) This attribute is gl_PrimitiveID, and it wasn't written by the
1110        * previous shader stage.
1111        *
1112        * Note that we don't have to worry about the cases where the attribute
1113        * is gl_PointCoord or is undergoing point sprite coordinate
1114        * replacement, because in those cases, this function isn't called.
1115        *
1116        * In case (c), we need to program the attribute overrides so that the
1117        * primitive ID will be stored in this slot.  In every other case, the
1118        * attribute override we supply doesn't matter.  So just go ahead and
1119        * program primitive ID in every case.
1120        */
1121       attr->ComponentOverrideW = true;
1122       attr->ComponentOverrideX = true;
1123       attr->ComponentOverrideY = true;
1124       attr->ComponentOverrideZ = true;
1125       attr->ConstantSource = PRIM_ID;
1126       return;
1127    }
1128
1129    /* Compute the location of the attribute relative to urb_entry_read_offset.
1130     * Each increment of urb_entry_read_offset represents a 256-bit value, so
1131     * it counts for two 128-bit VUE slots.
1132     */
1133    int source_attr = slot - 2 * urb_entry_read_offset;
1134    assert(source_attr >= 0 && source_attr < 32);
1135
1136    /* If we are doing two-sided color, and the VUE slot following this one
1137     * represents a back-facing color, then we need to instruct the SF unit to
1138     * do back-facing swizzling.
1139     */
1140    bool swizzling = two_side_color &&
1141       ((vue_map->slot_to_varying[slot] == VARYING_SLOT_COL0 &&
1142         vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC0) ||
1143        (vue_map->slot_to_varying[slot] == VARYING_SLOT_COL1 &&
1144         vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC1));
1145
1146    /* Update max_source_attr.  If swizzling, the SF will read this slot + 1. */
1147    if (*max_source_attr < source_attr + swizzling)
1148       *max_source_attr = source_attr + swizzling;
1149
1150    attr->SourceAttribute = source_attr;
1151    if (swizzling)
1152       attr->SwizzleSelect = INPUTATTR_FACING;
1153 }
1154
1155
1156 static void
1157 genX(calculate_attr_overrides)(const struct brw_context *brw,
1158                                struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr_overrides,
1159                                uint32_t *point_sprite_enables,
1160                                uint32_t *urb_entry_read_length,
1161                                uint32_t *urb_entry_read_offset)
1162 {
1163    const struct gl_context *ctx = &brw->ctx;
1164
1165    /* _NEW_POINT */
1166    const struct gl_point_attrib *point = &ctx->Point;
1167
1168    /* BRW_NEW_FRAGMENT_PROGRAM */
1169    const struct gl_program *fp = brw->programs[MESA_SHADER_FRAGMENT];
1170
1171    /* BRW_NEW_FS_PROG_DATA */
1172    const struct brw_wm_prog_data *wm_prog_data =
1173       brw_wm_prog_data(brw->wm.base.prog_data);
1174    uint32_t max_source_attr = 0;
1175
1176    *point_sprite_enables = 0;
1177
1178    int first_slot =
1179       brw_compute_first_urb_slot_required(fp->info.inputs_read,
1180                                           &brw->vue_map_geom_out);
1181
1182    /* Each URB offset packs two varying slots */
1183    assert(first_slot % 2 == 0);
1184    *urb_entry_read_offset = first_slot / 2;
1185
1186    /* From the Ivybridge PRM, Vol 2 Part 1, 3DSTATE_SBE,
1187     * description of dw10 Point Sprite Texture Coordinate Enable:
1188     *
1189     * "This field must be programmed to zero when non-point primitives
1190     * are rendered."
1191     *
1192     * The SandyBridge PRM doesn't explicitly say that point sprite enables
1193     * must be programmed to zero when rendering non-point primitives, but
1194     * the IvyBridge PRM does, and if we don't, we get garbage.
1195     *
1196     * This is not required on Haswell, as the hardware ignores this state
1197     * when drawing non-points -- although we do still need to be careful to
1198     * correctly set the attr overrides.
1199     *
1200     * _NEW_POLYGON
1201     * BRW_NEW_PRIMITIVE | BRW_NEW_GS_PROG_DATA | BRW_NEW_TES_PROG_DATA
1202     */
1203    bool drawing_points = brw_is_drawing_points(brw);
1204
1205    for (int attr = 0; attr < VARYING_SLOT_MAX; attr++) {
1206       int input_index = wm_prog_data->urb_setup[attr];
1207
1208       if (input_index < 0)
1209          continue;
1210
1211       /* _NEW_POINT */
1212       bool point_sprite = false;
1213       if (drawing_points) {
1214          if (point->PointSprite &&
1215              (attr >= VARYING_SLOT_TEX0 && attr <= VARYING_SLOT_TEX7) &&
1216              (point->CoordReplace & (1u << (attr - VARYING_SLOT_TEX0)))) {
1217             point_sprite = true;
1218          }
1219
1220          if (attr == VARYING_SLOT_PNTC)
1221             point_sprite = true;
1222
1223          if (point_sprite)
1224             *point_sprite_enables |= (1 << input_index);
1225       }
1226
1227       /* BRW_NEW_VUE_MAP_GEOM_OUT | _NEW_LIGHT | _NEW_PROGRAM */
1228       struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attribute = { 0 };
1229
1230       if (!point_sprite) {
1231          genX(get_attr_override)(&attribute,
1232                                  &brw->vue_map_geom_out,
1233                                  *urb_entry_read_offset, attr,
1234                                  _mesa_vertex_program_two_side_enabled(ctx),
1235                                  &max_source_attr);
1236       }
1237
1238       /* The hardware can only do the overrides on 16 overrides at a
1239        * time, and the other up to 16 have to be lined up so that the
1240        * input index = the output index.  We'll need to do some
1241        * tweaking to make sure that's the case.
1242        */
1243       if (input_index < 16)
1244          attr_overrides[input_index] = attribute;
1245       else
1246          assert(attribute.SourceAttribute == input_index);
1247    }
1248
1249    /* From the Sandy Bridge PRM, Volume 2, Part 1, documentation for
1250     * 3DSTATE_SF DWord 1 bits 15:11, "Vertex URB Entry Read Length":
1251     *
1252     * "This field should be set to the minimum length required to read the
1253     *  maximum source attribute.  The maximum source attribute is indicated
1254     *  by the maximum value of the enabled Attribute # Source Attribute if
1255     *  Attribute Swizzle Enable is set, Number of Output Attributes-1 if
1256     *  enable is not set.
1257     *  read_length = ceiling((max_source_attr + 1) / 2)
1258     *
1259     *  [errata] Corruption/Hang possible if length programmed larger than
1260     *  recommended"
1261     *
1262     * Similar text exists for Ivy Bridge.
1263     */
1264    *urb_entry_read_length = DIV_ROUND_UP(max_source_attr + 1, 2);
1265 }
1266 #endif
1267
1268 /* ---------------------------------------------------------------------- */
1269
1270 #if GEN_GEN >= 8
1271 typedef struct GENX(3DSTATE_WM_DEPTH_STENCIL) DEPTH_STENCIL_GENXML;
1272 #elif GEN_GEN >= 6
1273 typedef struct GENX(DEPTH_STENCIL_STATE)      DEPTH_STENCIL_GENXML;
1274 #else
1275 typedef struct GENX(COLOR_CALC_STATE)         DEPTH_STENCIL_GENXML;
1276 #endif
1277
1278 static inline void
1279 set_depth_stencil_bits(struct brw_context *brw, DEPTH_STENCIL_GENXML *ds)
1280 {
1281    struct gl_context *ctx = &brw->ctx;
1282
1283    /* _NEW_BUFFERS */
1284    struct intel_renderbuffer *depth_irb =
1285       intel_get_renderbuffer(ctx->DrawBuffer, BUFFER_DEPTH);
1286
1287    /* _NEW_DEPTH */
1288    struct gl_depthbuffer_attrib *depth = &ctx->Depth;
1289
1290    /* _NEW_STENCIL */
1291    struct gl_stencil_attrib *stencil = &ctx->Stencil;
1292    const int b = stencil->_BackFace;
1293
1294    if (depth->Test && depth_irb) {
1295       ds->DepthTestEnable = true;
1296       ds->DepthBufferWriteEnable = brw_depth_writes_enabled(brw);
1297       ds->DepthTestFunction = intel_translate_compare_func(depth->Func);
1298    }
1299
1300    if (brw->stencil_enabled) {
1301       ds->StencilTestEnable = true;
1302       ds->StencilWriteMask = stencil->WriteMask[0] & 0xff;
1303       ds->StencilTestMask = stencil->ValueMask[0] & 0xff;
1304
1305       ds->StencilTestFunction =
1306          intel_translate_compare_func(stencil->Function[0]);
1307       ds->StencilFailOp =
1308          intel_translate_stencil_op(stencil->FailFunc[0]);
1309       ds->StencilPassDepthPassOp =
1310          intel_translate_stencil_op(stencil->ZPassFunc[0]);
1311       ds->StencilPassDepthFailOp =
1312          intel_translate_stencil_op(stencil->ZFailFunc[0]);
1313
1314       ds->StencilBufferWriteEnable = brw->stencil_write_enabled;
1315
1316       if (brw->stencil_two_sided) {
1317          ds->DoubleSidedStencilEnable = true;
1318          ds->BackfaceStencilWriteMask = stencil->WriteMask[b] & 0xff;
1319          ds->BackfaceStencilTestMask = stencil->ValueMask[b] & 0xff;
1320
1321          ds->BackfaceStencilTestFunction =
1322             intel_translate_compare_func(stencil->Function[b]);
1323          ds->BackfaceStencilFailOp =
1324             intel_translate_stencil_op(stencil->FailFunc[b]);
1325          ds->BackfaceStencilPassDepthPassOp =
1326             intel_translate_stencil_op(stencil->ZPassFunc[b]);
1327          ds->BackfaceStencilPassDepthFailOp =
1328             intel_translate_stencil_op(stencil->ZFailFunc[b]);
1329       }
1330
1331 #if GEN_GEN <= 5 || GEN_GEN >= 9
1332       ds->StencilReferenceValue = _mesa_get_stencil_ref(ctx, 0);
1333       ds->BackfaceStencilReferenceValue = _mesa_get_stencil_ref(ctx, b);
1334 #endif
1335    }
1336 }
1337
1338 #if GEN_GEN >= 6
1339 static void
1340 genX(upload_depth_stencil_state)(struct brw_context *brw)
1341 {
1342 #if GEN_GEN >= 8
1343    brw_batch_emit(brw, GENX(3DSTATE_WM_DEPTH_STENCIL), wmds) {
1344       set_depth_stencil_bits(brw, &wmds);
1345    }
1346 #else
1347    uint32_t ds_offset;
1348    brw_state_emit(brw, GENX(DEPTH_STENCIL_STATE), 64, &ds_offset, ds) {
1349       set_depth_stencil_bits(brw, &ds);
1350    }
1351
1352    /* Now upload a pointer to the indirect state */
1353 #if GEN_GEN == 6
1354    brw_batch_emit(brw, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
1355       ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
1356       ptr.DEPTH_STENCIL_STATEChange = true;
1357    }
1358 #else
1359    brw_batch_emit(brw, GENX(3DSTATE_DEPTH_STENCIL_STATE_POINTERS), ptr) {
1360       ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
1361    }
1362 #endif
1363 #endif
1364 }
1365
1366 static const struct brw_tracked_state genX(depth_stencil_state) = {
1367    .dirty = {
1368       .mesa = _NEW_BUFFERS |
1369               _NEW_DEPTH |
1370               _NEW_STENCIL,
1371       .brw  = BRW_NEW_BLORP |
1372               (GEN_GEN >= 8 ? BRW_NEW_CONTEXT
1373                             : BRW_NEW_BATCH |
1374                               BRW_NEW_STATE_BASE_ADDRESS),
1375    },
1376    .emit = genX(upload_depth_stencil_state),
1377 };
1378 #endif
1379
1380 /* ---------------------------------------------------------------------- */
1381
1382 #if GEN_GEN <= 5
1383
1384 static void
1385 genX(upload_clip_state)(struct brw_context *brw)
1386 {
1387    struct gl_context *ctx = &brw->ctx;
1388
1389    ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
1390    brw_state_emit(brw, GENX(CLIP_STATE), 32, &brw->clip.state_offset, clip) {
1391       clip.KernelStartPointer = KSP(brw, brw->clip.prog_offset);
1392       clip.GRFRegisterCount =
1393          DIV_ROUND_UP(brw->clip.prog_data->total_grf, 16) - 1;
1394       clip.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
1395       clip.SingleProgramFlow = true;
1396       clip.VertexURBEntryReadLength = brw->clip.prog_data->urb_read_length;
1397       clip.ConstantURBEntryReadLength = brw->clip.prog_data->curb_read_length;
1398
1399       /* BRW_NEW_PUSH_CONSTANT_ALLOCATION */
1400       clip.ConstantURBEntryReadOffset = brw->curbe.clip_start * 2;
1401       clip.DispatchGRFStartRegisterForURBData = 1;
1402       clip.VertexURBEntryReadOffset = 0;
1403
1404       /* BRW_NEW_URB_FENCE */
1405       clip.NumberofURBEntries = brw->urb.nr_clip_entries;
1406       clip.URBEntryAllocationSize = brw->urb.vsize - 1;
1407
1408       if (brw->urb.nr_clip_entries >= 10) {
1409          /* Half of the URB entries go to each thread, and it has to be an
1410           * even number.
1411           */
1412          assert(brw->urb.nr_clip_entries % 2 == 0);
1413
1414          /* Although up to 16 concurrent Clip threads are allowed on Ironlake,
1415           * only 2 threads can output VUEs at a time.
1416           */
1417          clip.MaximumNumberofThreads = (GEN_GEN == 5 ? 16 : 2) - 1;
1418       } else {
1419          assert(brw->urb.nr_clip_entries >= 5);
1420          clip.MaximumNumberofThreads = 1 - 1;
1421       }
1422
1423       clip.VertexPositionSpace = VPOS_NDCSPACE;
1424       clip.UserClipFlagsMustClipEnable = true;
1425       clip.GuardbandClipTestEnable = true;
1426
1427       clip.ClipperViewportStatePointer =
1428          ro_bo(brw->batch.state.bo, brw->clip.vp_offset);
1429
1430       clip.ScreenSpaceViewportXMin = -1;
1431       clip.ScreenSpaceViewportXMax = 1;
1432       clip.ScreenSpaceViewportYMin = -1;
1433       clip.ScreenSpaceViewportYMax = 1;
1434
1435       clip.ViewportXYClipTestEnable = true;
1436       clip.ViewportZClipTestEnable = !(ctx->Transform.DepthClampNear &&
1437                                        ctx->Transform.DepthClampFar);
1438
1439       /* _NEW_TRANSFORM */
1440       if (GEN_GEN == 5 || GEN_IS_G4X) {
1441          clip.UserClipDistanceClipTestEnableBitmask =
1442             ctx->Transform.ClipPlanesEnabled;
1443       } else {
1444          /* Up to 6 actual clip flags, plus the 7th for the negative RHW
1445           * workaround.
1446           */
1447          clip.UserClipDistanceClipTestEnableBitmask =
1448             (ctx->Transform.ClipPlanesEnabled & 0x3f) | 0x40;
1449       }
1450
1451       if (ctx->Transform.ClipDepthMode == GL_ZERO_TO_ONE)
1452          clip.APIMode = APIMODE_D3D;
1453       else
1454          clip.APIMode = APIMODE_OGL;
1455
1456       clip.GuardbandClipTestEnable = true;
1457
1458       clip.ClipMode = brw->clip.prog_data->clip_mode;
1459
1460 #if GEN_IS_G4X
1461       clip.NegativeWClipTestEnable = true;
1462 #endif
1463    }
1464 }
1465
1466 const struct brw_tracked_state genX(clip_state) = {
1467    .dirty = {
1468       .mesa  = _NEW_TRANSFORM |
1469                _NEW_VIEWPORT,
1470       .brw   = BRW_NEW_BATCH |
1471                BRW_NEW_BLORP |
1472                BRW_NEW_CLIP_PROG_DATA |
1473                BRW_NEW_PUSH_CONSTANT_ALLOCATION |
1474                BRW_NEW_PROGRAM_CACHE |
1475                BRW_NEW_URB_FENCE,
1476    },
1477    .emit = genX(upload_clip_state),
1478 };
1479
1480 #else
1481
1482 static void
1483 genX(upload_clip_state)(struct brw_context *brw)
1484 {
1485    struct gl_context *ctx = &brw->ctx;
1486
1487    /* _NEW_BUFFERS */
1488    struct gl_framebuffer *fb = ctx->DrawBuffer;
1489
1490    /* BRW_NEW_FS_PROG_DATA */
1491    struct brw_wm_prog_data *wm_prog_data =
1492       brw_wm_prog_data(brw->wm.base.prog_data);
1493
1494    brw_batch_emit(brw, GENX(3DSTATE_CLIP), clip) {
1495       clip.StatisticsEnable = !brw->meta_in_progress;
1496
1497       if (wm_prog_data->barycentric_interp_modes &
1498           BRW_BARYCENTRIC_NONPERSPECTIVE_BITS)
1499          clip.NonPerspectiveBarycentricEnable = true;
1500
1501 #if GEN_GEN >= 7
1502       clip.EarlyCullEnable = true;
1503 #endif
1504
1505 #if GEN_GEN == 7
1506       clip.FrontWinding = brw->polygon_front_bit != fb->FlipY;
1507
1508       if (ctx->Polygon.CullFlag) {
1509          switch (ctx->Polygon.CullFaceMode) {
1510          case GL_FRONT:
1511             clip.CullMode = CULLMODE_FRONT;
1512             break;
1513          case GL_BACK:
1514             clip.CullMode = CULLMODE_BACK;
1515             break;
1516          case GL_FRONT_AND_BACK:
1517             clip.CullMode = CULLMODE_BOTH;
1518             break;
1519          default:
1520             unreachable("Should not get here: invalid CullFlag");
1521          }
1522       } else {
1523          clip.CullMode = CULLMODE_NONE;
1524       }
1525 #endif
1526
1527 #if GEN_GEN < 8
1528       clip.UserClipDistanceCullTestEnableBitmask =
1529          brw_vue_prog_data(brw->vs.base.prog_data)->cull_distance_mask;
1530
1531       clip.ViewportZClipTestEnable = !(ctx->Transform.DepthClampNear &&
1532                                        ctx->Transform.DepthClampFar);
1533 #endif
1534
1535       /* _NEW_LIGHT */
1536       if (ctx->Light.ProvokingVertex == GL_FIRST_VERTEX_CONVENTION) {
1537          clip.TriangleStripListProvokingVertexSelect = 0;
1538          clip.TriangleFanProvokingVertexSelect = 1;
1539          clip.LineStripListProvokingVertexSelect = 0;
1540       } else {
1541          clip.TriangleStripListProvokingVertexSelect = 2;
1542          clip.TriangleFanProvokingVertexSelect = 2;
1543          clip.LineStripListProvokingVertexSelect = 1;
1544       }
1545
1546       /* _NEW_TRANSFORM */
1547       clip.UserClipDistanceClipTestEnableBitmask =
1548          ctx->Transform.ClipPlanesEnabled;
1549
1550 #if GEN_GEN >= 8
1551       clip.ForceUserClipDistanceClipTestEnableBitmask = true;
1552 #endif
1553
1554       if (ctx->Transform.ClipDepthMode == GL_ZERO_TO_ONE)
1555          clip.APIMode = APIMODE_D3D;
1556       else
1557          clip.APIMode = APIMODE_OGL;
1558
1559       clip.GuardbandClipTestEnable = true;
1560
1561       /* BRW_NEW_VIEWPORT_COUNT */
1562       const unsigned viewport_count = brw->clip.viewport_count;
1563
1564       if (ctx->RasterDiscard) {
1565          clip.ClipMode = CLIPMODE_REJECT_ALL;
1566 #if GEN_GEN == 6
1567          perf_debug("Rasterizer discard is currently implemented via the "
1568                     "clipper; having the GS not write primitives would "
1569                     "likely be faster.\n");
1570 #endif
1571       } else {
1572          clip.ClipMode = CLIPMODE_NORMAL;
1573       }
1574
1575       clip.ClipEnable = true;
1576
1577       /* _NEW_POLYGON,
1578        * BRW_NEW_GEOMETRY_PROGRAM | BRW_NEW_TES_PROG_DATA | BRW_NEW_PRIMITIVE
1579        */
1580       if (!brw_is_drawing_points(brw) && !brw_is_drawing_lines(brw))
1581          clip.ViewportXYClipTestEnable = true;
1582
1583       clip.MinimumPointWidth = 0.125;
1584       clip.MaximumPointWidth = 255.875;
1585       clip.MaximumVPIndex = viewport_count - 1;
1586       if (_mesa_geometric_layers(fb) == 0)
1587          clip.ForceZeroRTAIndexEnable = true;
1588    }
1589 }
1590
1591 static const struct brw_tracked_state genX(clip_state) = {
1592    .dirty = {
1593       .mesa  = _NEW_BUFFERS |
1594                _NEW_LIGHT |
1595                _NEW_POLYGON |
1596                _NEW_TRANSFORM,
1597       .brw   = BRW_NEW_BLORP |
1598                BRW_NEW_CONTEXT |
1599                BRW_NEW_FS_PROG_DATA |
1600                BRW_NEW_GS_PROG_DATA |
1601                BRW_NEW_VS_PROG_DATA |
1602                BRW_NEW_META_IN_PROGRESS |
1603                BRW_NEW_PRIMITIVE |
1604                BRW_NEW_RASTERIZER_DISCARD |
1605                BRW_NEW_TES_PROG_DATA |
1606                BRW_NEW_VIEWPORT_COUNT,
1607    },
1608    .emit = genX(upload_clip_state),
1609 };
1610 #endif
1611
1612 /* ---------------------------------------------------------------------- */
1613
1614 static void
1615 genX(upload_sf)(struct brw_context *brw)
1616 {
1617    struct gl_context *ctx = &brw->ctx;
1618    float point_size;
1619
1620 #if GEN_GEN <= 7
1621    /* _NEW_BUFFERS */
1622    bool flip_y = ctx->DrawBuffer->FlipY;
1623    UNUSED const bool multisampled_fbo =
1624       _mesa_geometric_samples(ctx->DrawBuffer) > 1;
1625 #endif
1626
1627 #if GEN_GEN < 6
1628    const struct brw_sf_prog_data *sf_prog_data = brw->sf.prog_data;
1629
1630    ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
1631
1632    brw_state_emit(brw, GENX(SF_STATE), 64, &brw->sf.state_offset, sf) {
1633       sf.KernelStartPointer = KSP(brw, brw->sf.prog_offset);
1634       sf.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
1635       sf.GRFRegisterCount = DIV_ROUND_UP(sf_prog_data->total_grf, 16) - 1;
1636       sf.DispatchGRFStartRegisterForURBData = 3;
1637       sf.VertexURBEntryReadOffset = BRW_SF_URB_ENTRY_READ_OFFSET;
1638       sf.VertexURBEntryReadLength = sf_prog_data->urb_read_length;
1639       sf.NumberofURBEntries = brw->urb.nr_sf_entries;
1640       sf.URBEntryAllocationSize = brw->urb.sfsize - 1;
1641
1642       /* STATE_PREFETCH command description describes this state as being
1643        * something loaded through the GPE (L2 ISC), so it's INSTRUCTION
1644        * domain.
1645        */
1646       sf.SetupViewportStateOffset =
1647          ro_bo(brw->batch.state.bo, brw->sf.vp_offset);
1648
1649       sf.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
1650
1651       /* sf.ConstantURBEntryReadLength = stage_prog_data->curb_read_length; */
1652       /* sf.ConstantURBEntryReadOffset = brw->curbe.vs_start * 2; */
1653
1654       sf.MaximumNumberofThreads =
1655          MIN2(GEN_GEN == 5 ? 48 : 24, brw->urb.nr_sf_entries) - 1;
1656
1657       sf.SpritePointEnable = ctx->Point.PointSprite;
1658
1659       sf.DestinationOriginHorizontalBias = 0.5;
1660       sf.DestinationOriginVerticalBias = 0.5;
1661 #else
1662    brw_batch_emit(brw, GENX(3DSTATE_SF), sf) {
1663       sf.StatisticsEnable = true;
1664 #endif
1665       sf.ViewportTransformEnable = true;
1666
1667 #if GEN_GEN == 7
1668       /* _NEW_BUFFERS */
1669       sf.DepthBufferSurfaceFormat = brw_depthbuffer_format(brw);
1670 #endif
1671
1672 #if GEN_GEN <= 7
1673       /* _NEW_POLYGON */
1674       sf.FrontWinding = brw->polygon_front_bit != flip_y;
1675 #if GEN_GEN >= 6
1676       sf.GlobalDepthOffsetEnableSolid = ctx->Polygon.OffsetFill;
1677       sf.GlobalDepthOffsetEnableWireframe = ctx->Polygon.OffsetLine;
1678       sf.GlobalDepthOffsetEnablePoint = ctx->Polygon.OffsetPoint;
1679
1680       switch (ctx->Polygon.FrontMode) {
1681          case GL_FILL:
1682             sf.FrontFaceFillMode = FILL_MODE_SOLID;
1683             break;
1684          case GL_LINE:
1685             sf.FrontFaceFillMode = FILL_MODE_WIREFRAME;
1686             break;
1687          case GL_POINT:
1688             sf.FrontFaceFillMode = FILL_MODE_POINT;
1689             break;
1690          default:
1691             unreachable("not reached");
1692       }
1693
1694       switch (ctx->Polygon.BackMode) {
1695          case GL_FILL:
1696             sf.BackFaceFillMode = FILL_MODE_SOLID;
1697             break;
1698          case GL_LINE:
1699             sf.BackFaceFillMode = FILL_MODE_WIREFRAME;
1700             break;
1701          case GL_POINT:
1702             sf.BackFaceFillMode = FILL_MODE_POINT;
1703             break;
1704          default:
1705             unreachable("not reached");
1706       }
1707
1708       if (multisampled_fbo && ctx->Multisample.Enabled)
1709          sf.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
1710
1711       sf.GlobalDepthOffsetConstant = ctx->Polygon.OffsetUnits * 2;
1712       sf.GlobalDepthOffsetScale = ctx->Polygon.OffsetFactor;
1713       sf.GlobalDepthOffsetClamp = ctx->Polygon.OffsetClamp;
1714 #endif
1715
1716       sf.ScissorRectangleEnable = true;
1717
1718       if (ctx->Polygon.CullFlag) {
1719          switch (ctx->Polygon.CullFaceMode) {
1720             case GL_FRONT:
1721                sf.CullMode = CULLMODE_FRONT;
1722                break;
1723             case GL_BACK:
1724                sf.CullMode = CULLMODE_BACK;
1725                break;
1726             case GL_FRONT_AND_BACK:
1727                sf.CullMode = CULLMODE_BOTH;
1728                break;
1729             default:
1730                unreachable("not reached");
1731          }
1732       } else {
1733          sf.CullMode = CULLMODE_NONE;
1734       }
1735
1736 #if GEN_IS_HASWELL
1737       sf.LineStippleEnable = ctx->Line.StippleFlag;
1738 #endif
1739
1740 #endif
1741
1742       /* _NEW_LINE */
1743 #if GEN_GEN == 8
1744       const struct gen_device_info *devinfo = &brw->screen->devinfo;
1745
1746       if (devinfo->is_cherryview)
1747          sf.CHVLineWidth = brw_get_line_width(brw);
1748       else
1749          sf.LineWidth = brw_get_line_width(brw);
1750 #else
1751       sf.LineWidth = brw_get_line_width(brw);
1752 #endif
1753
1754       if (ctx->Line.SmoothFlag) {
1755          sf.LineEndCapAntialiasingRegionWidth = _10pixels;
1756 #if GEN_GEN <= 7
1757          sf.AntiAliasingEnable = true;
1758 #endif
1759       }
1760
1761       /* _NEW_POINT - Clamp to ARB_point_parameters user limits */
1762       point_size = CLAMP(ctx->Point.Size, ctx->Point.MinSize, ctx->Point.MaxSize);
1763       /* Clamp to the hardware limits */
1764       sf.PointWidth = CLAMP(point_size, 0.125f, 255.875f);
1765
1766       /* _NEW_PROGRAM | _NEW_POINT, BRW_NEW_VUE_MAP_GEOM_OUT */
1767       if (use_state_point_size(brw))
1768          sf.PointWidthSource = State;
1769
1770 #if GEN_GEN >= 8
1771       /* _NEW_POINT | _NEW_MULTISAMPLE */
1772       if ((ctx->Point.SmoothFlag || _mesa_is_multisample_enabled(ctx)) &&
1773           !ctx->Point.PointSprite)
1774          sf.SmoothPointEnable = true;
1775 #endif
1776
1777 #if GEN_GEN == 10
1778       /* _NEW_BUFFERS
1779        * Smooth Point Enable bit MUST not be set when NUM_MULTISAMPLES > 1.
1780        */
1781       const bool multisampled_fbo =
1782          _mesa_geometric_samples(ctx->DrawBuffer) > 1;
1783       if (multisampled_fbo)
1784          sf.SmoothPointEnable = false;
1785 #endif
1786
1787 #if GEN_IS_G4X || GEN_GEN >= 5
1788       sf.AALineDistanceMode = AALINEDISTANCE_TRUE;
1789 #endif
1790
1791       /* _NEW_LIGHT */
1792       if (ctx->Light.ProvokingVertex != GL_FIRST_VERTEX_CONVENTION) {
1793          sf.TriangleStripListProvokingVertexSelect = 2;
1794          sf.TriangleFanProvokingVertexSelect = 2;
1795          sf.LineStripListProvokingVertexSelect = 1;
1796       } else {
1797          sf.TriangleFanProvokingVertexSelect = 1;
1798       }
1799
1800 #if GEN_GEN == 6
1801       /* BRW_NEW_FS_PROG_DATA */
1802       const struct brw_wm_prog_data *wm_prog_data =
1803          brw_wm_prog_data(brw->wm.base.prog_data);
1804
1805       sf.AttributeSwizzleEnable = true;
1806       sf.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
1807
1808       /*
1809        * Window coordinates in an FBO are inverted, which means point
1810        * sprite origin must be inverted, too.
1811        */
1812       if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) == flip_y) {
1813          sf.PointSpriteTextureCoordinateOrigin = LOWERLEFT;
1814       } else {
1815          sf.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
1816       }
1817
1818       /* BRW_NEW_VUE_MAP_GEOM_OUT | BRW_NEW_FRAGMENT_PROGRAM |
1819        * _NEW_POINT | _NEW_LIGHT | _NEW_PROGRAM | BRW_NEW_FS_PROG_DATA
1820        */
1821       uint32_t urb_entry_read_length;
1822       uint32_t urb_entry_read_offset;
1823       uint32_t point_sprite_enables;
1824       genX(calculate_attr_overrides)(brw, sf.Attribute, &point_sprite_enables,
1825                                      &urb_entry_read_length,
1826                                      &urb_entry_read_offset);
1827       sf.VertexURBEntryReadLength = urb_entry_read_length;
1828       sf.VertexURBEntryReadOffset = urb_entry_read_offset;
1829       sf.PointSpriteTextureCoordinateEnable = point_sprite_enables;
1830       sf.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
1831 #endif
1832    }
1833 }
1834
1835 static const struct brw_tracked_state genX(sf_state) = {
1836    .dirty = {
1837       .mesa  = _NEW_LIGHT |
1838                _NEW_LINE |
1839                _NEW_POINT |
1840                _NEW_PROGRAM |
1841                (GEN_GEN >= 6 ? _NEW_MULTISAMPLE : 0) |
1842                (GEN_GEN <= 7 ? _NEW_BUFFERS | _NEW_POLYGON : 0) |
1843                (GEN_GEN == 10 ? _NEW_BUFFERS : 0),
1844       .brw   = BRW_NEW_BLORP |
1845                BRW_NEW_VUE_MAP_GEOM_OUT |
1846                (GEN_GEN <= 5 ? BRW_NEW_BATCH |
1847                                BRW_NEW_PROGRAM_CACHE |
1848                                BRW_NEW_SF_PROG_DATA |
1849                                BRW_NEW_SF_VP |
1850                                BRW_NEW_URB_FENCE
1851                              : 0) |
1852                (GEN_GEN >= 6 ? BRW_NEW_CONTEXT : 0) |
1853                (GEN_GEN >= 6 && GEN_GEN <= 7 ?
1854                                BRW_NEW_GS_PROG_DATA |
1855                                BRW_NEW_PRIMITIVE |
1856                                BRW_NEW_TES_PROG_DATA
1857                              : 0) |
1858                (GEN_GEN == 6 ? BRW_NEW_FS_PROG_DATA |
1859                                BRW_NEW_FRAGMENT_PROGRAM
1860                              : 0),
1861    },
1862    .emit = genX(upload_sf),
1863 };
1864
1865 /* ---------------------------------------------------------------------- */
1866
1867 static bool
1868 brw_color_buffer_write_enabled(struct brw_context *brw)
1869 {
1870    struct gl_context *ctx = &brw->ctx;
1871    /* BRW_NEW_FRAGMENT_PROGRAM */
1872    const struct gl_program *fp = brw->programs[MESA_SHADER_FRAGMENT];
1873    unsigned i;
1874
1875    /* _NEW_BUFFERS */
1876    for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++) {
1877       struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
1878       uint64_t outputs_written = fp->info.outputs_written;
1879
1880       /* _NEW_COLOR */
1881       if (rb && (outputs_written & BITFIELD64_BIT(FRAG_RESULT_COLOR) ||
1882                  outputs_written & BITFIELD64_BIT(FRAG_RESULT_DATA0 + i)) &&
1883           GET_COLORMASK(ctx->Color.ColorMask, i)) {
1884          return true;
1885       }
1886    }
1887
1888    return false;
1889 }
1890
1891 static void
1892 genX(upload_wm)(struct brw_context *brw)
1893 {
1894    struct gl_context *ctx = &brw->ctx;
1895
1896    /* BRW_NEW_FS_PROG_DATA */
1897    const struct brw_wm_prog_data *wm_prog_data =
1898       brw_wm_prog_data(brw->wm.base.prog_data);
1899
1900    UNUSED bool writes_depth =
1901       wm_prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF;
1902    UNUSED struct brw_stage_state *stage_state = &brw->wm.base;
1903    UNUSED const struct gen_device_info *devinfo = &brw->screen->devinfo;
1904
1905 #if GEN_GEN == 6
1906    /* We can't fold this into gen6_upload_wm_push_constants(), because
1907     * according to the SNB PRM, vol 2 part 1 section 7.2.2
1908     * (3DSTATE_CONSTANT_PS [DevSNB]):
1909     *
1910     *     "[DevSNB]: This packet must be followed by WM_STATE."
1911     */
1912    brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_PS), wmcp) {
1913       if (wm_prog_data->base.nr_params != 0) {
1914          wmcp.Buffer0Valid = true;
1915          /* Pointer to the WM constant buffer.  Covered by the set of
1916           * state flags from gen6_upload_wm_push_constants.
1917           */
1918          wmcp.ConstantBody.PointertoConstantBuffer0 = stage_state->push_const_offset;
1919          wmcp.ConstantBody.ConstantBuffer0ReadLength = stage_state->push_const_size - 1;
1920       }
1921    }
1922 #endif
1923
1924 #if GEN_GEN >= 6
1925    brw_batch_emit(brw, GENX(3DSTATE_WM), wm) {
1926 #else
1927    ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
1928    brw_state_emit(brw, GENX(WM_STATE), 64, &stage_state->state_offset, wm) {
1929 #endif
1930
1931 #if GEN_GEN <= 6
1932       wm._8PixelDispatchEnable = wm_prog_data->dispatch_8;
1933       wm._16PixelDispatchEnable = wm_prog_data->dispatch_16;
1934       wm._32PixelDispatchEnable = wm_prog_data->dispatch_32;
1935 #endif
1936
1937 #if GEN_GEN == 4
1938       /* On gen4, we only have one shader kernel */
1939       if (brw_wm_state_has_ksp(wm, 0)) {
1940          assert(brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0) == 0);
1941          wm.KernelStartPointer0 = KSP(brw, stage_state->prog_offset);
1942          wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 0);
1943          wm.DispatchGRFStartRegisterForConstantSetupData0 =
1944             brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 0);
1945       }
1946 #elif GEN_GEN == 5
1947       /* On gen5, we have multiple shader kernels but only one GRF start
1948        * register for all kernels
1949        */
1950       wm.KernelStartPointer0 = stage_state->prog_offset +
1951                                brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0);
1952       wm.KernelStartPointer1 = stage_state->prog_offset +
1953                                brw_wm_prog_data_prog_offset(wm_prog_data, wm, 1);
1954       wm.KernelStartPointer2 = stage_state->prog_offset +
1955                                brw_wm_prog_data_prog_offset(wm_prog_data, wm, 2);
1956
1957       wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 0);
1958       wm.GRFRegisterCount1 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 1);
1959       wm.GRFRegisterCount2 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 2);
1960
1961       wm.DispatchGRFStartRegisterForConstantSetupData0 =
1962          wm_prog_data->base.dispatch_grf_start_reg;
1963
1964       /* Dispatch GRF Start should be the same for all shaders on gen5 */
1965       if (brw_wm_state_has_ksp(wm, 1)) {
1966          assert(wm_prog_data->base.dispatch_grf_start_reg ==
1967                 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 1));
1968       }
1969       if (brw_wm_state_has_ksp(wm, 2)) {
1970          assert(wm_prog_data->base.dispatch_grf_start_reg ==
1971                 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 2));
1972       }
1973 #elif GEN_GEN == 6
1974       /* On gen6, we have multiple shader kernels and we no longer specify a
1975        * register count for each one.
1976        */
1977       wm.KernelStartPointer0 = stage_state->prog_offset +
1978                                brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0);
1979       wm.KernelStartPointer1 = stage_state->prog_offset +
1980                                brw_wm_prog_data_prog_offset(wm_prog_data, wm, 1);
1981       wm.KernelStartPointer2 = stage_state->prog_offset +
1982                                brw_wm_prog_data_prog_offset(wm_prog_data, wm, 2);
1983
1984       wm.DispatchGRFStartRegisterForConstantSetupData0 =
1985          brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 0);
1986       wm.DispatchGRFStartRegisterForConstantSetupData1 =
1987          brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 1);
1988       wm.DispatchGRFStartRegisterForConstantSetupData2 =
1989          brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 2);
1990 #endif
1991
1992 #if GEN_GEN <= 5
1993       wm.ConstantURBEntryReadLength = wm_prog_data->base.curb_read_length;
1994       /* BRW_NEW_PUSH_CONSTANT_ALLOCATION */
1995       wm.ConstantURBEntryReadOffset = brw->curbe.wm_start * 2;
1996       wm.SetupURBEntryReadLength = wm_prog_data->num_varying_inputs * 2;
1997       wm.SetupURBEntryReadOffset = 0;
1998       wm.EarlyDepthTestEnable = true;
1999 #endif
2000
2001 #if GEN_GEN >= 6
2002       wm.LineAntialiasingRegionWidth = _10pixels;
2003       wm.LineEndCapAntialiasingRegionWidth = _05pixels;
2004
2005       wm.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
2006       wm.BarycentricInterpolationMode = wm_prog_data->barycentric_interp_modes;
2007 #else
2008       if (stage_state->sampler_count)
2009          wm.SamplerStatePointer =
2010             ro_bo(brw->batch.state.bo, stage_state->sampler_offset);
2011
2012       wm.LineAntialiasingRegionWidth = _05pixels;
2013       wm.LineEndCapAntialiasingRegionWidth = _10pixels;
2014
2015       /* _NEW_POLYGON */
2016       if (ctx->Polygon.OffsetFill) {
2017          wm.GlobalDepthOffsetEnable = true;
2018          /* Something weird going on with legacy_global_depth_bias,
2019           * offset_constant, scaling and MRD.  This value passes glean
2020           * but gives some odd results elsewere (eg. the
2021           * quad-offset-units test).
2022           */
2023          wm.GlobalDepthOffsetConstant = ctx->Polygon.OffsetUnits * 2;
2024
2025          /* This is the only value that passes glean:
2026          */
2027          wm.GlobalDepthOffsetScale = ctx->Polygon.OffsetFactor;
2028       }
2029
2030       wm.DepthCoefficientURBReadOffset = 1;
2031 #endif
2032
2033       /* BRW_NEW_STATS_WM */
2034       wm.StatisticsEnable = GEN_GEN >= 6 || brw->stats_wm;
2035
2036 #if GEN_GEN < 7
2037       if (wm_prog_data->base.use_alt_mode)
2038          wm.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
2039
2040       /* WA_1606682166 */
2041       wm.SamplerCount = (GEN_GEN == 5 || GEN_GEN == 11) ?
2042          0 : DIV_ROUND_UP(stage_state->sampler_count, 4);
2043
2044       wm.BindingTableEntryCount =
2045          wm_prog_data->base.binding_table.size_bytes / 4;
2046       wm.MaximumNumberofThreads = devinfo->max_wm_threads - 1;
2047
2048 #if GEN_GEN == 6
2049       wm.DualSourceBlendEnable =
2050          wm_prog_data->dual_src_blend && (ctx->Color.BlendEnabled & 1) &&
2051          ctx->Color.Blend[0]._UsesDualSrc;
2052       wm.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
2053       wm.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
2054
2055       /* From the SNB PRM, volume 2 part 1, page 281:
2056        * "If the PS kernel does not need the Position XY Offsets
2057        * to compute a Position XY value, then this field should be
2058        * programmed to POSOFFSET_NONE."
2059        *
2060        * "SW Recommendation: If the PS kernel needs the Position Offsets
2061        * to compute a Position XY value, this field should match Position
2062        * ZW Interpolation Mode to ensure a consistent position.xyzw
2063        * computation."
2064        * We only require XY sample offsets. So, this recommendation doesn't
2065        * look useful at the moment. We might need this in future.
2066        */
2067       if (wm_prog_data->uses_pos_offset)
2068          wm.PositionXYOffsetSelect = POSOFFSET_SAMPLE;
2069       else
2070          wm.PositionXYOffsetSelect = POSOFFSET_NONE;
2071 #endif
2072
2073       if (wm_prog_data->base.total_scratch) {
2074          wm.ScratchSpaceBasePointer = rw_32_bo(stage_state->scratch_bo, 0);
2075          wm.PerThreadScratchSpace =
2076             ffs(stage_state->per_thread_scratch) - 11;
2077       }
2078
2079       wm.PixelShaderComputedDepth = writes_depth;
2080 #endif
2081
2082       /* _NEW_LINE */
2083       wm.LineStippleEnable = ctx->Line.StippleFlag;
2084
2085       /* _NEW_POLYGON */
2086       wm.PolygonStippleEnable = ctx->Polygon.StippleFlag;
2087
2088 #if GEN_GEN < 8
2089
2090 #if GEN_GEN >= 6
2091       wm.PixelShaderUsesSourceW = wm_prog_data->uses_src_w;
2092
2093       /* _NEW_BUFFERS */
2094       const bool multisampled_fbo = _mesa_geometric_samples(ctx->DrawBuffer) > 1;
2095
2096       if (multisampled_fbo) {
2097          /* _NEW_MULTISAMPLE */
2098          if (ctx->Multisample.Enabled)
2099             wm.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
2100          else
2101             wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
2102
2103          if (wm_prog_data->persample_dispatch)
2104             wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
2105          else
2106             wm.MultisampleDispatchMode = MSDISPMODE_PERPIXEL;
2107       } else {
2108          wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
2109          wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
2110       }
2111 #endif
2112       wm.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth;
2113       if (wm_prog_data->uses_kill ||
2114           _mesa_is_alpha_test_enabled(ctx) ||
2115           _mesa_is_alpha_to_coverage_enabled(ctx) ||
2116           (GEN_GEN >= 6 && wm_prog_data->uses_omask)) {
2117          wm.PixelShaderKillsPixel = true;
2118       }
2119
2120       /* _NEW_BUFFERS | _NEW_COLOR */
2121       if (brw_color_buffer_write_enabled(brw) || writes_depth ||
2122           wm.PixelShaderKillsPixel ||
2123           (GEN_GEN >= 6 && wm_prog_data->has_side_effects)) {
2124          wm.ThreadDispatchEnable = true;
2125       }
2126
2127 #if GEN_GEN >= 7
2128       wm.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode;
2129       wm.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask;
2130 #endif
2131
2132       /* The "UAV access enable" bits are unnecessary on HSW because they only
2133        * seem to have an effect on the HW-assisted coherency mechanism which we
2134        * don't need, and the rasterization-related UAV_ONLY flag and the
2135        * DISPATCH_ENABLE bit can be set independently from it.
2136        * C.f. gen8_upload_ps_extra().
2137        *
2138        * BRW_NEW_FRAGMENT_PROGRAM | BRW_NEW_FS_PROG_DATA | _NEW_BUFFERS |
2139        * _NEW_COLOR
2140        */
2141 #if GEN_IS_HASWELL
2142       if (!(brw_color_buffer_write_enabled(brw) || writes_depth) &&
2143           wm_prog_data->has_side_effects)
2144          wm.PSUAVonly = ON;
2145 #endif
2146 #endif
2147
2148 #if GEN_GEN >= 7
2149       /* BRW_NEW_FS_PROG_DATA */
2150       if (wm_prog_data->early_fragment_tests)
2151          wm.EarlyDepthStencilControl = EDSC_PREPS;
2152       else if (wm_prog_data->has_side_effects)
2153          wm.EarlyDepthStencilControl = EDSC_PSEXEC;
2154 #endif
2155    }
2156
2157 #if GEN_GEN <= 5
2158    if (brw->wm.offset_clamp != ctx->Polygon.OffsetClamp) {
2159       brw_batch_emit(brw, GENX(3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP), clamp) {
2160          clamp.GlobalDepthOffsetClamp = ctx->Polygon.OffsetClamp;
2161       }
2162
2163       brw->wm.offset_clamp = ctx->Polygon.OffsetClamp;
2164    }
2165 #endif
2166 }
2167
2168 static const struct brw_tracked_state genX(wm_state) = {
2169    .dirty = {
2170       .mesa  = _NEW_LINE |
2171                _NEW_POLYGON |
2172                (GEN_GEN < 8 ? _NEW_BUFFERS |
2173                               _NEW_COLOR :
2174                               0) |
2175                (GEN_GEN == 6 ? _NEW_PROGRAM_CONSTANTS : 0) |
2176                (GEN_GEN < 6 ? _NEW_POLYGONSTIPPLE : 0) |
2177                (GEN_GEN < 8 && GEN_GEN >= 6 ? _NEW_MULTISAMPLE : 0),
2178       .brw   = BRW_NEW_BLORP |
2179                BRW_NEW_FS_PROG_DATA |
2180                (GEN_GEN < 6 ? BRW_NEW_PUSH_CONSTANT_ALLOCATION |
2181                               BRW_NEW_FRAGMENT_PROGRAM |
2182                               BRW_NEW_PROGRAM_CACHE |
2183                               BRW_NEW_SAMPLER_STATE_TABLE |
2184                               BRW_NEW_STATS_WM
2185                             : 0) |
2186                (GEN_GEN < 7 ? BRW_NEW_BATCH : BRW_NEW_CONTEXT),
2187    },
2188    .emit = genX(upload_wm),
2189 };
2190
2191 /* ---------------------------------------------------------------------- */
2192
2193 /* We restrict scratch buffers to the bottom 32 bits of the address space
2194  * by using rw_32_bo().
2195  *
2196  * General State Base Address is a bit broken.  If the address + size as
2197  * seen by STATE_BASE_ADDRESS overflows 48 bits, the GPU appears to treat
2198  * all accesses to the buffer as being out of bounds and returns zero.
2199  */
2200
2201 #define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix) \
2202    pkt.KernelStartPointer = KSP(brw, stage_state->prog_offset);           \
2203    /* WA_1606682166 */                                                    \
2204    pkt.SamplerCount       =                                               \
2205       GEN_GEN == 11 ?                                                     \
2206       0 :                                                                 \
2207       DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4);          \
2208    /* Gen 11 workarounds table #2056 WABTPPrefetchDisable suggests to     \
2209     * disable prefetching of binding tables in A0 and B0 steppings.       \
2210     * TODO: Revisit this WA on C0 stepping.                               \
2211     */                                                                    \
2212    pkt.BindingTableEntryCount =                                           \
2213       GEN_GEN == 11 ?                                                     \
2214       0 :                                                                 \
2215       stage_prog_data->binding_table.size_bytes / 4;                      \
2216    pkt.FloatingPointMode  = stage_prog_data->use_alt_mode;                \
2217                                                                           \
2218    if (stage_prog_data->total_scratch) {                                  \
2219       pkt.ScratchSpaceBasePointer = rw_32_bo(stage_state->scratch_bo, 0); \
2220       pkt.PerThreadScratchSpace =                                         \
2221          ffs(stage_state->per_thread_scratch) - 11;                       \
2222    }                                                                      \
2223                                                                           \
2224    pkt.DispatchGRFStartRegisterForURBData =                               \
2225       stage_prog_data->dispatch_grf_start_reg;                            \
2226    pkt.prefix##URBEntryReadLength = vue_prog_data->urb_read_length;       \
2227    pkt.prefix##URBEntryReadOffset = 0;                                    \
2228                                                                           \
2229    pkt.StatisticsEnable = true;                                           \
2230    pkt.Enable           = true;
2231
2232 static void
2233 genX(upload_vs_state)(struct brw_context *brw)
2234 {
2235    UNUSED struct gl_context *ctx = &brw->ctx;
2236    const struct gen_device_info *devinfo = &brw->screen->devinfo;
2237    struct brw_stage_state *stage_state = &brw->vs.base;
2238
2239    /* BRW_NEW_VS_PROG_DATA */
2240    const struct brw_vue_prog_data *vue_prog_data =
2241       brw_vue_prog_data(brw->vs.base.prog_data);
2242    const struct brw_stage_prog_data *stage_prog_data = &vue_prog_data->base;
2243
2244    assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8 ||
2245           vue_prog_data->dispatch_mode == DISPATCH_MODE_4X2_DUAL_OBJECT);
2246    assert(GEN_GEN < 11 ||
2247           vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8);
2248
2249 #if GEN_GEN == 6
2250    /* From the BSpec, 3D Pipeline > Geometry > Vertex Shader > State,
2251     * 3DSTATE_VS, Dword 5.0 "VS Function Enable":
2252     *
2253     *   [DevSNB] A pipeline flush must be programmed prior to a 3DSTATE_VS
2254     *   command that causes the VS Function Enable to toggle. Pipeline
2255     *   flush can be executed by sending a PIPE_CONTROL command with CS
2256     *   stall bit set and a post sync operation.
2257     *
2258     * We've already done such a flush at the start of state upload, so we
2259     * don't need to do another one here.
2260     */
2261    brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_VS), cvs) {
2262       if (stage_state->push_const_size != 0) {
2263          cvs.Buffer0Valid = true;
2264          cvs.ConstantBody.PointertoConstantBuffer0 = stage_state->push_const_offset;
2265          cvs.ConstantBody.ConstantBuffer0ReadLength = stage_state->push_const_size - 1;
2266       }
2267    }
2268 #endif
2269
2270    if (GEN_GEN == 7 && devinfo->is_ivybridge)
2271       gen7_emit_vs_workaround_flush(brw);
2272
2273 #if GEN_GEN >= 6
2274    brw_batch_emit(brw, GENX(3DSTATE_VS), vs) {
2275 #else
2276    ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
2277    brw_state_emit(brw, GENX(VS_STATE), 32, &stage_state->state_offset, vs) {
2278 #endif
2279       INIT_THREAD_DISPATCH_FIELDS(vs, Vertex);
2280
2281       vs.MaximumNumberofThreads = devinfo->max_vs_threads - 1;
2282
2283 #if GEN_GEN < 6
2284       vs.GRFRegisterCount = DIV_ROUND_UP(vue_prog_data->total_grf, 16) - 1;
2285       vs.ConstantURBEntryReadLength = stage_prog_data->curb_read_length;
2286       vs.ConstantURBEntryReadOffset = brw->curbe.vs_start * 2;
2287
2288       vs.NumberofURBEntries = brw->urb.nr_vs_entries >> (GEN_GEN == 5 ? 2 : 0);
2289       vs.URBEntryAllocationSize = brw->urb.vsize - 1;
2290
2291       vs.MaximumNumberofThreads =
2292          CLAMP(brw->urb.nr_vs_entries / 2, 1, devinfo->max_vs_threads) - 1;
2293
2294       vs.StatisticsEnable = false;
2295       vs.SamplerStatePointer =
2296          ro_bo(brw->batch.state.bo, stage_state->sampler_offset);
2297 #endif
2298
2299 #if GEN_GEN == 5
2300       /* Force single program flow on Ironlake.  We cannot reliably get
2301        * all applications working without it.  See:
2302        * https://bugs.freedesktop.org/show_bug.cgi?id=29172
2303        *
2304        * The most notable and reliably failing application is the Humus
2305        * demo "CelShading"
2306        */
2307       vs.SingleProgramFlow = true;
2308       vs.SamplerCount = 0; /* hardware requirement */
2309 #endif
2310
2311 #if GEN_GEN >= 8
2312       vs.SIMD8DispatchEnable =
2313          vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8;
2314
2315       vs.UserClipDistanceCullTestEnableBitmask =
2316          vue_prog_data->cull_distance_mask;
2317 #endif
2318    }
2319
2320 #if GEN_GEN == 6
2321    /* Based on my reading of the simulator, the VS constants don't get
2322     * pulled into the VS FF unit until an appropriate pipeline flush
2323     * happens, and instead the 3DSTATE_CONSTANT_VS packet just adds
2324     * references to them into a little FIFO.  The flushes are common,
2325     * but don't reliably happen between this and a 3DPRIMITIVE, causing
2326     * the primitive to use the wrong constants.  Then the FIFO
2327     * containing the constant setup gets added to again on the next
2328     * constants change, and eventually when a flush does happen the
2329     * unit is overwhelmed by constant changes and dies.
2330     *
2331     * To avoid this, send a PIPE_CONTROL down the line that will
2332     * update the unit immediately loading the constants.  The flush
2333     * type bits here were those set by the STATE_BASE_ADDRESS whose
2334     * move in a82a43e8d99e1715dd11c9c091b5ab734079b6a6 triggered the
2335     * bug reports that led to this workaround, and may be more than
2336     * what is strictly required to avoid the issue.
2337     */
2338    brw_emit_pipe_control_flush(brw,
2339                                PIPE_CONTROL_DEPTH_STALL |
2340                                PIPE_CONTROL_INSTRUCTION_INVALIDATE |
2341                                PIPE_CONTROL_STATE_CACHE_INVALIDATE);
2342 #endif
2343 }
2344
2345 static const struct brw_tracked_state genX(vs_state) = {
2346    .dirty = {
2347       .mesa  = (GEN_GEN == 6 ? (_NEW_PROGRAM_CONSTANTS | _NEW_TRANSFORM) : 0),
2348       .brw   = BRW_NEW_BATCH |
2349                BRW_NEW_BLORP |
2350                BRW_NEW_CONTEXT |
2351                BRW_NEW_VS_PROG_DATA |
2352                (GEN_GEN == 6 ? BRW_NEW_VERTEX_PROGRAM : 0) |
2353                (GEN_GEN <= 5 ? BRW_NEW_PUSH_CONSTANT_ALLOCATION |
2354                                BRW_NEW_PROGRAM_CACHE |
2355                                BRW_NEW_SAMPLER_STATE_TABLE |
2356                                BRW_NEW_URB_FENCE
2357                              : 0),
2358    },
2359    .emit = genX(upload_vs_state),
2360 };
2361
2362 /* ---------------------------------------------------------------------- */
2363
2364 static void
2365 genX(upload_cc_viewport)(struct brw_context *brw)
2366 {
2367    struct gl_context *ctx = &brw->ctx;
2368
2369    /* BRW_NEW_VIEWPORT_COUNT */
2370    const unsigned viewport_count = brw->clip.viewport_count;
2371
2372    struct GENX(CC_VIEWPORT) ccv;
2373    uint32_t cc_vp_offset;
2374    uint32_t *cc_map =
2375       brw_state_batch(brw, 4 * GENX(CC_VIEWPORT_length) * viewport_count,
2376                       32, &cc_vp_offset);
2377
2378    for (unsigned i = 0; i < viewport_count; i++) {
2379       /* _NEW_VIEWPORT | _NEW_TRANSFORM */
2380       const struct gl_viewport_attrib *vp = &ctx->ViewportArray[i];
2381       if (ctx->Transform.DepthClampNear && ctx->Transform.DepthClampFar) {
2382          ccv.MinimumDepth = MIN2(vp->Near, vp->Far);
2383          ccv.MaximumDepth = MAX2(vp->Near, vp->Far);
2384       } else if (ctx->Transform.DepthClampNear) {
2385          ccv.MinimumDepth = MIN2(vp->Near, vp->Far);
2386          ccv.MaximumDepth = 0.0;
2387       } else if (ctx->Transform.DepthClampFar) {
2388          ccv.MinimumDepth = 0.0;
2389          ccv.MaximumDepth = MAX2(vp->Near, vp->Far);
2390       } else {
2391          ccv.MinimumDepth = 0.0;
2392          ccv.MaximumDepth = 1.0;
2393       }
2394       GENX(CC_VIEWPORT_pack)(NULL, cc_map, &ccv);
2395       cc_map += GENX(CC_VIEWPORT_length);
2396    }
2397
2398 #if GEN_GEN >= 7
2399    brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), ptr) {
2400       ptr.CCViewportPointer = cc_vp_offset;
2401    }
2402 #elif GEN_GEN == 6
2403    brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) {
2404       vp.CCViewportStateChange = 1;
2405       vp.PointertoCC_VIEWPORT = cc_vp_offset;
2406    }
2407 #else
2408    brw->cc.vp_offset = cc_vp_offset;
2409    ctx->NewDriverState |= BRW_NEW_CC_VP;
2410 #endif
2411 }
2412
2413 const struct brw_tracked_state genX(cc_vp) = {
2414    .dirty = {
2415       .mesa = _NEW_TRANSFORM |
2416               _NEW_VIEWPORT,
2417       .brw = BRW_NEW_BATCH |
2418              BRW_NEW_BLORP |
2419              BRW_NEW_VIEWPORT_COUNT,
2420    },
2421    .emit = genX(upload_cc_viewport)
2422 };
2423
2424 /* ---------------------------------------------------------------------- */
2425
2426 static void
2427 set_scissor_bits(const struct gl_context *ctx, int i,
2428                  bool flip_y, unsigned fb_width, unsigned fb_height,
2429                  struct GENX(SCISSOR_RECT) *sc)
2430 {
2431    int bbox[4];
2432
2433    bbox[0] = MAX2(ctx->ViewportArray[i].X, 0);
2434    bbox[1] = MIN2(bbox[0] + ctx->ViewportArray[i].Width, fb_width);
2435    bbox[2] = MAX2(ctx->ViewportArray[i].Y, 0);
2436    bbox[3] = MIN2(bbox[2] + ctx->ViewportArray[i].Height, fb_height);
2437    _mesa_intersect_scissor_bounding_box(ctx, i, bbox);
2438
2439    if (bbox[0] == bbox[1] || bbox[2] == bbox[3]) {
2440       /* If the scissor was out of bounds and got clamped to 0 width/height
2441        * at the bounds, the subtraction of 1 from maximums could produce a
2442        * negative number and thus not clip anything.  Instead, just provide
2443        * a min > max scissor inside the bounds, which produces the expected
2444        * no rendering.
2445        */
2446       sc->ScissorRectangleXMin = 1;
2447       sc->ScissorRectangleXMax = 0;
2448       sc->ScissorRectangleYMin = 1;
2449       sc->ScissorRectangleYMax = 0;
2450    } else if (!flip_y) {
2451       /* texmemory: Y=0=bottom */
2452       sc->ScissorRectangleXMin = bbox[0];
2453       sc->ScissorRectangleXMax = bbox[1] - 1;
2454       sc->ScissorRectangleYMin = bbox[2];
2455       sc->ScissorRectangleYMax = bbox[3] - 1;
2456    } else {
2457       /* memory: Y=0=top */
2458       sc->ScissorRectangleXMin = bbox[0];
2459       sc->ScissorRectangleXMax = bbox[1] - 1;
2460       sc->ScissorRectangleYMin = fb_height - bbox[3];
2461       sc->ScissorRectangleYMax = fb_height - bbox[2] - 1;
2462    }
2463 }
2464
2465 #if GEN_GEN >= 6
2466 static void
2467 genX(upload_scissor_state)(struct brw_context *brw)
2468 {
2469    struct gl_context *ctx = &brw->ctx;
2470    const bool flip_y = ctx->DrawBuffer->FlipY;
2471    struct GENX(SCISSOR_RECT) scissor;
2472    uint32_t scissor_state_offset;
2473    const unsigned int fb_width = _mesa_geometric_width(ctx->DrawBuffer);
2474    const unsigned int fb_height = _mesa_geometric_height(ctx->DrawBuffer);
2475    uint32_t *scissor_map;
2476
2477    /* BRW_NEW_VIEWPORT_COUNT */
2478    const unsigned viewport_count = brw->clip.viewport_count;
2479
2480    scissor_map = brw_state_batch(
2481       brw, GENX(SCISSOR_RECT_length) * sizeof(uint32_t) * viewport_count,
2482       32, &scissor_state_offset);
2483
2484    /* _NEW_SCISSOR | _NEW_BUFFERS | _NEW_VIEWPORT */
2485
2486    /* The scissor only needs to handle the intersection of drawable and
2487     * scissor rect.  Clipping to the boundaries of static shared buffers
2488     * for front/back/depth is covered by looping over cliprects in brw_draw.c.
2489     *
2490     * Note that the hardware's coordinates are inclusive, while Mesa's min is
2491     * inclusive but max is exclusive.
2492     */
2493    for (unsigned i = 0; i < viewport_count; i++) {
2494       set_scissor_bits(ctx, i, flip_y, fb_width, fb_height, &scissor);
2495       GENX(SCISSOR_RECT_pack)(
2496          NULL, scissor_map + i * GENX(SCISSOR_RECT_length), &scissor);
2497    }
2498
2499    brw_batch_emit(brw, GENX(3DSTATE_SCISSOR_STATE_POINTERS), ptr) {
2500       ptr.ScissorRectPointer = scissor_state_offset;
2501    }
2502 }
2503
2504 static const struct brw_tracked_state genX(scissor_state) = {
2505    .dirty = {
2506       .mesa = _NEW_BUFFERS |
2507               _NEW_SCISSOR |
2508               _NEW_VIEWPORT,
2509       .brw = BRW_NEW_BATCH |
2510              BRW_NEW_BLORP |
2511              BRW_NEW_VIEWPORT_COUNT,
2512    },
2513    .emit = genX(upload_scissor_state),
2514 };
2515 #endif
2516
2517 /* ---------------------------------------------------------------------- */
2518
2519 static void
2520 brw_calculate_guardband_size(uint32_t fb_width, uint32_t fb_height,
2521                              float m00, float m11, float m30, float m31,
2522                              float *xmin, float *xmax,
2523                              float *ymin, float *ymax)
2524 {
2525    /* According to the "Vertex X,Y Clamping and Quantization" section of the
2526     * Strips and Fans documentation:
2527     *
2528     * "The vertex X and Y screen-space coordinates are also /clamped/ to the
2529     *  fixed-point "guardband" range supported by the rasterization hardware"
2530     *
2531     * and
2532     *
2533     * "In almost all circumstances, if an object’s vertices are actually
2534     *  modified by this clamping (i.e., had X or Y coordinates outside of
2535     *  the guardband extent the rendered object will not match the intended
2536     *  result.  Therefore software should take steps to ensure that this does
2537     *  not happen - e.g., by clipping objects such that they do not exceed
2538     *  these limits after the Drawing Rectangle is applied."
2539     *
2540     * I believe the fundamental restriction is that the rasterizer (in
2541     * the SF/WM stages) have a limit on the number of pixels that can be
2542     * rasterized.  We need to ensure any coordinates beyond the rasterizer
2543     * limit are handled by the clipper.  So effectively that limit becomes
2544     * the clipper's guardband size.
2545     *
2546     * It goes on to say:
2547     *
2548     * "In addition, in order to be correctly rendered, objects must have a
2549     *  screenspace bounding box not exceeding 8K in the X or Y direction.
2550     *  This additional restriction must also be comprehended by software,
2551     *  i.e., enforced by use of clipping."
2552     *
2553     * This makes no sense.  Gen7+ hardware supports 16K render targets,
2554     * and you definitely need to be able to draw polygons that fill the
2555     * surface.  Our assumption is that the rasterizer was limited to 8K
2556     * on Sandybridge, which only supports 8K surfaces, and it was actually
2557     * increased to 16K on Ivybridge and later.
2558     *
2559     * So, limit the guardband to 16K on Gen7+ and 8K on Sandybridge.
2560     */
2561    const float gb_size = GEN_GEN >= 7 ? 16384.0f : 8192.0f;
2562
2563    /* Workaround: prevent gpu hangs on SandyBridge
2564     * by disabling guardband clipping for odd dimensions.
2565     */
2566    if (GEN_GEN == 6 && (fb_width & 1 || fb_height & 1)) {
2567       *xmin = -1.0f;
2568       *xmax =  1.0f;
2569       *ymin = -1.0f;
2570       *ymax =  1.0f;
2571       return;
2572    }
2573
2574    if (m00 != 0 && m11 != 0) {
2575       /* First, we compute the screen-space render area */
2576       const float ss_ra_xmin = MIN3(        0, m30 + m00, m30 - m00);
2577       const float ss_ra_xmax = MAX3( fb_width, m30 + m00, m30 - m00);
2578       const float ss_ra_ymin = MIN3(        0, m31 + m11, m31 - m11);
2579       const float ss_ra_ymax = MAX3(fb_height, m31 + m11, m31 - m11);
2580
2581       /* We want the guardband to be centered on that */
2582       const float ss_gb_xmin = (ss_ra_xmin + ss_ra_xmax) / 2 - gb_size;
2583       const float ss_gb_xmax = (ss_ra_xmin + ss_ra_xmax) / 2 + gb_size;
2584       const float ss_gb_ymin = (ss_ra_ymin + ss_ra_ymax) / 2 - gb_size;
2585       const float ss_gb_ymax = (ss_ra_ymin + ss_ra_ymax) / 2 + gb_size;
2586
2587       /* Now we need it in native device coordinates */
2588       const float ndc_gb_xmin = (ss_gb_xmin - m30) / m00;
2589       const float ndc_gb_xmax = (ss_gb_xmax - m30) / m00;
2590       const float ndc_gb_ymin = (ss_gb_ymin - m31) / m11;
2591       const float ndc_gb_ymax = (ss_gb_ymax - m31) / m11;
2592
2593       /* Thanks to Y-flipping and ORIGIN_UPPER_LEFT, the Y coordinates may be
2594        * flipped upside-down.  X should be fine though.
2595        */
2596       assert(ndc_gb_xmin <= ndc_gb_xmax);
2597       *xmin = ndc_gb_xmin;
2598       *xmax = ndc_gb_xmax;
2599       *ymin = MIN2(ndc_gb_ymin, ndc_gb_ymax);
2600       *ymax = MAX2(ndc_gb_ymin, ndc_gb_ymax);
2601    } else {
2602       /* The viewport scales to 0, so nothing will be rendered. */
2603       *xmin = 0.0f;
2604       *xmax = 0.0f;
2605       *ymin = 0.0f;
2606       *ymax = 0.0f;
2607    }
2608 }
2609
2610 static void
2611 genX(upload_sf_clip_viewport)(struct brw_context *brw)
2612 {
2613    struct gl_context *ctx = &brw->ctx;
2614    float y_scale, y_bias;
2615
2616    /* BRW_NEW_VIEWPORT_COUNT */
2617    const unsigned viewport_count = brw->clip.viewport_count;
2618
2619    /* _NEW_BUFFERS */
2620    const bool flip_y = ctx->DrawBuffer->FlipY;
2621    const uint32_t fb_width = (float)_mesa_geometric_width(ctx->DrawBuffer);
2622    const uint32_t fb_height = (float)_mesa_geometric_height(ctx->DrawBuffer);
2623
2624 #if GEN_GEN >= 7
2625 #define clv sfv
2626    struct GENX(SF_CLIP_VIEWPORT) sfv;
2627    uint32_t sf_clip_vp_offset;
2628    uint32_t *sf_clip_map =
2629       brw_state_batch(brw, GENX(SF_CLIP_VIEWPORT_length) * 4 * viewport_count,
2630                       64, &sf_clip_vp_offset);
2631 #else
2632    struct GENX(SF_VIEWPORT) sfv;
2633    struct GENX(CLIP_VIEWPORT) clv;
2634    uint32_t sf_vp_offset, clip_vp_offset;
2635    uint32_t *sf_map =
2636       brw_state_batch(brw, GENX(SF_VIEWPORT_length) * 4 * viewport_count,
2637                       32, &sf_vp_offset);
2638    uint32_t *clip_map =
2639       brw_state_batch(brw, GENX(CLIP_VIEWPORT_length) * 4 * viewport_count,
2640                       32, &clip_vp_offset);
2641 #endif
2642
2643    /* _NEW_BUFFERS */
2644    if (flip_y) {
2645       y_scale = -1.0;
2646       y_bias = (float)fb_height;
2647    } else {
2648       y_scale = 1.0;
2649       y_bias = 0;
2650    }
2651
2652    for (unsigned i = 0; i < brw->clip.viewport_count; i++) {
2653       /* _NEW_VIEWPORT: Guardband Clipping */
2654       float scale[3], translate[3], gb_xmin, gb_xmax, gb_ymin, gb_ymax;
2655       _mesa_get_viewport_xform(ctx, i, scale, translate);
2656
2657       sfv.ViewportMatrixElementm00 = scale[0];
2658       sfv.ViewportMatrixElementm11 = scale[1] * y_scale,
2659       sfv.ViewportMatrixElementm22 = scale[2],
2660       sfv.ViewportMatrixElementm30 = translate[0],
2661       sfv.ViewportMatrixElementm31 = translate[1] * y_scale + y_bias,
2662       sfv.ViewportMatrixElementm32 = translate[2],
2663       brw_calculate_guardband_size(fb_width, fb_height,
2664                                    sfv.ViewportMatrixElementm00,
2665                                    sfv.ViewportMatrixElementm11,
2666                                    sfv.ViewportMatrixElementm30,
2667                                    sfv.ViewportMatrixElementm31,
2668                                    &gb_xmin, &gb_xmax, &gb_ymin, &gb_ymax);
2669
2670
2671       clv.XMinClipGuardband = gb_xmin;
2672       clv.XMaxClipGuardband = gb_xmax;
2673       clv.YMinClipGuardband = gb_ymin;
2674       clv.YMaxClipGuardband = gb_ymax;
2675
2676 #if GEN_GEN < 6
2677       set_scissor_bits(ctx, i, flip_y, fb_width, fb_height,
2678                        &sfv.ScissorRectangle);
2679 #elif GEN_GEN >= 8
2680       /* _NEW_VIEWPORT | _NEW_BUFFERS: Screen Space Viewport
2681        * The hardware will take the intersection of the drawing rectangle,
2682        * scissor rectangle, and the viewport extents.  However, emitting
2683        * 3DSTATE_DRAWING_RECTANGLE is expensive since it requires a full
2684        * pipeline stall so we're better off just being a little more clever
2685        * with our viewport so we can emit it once at context creation time.
2686        */
2687       const float viewport_Xmin = MAX2(ctx->ViewportArray[i].X, 0);
2688       const float viewport_Ymin = MAX2(ctx->ViewportArray[i].Y, 0);
2689       const float viewport_Xmax =
2690          MIN2(ctx->ViewportArray[i].X + ctx->ViewportArray[i].Width, fb_width);
2691       const float viewport_Ymax =
2692          MIN2(ctx->ViewportArray[i].Y + ctx->ViewportArray[i].Height, fb_height);
2693
2694       if (flip_y) {
2695          sfv.XMinViewPort = viewport_Xmin;
2696          sfv.XMaxViewPort = viewport_Xmax - 1;
2697          sfv.YMinViewPort = fb_height - viewport_Ymax;
2698          sfv.YMaxViewPort = fb_height - viewport_Ymin - 1;
2699       } else {
2700          sfv.XMinViewPort = viewport_Xmin;
2701          sfv.XMaxViewPort = viewport_Xmax - 1;
2702          sfv.YMinViewPort = viewport_Ymin;
2703          sfv.YMaxViewPort = viewport_Ymax - 1;
2704       }
2705 #endif
2706
2707 #if GEN_GEN >= 7
2708       GENX(SF_CLIP_VIEWPORT_pack)(NULL, sf_clip_map, &sfv);
2709       sf_clip_map += GENX(SF_CLIP_VIEWPORT_length);
2710 #else
2711       GENX(SF_VIEWPORT_pack)(NULL, sf_map, &sfv);
2712       GENX(CLIP_VIEWPORT_pack)(NULL, clip_map, &clv);
2713       sf_map += GENX(SF_VIEWPORT_length);
2714       clip_map += GENX(CLIP_VIEWPORT_length);
2715 #endif
2716    }
2717
2718 #if GEN_GEN >= 7
2719    brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), ptr) {
2720       ptr.SFClipViewportPointer = sf_clip_vp_offset;
2721    }
2722 #elif GEN_GEN == 6
2723    brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) {
2724       vp.SFViewportStateChange = 1;
2725       vp.CLIPViewportStateChange = 1;
2726       vp.PointertoCLIP_VIEWPORT = clip_vp_offset;
2727       vp.PointertoSF_VIEWPORT = sf_vp_offset;
2728    }
2729 #else
2730    brw->sf.vp_offset = sf_vp_offset;
2731    brw->clip.vp_offset = clip_vp_offset;
2732    brw->ctx.NewDriverState |= BRW_NEW_SF_VP | BRW_NEW_CLIP_VP;
2733 #endif
2734 }
2735
2736 static const struct brw_tracked_state genX(sf_clip_viewport) = {
2737    .dirty = {
2738       .mesa = _NEW_BUFFERS |
2739               _NEW_VIEWPORT |
2740               (GEN_GEN <= 5 ? _NEW_SCISSOR : 0),
2741       .brw = BRW_NEW_BATCH |
2742              BRW_NEW_BLORP |
2743              BRW_NEW_VIEWPORT_COUNT,
2744    },
2745    .emit = genX(upload_sf_clip_viewport),
2746 };
2747
2748 /* ---------------------------------------------------------------------- */
2749
2750 static void
2751 genX(upload_gs_state)(struct brw_context *brw)
2752 {
2753    UNUSED struct gl_context *ctx = &brw->ctx;
2754    UNUSED const struct gen_device_info *devinfo = &brw->screen->devinfo;
2755    const struct brw_stage_state *stage_state = &brw->gs.base;
2756    const struct gl_program *gs_prog = brw->programs[MESA_SHADER_GEOMETRY];
2757    /* BRW_NEW_GEOMETRY_PROGRAM */
2758    bool active = GEN_GEN >= 6 && gs_prog;
2759
2760    /* BRW_NEW_GS_PROG_DATA */
2761    struct brw_stage_prog_data *stage_prog_data = stage_state->prog_data;
2762    UNUSED const struct brw_vue_prog_data *vue_prog_data =
2763       brw_vue_prog_data(stage_prog_data);
2764 #if GEN_GEN >= 7
2765    const struct brw_gs_prog_data *gs_prog_data =
2766       brw_gs_prog_data(stage_prog_data);
2767 #endif
2768
2769 #if GEN_GEN == 6
2770    brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_GS), cgs) {
2771       if (active && stage_state->push_const_size != 0) {
2772          cgs.Buffer0Valid = true;
2773          cgs.ConstantBody.PointertoConstantBuffer0 = stage_state->push_const_offset;
2774          cgs.ConstantBody.ConstantBuffer0ReadLength = stage_state->push_const_size - 1;
2775       }
2776    }
2777 #endif
2778
2779 #if GEN_GEN == 7 && !GEN_IS_HASWELL
2780    /**
2781     * From Graphics BSpec: 3D-Media-GPGPU Engine > 3D Pipeline Stages >
2782     * Geometry > Geometry Shader > State:
2783     *
2784     *     "Note: Because of corruption in IVB:GT2, software needs to flush the
2785     *     whole fixed function pipeline when the GS enable changes value in
2786     *     the 3DSTATE_GS."
2787     *
2788     * The hardware architects have clarified that in this context "flush the
2789     * whole fixed function pipeline" means to emit a PIPE_CONTROL with the "CS
2790     * Stall" bit set.
2791     */
2792    if (devinfo->gt == 2 && brw->gs.enabled != active)
2793       gen7_emit_cs_stall_flush(brw);
2794 #endif
2795
2796 #if GEN_GEN >= 6
2797    brw_batch_emit(brw, GENX(3DSTATE_GS), gs) {
2798 #else
2799    ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
2800    brw_state_emit(brw, GENX(GS_STATE), 32, &brw->ff_gs.state_offset, gs) {
2801 #endif
2802
2803 #if GEN_GEN >= 6
2804       if (active) {
2805          INIT_THREAD_DISPATCH_FIELDS(gs, Vertex);
2806
2807 #if GEN_GEN >= 7
2808          gs.OutputVertexSize = gs_prog_data->output_vertex_size_hwords * 2 - 1;
2809          gs.OutputTopology = gs_prog_data->output_topology;
2810          gs.ControlDataHeaderSize =
2811             gs_prog_data->control_data_header_size_hwords;
2812
2813          gs.InstanceControl = gs_prog_data->invocations - 1;
2814          gs.DispatchMode = vue_prog_data->dispatch_mode;
2815
2816          gs.IncludePrimitiveID = gs_prog_data->include_primitive_id;
2817
2818          gs.ControlDataFormat = gs_prog_data->control_data_format;
2819 #endif
2820
2821          /* Note: the meaning of the GEN7_GS_REORDER_TRAILING bit changes between
2822           * Ivy Bridge and Haswell.
2823           *
2824           * On Ivy Bridge, setting this bit causes the vertices of a triangle
2825           * strip to be delivered to the geometry shader in an order that does
2826           * not strictly follow the OpenGL spec, but preserves triangle
2827           * orientation.  For example, if the vertices are (1, 2, 3, 4, 5), then
2828           * the geometry shader sees triangles:
2829           *
2830           * (1, 2, 3), (2, 4, 3), (3, 4, 5)
2831           *
2832           * (Clearing the bit is even worse, because it fails to preserve
2833           * orientation).
2834           *
2835           * Triangle strips with adjacency always ordered in a way that preserves
2836           * triangle orientation but does not strictly follow the OpenGL spec,
2837           * regardless of the setting of this bit.
2838           *
2839           * On Haswell, both triangle strips and triangle strips with adjacency
2840           * are always ordered in a way that preserves triangle orientation.
2841           * Setting this bit causes the ordering to strictly follow the OpenGL
2842           * spec.
2843           *
2844           * So in either case we want to set the bit.  Unfortunately on Ivy
2845           * Bridge this will get the order close to correct but not perfect.
2846           */
2847          gs.ReorderMode = TRAILING;
2848          gs.MaximumNumberofThreads =
2849             GEN_GEN == 8 ? (devinfo->max_gs_threads / 2 - 1)
2850                          : (devinfo->max_gs_threads - 1);
2851
2852 #if GEN_GEN < 7
2853          gs.SOStatisticsEnable = true;
2854          if (gs_prog->info.has_transform_feedback_varyings)
2855             gs.SVBIPayloadEnable = _mesa_is_xfb_active_and_unpaused(ctx);
2856
2857          /* GEN6_GS_SPF_MODE and GEN6_GS_VECTOR_MASK_ENABLE are enabled as it
2858           * was previously done for gen6.
2859           *
2860           * TODO: test with both disabled to see if the HW is behaving
2861           * as expected, like in gen7.
2862           */
2863          gs.SingleProgramFlow = true;
2864          gs.VectorMaskEnable = true;
2865 #endif
2866
2867 #if GEN_GEN >= 8
2868          gs.ExpectedVertexCount = gs_prog_data->vertices_in;
2869
2870          if (gs_prog_data->static_vertex_count != -1) {
2871             gs.StaticOutput = true;
2872             gs.StaticOutputVertexCount = gs_prog_data->static_vertex_count;
2873          }
2874          gs.IncludeVertexHandles = vue_prog_data->include_vue_handles;
2875
2876          gs.UserClipDistanceCullTestEnableBitmask =
2877             vue_prog_data->cull_distance_mask;
2878
2879          const int urb_entry_write_offset = 1;
2880          const uint32_t urb_entry_output_length =
2881             DIV_ROUND_UP(vue_prog_data->vue_map.num_slots, 2) -
2882             urb_entry_write_offset;
2883
2884          gs.VertexURBEntryOutputReadOffset = urb_entry_write_offset;
2885          gs.VertexURBEntryOutputLength = MAX2(urb_entry_output_length, 1);
2886 #endif
2887       }
2888 #endif
2889
2890 #if GEN_GEN <= 6
2891       if (!active && brw->ff_gs.prog_active) {
2892          /* In gen6, transform feedback for the VS stage is done with an
2893           * ad-hoc GS program. This function provides the needed 3DSTATE_GS
2894           * for this.
2895           */
2896          gs.KernelStartPointer = KSP(brw, brw->ff_gs.prog_offset);
2897          gs.SingleProgramFlow = true;
2898          gs.DispatchGRFStartRegisterForURBData = GEN_GEN == 6 ? 2 : 1;
2899          gs.VertexURBEntryReadLength = brw->ff_gs.prog_data->urb_read_length;
2900
2901 #if GEN_GEN <= 5
2902          gs.GRFRegisterCount =
2903             DIV_ROUND_UP(brw->ff_gs.prog_data->total_grf, 16) - 1;
2904          /* BRW_NEW_URB_FENCE */
2905          gs.NumberofURBEntries = brw->urb.nr_gs_entries;
2906          gs.URBEntryAllocationSize = brw->urb.vsize - 1;
2907          gs.MaximumNumberofThreads = brw->urb.nr_gs_entries >= 8 ? 1 : 0;
2908          gs.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
2909 #else
2910          gs.Enable = true;
2911          gs.VectorMaskEnable = true;
2912          gs.SVBIPayloadEnable = true;
2913          gs.SVBIPostIncrementEnable = true;
2914          gs.SVBIPostIncrementValue =
2915             brw->ff_gs.prog_data->svbi_postincrement_value;
2916          gs.SOStatisticsEnable = true;
2917          gs.MaximumNumberofThreads = devinfo->max_gs_threads - 1;
2918 #endif
2919       }
2920 #endif
2921       if (!active && !brw->ff_gs.prog_active) {
2922 #if GEN_GEN < 8
2923          gs.DispatchGRFStartRegisterForURBData = 1;
2924 #if GEN_GEN >= 7
2925          gs.IncludeVertexHandles = true;
2926 #endif
2927 #endif
2928       }
2929
2930 #if GEN_GEN >= 6
2931       gs.StatisticsEnable = true;
2932 #endif
2933 #if GEN_GEN == 5 || GEN_GEN == 6
2934       gs.RenderingEnabled = true;
2935 #endif
2936 #if GEN_GEN <= 5
2937       gs.MaximumVPIndex = brw->clip.viewport_count - 1;
2938 #endif
2939    }
2940
2941 #if GEN_GEN == 6
2942    brw->gs.enabled = active;
2943 #endif
2944 }
2945
2946 static const struct brw_tracked_state genX(gs_state) = {
2947    .dirty = {
2948       .mesa  = (GEN_GEN == 6 ? _NEW_PROGRAM_CONSTANTS : 0),
2949       .brw   = BRW_NEW_BATCH |
2950                BRW_NEW_BLORP |
2951                (GEN_GEN <= 5 ? BRW_NEW_PUSH_CONSTANT_ALLOCATION |
2952                                BRW_NEW_PROGRAM_CACHE |
2953                                BRW_NEW_URB_FENCE |
2954                                BRW_NEW_VIEWPORT_COUNT
2955                              : 0) |
2956                (GEN_GEN >= 6 ? BRW_NEW_CONTEXT |
2957                                BRW_NEW_GEOMETRY_PROGRAM |
2958                                BRW_NEW_GS_PROG_DATA
2959                              : 0) |
2960                (GEN_GEN < 7 ? BRW_NEW_FF_GS_PROG_DATA : 0),
2961    },
2962    .emit = genX(upload_gs_state),
2963 };
2964
2965 /* ---------------------------------------------------------------------- */
2966
2967 UNUSED static GLenum
2968 fix_dual_blend_alpha_to_one(GLenum function)
2969 {
2970    switch (function) {
2971    case GL_SRC1_ALPHA:
2972       return GL_ONE;
2973
2974    case GL_ONE_MINUS_SRC1_ALPHA:
2975       return GL_ZERO;
2976    }
2977
2978    return function;
2979 }
2980
2981 #define blend_factor(x) brw_translate_blend_factor(x)
2982 #define blend_eqn(x) brw_translate_blend_equation(x)
2983
2984 /**
2985  * Modify blend function to force destination alpha to 1.0
2986  *
2987  * If \c function specifies a blend function that uses destination alpha,
2988  * replace it with a function that hard-wires destination alpha to 1.0.  This
2989  * is used when rendering to xRGB targets.
2990  */
2991 static GLenum
2992 brw_fix_xRGB_alpha(GLenum function)
2993 {
2994    switch (function) {
2995    case GL_DST_ALPHA:
2996       return GL_ONE;
2997
2998    case GL_ONE_MINUS_DST_ALPHA:
2999    case GL_SRC_ALPHA_SATURATE:
3000       return GL_ZERO;
3001    }
3002
3003    return function;
3004 }
3005
3006 #if GEN_GEN >= 6
3007 typedef struct GENX(BLEND_STATE_ENTRY) BLEND_ENTRY_GENXML;
3008 #else
3009 typedef struct GENX(COLOR_CALC_STATE) BLEND_ENTRY_GENXML;
3010 #endif
3011
3012 UNUSED static bool
3013 set_blend_entry_bits(struct brw_context *brw, BLEND_ENTRY_GENXML *entry, int i,
3014                      bool alpha_to_one)
3015 {
3016    struct gl_context *ctx = &brw->ctx;
3017
3018    /* _NEW_BUFFERS */
3019    const struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
3020
3021    bool independent_alpha_blend = false;
3022
3023    /* Used for implementing the following bit of GL_EXT_texture_integer:
3024     * "Per-fragment operations that require floating-point color
3025     *  components, including multisample alpha operations, alpha test,
3026     *  blending, and dithering, have no effect when the corresponding
3027     *  colors are written to an integer color buffer."
3028     */
3029    const bool integer = ctx->DrawBuffer->_IntegerBuffers & (0x1 << i);
3030
3031    const unsigned blend_enabled = GEN_GEN >= 6 ?
3032       ctx->Color.BlendEnabled & (1 << i) : ctx->Color.BlendEnabled;
3033
3034    /* _NEW_COLOR */
3035    if (ctx->Color.ColorLogicOpEnabled) {
3036       GLenum rb_type = rb ? _mesa_get_format_datatype(rb->Format)
3037          : GL_UNSIGNED_NORMALIZED;
3038       WARN_ONCE(ctx->Color.LogicOp != GL_COPY &&
3039                 rb_type != GL_UNSIGNED_NORMALIZED &&
3040                 rb_type != GL_FLOAT, "Ignoring %s logic op on %s "
3041                 "renderbuffer\n",
3042                 _mesa_enum_to_string(ctx->Color.LogicOp),
3043                 _mesa_enum_to_string(rb_type));
3044       if (GEN_GEN >= 8 || rb_type == GL_UNSIGNED_NORMALIZED) {
3045          entry->LogicOpEnable = true;
3046          entry->LogicOpFunction = ctx->Color._LogicOp;
3047       }
3048    } else if (blend_enabled && !ctx->Color._AdvancedBlendMode
3049               && (GEN_GEN <= 5 || !integer)) {
3050       GLenum eqRGB = ctx->Color.Blend[i].EquationRGB;
3051       GLenum eqA = ctx->Color.Blend[i].EquationA;
3052       GLenum srcRGB = ctx->Color.Blend[i].SrcRGB;
3053       GLenum dstRGB = ctx->Color.Blend[i].DstRGB;
3054       GLenum srcA = ctx->Color.Blend[i].SrcA;
3055       GLenum dstA = ctx->Color.Blend[i].DstA;
3056
3057       if (eqRGB == GL_MIN || eqRGB == GL_MAX)
3058          srcRGB = dstRGB = GL_ONE;
3059
3060       if (eqA == GL_MIN || eqA == GL_MAX)
3061          srcA = dstA = GL_ONE;
3062
3063       /* Due to hardware limitations, the destination may have information
3064        * in an alpha channel even when the format specifies no alpha
3065        * channel. In order to avoid getting any incorrect blending due to
3066        * that alpha channel, coerce the blend factors to values that will
3067        * not read the alpha channel, but will instead use the correct
3068        * implicit value for alpha.
3069        */
3070       if (rb && !_mesa_base_format_has_channel(rb->_BaseFormat,
3071                                                GL_TEXTURE_ALPHA_TYPE)) {
3072          srcRGB = brw_fix_xRGB_alpha(srcRGB);
3073          srcA = brw_fix_xRGB_alpha(srcA);
3074          dstRGB = brw_fix_xRGB_alpha(dstRGB);
3075          dstA = brw_fix_xRGB_alpha(dstA);
3076       }
3077
3078       /* From the BLEND_STATE docs, DWord 0, Bit 29 (AlphaToOne Enable):
3079        * "If Dual Source Blending is enabled, this bit must be disabled."
3080        *
3081        * We override SRC1_ALPHA to ONE and ONE_MINUS_SRC1_ALPHA to ZERO,
3082        * and leave it enabled anyway.
3083        */
3084       if (GEN_GEN >= 6 && ctx->Color.Blend[i]._UsesDualSrc && alpha_to_one) {
3085          srcRGB = fix_dual_blend_alpha_to_one(srcRGB);
3086          srcA = fix_dual_blend_alpha_to_one(srcA);
3087          dstRGB = fix_dual_blend_alpha_to_one(dstRGB);
3088          dstA = fix_dual_blend_alpha_to_one(dstA);
3089       }
3090
3091       /* BRW_NEW_FS_PROG_DATA */
3092       const struct brw_wm_prog_data *wm_prog_data =
3093          brw_wm_prog_data(brw->wm.base.prog_data);
3094
3095       /* The Dual Source Blending documentation says:
3096        *
3097        * "If SRC1 is included in a src/dst blend factor and
3098        * a DualSource RT Write message is not used, results
3099        * are UNDEFINED. (This reflects the same restriction in DX APIs,
3100        * where undefined results are produced if “o1” is not written
3101        * by a PS – there are no default values defined).
3102        * If SRC1 is not included in a src/dst blend factor,
3103        * dual source blending must be disabled."
3104        *
3105        * There is no way to gracefully fix this undefined situation
3106        * so we just disable the blending to prevent possible issues.
3107        */
3108       entry->ColorBufferBlendEnable =
3109          !ctx->Color.Blend[0]._UsesDualSrc || wm_prog_data->dual_src_blend;
3110
3111       entry->DestinationBlendFactor = blend_factor(dstRGB);
3112       entry->SourceBlendFactor = blend_factor(srcRGB);
3113       entry->DestinationAlphaBlendFactor = blend_factor(dstA);
3114       entry->SourceAlphaBlendFactor = blend_factor(srcA);
3115       entry->ColorBlendFunction = blend_eqn(eqRGB);
3116       entry->AlphaBlendFunction = blend_eqn(eqA);
3117
3118       if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB)
3119          independent_alpha_blend = true;
3120    }
3121
3122    return independent_alpha_blend;
3123 }
3124
3125 #if GEN_GEN >= 6
3126 static void
3127 genX(upload_blend_state)(struct brw_context *brw)
3128 {
3129    struct gl_context *ctx = &brw->ctx;
3130    int size;
3131
3132    /* We need at least one BLEND_STATE written, because we might do
3133     * thread dispatch even if _NumColorDrawBuffers is 0 (for example
3134     * for computed depth or alpha test), which will do an FB write
3135     * with render target 0, which will reference BLEND_STATE[0] for
3136     * alpha test enable.
3137     */
3138    int nr_draw_buffers = ctx->DrawBuffer->_NumColorDrawBuffers;
3139    if (nr_draw_buffers == 0 && ctx->Color.AlphaEnabled)
3140       nr_draw_buffers = 1;
3141
3142    size = GENX(BLEND_STATE_ENTRY_length) * 4 * nr_draw_buffers;
3143 #if GEN_GEN >= 8
3144    size += GENX(BLEND_STATE_length) * 4;
3145 #endif
3146
3147    uint32_t *blend_map;
3148    blend_map = brw_state_batch(brw, size, 64, &brw->cc.blend_state_offset);
3149
3150 #if GEN_GEN >= 8
3151    struct GENX(BLEND_STATE) blend = { 0 };
3152    {
3153 #else
3154    for (int i = 0; i < nr_draw_buffers; i++) {
3155       struct GENX(BLEND_STATE_ENTRY) entry = { 0 };
3156 #define blend entry
3157 #endif
3158       /* OpenGL specification 3.3 (page 196), section 4.1.3 says:
3159        * "If drawbuffer zero is not NONE and the buffer it references has an
3160        * integer format, the SAMPLE_ALPHA_TO_COVERAGE and SAMPLE_ALPHA_TO_ONE
3161        * operations are skipped."
3162        */
3163       if (!(ctx->DrawBuffer->_IntegerBuffers & 0x1)) {
3164          /* _NEW_MULTISAMPLE */
3165          if (_mesa_is_multisample_enabled(ctx)) {
3166             if (ctx->Multisample.SampleAlphaToCoverage) {
3167                blend.AlphaToCoverageEnable = true;
3168                blend.AlphaToCoverageDitherEnable = GEN_GEN >= 7;
3169             }
3170             if (ctx->Multisample.SampleAlphaToOne)
3171                blend.AlphaToOneEnable = true;
3172          }
3173
3174          /* _NEW_COLOR */
3175          if (ctx->Color.AlphaEnabled) {
3176             blend.AlphaTestEnable = true;
3177             blend.AlphaTestFunction =
3178                intel_translate_compare_func(ctx->Color.AlphaFunc);
3179          }
3180
3181          if (ctx->Color.DitherFlag) {
3182             blend.ColorDitherEnable = true;
3183          }
3184       }
3185
3186 #if GEN_GEN >= 8
3187       for (int i = 0; i < nr_draw_buffers; i++) {
3188          struct GENX(BLEND_STATE_ENTRY) entry = { 0 };
3189 #else
3190       {
3191 #endif
3192          blend.IndependentAlphaBlendEnable =
3193             set_blend_entry_bits(brw, &entry, i, blend.AlphaToOneEnable) ||
3194             blend.IndependentAlphaBlendEnable;
3195
3196          /* See section 8.1.6 "Pre-Blend Color Clamping" of the
3197           * SandyBridge PRM Volume 2 Part 1 for HW requirements.
3198           *
3199           * We do our ARB_color_buffer_float CLAMP_FRAGMENT_COLOR
3200           * clamping in the fragment shader.  For its clamping of
3201           * blending, the spec says:
3202           *
3203           *     "RESOLVED: For fixed-point color buffers, the inputs and
3204           *      the result of the blending equation are clamped.  For
3205           *      floating-point color buffers, no clamping occurs."
3206           *
3207           * So, generally, we want clamping to the render target's range.
3208           * And, good news, the hardware tables for both pre- and
3209           * post-blend color clamping are either ignored, or any are
3210           * allowed, or clamping is required but RT range clamping is a
3211           * valid option.
3212           */
3213          entry.PreBlendColorClampEnable = true;
3214          entry.PostBlendColorClampEnable = true;
3215          entry.ColorClampRange = COLORCLAMP_RTFORMAT;
3216
3217          entry.WriteDisableRed   = !GET_COLORMASK_BIT(ctx->Color.ColorMask, i, 0);
3218          entry.WriteDisableGreen = !GET_COLORMASK_BIT(ctx->Color.ColorMask, i, 1);
3219          entry.WriteDisableBlue  = !GET_COLORMASK_BIT(ctx->Color.ColorMask, i, 2);
3220          entry.WriteDisableAlpha = !GET_COLORMASK_BIT(ctx->Color.ColorMask, i, 3);
3221
3222 #if GEN_GEN >= 8
3223          GENX(BLEND_STATE_ENTRY_pack)(NULL, &blend_map[1 + i * 2], &entry);
3224 #else
3225          GENX(BLEND_STATE_ENTRY_pack)(NULL, &blend_map[i * 2], &entry);
3226 #endif
3227       }
3228    }
3229
3230 #if GEN_GEN >= 8
3231    GENX(BLEND_STATE_pack)(NULL, blend_map, &blend);
3232 #endif
3233
3234 #if GEN_GEN < 7
3235    brw_batch_emit(brw, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
3236       ptr.PointertoBLEND_STATE = brw->cc.blend_state_offset;
3237       ptr.BLEND_STATEChange = true;
3238    }
3239 #else
3240    brw_batch_emit(brw, GENX(3DSTATE_BLEND_STATE_POINTERS), ptr) {
3241       ptr.BlendStatePointer = brw->cc.blend_state_offset;
3242 #if GEN_GEN >= 8
3243       ptr.BlendStatePointerValid = true;
3244 #endif
3245    }
3246 #endif
3247 }
3248
3249 static const struct brw_tracked_state genX(blend_state) = {
3250    .dirty = {
3251       .mesa = _NEW_BUFFERS |
3252               _NEW_COLOR |
3253               _NEW_MULTISAMPLE,
3254       .brw = BRW_NEW_BATCH |
3255              BRW_NEW_BLORP |
3256              BRW_NEW_FS_PROG_DATA |
3257              BRW_NEW_STATE_BASE_ADDRESS,
3258    },
3259    .emit = genX(upload_blend_state),
3260 };
3261 #endif
3262
3263 /* ---------------------------------------------------------------------- */
3264
3265 #if GEN_GEN >= 7
3266 UNUSED static const uint32_t push_constant_opcodes[] = {
3267    [MESA_SHADER_VERTEX]                      = 21,
3268    [MESA_SHADER_TESS_CTRL]                   = 25, /* HS */
3269    [MESA_SHADER_TESS_EVAL]                   = 26, /* DS */
3270    [MESA_SHADER_GEOMETRY]                    = 22,
3271    [MESA_SHADER_FRAGMENT]                    = 23,
3272    [MESA_SHADER_COMPUTE]                     = 0,
3273 };
3274
3275 static void
3276 genX(upload_push_constant_packets)(struct brw_context *brw)
3277 {
3278    const struct gen_device_info *devinfo = &brw->screen->devinfo;
3279    struct gl_context *ctx = &brw->ctx;
3280
3281    UNUSED uint32_t mocs = GEN_GEN < 8 ? GEN7_MOCS_L3 : 0;
3282
3283    struct brw_stage_state *stage_states[] = {
3284       &brw->vs.base,
3285       &brw->tcs.base,
3286       &brw->tes.base,
3287       &brw->gs.base,
3288       &brw->wm.base,
3289    };
3290
3291    if (GEN_GEN == 7 && !GEN_IS_HASWELL && !devinfo->is_baytrail &&
3292        stage_states[MESA_SHADER_VERTEX]->push_constants_dirty)
3293       gen7_emit_vs_workaround_flush(brw);
3294
3295    for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
3296       struct brw_stage_state *stage_state = stage_states[stage];
3297       UNUSED struct gl_program *prog = ctx->_Shader->CurrentProgram[stage];
3298
3299       if (!stage_state->push_constants_dirty)
3300          continue;
3301
3302       brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_VS), pkt) {
3303          pkt._3DCommandSubOpcode = push_constant_opcodes[stage];
3304          if (stage_state->prog_data) {
3305 #if GEN_GEN >= 8 || GEN_IS_HASWELL
3306             /* The Skylake PRM contains the following restriction:
3307              *
3308              *    "The driver must ensure The following case does not occur
3309              *     without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
3310              *     buffer 3 read length equal to zero committed followed by a
3311              *     3DSTATE_CONSTANT_* with buffer 0 read length not equal to
3312              *     zero committed."
3313              *
3314              * To avoid this, we program the buffers in the highest slots.
3315              * This way, slot 0 is only used if slot 3 is also used.
3316              */
3317             int n = 3;
3318
3319             for (int i = 3; i >= 0; i--) {
3320                const struct brw_ubo_range *range =
3321                   &stage_state->prog_data->ubo_ranges[i];
3322
3323                if (range->length == 0)
3324                   continue;
3325
3326                const struct gl_uniform_block *block =
3327                   prog->sh.UniformBlocks[range->block];
3328                const struct gl_buffer_binding *binding =
3329                   &ctx->UniformBufferBindings[block->Binding];
3330
3331                if (binding->BufferObject == ctx->Shared->NullBufferObj) {
3332                   static unsigned msg_id = 0;
3333                   _mesa_gl_debug(ctx, &msg_id, MESA_DEBUG_SOURCE_API,
3334                                  MESA_DEBUG_TYPE_UNDEFINED,
3335                                  MESA_DEBUG_SEVERITY_HIGH,
3336                                  "UBO %d unbound, %s shader uniform data "
3337                                  "will be undefined.",
3338                                  range->block,
3339                                  _mesa_shader_stage_to_string(stage));
3340                   continue;
3341                }
3342
3343                assert(binding->Offset % 32 == 0);
3344
3345                struct brw_bo *bo = intel_bufferobj_buffer(brw,
3346                   intel_buffer_object(binding->BufferObject),
3347                   binding->Offset, range->length * 32, false);
3348
3349                pkt.ConstantBody.ReadLength[n] = range->length;
3350                pkt.ConstantBody.Buffer[n] =
3351                   ro_bo(bo, range->start * 32 + binding->Offset);
3352                n--;
3353             }
3354
3355             if (stage_state->push_const_size > 0) {
3356                assert(n >= 0);
3357                pkt.ConstantBody.ReadLength[n] = stage_state->push_const_size;
3358                pkt.ConstantBody.Buffer[n] =
3359                   ro_bo(stage_state->push_const_bo,
3360                         stage_state->push_const_offset);
3361             }
3362 #else
3363             pkt.ConstantBody.ReadLength[0] = stage_state->push_const_size;
3364             pkt.ConstantBody.Buffer[0].offset =
3365                stage_state->push_const_offset | mocs;
3366 #endif
3367          }
3368       }
3369
3370       stage_state->push_constants_dirty = false;
3371       brw->ctx.NewDriverState |= GEN_GEN >= 9 ? BRW_NEW_SURFACES : 0;
3372    }
3373 }
3374
3375 const struct brw_tracked_state genX(push_constant_packets) = {
3376    .dirty = {
3377       .mesa  = 0,
3378       .brw   = BRW_NEW_DRAW_CALL,
3379    },
3380    .emit = genX(upload_push_constant_packets),
3381 };
3382 #endif
3383
3384 #if GEN_GEN >= 6
3385 static void
3386 genX(upload_vs_push_constants)(struct brw_context *brw)
3387 {
3388    struct brw_stage_state *stage_state = &brw->vs.base;
3389
3390    /* BRW_NEW_VERTEX_PROGRAM */
3391    const struct gl_program *vp = brw->programs[MESA_SHADER_VERTEX];
3392    /* BRW_NEW_VS_PROG_DATA */
3393    const struct brw_stage_prog_data *prog_data = brw->vs.base.prog_data;
3394
3395    gen6_upload_push_constants(brw, vp, prog_data, stage_state);
3396 }
3397
3398 static const struct brw_tracked_state genX(vs_push_constants) = {
3399    .dirty = {
3400       .mesa  = _NEW_PROGRAM_CONSTANTS |
3401                _NEW_TRANSFORM,
3402       .brw   = BRW_NEW_BATCH |
3403                BRW_NEW_BLORP |
3404                BRW_NEW_VERTEX_PROGRAM |
3405                BRW_NEW_VS_PROG_DATA,
3406    },
3407    .emit = genX(upload_vs_push_constants),
3408 };
3409
3410 static void
3411 genX(upload_gs_push_constants)(struct brw_context *brw)
3412 {
3413    struct brw_stage_state *stage_state = &brw->gs.base;
3414
3415    /* BRW_NEW_GEOMETRY_PROGRAM */
3416    const struct gl_program *gp = brw->programs[MESA_SHADER_GEOMETRY];
3417
3418    /* BRW_NEW_GS_PROG_DATA */
3419    struct brw_stage_prog_data *prog_data = brw->gs.base.prog_data;
3420
3421    gen6_upload_push_constants(brw, gp, prog_data, stage_state);
3422 }
3423
3424 static const struct brw_tracked_state genX(gs_push_constants) = {
3425    .dirty = {
3426       .mesa  = _NEW_PROGRAM_CONSTANTS |
3427                _NEW_TRANSFORM,
3428       .brw   = BRW_NEW_BATCH |
3429                BRW_NEW_BLORP |
3430                BRW_NEW_GEOMETRY_PROGRAM |
3431                BRW_NEW_GS_PROG_DATA,
3432    },
3433    .emit = genX(upload_gs_push_constants),
3434 };
3435
3436 static void
3437 genX(upload_wm_push_constants)(struct brw_context *brw)
3438 {
3439    struct brw_stage_state *stage_state = &brw->wm.base;
3440    /* BRW_NEW_FRAGMENT_PROGRAM */
3441    const struct gl_program *fp = brw->programs[MESA_SHADER_FRAGMENT];
3442    /* BRW_NEW_FS_PROG_DATA */
3443    const struct brw_stage_prog_data *prog_data = brw->wm.base.prog_data;
3444
3445    gen6_upload_push_constants(brw, fp, prog_data, stage_state);
3446 }
3447
3448 static const struct brw_tracked_state genX(wm_push_constants) = {
3449    .dirty = {
3450       .mesa  = _NEW_PROGRAM_CONSTANTS,
3451       .brw   = BRW_NEW_BATCH |
3452                BRW_NEW_BLORP |
3453                BRW_NEW_FRAGMENT_PROGRAM |
3454                BRW_NEW_FS_PROG_DATA,
3455    },
3456    .emit = genX(upload_wm_push_constants),
3457 };
3458 #endif
3459
3460 /* ---------------------------------------------------------------------- */
3461
3462 #if GEN_GEN >= 6
3463 static unsigned
3464 genX(determine_sample_mask)(struct brw_context *brw)
3465 {
3466    struct gl_context *ctx = &brw->ctx;
3467    float coverage = 1.0f;
3468    float coverage_invert = false;
3469    unsigned sample_mask = ~0u;
3470
3471    /* BRW_NEW_NUM_SAMPLES */
3472    unsigned num_samples = brw->num_samples;
3473
3474    if (_mesa_is_multisample_enabled(ctx)) {
3475       if (ctx->Multisample.SampleCoverage) {
3476          coverage = ctx->Multisample.SampleCoverageValue;
3477          coverage_invert = ctx->Multisample.SampleCoverageInvert;
3478       }
3479       if (ctx->Multisample.SampleMask) {
3480          sample_mask = ctx->Multisample.SampleMaskValue;
3481       }
3482    }
3483
3484    if (num_samples > 1) {
3485       int coverage_int = (int) (num_samples * coverage + 0.5f);
3486       uint32_t coverage_bits = (1 << coverage_int) - 1;
3487       if (coverage_invert)
3488          coverage_bits ^= (1 << num_samples) - 1;
3489       return coverage_bits & sample_mask;
3490    } else {
3491       return 1;
3492    }
3493 }
3494
3495 static void
3496 genX(emit_3dstate_multisample2)(struct brw_context *brw,
3497                                 unsigned num_samples)
3498 {
3499    unsigned log2_samples = ffs(num_samples) - 1;
3500
3501    brw_batch_emit(brw, GENX(3DSTATE_MULTISAMPLE), multi) {
3502       multi.PixelLocation = CENTER;
3503       multi.NumberofMultisamples = log2_samples;
3504 #if GEN_GEN == 6
3505       GEN_SAMPLE_POS_4X(multi.Sample);
3506 #elif GEN_GEN == 7
3507       switch (num_samples) {
3508       case 1:
3509          GEN_SAMPLE_POS_1X(multi.Sample);
3510          break;
3511       case 2:
3512          GEN_SAMPLE_POS_2X(multi.Sample);
3513          break;
3514       case 4:
3515          GEN_SAMPLE_POS_4X(multi.Sample);
3516          break;
3517       case 8:
3518          GEN_SAMPLE_POS_8X(multi.Sample);
3519          break;
3520       default:
3521          break;
3522       }
3523 #endif
3524    }
3525 }
3526
3527 static void
3528 genX(upload_multisample_state)(struct brw_context *brw)
3529 {
3530    assert(brw->num_samples > 0 && brw->num_samples <= 16);
3531
3532    genX(emit_3dstate_multisample2)(brw, brw->num_samples);
3533
3534    brw_batch_emit(brw, GENX(3DSTATE_SAMPLE_MASK), sm) {
3535       sm.SampleMask = genX(determine_sample_mask)(brw);
3536    }
3537 }
3538
3539 static const struct brw_tracked_state genX(multisample_state) = {
3540    .dirty = {
3541       .mesa = _NEW_MULTISAMPLE |
3542               (GEN_GEN == 10 ? _NEW_BUFFERS : 0),
3543       .brw = BRW_NEW_BLORP |
3544              BRW_NEW_CONTEXT |
3545              BRW_NEW_NUM_SAMPLES,
3546    },
3547    .emit = genX(upload_multisample_state)
3548 };
3549 #endif
3550
3551 /* ---------------------------------------------------------------------- */
3552
3553 static void
3554 genX(upload_color_calc_state)(struct brw_context *brw)
3555 {
3556    struct gl_context *ctx = &brw->ctx;
3557
3558    brw_state_emit(brw, GENX(COLOR_CALC_STATE), 64, &brw->cc.state_offset, cc) {
3559 #if GEN_GEN <= 5
3560       cc.IndependentAlphaBlendEnable =
3561          set_blend_entry_bits(brw, &cc, 0, false);
3562       set_depth_stencil_bits(brw, &cc);
3563
3564       if (ctx->Color.AlphaEnabled &&
3565           ctx->DrawBuffer->_NumColorDrawBuffers <= 1) {
3566          cc.AlphaTestEnable = true;
3567          cc.AlphaTestFunction =
3568             intel_translate_compare_func(ctx->Color.AlphaFunc);
3569       }
3570
3571       cc.ColorDitherEnable = ctx->Color.DitherFlag;
3572
3573       cc.StatisticsEnable = brw->stats_wm;
3574
3575       cc.CCViewportStatePointer =
3576          ro_bo(brw->batch.state.bo, brw->cc.vp_offset);
3577 #else
3578       /* _NEW_COLOR */
3579       cc.BlendConstantColorRed = ctx->Color.BlendColorUnclamped[0];
3580       cc.BlendConstantColorGreen = ctx->Color.BlendColorUnclamped[1];
3581       cc.BlendConstantColorBlue = ctx->Color.BlendColorUnclamped[2];
3582       cc.BlendConstantColorAlpha = ctx->Color.BlendColorUnclamped[3];
3583
3584 #if GEN_GEN < 9
3585       /* _NEW_STENCIL */
3586       cc.StencilReferenceValue = _mesa_get_stencil_ref(ctx, 0);
3587       cc.BackfaceStencilReferenceValue =
3588          _mesa_get_stencil_ref(ctx, ctx->Stencil._BackFace);
3589 #endif
3590
3591 #endif
3592
3593       /* _NEW_COLOR */
3594       UNCLAMPED_FLOAT_TO_UBYTE(cc.AlphaReferenceValueAsUNORM8,
3595                                ctx->Color.AlphaRef);
3596    }
3597
3598 #if GEN_GEN >= 6
3599    brw_batch_emit(brw, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
3600       ptr.ColorCalcStatePointer = brw->cc.state_offset;
3601 #if GEN_GEN != 7
3602       ptr.ColorCalcStatePointerValid = true;
3603 #endif
3604    }
3605 #else
3606    brw->ctx.NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
3607 #endif
3608 }
3609
3610 static const struct brw_tracked_state genX(color_calc_state) = {
3611    .dirty = {
3612       .mesa = _NEW_COLOR |
3613               _NEW_STENCIL |
3614               (GEN_GEN <= 5 ? _NEW_BUFFERS |
3615                               _NEW_DEPTH
3616                             : 0),
3617       .brw = BRW_NEW_BATCH |
3618              BRW_NEW_BLORP |
3619              (GEN_GEN <= 5 ? BRW_NEW_CC_VP |
3620                              BRW_NEW_STATS_WM
3621                            : BRW_NEW_CC_STATE |
3622                              BRW_NEW_STATE_BASE_ADDRESS),
3623    },
3624    .emit = genX(upload_color_calc_state),
3625 };
3626
3627
3628 /* ---------------------------------------------------------------------- */
3629
3630 #if GEN_GEN >= 7
3631 static void
3632 genX(upload_sbe)(struct brw_context *brw)
3633 {
3634    struct gl_context *ctx = &brw->ctx;
3635    /* BRW_NEW_FRAGMENT_PROGRAM */
3636    UNUSED const struct gl_program *fp = brw->programs[MESA_SHADER_FRAGMENT];
3637    /* BRW_NEW_FS_PROG_DATA */
3638    const struct brw_wm_prog_data *wm_prog_data =
3639       brw_wm_prog_data(brw->wm.base.prog_data);
3640 #if GEN_GEN >= 8
3641    struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attr_overrides[16] = { { 0 } };
3642 #else
3643 #define attr_overrides sbe.Attribute
3644 #endif
3645    uint32_t urb_entry_read_length;
3646    uint32_t urb_entry_read_offset;
3647    uint32_t point_sprite_enables;
3648
3649    brw_batch_emit(brw, GENX(3DSTATE_SBE), sbe) {
3650       sbe.AttributeSwizzleEnable = true;
3651       sbe.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
3652
3653       /* _NEW_BUFFERS */
3654       bool flip_y = ctx->DrawBuffer->FlipY;
3655
3656       /* _NEW_POINT
3657        *
3658        * Window coordinates in an FBO are inverted, which means point
3659        * sprite origin must be inverted.
3660        */
3661       if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) == flip_y)
3662          sbe.PointSpriteTextureCoordinateOrigin = LOWERLEFT;
3663       else
3664          sbe.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
3665
3666       /* _NEW_POINT | _NEW_LIGHT | _NEW_PROGRAM,
3667        * BRW_NEW_FS_PROG_DATA | BRW_NEW_FRAGMENT_PROGRAM |
3668        * BRW_NEW_GS_PROG_DATA | BRW_NEW_PRIMITIVE | BRW_NEW_TES_PROG_DATA |
3669        * BRW_NEW_VUE_MAP_GEOM_OUT
3670        */
3671       genX(calculate_attr_overrides)(brw,
3672                                      attr_overrides,
3673                                      &point_sprite_enables,
3674                                      &urb_entry_read_length,
3675                                      &urb_entry_read_offset);
3676
3677       /* Typically, the URB entry read length and offset should be programmed
3678        * in 3DSTATE_VS and 3DSTATE_GS; SBE inherits it from the last active
3679        * stage which produces geometry.  However, we don't know the proper
3680        * value until we call calculate_attr_overrides().
3681        *
3682        * To fit with our existing code, we override the inherited values and
3683        * specify it here directly, as we did on previous generations.
3684        */
3685       sbe.VertexURBEntryReadLength = urb_entry_read_length;
3686       sbe.VertexURBEntryReadOffset = urb_entry_read_offset;
3687       sbe.PointSpriteTextureCoordinateEnable = point_sprite_enables;
3688       sbe.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
3689
3690 #if GEN_GEN >= 8
3691       sbe.ForceVertexURBEntryReadLength = true;
3692       sbe.ForceVertexURBEntryReadOffset = true;
3693 #endif
3694
3695 #if GEN_GEN >= 9
3696       /* prepare the active component dwords */
3697       for (int i = 0; i < 32; i++)
3698          sbe.AttributeActiveComponentFormat[i] = ACTIVE_COMPONENT_XYZW;
3699 #endif
3700    }
3701
3702 #if GEN_GEN >= 8
3703    brw_batch_emit(brw, GENX(3DSTATE_SBE_SWIZ), sbes) {
3704       for (int i = 0; i < 16; i++)
3705          sbes.Attribute[i] = attr_overrides[i];
3706    }
3707 #endif
3708
3709 #undef attr_overrides
3710 }
3711
3712 static const struct brw_tracked_state genX(sbe_state) = {
3713    .dirty = {
3714       .mesa  = _NEW_BUFFERS |
3715                _NEW_LIGHT |
3716                _NEW_POINT |
3717                _NEW_POLYGON |
3718                _NEW_PROGRAM,
3719       .brw   = BRW_NEW_BLORP |
3720                BRW_NEW_CONTEXT |
3721                BRW_NEW_FRAGMENT_PROGRAM |
3722                BRW_NEW_FS_PROG_DATA |
3723                BRW_NEW_GS_PROG_DATA |
3724                BRW_NEW_TES_PROG_DATA |
3725                BRW_NEW_VUE_MAP_GEOM_OUT |
3726                (GEN_GEN == 7 ? BRW_NEW_PRIMITIVE
3727                              : 0),
3728    },
3729    .emit = genX(upload_sbe),
3730 };
3731 #endif
3732
3733 /* ---------------------------------------------------------------------- */
3734
3735 #if GEN_GEN >= 7
3736 /**
3737  * Outputs the 3DSTATE_SO_DECL_LIST command.
3738  *
3739  * The data output is a series of 64-bit entries containing a SO_DECL per
3740  * stream.  We only have one stream of rendering coming out of the GS unit, so
3741  * we only emit stream 0 (low 16 bits) SO_DECLs.
3742  */
3743 static void
3744 genX(upload_3dstate_so_decl_list)(struct brw_context *brw,
3745                                   const struct brw_vue_map *vue_map)
3746 {
3747    struct gl_context *ctx = &brw->ctx;
3748    /* BRW_NEW_TRANSFORM_FEEDBACK */
3749    struct gl_transform_feedback_object *xfb_obj =
3750       ctx->TransformFeedback.CurrentObject;
3751    const struct gl_transform_feedback_info *linked_xfb_info =
3752       xfb_obj->program->sh.LinkedTransformFeedback;
3753    struct GENX(SO_DECL) so_decl[MAX_VERTEX_STREAMS][128];
3754    int buffer_mask[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
3755    int next_offset[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
3756    int decls[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
3757    int max_decls = 0;
3758    STATIC_ASSERT(ARRAY_SIZE(so_decl[0]) >= MAX_PROGRAM_OUTPUTS);
3759
3760    memset(so_decl, 0, sizeof(so_decl));
3761
3762    /* Construct the list of SO_DECLs to be emitted.  The formatting of the
3763     * command feels strange -- each dword pair contains a SO_DECL per stream.
3764     */
3765    for (unsigned i = 0; i < linked_xfb_info->NumOutputs; i++) {
3766       const struct gl_transform_feedback_output *output =
3767          &linked_xfb_info->Outputs[i];
3768       const int buffer = output->OutputBuffer;
3769       const int varying = output->OutputRegister;
3770       const unsigned stream_id = output->StreamId;
3771       assert(stream_id < MAX_VERTEX_STREAMS);
3772
3773       buffer_mask[stream_id] |= 1 << buffer;
3774
3775       assert(vue_map->varying_to_slot[varying] >= 0);
3776
3777       /* Mesa doesn't store entries for gl_SkipComponents in the Outputs[]
3778        * array.  Instead, it simply increments DstOffset for the following
3779        * input by the number of components that should be skipped.
3780        *
3781        * Our hardware is unusual in that it requires us to program SO_DECLs
3782        * for fake "hole" components, rather than simply taking the offset
3783        * for each real varying.  Each hole can have size 1, 2, 3, or 4; we
3784        * program as many size = 4 holes as we can, then a final hole to
3785        * accommodate the final 1, 2, or 3 remaining.
3786        */
3787       int skip_components = output->DstOffset - next_offset[buffer];
3788
3789       while (skip_components > 0) {
3790          so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
3791             .HoleFlag = 1,
3792             .OutputBufferSlot = output->OutputBuffer,
3793             .ComponentMask = (1 << MIN2(skip_components, 4)) - 1,
3794          };
3795          skip_components -= 4;
3796       }
3797
3798       next_offset[buffer] = output->DstOffset + output->NumComponents;
3799
3800       so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
3801          .OutputBufferSlot = output->OutputBuffer,
3802          .RegisterIndex = vue_map->varying_to_slot[varying],
3803          .ComponentMask =
3804             ((1 << output->NumComponents) - 1) << output->ComponentOffset,
3805       };
3806
3807       if (decls[stream_id] > max_decls)
3808          max_decls = decls[stream_id];
3809    }
3810
3811    uint32_t *dw;
3812    dw = brw_batch_emitn(brw, GENX(3DSTATE_SO_DECL_LIST), 3 + 2 * max_decls,
3813                         .StreamtoBufferSelects0 = buffer_mask[0],
3814                         .StreamtoBufferSelects1 = buffer_mask[1],
3815                         .StreamtoBufferSelects2 = buffer_mask[2],
3816                         .StreamtoBufferSelects3 = buffer_mask[3],
3817                         .NumEntries0 = decls[0],
3818                         .NumEntries1 = decls[1],
3819                         .NumEntries2 = decls[2],
3820                         .NumEntries3 = decls[3]);
3821
3822    for (int i = 0; i < max_decls; i++) {
3823       GENX(SO_DECL_ENTRY_pack)(
3824          brw, dw + 2 + i * 2,
3825          &(struct GENX(SO_DECL_ENTRY)) {
3826             .Stream0Decl = so_decl[0][i],
3827             .Stream1Decl = so_decl[1][i],
3828             .Stream2Decl = so_decl[2][i],
3829             .Stream3Decl = so_decl[3][i],
3830          });
3831    }
3832 }
3833
3834 static void
3835 genX(upload_3dstate_so_buffers)(struct brw_context *brw)
3836 {
3837    struct gl_context *ctx = &brw->ctx;
3838    /* BRW_NEW_TRANSFORM_FEEDBACK */
3839    struct gl_transform_feedback_object *xfb_obj =
3840       ctx->TransformFeedback.CurrentObject;
3841 #if GEN_GEN < 8
3842    const struct gl_transform_feedback_info *linked_xfb_info =
3843       xfb_obj->program->sh.LinkedTransformFeedback;
3844 #else
3845    struct brw_transform_feedback_object *brw_obj =
3846       (struct brw_transform_feedback_object *) xfb_obj;
3847    uint32_t mocs_wb = GEN_GEN >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
3848 #endif
3849
3850    /* Set up the up to 4 output buffers.  These are the ranges defined in the
3851     * gl_transform_feedback_object.
3852     */
3853    for (int i = 0; i < 4; i++) {
3854       struct intel_buffer_object *bufferobj =
3855          intel_buffer_object(xfb_obj->Buffers[i]);
3856       uint32_t start = xfb_obj->Offset[i];
3857       uint32_t end = ALIGN(start + xfb_obj->Size[i], 4);
3858       uint32_t const size = end - start;
3859
3860       if (!bufferobj || !size) {
3861          brw_batch_emit(brw, GENX(3DSTATE_SO_BUFFER), sob) {
3862             sob.SOBufferIndex = i;
3863          }
3864          continue;
3865       }
3866
3867       assert(start % 4 == 0);
3868       struct brw_bo *bo =
3869          intel_bufferobj_buffer(brw, bufferobj, start, size, true);
3870       assert(end <= bo->size);
3871
3872       brw_batch_emit(brw, GENX(3DSTATE_SO_BUFFER), sob) {
3873          sob.SOBufferIndex = i;
3874
3875          sob.SurfaceBaseAddress = rw_bo(bo, start);
3876 #if GEN_GEN < 8
3877          sob.SurfacePitch = linked_xfb_info->Buffers[i].Stride * 4;
3878          sob.SurfaceEndAddress = rw_bo(bo, end);
3879 #else
3880          sob.SOBufferEnable = true;
3881          sob.StreamOffsetWriteEnable = true;
3882          sob.StreamOutputBufferOffsetAddressEnable = true;
3883          sob.MOCS = mocs_wb;
3884
3885          sob.SurfaceSize = MAX2(xfb_obj->Size[i] / 4, 1) - 1;
3886          sob.StreamOutputBufferOffsetAddress =
3887             rw_bo(brw_obj->offset_bo, i * sizeof(uint32_t));
3888
3889          if (brw_obj->zero_offsets) {
3890             /* Zero out the offset and write that to offset_bo */
3891             sob.StreamOffset = 0;
3892          } else {
3893             /* Use offset_bo as the "Stream Offset." */
3894             sob.StreamOffset = 0xFFFFFFFF;
3895          }
3896 #endif
3897       }
3898    }
3899
3900 #if GEN_GEN >= 8
3901    brw_obj->zero_offsets = false;
3902 #endif
3903 }
3904
3905 static bool
3906 query_active(struct gl_query_object *q)
3907 {
3908    return q && q->Active;
3909 }
3910
3911 static void
3912 genX(upload_3dstate_streamout)(struct brw_context *brw, bool active,
3913                                const struct brw_vue_map *vue_map)
3914 {
3915    struct gl_context *ctx = &brw->ctx;
3916    /* BRW_NEW_TRANSFORM_FEEDBACK */
3917    struct gl_transform_feedback_object *xfb_obj =
3918       ctx->TransformFeedback.CurrentObject;
3919
3920    brw_batch_emit(brw, GENX(3DSTATE_STREAMOUT), sos) {
3921       if (active) {
3922          int urb_entry_read_offset = 0;
3923          int urb_entry_read_length = (vue_map->num_slots + 1) / 2 -
3924             urb_entry_read_offset;
3925
3926          sos.SOFunctionEnable = true;
3927          sos.SOStatisticsEnable = true;
3928
3929          /* BRW_NEW_RASTERIZER_DISCARD */
3930          if (ctx->RasterDiscard) {
3931             if (!query_active(ctx->Query.PrimitivesGenerated[0])) {
3932                sos.RenderingDisable = true;
3933             } else {
3934                perf_debug("Rasterizer discard with a GL_PRIMITIVES_GENERATED "
3935                           "query active relies on the clipper.\n");
3936             }
3937          }
3938
3939          /* _NEW_LIGHT */
3940          if (ctx->Light.ProvokingVertex != GL_FIRST_VERTEX_CONVENTION)
3941             sos.ReorderMode = TRAILING;
3942
3943 #if GEN_GEN < 8
3944          sos.SOBufferEnable0 = xfb_obj->Buffers[0] != NULL;
3945          sos.SOBufferEnable1 = xfb_obj->Buffers[1] != NULL;
3946          sos.SOBufferEnable2 = xfb_obj->Buffers[2] != NULL;
3947          sos.SOBufferEnable3 = xfb_obj->Buffers[3] != NULL;
3948 #else
3949          const struct gl_transform_feedback_info *linked_xfb_info =
3950             xfb_obj->program->sh.LinkedTransformFeedback;
3951          /* Set buffer pitches; 0 means unbound. */
3952          if (xfb_obj->Buffers[0])
3953             sos.Buffer0SurfacePitch = linked_xfb_info->Buffers[0].Stride * 4;
3954          if (xfb_obj->Buffers[1])
3955             sos.Buffer1SurfacePitch = linked_xfb_info->Buffers[1].Stride * 4;
3956          if (xfb_obj->Buffers[2])
3957             sos.Buffer2SurfacePitch = linked_xfb_info->Buffers[2].Stride * 4;
3958          if (xfb_obj->Buffers[3])
3959             sos.Buffer3SurfacePitch = linked_xfb_info->Buffers[3].Stride * 4;
3960 #endif
3961
3962          /* We always read the whole vertex.  This could be reduced at some
3963           * point by reading less and offsetting the register index in the
3964           * SO_DECLs.
3965           */
3966          sos.Stream0VertexReadOffset = urb_entry_read_offset;
3967          sos.Stream0VertexReadLength = urb_entry_read_length - 1;
3968          sos.Stream1VertexReadOffset = urb_entry_read_offset;
3969          sos.Stream1VertexReadLength = urb_entry_read_length - 1;
3970          sos.Stream2VertexReadOffset = urb_entry_read_offset;
3971          sos.Stream2VertexReadLength = urb_entry_read_length - 1;
3972          sos.Stream3VertexReadOffset = urb_entry_read_offset;
3973          sos.Stream3VertexReadLength = urb_entry_read_length - 1;
3974       }
3975    }
3976 }
3977
3978 static void
3979 genX(upload_sol)(struct brw_context *brw)
3980 {
3981    struct gl_context *ctx = &brw->ctx;
3982    /* BRW_NEW_TRANSFORM_FEEDBACK */
3983    bool active = _mesa_is_xfb_active_and_unpaused(ctx);
3984
3985    if (active) {
3986       genX(upload_3dstate_so_buffers)(brw);
3987
3988       /* BRW_NEW_VUE_MAP_GEOM_OUT */
3989       genX(upload_3dstate_so_decl_list)(brw, &brw->vue_map_geom_out);
3990    }
3991
3992    /* Finally, set up the SOL stage.  This command must always follow updates to
3993     * the nonpipelined SOL state (3DSTATE_SO_BUFFER, 3DSTATE_SO_DECL_LIST) or
3994     * MMIO register updates (current performed by the kernel at each batch
3995     * emit).
3996     */
3997    genX(upload_3dstate_streamout)(brw, active, &brw->vue_map_geom_out);
3998 }
3999
4000 static const struct brw_tracked_state genX(sol_state) = {
4001    .dirty = {
4002       .mesa  = _NEW_LIGHT,
4003       .brw   = BRW_NEW_BATCH |
4004                BRW_NEW_BLORP |
4005                BRW_NEW_RASTERIZER_DISCARD |
4006                BRW_NEW_VUE_MAP_GEOM_OUT |
4007                BRW_NEW_TRANSFORM_FEEDBACK,
4008    },
4009    .emit = genX(upload_sol),
4010 };
4011 #endif
4012
4013 /* ---------------------------------------------------------------------- */
4014
4015 #if GEN_GEN >= 7
4016 static void
4017 genX(upload_ps)(struct brw_context *brw)
4018 {
4019    UNUSED const struct gl_context *ctx = &brw->ctx;
4020    UNUSED const struct gen_device_info *devinfo = &brw->screen->devinfo;
4021
4022    /* BRW_NEW_FS_PROG_DATA */
4023    const struct brw_wm_prog_data *prog_data =
4024       brw_wm_prog_data(brw->wm.base.prog_data);
4025    const struct brw_stage_state *stage_state = &brw->wm.base;
4026
4027 #if GEN_GEN < 8
4028 #endif
4029
4030    brw_batch_emit(brw, GENX(3DSTATE_PS), ps) {
4031       /* Initialize the execution mask with VMask.  Otherwise, derivatives are
4032        * incorrect for subspans where some of the pixels are unlit.  We believe
4033        * the bit just didn't take effect in previous generations.
4034        */
4035       ps.VectorMaskEnable = GEN_GEN >= 8;
4036
4037       /* WA_1606682166:
4038        * "Incorrect TDL's SSP address shift in SARB for 16:6 & 18:8 modes.
4039        * Disable the Sampler state prefetch functionality in the SARB by
4040        * programming 0xB000[30] to '1'."
4041        */
4042       ps.SamplerCount = GEN_GEN == 11 ?
4043          0 : DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4);
4044
4045       /* BRW_NEW_FS_PROG_DATA */
4046       /* Gen 11 workarounds table #2056 WABTPPrefetchDisable suggests to disable
4047        * prefetching of binding tables in A0 and B0 steppings.
4048        * TODO: Revisit this workaround on C0 stepping.
4049        */
4050       ps.BindingTableEntryCount = GEN_GEN == 11 ?
4051                                   0 :
4052                                   prog_data->base.binding_table.size_bytes / 4;
4053
4054       if (prog_data->base.use_alt_mode)
4055          ps.FloatingPointMode = Alternate;
4056
4057       /* Haswell requires the sample mask to be set in this packet as well as
4058        * in 3DSTATE_SAMPLE_MASK; the values should match.
4059        */
4060
4061       /* _NEW_BUFFERS, _NEW_MULTISAMPLE */
4062 #if GEN_IS_HASWELL
4063       ps.SampleMask = genX(determine_sample_mask(brw));
4064 #endif
4065
4066       /* 3DSTATE_PS expects the number of threads per PSD, which is always 64
4067        * for pre Gen11 and 128 for gen11+; On gen11+ If a programmed value is
4068        * k, it implies 2(k+1) threads. It implicitly scales for different GT
4069        * levels (which have some # of PSDs).
4070        *
4071        * In Gen8 the format is U8-2 whereas in Gen9+ it is U9-1.
4072        */
4073 #if GEN_GEN >= 9
4074       ps.MaximumNumberofThreadsPerPSD = 64 - 1;
4075 #elif GEN_GEN >= 8
4076       ps.MaximumNumberofThreadsPerPSD = 64 - 2;
4077 #else
4078       ps.MaximumNumberofThreads = devinfo->max_wm_threads - 1;
4079 #endif
4080
4081       if (prog_data->base.nr_params > 0 ||
4082           prog_data->base.ubo_ranges[0].length > 0)
4083          ps.PushConstantEnable = true;
4084
4085 #if GEN_GEN < 8
4086       /* From the IVB PRM, volume 2 part 1, page 287:
4087        * "This bit is inserted in the PS payload header and made available to
4088        * the DataPort (either via the message header or via header bypass) to
4089        * indicate that oMask data (one or two phases) is included in Render
4090        * Target Write messages. If present, the oMask data is used to mask off
4091        * samples."
4092        */
4093       ps.oMaskPresenttoRenderTarget = prog_data->uses_omask;
4094
4095       /* The hardware wedges if you have this bit set but don't turn on any
4096        * dual source blend factors.
4097        *
4098        * BRW_NEW_FS_PROG_DATA | _NEW_COLOR
4099        */
4100       ps.DualSourceBlendEnable = prog_data->dual_src_blend &&
4101                                  (ctx->Color.BlendEnabled & 1) &&
4102                                  ctx->Color.Blend[0]._UsesDualSrc;
4103
4104       /* BRW_NEW_FS_PROG_DATA */
4105       ps.AttributeEnable = (prog_data->num_varying_inputs != 0);
4106 #endif
4107
4108       /* From the documentation for this packet:
4109        * "If the PS kernel does not need the Position XY Offsets to
4110        *  compute a Position Value, then this field should be programmed
4111        *  to POSOFFSET_NONE."
4112        *
4113        * "SW Recommendation: If the PS kernel needs the Position Offsets
4114        *  to compute a Position XY value, this field should match Position
4115        *  ZW Interpolation Mode to ensure a consistent position.xyzw
4116        *  computation."
4117        *
4118        * We only require XY sample offsets. So, this recommendation doesn't
4119        * look useful at the moment. We might need this in future.
4120        */
4121       if (prog_data->uses_pos_offset)
4122          ps.PositionXYOffsetSelect = POSOFFSET_SAMPLE;
4123       else
4124          ps.PositionXYOffsetSelect = POSOFFSET_NONE;
4125
4126       ps._8PixelDispatchEnable = prog_data->dispatch_8;
4127       ps._16PixelDispatchEnable = prog_data->dispatch_16;
4128       ps._32PixelDispatchEnable = prog_data->dispatch_32;
4129
4130       /* From the Sky Lake PRM 3DSTATE_PS::32 Pixel Dispatch Enable:
4131        *
4132        *    "When NUM_MULTISAMPLES = 16 or FORCE_SAMPLE_COUNT = 16, SIMD32
4133        *    Dispatch must not be enabled for PER_PIXEL dispatch mode."
4134        *
4135        * Since 16x MSAA is first introduced on SKL, we don't need to apply
4136        * the workaround on any older hardware.
4137        *
4138        * BRW_NEW_NUM_SAMPLES
4139        */
4140       if (GEN_GEN >= 9 && !prog_data->persample_dispatch &&
4141           brw->num_samples == 16) {
4142          assert(ps._8PixelDispatchEnable || ps._16PixelDispatchEnable);
4143          ps._32PixelDispatchEnable = false;
4144       }
4145
4146       ps.DispatchGRFStartRegisterForConstantSetupData0 =
4147          brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 0);
4148       ps.DispatchGRFStartRegisterForConstantSetupData1 =
4149          brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 1);
4150       ps.DispatchGRFStartRegisterForConstantSetupData2 =
4151          brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 2);
4152
4153       ps.KernelStartPointer0 = stage_state->prog_offset +
4154                                brw_wm_prog_data_prog_offset(prog_data, ps, 0);
4155       ps.KernelStartPointer1 = stage_state->prog_offset +
4156                                brw_wm_prog_data_prog_offset(prog_data, ps, 1);
4157       ps.KernelStartPointer2 = stage_state->prog_offset +
4158                                brw_wm_prog_data_prog_offset(prog_data, ps, 2);
4159
4160       if (prog_data->base.total_scratch) {
4161          ps.ScratchSpaceBasePointer =
4162             rw_32_bo(stage_state->scratch_bo,
4163                      ffs(stage_state->per_thread_scratch) - 11);
4164       }
4165    }
4166 }
4167
4168 static const struct brw_tracked_state genX(ps_state) = {
4169    .dirty = {
4170       .mesa  = _NEW_MULTISAMPLE |
4171                (GEN_GEN < 8 ? _NEW_BUFFERS |
4172                               _NEW_COLOR
4173                             : 0),
4174       .brw   = BRW_NEW_BATCH |
4175                BRW_NEW_BLORP |
4176                BRW_NEW_FS_PROG_DATA |
4177                (GEN_GEN >= 9 ? BRW_NEW_NUM_SAMPLES : 0),
4178    },
4179    .emit = genX(upload_ps),
4180 };
4181 #endif
4182
4183 /* ---------------------------------------------------------------------- */
4184
4185 #if GEN_GEN >= 7
4186 static void
4187 genX(upload_hs_state)(struct brw_context *brw)
4188 {
4189    const struct gen_device_info *devinfo = &brw->screen->devinfo;
4190    struct brw_stage_state *stage_state = &brw->tcs.base;
4191    struct brw_stage_prog_data *stage_prog_data = stage_state->prog_data;
4192    const struct brw_vue_prog_data *vue_prog_data =
4193       brw_vue_prog_data(stage_prog_data);
4194
4195    /* BRW_NEW_TES_PROG_DATA */
4196    struct brw_tcs_prog_data *tcs_prog_data =
4197       brw_tcs_prog_data(stage_prog_data);
4198
4199    if (!tcs_prog_data) {
4200       brw_batch_emit(brw, GENX(3DSTATE_HS), hs);
4201    } else {
4202       brw_batch_emit(brw, GENX(3DSTATE_HS), hs) {
4203          INIT_THREAD_DISPATCH_FIELDS(hs, Vertex);
4204
4205          hs.InstanceCount = tcs_prog_data->instances - 1;
4206          hs.IncludeVertexHandles = true;
4207
4208          hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1;
4209       }
4210    }
4211 }
4212
4213 static const struct brw_tracked_state genX(hs_state) = {
4214    .dirty = {
4215       .mesa  = 0,
4216       .brw   = BRW_NEW_BATCH |
4217                BRW_NEW_BLORP |
4218                BRW_NEW_TCS_PROG_DATA |
4219                BRW_NEW_TESS_PROGRAMS,
4220    },
4221    .emit = genX(upload_hs_state),
4222 };
4223
4224 static void
4225 genX(upload_ds_state)(struct brw_context *brw)
4226 {
4227    const struct gen_device_info *devinfo = &brw->screen->devinfo;
4228    const struct brw_stage_state *stage_state = &brw->tes.base;
4229    struct brw_stage_prog_data *stage_prog_data = stage_state->prog_data;
4230
4231    /* BRW_NEW_TES_PROG_DATA */
4232    const struct brw_tes_prog_data *tes_prog_data =
4233       brw_tes_prog_data(stage_prog_data);
4234    const struct brw_vue_prog_data *vue_prog_data =
4235       brw_vue_prog_data(stage_prog_data);
4236
4237    if (!tes_prog_data) {
4238       brw_batch_emit(brw, GENX(3DSTATE_DS), ds);
4239    } else {
4240       assert(GEN_GEN < 11 ||
4241              vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8);
4242
4243       brw_batch_emit(brw, GENX(3DSTATE_DS), ds) {
4244          INIT_THREAD_DISPATCH_FIELDS(ds, Patch);
4245
4246         ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1;
4247         ds.ComputeWCoordinateEnable =
4248            tes_prog_data->domain == BRW_TESS_DOMAIN_TRI;
4249
4250 #if GEN_GEN >= 8
4251         if (vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8)
4252            ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH;
4253         ds.UserClipDistanceCullTestEnableBitmask =
4254             vue_prog_data->cull_distance_mask;
4255 #endif
4256       }
4257    }
4258 }
4259
4260 static const struct brw_tracked_state genX(ds_state) = {
4261    .dirty = {
4262       .mesa  = 0,
4263       .brw   = BRW_NEW_BATCH |
4264                BRW_NEW_BLORP |
4265                BRW_NEW_TESS_PROGRAMS |
4266                BRW_NEW_TES_PROG_DATA,
4267    },
4268    .emit = genX(upload_ds_state),
4269 };
4270
4271 /* ---------------------------------------------------------------------- */
4272
4273 static void
4274 upload_te_state(struct brw_context *brw)
4275 {
4276    /* BRW_NEW_TESS_PROGRAMS */
4277    bool active = brw->programs[MESA_SHADER_TESS_EVAL];
4278
4279    /* BRW_NEW_TES_PROG_DATA */
4280    const struct brw_tes_prog_data *tes_prog_data =
4281       brw_tes_prog_data(brw->tes.base.prog_data);
4282
4283    if (active) {
4284       brw_batch_emit(brw, GENX(3DSTATE_TE), te) {
4285          te.Partitioning = tes_prog_data->partitioning;
4286          te.OutputTopology = tes_prog_data->output_topology;
4287          te.TEDomain = tes_prog_data->domain;
4288          te.TEEnable = true;
4289          te.MaximumTessellationFactorOdd = 63.0;
4290          te.MaximumTessellationFactorNotOdd = 64.0;
4291       }
4292    } else {
4293       brw_batch_emit(brw, GENX(3DSTATE_TE), te);
4294    }
4295 }
4296
4297 static const struct brw_tracked_state genX(te_state) = {
4298    .dirty = {
4299       .mesa  = 0,
4300       .brw   = BRW_NEW_BLORP |
4301                BRW_NEW_CONTEXT |
4302                BRW_NEW_TES_PROG_DATA |
4303                BRW_NEW_TESS_PROGRAMS,
4304    },
4305    .emit = upload_te_state,
4306 };
4307
4308 /* ---------------------------------------------------------------------- */
4309
4310 static void
4311 genX(upload_tes_push_constants)(struct brw_context *brw)
4312 {
4313    struct brw_stage_state *stage_state = &brw->tes.base;
4314    /* BRW_NEW_TESS_PROGRAMS */
4315    const struct gl_program *tep = brw->programs[MESA_SHADER_TESS_EVAL];
4316
4317    /* BRW_NEW_TES_PROG_DATA */
4318    const struct brw_stage_prog_data *prog_data = brw->tes.base.prog_data;
4319    gen6_upload_push_constants(brw, tep, prog_data, stage_state);
4320 }
4321
4322 static const struct brw_tracked_state genX(tes_push_constants) = {
4323    .dirty = {
4324       .mesa  = _NEW_PROGRAM_CONSTANTS,
4325       .brw   = BRW_NEW_BATCH |
4326                BRW_NEW_BLORP |
4327                BRW_NEW_TESS_PROGRAMS |
4328                BRW_NEW_TES_PROG_DATA,
4329    },
4330    .emit = genX(upload_tes_push_constants),
4331 };
4332
4333 static void
4334 genX(upload_tcs_push_constants)(struct brw_context *brw)
4335 {
4336    struct brw_stage_state *stage_state = &brw->tcs.base;
4337    /* BRW_NEW_TESS_PROGRAMS */
4338    const struct gl_program *tcp = brw->programs[MESA_SHADER_TESS_CTRL];
4339
4340    /* BRW_NEW_TCS_PROG_DATA */
4341    const struct brw_stage_prog_data *prog_data = brw->tcs.base.prog_data;
4342
4343    gen6_upload_push_constants(brw, tcp, prog_data, stage_state);
4344 }
4345
4346 static const struct brw_tracked_state genX(tcs_push_constants) = {
4347    .dirty = {
4348       .mesa  = _NEW_PROGRAM_CONSTANTS,
4349       .brw   = BRW_NEW_BATCH |
4350                BRW_NEW_BLORP |
4351                BRW_NEW_DEFAULT_TESS_LEVELS |
4352                BRW_NEW_TESS_PROGRAMS |
4353                BRW_NEW_TCS_PROG_DATA,
4354    },
4355    .emit = genX(upload_tcs_push_constants),
4356 };
4357
4358 #endif
4359
4360 /* ---------------------------------------------------------------------- */
4361
4362 #if GEN_GEN >= 7
4363 static void
4364 genX(upload_cs_push_constants)(struct brw_context *brw)
4365 {
4366    struct brw_stage_state *stage_state = &brw->cs.base;
4367
4368    /* BRW_NEW_COMPUTE_PROGRAM */
4369    const struct gl_program *cp = brw->programs[MESA_SHADER_COMPUTE];
4370
4371    if (cp) {
4372       /* BRW_NEW_CS_PROG_DATA */
4373       struct brw_cs_prog_data *cs_prog_data =
4374          brw_cs_prog_data(brw->cs.base.prog_data);
4375
4376       _mesa_shader_write_subroutine_indices(&brw->ctx, MESA_SHADER_COMPUTE);
4377       brw_upload_cs_push_constants(brw, cp, cs_prog_data, stage_state);
4378    }
4379 }
4380
4381 const struct brw_tracked_state genX(cs_push_constants) = {
4382    .dirty = {
4383       .mesa = _NEW_PROGRAM_CONSTANTS,
4384       .brw = BRW_NEW_BATCH |
4385              BRW_NEW_BLORP |
4386              BRW_NEW_COMPUTE_PROGRAM |
4387              BRW_NEW_CS_PROG_DATA,
4388    },
4389    .emit = genX(upload_cs_push_constants),
4390 };
4391
4392 /**
4393  * Creates a new CS constant buffer reflecting the current CS program's
4394  * constants, if needed by the CS program.
4395  */
4396 static void
4397 genX(upload_cs_pull_constants)(struct brw_context *brw)
4398 {
4399    struct brw_stage_state *stage_state = &brw->cs.base;
4400
4401    /* BRW_NEW_COMPUTE_PROGRAM */
4402    struct brw_program *cp =
4403       (struct brw_program *) brw->programs[MESA_SHADER_COMPUTE];
4404
4405    /* BRW_NEW_CS_PROG_DATA */
4406    const struct brw_stage_prog_data *prog_data = brw->cs.base.prog_data;
4407
4408    _mesa_shader_write_subroutine_indices(&brw->ctx, MESA_SHADER_COMPUTE);
4409    /* _NEW_PROGRAM_CONSTANTS */
4410    brw_upload_pull_constants(brw, BRW_NEW_SURFACES, &cp->program,
4411                              stage_state, prog_data);
4412 }
4413
4414 const struct brw_tracked_state genX(cs_pull_constants) = {
4415    .dirty = {
4416       .mesa = _NEW_PROGRAM_CONSTANTS,
4417       .brw = BRW_NEW_BATCH |
4418              BRW_NEW_BLORP |
4419              BRW_NEW_COMPUTE_PROGRAM |
4420              BRW_NEW_CS_PROG_DATA,
4421    },
4422    .emit = genX(upload_cs_pull_constants),
4423 };
4424
4425 static void
4426 genX(upload_cs_state)(struct brw_context *brw)
4427 {
4428    if (!brw->cs.base.prog_data)
4429       return;
4430
4431    uint32_t offset;
4432    uint32_t *desc = (uint32_t*) brw_state_batch(
4433       brw, GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t), 64,
4434       &offset);
4435
4436    struct brw_stage_state *stage_state = &brw->cs.base;
4437    struct brw_stage_prog_data *prog_data = stage_state->prog_data;
4438    struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data);
4439    const struct gen_device_info *devinfo = &brw->screen->devinfo;
4440
4441    if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
4442       brw_emit_buffer_surface_state(
4443          brw, &stage_state->surf_offset[
4444                  prog_data->binding_table.shader_time_start],
4445          brw->shader_time.bo, 0, ISL_FORMAT_RAW,
4446          brw->shader_time.bo->size, 1,
4447          RELOC_WRITE);
4448    }
4449
4450    uint32_t *bind = brw_state_batch(brw, prog_data->binding_table.size_bytes,
4451                                     32, &stage_state->bind_bo_offset);
4452
4453    /* The MEDIA_VFE_STATE documentation for Gen8+ says:
4454     *
4455     * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
4456     *  the only bits that are changed are scoreboard related: Scoreboard
4457     *  Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For
4458     *  these scoreboard related states, a MEDIA_STATE_FLUSH is sufficient."
4459     *
4460     * Earlier generations say "MI_FLUSH" instead of "stalling PIPE_CONTROL",
4461     * but MI_FLUSH isn't really a thing, so we assume they meant PIPE_CONTROL.
4462     */
4463    brw_emit_pipe_control_flush(brw, PIPE_CONTROL_CS_STALL);
4464
4465    brw_batch_emit(brw, GENX(MEDIA_VFE_STATE), vfe) {
4466       if (prog_data->total_scratch) {
4467          uint32_t per_thread_scratch_value;
4468
4469          if (GEN_GEN >= 8) {
4470             /* Broadwell's Per Thread Scratch Space is in the range [0, 11]
4471              * where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M.
4472              */
4473             per_thread_scratch_value = ffs(stage_state->per_thread_scratch) - 11;
4474          } else if (GEN_IS_HASWELL) {
4475             /* Haswell's Per Thread Scratch Space is in the range [0, 10]
4476              * where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M.
4477              */
4478             per_thread_scratch_value = ffs(stage_state->per_thread_scratch) - 12;
4479          } else {
4480             /* Earlier platforms use the range [0, 11] to mean [1kB, 12kB]
4481              * where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB.
4482              */
4483             per_thread_scratch_value = stage_state->per_thread_scratch / 1024 - 1;
4484          }
4485          vfe.ScratchSpaceBasePointer = rw_32_bo(stage_state->scratch_bo, 0);
4486          vfe.PerThreadScratchSpace = per_thread_scratch_value;
4487       }
4488
4489       /* If brw->screen->subslice_total is greater than one, then
4490        * devinfo->max_cs_threads stores number of threads per sub-slice;
4491        * thus we need to multiply by that number by subslices to get
4492        * the actual maximum number of threads; the -1 is because the HW
4493        * has a bias of 1 (would not make sense to say the maximum number
4494        * of threads is 0).
4495        */
4496       const uint32_t subslices = MAX2(brw->screen->subslice_total, 1);
4497       vfe.MaximumNumberofThreads = devinfo->max_cs_threads * subslices - 1;
4498       vfe.NumberofURBEntries = GEN_GEN >= 8 ? 2 : 0;
4499 #if GEN_GEN < 11
4500       vfe.ResetGatewayTimer =
4501          Resettingrelativetimerandlatchingtheglobaltimestamp;
4502 #endif
4503 #if GEN_GEN < 9
4504       vfe.BypassGatewayControl = BypassingOpenGatewayCloseGatewayprotocol;
4505 #endif
4506 #if GEN_GEN == 7
4507       vfe.GPGPUMode = 1;
4508 #endif
4509
4510       /* We are uploading duplicated copies of push constant uniforms for each
4511        * thread. Although the local id data needs to vary per thread, it won't
4512        * change for other uniform data. Unfortunately this duplication is
4513        * required for gen7. As of Haswell, this duplication can be avoided,
4514        * but this older mechanism with duplicated data continues to work.
4515        *
4516        * FINISHME: As of Haswell, we could make use of the
4517        * INTERFACE_DESCRIPTOR_DATA "Cross-Thread Constant Data Read Length"
4518        * field to only store one copy of uniform data.
4519        *
4520        * FINISHME: Broadwell adds a new alternative "Indirect Payload Storage"
4521        * which is described in the GPGPU_WALKER command and in the Broadwell
4522        * PRM Volume 7: 3D Media GPGPU, under Media GPGPU Pipeline => Mode of
4523        * Operations => GPGPU Mode => Indirect Payload Storage.
4524        *
4525        * Note: The constant data is built in brw_upload_cs_push_constants
4526        * below.
4527        */
4528       vfe.URBEntryAllocationSize = GEN_GEN >= 8 ? 2 : 0;
4529
4530       const uint32_t vfe_curbe_allocation =
4531          ALIGN(cs_prog_data->push.per_thread.regs * cs_prog_data->threads +
4532                cs_prog_data->push.cross_thread.regs, 2);
4533       vfe.CURBEAllocationSize = vfe_curbe_allocation;
4534    }
4535
4536    if (cs_prog_data->push.total.size > 0) {
4537       brw_batch_emit(brw, GENX(MEDIA_CURBE_LOAD), curbe) {
4538          curbe.CURBETotalDataLength =
4539             ALIGN(cs_prog_data->push.total.size, 64);
4540          curbe.CURBEDataStartAddress = stage_state->push_const_offset;
4541       }
4542    }
4543
4544    /* BRW_NEW_SURFACES and BRW_NEW_*_CONSTBUF */
4545    memcpy(bind, stage_state->surf_offset,
4546           prog_data->binding_table.size_bytes);
4547    const struct GENX(INTERFACE_DESCRIPTOR_DATA) idd = {
4548       .KernelStartPointer = brw->cs.base.prog_offset,
4549       .SamplerStatePointer = stage_state->sampler_offset,
4550       .SamplerCount = DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4),
4551       .BindingTablePointer = stage_state->bind_bo_offset,
4552       .ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs,
4553       .NumberofThreadsinGPGPUThreadGroup = cs_prog_data->threads,
4554       .SharedLocalMemorySize = encode_slm_size(GEN_GEN,
4555                                                prog_data->total_shared),
4556       .BarrierEnable = cs_prog_data->uses_barrier,
4557 #if GEN_GEN >= 8 || GEN_IS_HASWELL
4558       .CrossThreadConstantDataReadLength =
4559          cs_prog_data->push.cross_thread.regs,
4560 #endif
4561    };
4562
4563    GENX(INTERFACE_DESCRIPTOR_DATA_pack)(brw, desc, &idd);
4564
4565    brw_batch_emit(brw, GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), load) {
4566       load.InterfaceDescriptorTotalLength =
4567          GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
4568       load.InterfaceDescriptorDataStartAddress = offset;
4569    }
4570 }
4571
4572 static const struct brw_tracked_state genX(cs_state) = {
4573    .dirty = {
4574       .mesa = _NEW_PROGRAM_CONSTANTS,
4575       .brw = BRW_NEW_BATCH |
4576              BRW_NEW_BLORP |
4577              BRW_NEW_CS_PROG_DATA |
4578              BRW_NEW_SAMPLER_STATE_TABLE |
4579              BRW_NEW_SURFACES,
4580    },
4581    .emit = genX(upload_cs_state)
4582 };
4583
4584 #define GPGPU_DISPATCHDIMX 0x2500
4585 #define GPGPU_DISPATCHDIMY 0x2504
4586 #define GPGPU_DISPATCHDIMZ 0x2508
4587
4588 #define MI_PREDICATE_SRC0  0x2400
4589 #define MI_PREDICATE_SRC1  0x2408
4590
4591 static void
4592 prepare_indirect_gpgpu_walker(struct brw_context *brw)
4593 {
4594    GLintptr indirect_offset = brw->compute.num_work_groups_offset;
4595    struct brw_bo *bo = brw->compute.num_work_groups_bo;
4596
4597    emit_lrm(brw, GPGPU_DISPATCHDIMX, ro_bo(bo, indirect_offset + 0));
4598    emit_lrm(brw, GPGPU_DISPATCHDIMY, ro_bo(bo, indirect_offset + 4));
4599    emit_lrm(brw, GPGPU_DISPATCHDIMZ, ro_bo(bo, indirect_offset + 8));
4600
4601 #if GEN_GEN <= 7
4602    /* Clear upper 32-bits of SRC0 and all 64-bits of SRC1 */
4603    emit_lri(brw, MI_PREDICATE_SRC0 + 4, 0);
4604    emit_lri(brw, MI_PREDICATE_SRC1    , 0);
4605    emit_lri(brw, MI_PREDICATE_SRC1 + 4, 0);
4606
4607    /* Load compute_dispatch_indirect_x_size into SRC0 */
4608    emit_lrm(brw, MI_PREDICATE_SRC0, ro_bo(bo, indirect_offset + 0));
4609
4610    /* predicate = (compute_dispatch_indirect_x_size == 0); */
4611    brw_batch_emit(brw, GENX(MI_PREDICATE), mip) {
4612       mip.LoadOperation    = LOAD_LOAD;
4613       mip.CombineOperation = COMBINE_SET;
4614       mip.CompareOperation = COMPARE_SRCS_EQUAL;
4615    }
4616
4617    /* Load compute_dispatch_indirect_y_size into SRC0 */
4618    emit_lrm(brw, MI_PREDICATE_SRC0, ro_bo(bo, indirect_offset + 4));
4619
4620    /* predicate |= (compute_dispatch_indirect_y_size == 0); */
4621    brw_batch_emit(brw, GENX(MI_PREDICATE), mip) {
4622       mip.LoadOperation    = LOAD_LOAD;
4623       mip.CombineOperation = COMBINE_OR;
4624       mip.CompareOperation = COMPARE_SRCS_EQUAL;
4625    }
4626
4627    /* Load compute_dispatch_indirect_z_size into SRC0 */
4628    emit_lrm(brw, MI_PREDICATE_SRC0, ro_bo(bo, indirect_offset + 8));
4629
4630    /* predicate |= (compute_dispatch_indirect_z_size == 0); */
4631    brw_batch_emit(brw, GENX(MI_PREDICATE), mip) {
4632       mip.LoadOperation    = LOAD_LOAD;
4633       mip.CombineOperation = COMBINE_OR;
4634       mip.CompareOperation = COMPARE_SRCS_EQUAL;
4635    }
4636
4637    /* predicate = !predicate; */
4638 #define COMPARE_FALSE                           1
4639    brw_batch_emit(brw, GENX(MI_PREDICATE), mip) {
4640       mip.LoadOperation    = LOAD_LOADINV;
4641       mip.CombineOperation = COMBINE_OR;
4642       mip.CompareOperation = COMPARE_FALSE;
4643    }
4644 #endif
4645 }
4646
4647 static void
4648 genX(emit_gpgpu_walker)(struct brw_context *brw)
4649 {
4650    const struct brw_cs_prog_data *prog_data =
4651       brw_cs_prog_data(brw->cs.base.prog_data);
4652
4653    const GLuint *num_groups = brw->compute.num_work_groups;
4654
4655    bool indirect = brw->compute.num_work_groups_bo != NULL;
4656    if (indirect)
4657       prepare_indirect_gpgpu_walker(brw);
4658
4659    const unsigned simd_size = prog_data->simd_size;
4660    unsigned group_size = prog_data->local_size[0] *
4661       prog_data->local_size[1] * prog_data->local_size[2];
4662
4663    uint32_t right_mask = 0xffffffffu >> (32 - simd_size);
4664    const unsigned right_non_aligned = group_size & (simd_size - 1);
4665    if (right_non_aligned != 0)
4666       right_mask >>= (simd_size - right_non_aligned);
4667
4668    brw_batch_emit(brw, GENX(GPGPU_WALKER), ggw) {
4669       ggw.IndirectParameterEnable      = indirect;
4670       ggw.PredicateEnable              = GEN_GEN <= 7 && indirect;
4671       ggw.SIMDSize                     = prog_data->simd_size / 16;
4672       ggw.ThreadDepthCounterMaximum    = 0;
4673       ggw.ThreadHeightCounterMaximum   = 0;
4674       ggw.ThreadWidthCounterMaximum    = prog_data->threads - 1;
4675       ggw.ThreadGroupIDXDimension      = num_groups[0];
4676       ggw.ThreadGroupIDYDimension      = num_groups[1];
4677       ggw.ThreadGroupIDZDimension      = num_groups[2];
4678       ggw.RightExecutionMask           = right_mask;
4679       ggw.BottomExecutionMask          = 0xffffffff;
4680    }
4681
4682    brw_batch_emit(brw, GENX(MEDIA_STATE_FLUSH), msf);
4683 }
4684
4685 #endif
4686
4687 /* ---------------------------------------------------------------------- */
4688
4689 #if GEN_GEN >= 8
4690 static void
4691 genX(upload_raster)(struct brw_context *brw)
4692 {
4693    const struct gl_context *ctx = &brw->ctx;
4694
4695    /* _NEW_BUFFERS */
4696    const bool flip_y = ctx->DrawBuffer->FlipY;
4697
4698    /* _NEW_POLYGON */
4699    const struct gl_polygon_attrib *polygon = &ctx->Polygon;
4700
4701    /* _NEW_POINT */
4702    const struct gl_point_attrib *point = &ctx->Point;
4703
4704    brw_batch_emit(brw, GENX(3DSTATE_RASTER), raster) {
4705       if (brw->polygon_front_bit != flip_y)
4706          raster.FrontWinding = CounterClockwise;
4707
4708       if (polygon->CullFlag) {
4709          switch (polygon->CullFaceMode) {
4710          case GL_FRONT:
4711             raster.CullMode = CULLMODE_FRONT;
4712             break;
4713          case GL_BACK:
4714             raster.CullMode = CULLMODE_BACK;
4715             break;
4716          case GL_FRONT_AND_BACK:
4717             raster.CullMode = CULLMODE_BOTH;
4718             break;
4719          default:
4720             unreachable("not reached");
4721          }
4722       } else {
4723          raster.CullMode = CULLMODE_NONE;
4724       }
4725
4726       raster.SmoothPointEnable = point->SmoothFlag;
4727
4728       raster.DXMultisampleRasterizationEnable =
4729          _mesa_is_multisample_enabled(ctx);
4730
4731       raster.GlobalDepthOffsetEnableSolid = polygon->OffsetFill;
4732       raster.GlobalDepthOffsetEnableWireframe = polygon->OffsetLine;
4733       raster.GlobalDepthOffsetEnablePoint = polygon->OffsetPoint;
4734
4735       switch (polygon->FrontMode) {
4736       case GL_FILL:
4737          raster.FrontFaceFillMode = FILL_MODE_SOLID;
4738          break;
4739       case GL_LINE:
4740          raster.FrontFaceFillMode = FILL_MODE_WIREFRAME;
4741          break;
4742       case GL_POINT:
4743          raster.FrontFaceFillMode = FILL_MODE_POINT;
4744          break;
4745       default:
4746          unreachable("not reached");
4747       }
4748
4749       switch (polygon->BackMode) {
4750       case GL_FILL:
4751          raster.BackFaceFillMode = FILL_MODE_SOLID;
4752          break;
4753       case GL_LINE:
4754          raster.BackFaceFillMode = FILL_MODE_WIREFRAME;
4755          break;
4756       case GL_POINT:
4757          raster.BackFaceFillMode = FILL_MODE_POINT;
4758          break;
4759       default:
4760          unreachable("not reached");
4761       }
4762
4763       /* _NEW_LINE */
4764       raster.AntialiasingEnable = ctx->Line.SmoothFlag;
4765
4766 #if GEN_GEN == 10
4767       /* _NEW_BUFFERS
4768        * Antialiasing Enable bit MUST not be set when NUM_MULTISAMPLES > 1.
4769        */
4770       const bool multisampled_fbo =
4771          _mesa_geometric_samples(ctx->DrawBuffer) > 1;
4772       if (multisampled_fbo)
4773          raster.AntialiasingEnable = false;
4774 #endif
4775
4776       /* _NEW_SCISSOR */
4777       raster.ScissorRectangleEnable = ctx->Scissor.EnableFlags;
4778
4779       /* _NEW_TRANSFORM */
4780 #if GEN_GEN < 9
4781       if (!(ctx->Transform.DepthClampNear &&
4782             ctx->Transform.DepthClampFar))
4783          raster.ViewportZClipTestEnable = true;
4784 #endif
4785
4786 #if GEN_GEN >= 9
4787       if (!ctx->Transform.DepthClampNear)
4788          raster.ViewportZNearClipTestEnable = true;
4789
4790       if (!ctx->Transform.DepthClampFar)
4791          raster.ViewportZFarClipTestEnable = true;
4792 #endif
4793
4794       /* BRW_NEW_CONSERVATIVE_RASTERIZATION */
4795 #if GEN_GEN >= 9
4796       raster.ConservativeRasterizationEnable =
4797          ctx->IntelConservativeRasterization;
4798 #endif
4799
4800       raster.GlobalDepthOffsetClamp = polygon->OffsetClamp;
4801       raster.GlobalDepthOffsetScale = polygon->OffsetFactor;
4802
4803       raster.GlobalDepthOffsetConstant = polygon->OffsetUnits * 2;
4804    }
4805 }
4806
4807 static const struct brw_tracked_state genX(raster_state) = {
4808    .dirty = {
4809       .mesa  = _NEW_BUFFERS |
4810                _NEW_LINE |
4811                _NEW_MULTISAMPLE |
4812                _NEW_POINT |
4813                _NEW_POLYGON |
4814                _NEW_SCISSOR |
4815                _NEW_TRANSFORM,
4816       .brw   = BRW_NEW_BLORP |
4817                BRW_NEW_CONTEXT |
4818                BRW_NEW_CONSERVATIVE_RASTERIZATION,
4819    },
4820    .emit = genX(upload_raster),
4821 };
4822 #endif
4823
4824 /* ---------------------------------------------------------------------- */
4825
4826 #if GEN_GEN >= 8
4827 static void
4828 genX(upload_ps_extra)(struct brw_context *brw)
4829 {
4830    UNUSED struct gl_context *ctx = &brw->ctx;
4831
4832    const struct brw_wm_prog_data *prog_data =
4833       brw_wm_prog_data(brw->wm.base.prog_data);
4834
4835    brw_batch_emit(brw, GENX(3DSTATE_PS_EXTRA), psx) {
4836       psx.PixelShaderValid = true;
4837       psx.PixelShaderComputedDepthMode = prog_data->computed_depth_mode;
4838       psx.PixelShaderKillsPixel = prog_data->uses_kill;
4839       psx.AttributeEnable = prog_data->num_varying_inputs != 0;
4840       psx.PixelShaderUsesSourceDepth = prog_data->uses_src_depth;
4841       psx.PixelShaderUsesSourceW = prog_data->uses_src_w;
4842       psx.PixelShaderIsPerSample = prog_data->persample_dispatch;
4843
4844       /* _NEW_MULTISAMPLE | BRW_NEW_CONSERVATIVE_RASTERIZATION */
4845       if (prog_data->uses_sample_mask) {
4846 #if GEN_GEN >= 9
4847          if (prog_data->post_depth_coverage)
4848             psx.InputCoverageMaskState = ICMS_DEPTH_COVERAGE;
4849          else if (prog_data->inner_coverage && ctx->IntelConservativeRasterization)
4850             psx.InputCoverageMaskState = ICMS_INNER_CONSERVATIVE;
4851          else
4852             psx.InputCoverageMaskState = ICMS_NORMAL;
4853 #else
4854          psx.PixelShaderUsesInputCoverageMask = true;
4855 #endif
4856       }
4857
4858       psx.oMaskPresenttoRenderTarget = prog_data->uses_omask;
4859 #if GEN_GEN >= 9
4860       psx.PixelShaderPullsBary = prog_data->pulls_bary;
4861       psx.PixelShaderComputesStencil = prog_data->computed_stencil;
4862 #endif
4863
4864       /* The stricter cross-primitive coherency guarantees that the hardware
4865        * gives us with the "Accesses UAV" bit set for at least one shader stage
4866        * and the "UAV coherency required" bit set on the 3DPRIMITIVE command
4867        * are redundant within the current image, atomic counter and SSBO GL
4868        * APIs, which all have very loose ordering and coherency requirements
4869        * and generally rely on the application to insert explicit barriers when
4870        * a shader invocation is expected to see the memory writes performed by
4871        * the invocations of some previous primitive.  Regardless of the value
4872        * of "UAV coherency required", the "Accesses UAV" bits will implicitly
4873        * cause an in most cases useless DC flush when the lowermost stage with
4874        * the bit set finishes execution.
4875        *
4876        * It would be nice to disable it, but in some cases we can't because on
4877        * Gen8+ it also has an influence on rasterization via the PS UAV-only
4878        * signal (which could be set independently from the coherency mechanism
4879        * in the 3DSTATE_WM command on Gen7), and because in some cases it will
4880        * determine whether the hardware skips execution of the fragment shader
4881        * or not via the ThreadDispatchEnable signal.  However if we know that
4882        * GEN8_PS_BLEND_HAS_WRITEABLE_RT is going to be set and
4883        * GEN8_PSX_PIXEL_SHADER_NO_RT_WRITE is not set it shouldn't make any
4884        * difference so we may just disable it here.
4885        *
4886        * Gen8 hardware tries to compute ThreadDispatchEnable for us but doesn't
4887        * take into account KillPixels when no depth or stencil writes are
4888        * enabled.  In order for occlusion queries to work correctly with no
4889        * attachments, we need to force-enable here.
4890        *
4891        * BRW_NEW_FS_PROG_DATA | BRW_NEW_FRAGMENT_PROGRAM | _NEW_BUFFERS |
4892        * _NEW_COLOR
4893        */
4894       if ((prog_data->has_side_effects || prog_data->uses_kill) &&
4895           !brw_color_buffer_write_enabled(brw))
4896          psx.PixelShaderHasUAV = true;
4897    }
4898 }
4899
4900 const struct brw_tracked_state genX(ps_extra) = {
4901    .dirty = {
4902       .mesa  = _NEW_BUFFERS | _NEW_COLOR,
4903       .brw   = BRW_NEW_BLORP |
4904                BRW_NEW_CONTEXT |
4905                BRW_NEW_FRAGMENT_PROGRAM |
4906                BRW_NEW_FS_PROG_DATA |
4907                BRW_NEW_CONSERVATIVE_RASTERIZATION,
4908    },
4909    .emit = genX(upload_ps_extra),
4910 };
4911 #endif
4912
4913 /* ---------------------------------------------------------------------- */
4914
4915 #if GEN_GEN >= 8
4916 static void
4917 genX(upload_ps_blend)(struct brw_context *brw)
4918 {
4919    struct gl_context *ctx = &brw->ctx;
4920
4921    /* _NEW_BUFFERS */
4922    struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[0];
4923    const bool buffer0_is_integer = ctx->DrawBuffer->_IntegerBuffers & 0x1;
4924
4925    /* _NEW_COLOR */
4926    struct gl_colorbuffer_attrib *color = &ctx->Color;
4927
4928    brw_batch_emit(brw, GENX(3DSTATE_PS_BLEND), pb) {
4929       /* BRW_NEW_FRAGMENT_PROGRAM | _NEW_BUFFERS | _NEW_COLOR */
4930       pb.HasWriteableRT = brw_color_buffer_write_enabled(brw);
4931
4932       bool alpha_to_one = false;
4933
4934       if (!buffer0_is_integer) {
4935          /* _NEW_MULTISAMPLE */
4936
4937          if (_mesa_is_multisample_enabled(ctx)) {
4938             pb.AlphaToCoverageEnable = ctx->Multisample.SampleAlphaToCoverage;
4939             alpha_to_one = ctx->Multisample.SampleAlphaToOne;
4940          }
4941
4942          pb.AlphaTestEnable = color->AlphaEnabled;
4943       }
4944
4945       /* Used for implementing the following bit of GL_EXT_texture_integer:
4946        * "Per-fragment operations that require floating-point color
4947        *  components, including multisample alpha operations, alpha test,
4948        *  blending, and dithering, have no effect when the corresponding
4949        *  colors are written to an integer color buffer."
4950        *
4951        * The OpenGL specification 3.3 (page 196), section 4.1.3 says:
4952        * "If drawbuffer zero is not NONE and the buffer it references has an
4953        *  integer format, the SAMPLE_ALPHA_TO_COVERAGE and SAMPLE_ALPHA_TO_ONE
4954        *  operations are skipped."
4955        */
4956       if (rb && !buffer0_is_integer && (color->BlendEnabled & 1)) {
4957          GLenum eqRGB = color->Blend[0].EquationRGB;
4958          GLenum eqA = color->Blend[0].EquationA;
4959          GLenum srcRGB = color->Blend[0].SrcRGB;
4960          GLenum dstRGB = color->Blend[0].DstRGB;
4961          GLenum srcA = color->Blend[0].SrcA;
4962          GLenum dstA = color->Blend[0].DstA;
4963
4964          if (eqRGB == GL_MIN || eqRGB == GL_MAX)
4965             srcRGB = dstRGB = GL_ONE;
4966
4967          if (eqA == GL_MIN || eqA == GL_MAX)
4968             srcA = dstA = GL_ONE;
4969
4970          /* Due to hardware limitations, the destination may have information
4971           * in an alpha channel even when the format specifies no alpha
4972           * channel. In order to avoid getting any incorrect blending due to
4973           * that alpha channel, coerce the blend factors to values that will
4974           * not read the alpha channel, but will instead use the correct
4975           * implicit value for alpha.
4976           */
4977          if (!_mesa_base_format_has_channel(rb->_BaseFormat,
4978                                             GL_TEXTURE_ALPHA_TYPE)) {
4979             srcRGB = brw_fix_xRGB_alpha(srcRGB);
4980             srcA = brw_fix_xRGB_alpha(srcA);
4981             dstRGB = brw_fix_xRGB_alpha(dstRGB);
4982             dstA = brw_fix_xRGB_alpha(dstA);
4983          }
4984
4985          /* Alpha to One doesn't work with Dual Color Blending.  Override
4986           * SRC1_ALPHA to ONE and ONE_MINUS_SRC1_ALPHA to ZERO.
4987           */
4988          if (alpha_to_one && color->Blend[0]._UsesDualSrc) {
4989             srcRGB = fix_dual_blend_alpha_to_one(srcRGB);
4990             srcA = fix_dual_blend_alpha_to_one(srcA);
4991             dstRGB = fix_dual_blend_alpha_to_one(dstRGB);
4992             dstA = fix_dual_blend_alpha_to_one(dstA);
4993          }
4994
4995          /* BRW_NEW_FS_PROG_DATA */
4996          const struct brw_wm_prog_data *wm_prog_data =
4997             brw_wm_prog_data(brw->wm.base.prog_data);
4998
4999          /* The Dual Source Blending documentation says:
5000           *
5001           * "If SRC1 is included in a src/dst blend factor and
5002           * a DualSource RT Write message is not used, results
5003           * are UNDEFINED. (This reflects the same restriction in DX APIs,
5004           * where undefined results are produced if “o1” is not written
5005           * by a PS – there are no default values defined).
5006           * If SRC1 is not included in a src/dst blend factor,
5007           * dual source blending must be disabled."
5008           *
5009           * There is no way to gracefully fix this undefined situation
5010           * so we just disable the blending to prevent possible issues.
5011           */
5012          pb.ColorBufferBlendEnable =
5013             !color->Blend[0]._UsesDualSrc || wm_prog_data->dual_src_blend;
5014          pb.SourceAlphaBlendFactor = brw_translate_blend_factor(srcA);
5015          pb.DestinationAlphaBlendFactor = brw_translate_blend_factor(dstA);
5016          pb.SourceBlendFactor = brw_translate_blend_factor(srcRGB);
5017          pb.DestinationBlendFactor = brw_translate_blend_factor(dstRGB);
5018
5019          pb.IndependentAlphaBlendEnable =
5020             srcA != srcRGB || dstA != dstRGB || eqA != eqRGB;
5021       }
5022    }
5023 }
5024
5025 static const struct brw_tracked_state genX(ps_blend) = {
5026    .dirty = {
5027       .mesa = _NEW_BUFFERS |
5028               _NEW_COLOR |
5029               _NEW_MULTISAMPLE,
5030       .brw = BRW_NEW_BLORP |
5031              BRW_NEW_CONTEXT |
5032              BRW_NEW_FRAGMENT_PROGRAM |
5033              BRW_NEW_FS_PROG_DATA,
5034    },
5035    .emit = genX(upload_ps_blend)
5036 };
5037 #endif
5038
5039 /* ---------------------------------------------------------------------- */
5040
5041 #if GEN_GEN >= 8
5042 static void
5043 genX(emit_vf_topology)(struct brw_context *brw)
5044 {
5045    brw_batch_emit(brw, GENX(3DSTATE_VF_TOPOLOGY), vftopo) {
5046       vftopo.PrimitiveTopologyType = brw->primitive;
5047    }
5048 }
5049
5050 static const struct brw_tracked_state genX(vf_topology) = {
5051    .dirty = {
5052       .mesa = 0,
5053       .brw = BRW_NEW_BLORP |
5054              BRW_NEW_PRIMITIVE,
5055    },
5056    .emit = genX(emit_vf_topology),
5057 };
5058 #endif
5059
5060 /* ---------------------------------------------------------------------- */
5061
5062 #if GEN_GEN >= 7
5063 static void
5064 genX(emit_mi_report_perf_count)(struct brw_context *brw,
5065                                 struct brw_bo *bo,
5066                                 uint32_t offset_in_bytes,
5067                                 uint32_t report_id)
5068 {
5069    brw_batch_emit(brw, GENX(MI_REPORT_PERF_COUNT), mi_rpc) {
5070       mi_rpc.MemoryAddress = ggtt_bo(bo, offset_in_bytes);
5071       mi_rpc.ReportID = report_id;
5072    }
5073 }
5074 #endif
5075
5076 /* ---------------------------------------------------------------------- */
5077
5078 /**
5079  * Emit a 3DSTATE_SAMPLER_STATE_POINTERS_{VS,HS,GS,DS,PS} packet.
5080  */
5081 static void
5082 genX(emit_sampler_state_pointers_xs)(MAYBE_UNUSED struct brw_context *brw,
5083                                      MAYBE_UNUSED struct brw_stage_state *stage_state)
5084 {
5085 #if GEN_GEN >= 7
5086    static const uint16_t packet_headers[] = {
5087       [MESA_SHADER_VERTEX] = 43,
5088       [MESA_SHADER_TESS_CTRL] = 44,
5089       [MESA_SHADER_TESS_EVAL] = 45,
5090       [MESA_SHADER_GEOMETRY] = 46,
5091       [MESA_SHADER_FRAGMENT] = 47,
5092    };
5093
5094    /* Ivybridge requires a workaround flush before VS packets. */
5095    if (GEN_GEN == 7 && !GEN_IS_HASWELL &&
5096        stage_state->stage == MESA_SHADER_VERTEX) {
5097       gen7_emit_vs_workaround_flush(brw);
5098    }
5099
5100    brw_batch_emit(brw, GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ptr) {
5101       ptr._3DCommandSubOpcode = packet_headers[stage_state->stage];
5102       ptr.PointertoVSSamplerState = stage_state->sampler_offset;
5103    }
5104 #endif
5105 }
5106
5107 UNUSED static bool
5108 has_component(mesa_format format, int i)
5109 {
5110    if (_mesa_is_format_color_format(format))
5111       return _mesa_format_has_color_component(format, i);
5112
5113    /* depth and stencil have only one component */
5114    return i == 0;
5115 }
5116
5117 /**
5118  * Upload SAMPLER_BORDER_COLOR_STATE.
5119  */
5120 static void
5121 genX(upload_default_color)(struct brw_context *brw,
5122                            const struct gl_sampler_object *sampler,
5123                            MAYBE_UNUSED mesa_format format, GLenum base_format,
5124                            bool is_integer_format, bool is_stencil_sampling,
5125                            uint32_t *sdc_offset)
5126 {
5127    union gl_color_union color;
5128
5129    switch (base_format) {
5130    case GL_DEPTH_COMPONENT:
5131       /* GL specs that border color for depth textures is taken from the
5132        * R channel, while the hardware uses A.  Spam R into all the
5133        * channels for safety.
5134        */
5135       color.ui[0] = sampler->BorderColor.ui[0];
5136       color.ui[1] = sampler->BorderColor.ui[0];
5137       color.ui[2] = sampler->BorderColor.ui[0];
5138       color.ui[3] = sampler->BorderColor.ui[0];
5139       break;
5140    case GL_ALPHA:
5141       color.ui[0] = 0u;
5142       color.ui[1] = 0u;
5143       color.ui[2] = 0u;
5144       color.ui[3] = sampler->BorderColor.ui[3];
5145       break;
5146    case GL_INTENSITY:
5147       color.ui[0] = sampler->BorderColor.ui[0];
5148       color.ui[1] = sampler->BorderColor.ui[0];
5149       color.ui[2] = sampler->BorderColor.ui[0];
5150       color.ui[3] = sampler->BorderColor.ui[0];
5151       break;
5152    case GL_LUMINANCE:
5153       color.ui[0] = sampler->BorderColor.ui[0];
5154       color.ui[1] = sampler->BorderColor.ui[0];
5155       color.ui[2] = sampler->BorderColor.ui[0];
5156       color.ui[3] = float_as_int(1.0);
5157       break;
5158    case GL_LUMINANCE_ALPHA:
5159       color.ui[0] = sampler->BorderColor.ui[0];
5160       color.ui[1] = sampler->BorderColor.ui[0];
5161       color.ui[2] = sampler->BorderColor.ui[0];
5162       color.ui[3] = sampler->BorderColor.ui[3];
5163       break;
5164    default:
5165       color.ui[0] = sampler->BorderColor.ui[0];
5166       color.ui[1] = sampler->BorderColor.ui[1];
5167       color.ui[2] = sampler->BorderColor.ui[2];
5168       color.ui[3] = sampler->BorderColor.ui[3];
5169       break;
5170    }
5171
5172    /* In some cases we use an RGBA surface format for GL RGB textures,
5173     * where we've initialized the A channel to 1.0.  We also have to set
5174     * the border color alpha to 1.0 in that case.
5175     */
5176    if (base_format == GL_RGB)
5177       color.ui[3] = float_as_int(1.0);
5178
5179    int alignment = 32;
5180    if (GEN_GEN >= 8) {
5181       alignment = 64;
5182    } else if (GEN_IS_HASWELL && (is_integer_format || is_stencil_sampling)) {
5183       alignment = 512;
5184    }
5185
5186    uint32_t *sdc = brw_state_batch(
5187       brw, GENX(SAMPLER_BORDER_COLOR_STATE_length) * sizeof(uint32_t),
5188       alignment, sdc_offset);
5189
5190    struct GENX(SAMPLER_BORDER_COLOR_STATE) state = { 0 };
5191
5192 #define ASSIGN(dst, src) \
5193    do {                  \
5194       dst = src;         \
5195    } while (0)
5196
5197 #define ASSIGNu16(dst, src) \
5198    do {                     \
5199       dst = (uint16_t)src;  \
5200    } while (0)
5201
5202 #define ASSIGNu8(dst, src) \
5203    do {                    \
5204       dst = (uint8_t)src;  \
5205    } while (0)
5206
5207 #define BORDER_COLOR_ATTR(macro, _color_type, src)              \
5208    macro(state.BorderColor ## _color_type ## Red, src[0]);   \
5209    macro(state.BorderColor ## _color_type ## Green, src[1]);   \
5210    macro(state.BorderColor ## _color_type ## Blue, src[2]);   \
5211    macro(state.BorderColor ## _color_type ## Alpha, src[3]);
5212
5213 #if GEN_GEN >= 8
5214    /* On Broadwell, the border color is represented as four 32-bit floats,
5215     * integers, or unsigned values, interpreted according to the surface
5216     * format.  This matches the sampler->BorderColor union exactly; just
5217     * memcpy the values.
5218     */
5219    BORDER_COLOR_ATTR(ASSIGN, 32bit, color.ui);
5220 #elif GEN_IS_HASWELL
5221    if (is_integer_format || is_stencil_sampling) {
5222       bool stencil = format == MESA_FORMAT_S_UINT8 || is_stencil_sampling;
5223       const int bits_per_channel =
5224          _mesa_get_format_bits(format, stencil ? GL_STENCIL_BITS : GL_RED_BITS);
5225
5226       /* From the Haswell PRM, "Command Reference: Structures", Page 36:
5227        * "If any color channel is missing from the surface format,
5228        *  corresponding border color should be programmed as zero and if
5229        *  alpha channel is missing, corresponding Alpha border color should
5230        *  be programmed as 1."
5231        */
5232       unsigned c[4] = { 0, 0, 0, 1 };
5233       for (int i = 0; i < 4; i++) {
5234          if (has_component(format, i))
5235             c[i] = color.ui[i];
5236       }
5237
5238       switch (bits_per_channel) {
5239       case 8:
5240          /* Copy RGBA in order. */
5241          BORDER_COLOR_ATTR(ASSIGNu8, 8bit, c);
5242          break;
5243       case 10:
5244          /* R10G10B10A2_UINT is treated like a 16-bit format. */
5245       case 16:
5246          BORDER_COLOR_ATTR(ASSIGNu16, 16bit, c);
5247          break;
5248       case 32:
5249          if (base_format == GL_RG) {
5250             /* Careful inspection of the tables reveals that for RG32 formats,
5251              * the green channel needs to go where blue normally belongs.
5252              */
5253             state.BorderColor32bitRed = c[0];
5254             state.BorderColor32bitBlue = c[1];
5255             state.BorderColor32bitAlpha = 1;
5256          } else {
5257             /* Copy RGBA in order. */
5258             BORDER_COLOR_ATTR(ASSIGN, 32bit, c);
5259          }
5260          break;
5261       default:
5262          assert(!"Invalid number of bits per channel in integer format.");
5263          break;
5264       }
5265    } else {
5266       BORDER_COLOR_ATTR(ASSIGN, Float, color.f);
5267    }
5268 #elif GEN_GEN == 5 || GEN_GEN == 6
5269    BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_UBYTE, Unorm, color.f);
5270    BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_USHORT, Unorm16, color.f);
5271    BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_SHORT, Snorm16, color.f);
5272
5273 #define MESA_FLOAT_TO_HALF(dst, src) \
5274    dst = _mesa_float_to_half(src);
5275
5276    BORDER_COLOR_ATTR(MESA_FLOAT_TO_HALF, Float16, color.f);
5277
5278 #undef MESA_FLOAT_TO_HALF
5279
5280    state.BorderColorSnorm8Red   = state.BorderColorSnorm16Red >> 8;
5281    state.BorderColorSnorm8Green = state.BorderColorSnorm16Green >> 8;
5282    state.BorderColorSnorm8Blue  = state.BorderColorSnorm16Blue >> 8;
5283    state.BorderColorSnorm8Alpha = state.BorderColorSnorm16Alpha >> 8;
5284
5285    BORDER_COLOR_ATTR(ASSIGN, Float, color.f);
5286 #elif GEN_GEN == 4
5287    BORDER_COLOR_ATTR(ASSIGN, , color.f);
5288 #else
5289    BORDER_COLOR_ATTR(ASSIGN, Float, color.f);
5290 #endif
5291
5292 #undef ASSIGN
5293 #undef BORDER_COLOR_ATTR
5294
5295    GENX(SAMPLER_BORDER_COLOR_STATE_pack)(brw, sdc, &state);
5296 }
5297
5298 static uint32_t
5299 translate_wrap_mode(GLenum wrap, MAYBE_UNUSED bool using_nearest)
5300 {
5301    switch (wrap) {
5302    case GL_REPEAT:
5303       return TCM_WRAP;
5304    case GL_CLAMP:
5305 #if GEN_GEN >= 8
5306       /* GL_CLAMP is the weird mode where coordinates are clamped to
5307        * [0.0, 1.0], so linear filtering of coordinates outside of
5308        * [0.0, 1.0] give you half edge texel value and half border
5309        * color.
5310        *
5311        * Gen8+ supports this natively.
5312        */
5313       return TCM_HALF_BORDER;
5314 #else
5315       /* On Gen4-7.5, we clamp the coordinates in the fragment shader
5316        * and set clamp_border here, which gets the result desired.
5317        * We just use clamp(_to_edge) for nearest, because for nearest
5318        * clamping to 1.0 gives border color instead of the desired
5319        * edge texels.
5320        */
5321       if (using_nearest)
5322          return TCM_CLAMP;
5323       else
5324          return TCM_CLAMP_BORDER;
5325 #endif
5326    case GL_CLAMP_TO_EDGE:
5327       return TCM_CLAMP;
5328    case GL_CLAMP_TO_BORDER:
5329       return TCM_CLAMP_BORDER;
5330    case GL_MIRRORED_REPEAT:
5331       return TCM_MIRROR;
5332    case GL_MIRROR_CLAMP_TO_EDGE:
5333       return TCM_MIRROR_ONCE;
5334    default:
5335       return TCM_WRAP;
5336    }
5337 }
5338
5339 /**
5340  * Return true if the given wrap mode requires the border color to exist.
5341  */
5342 static bool
5343 wrap_mode_needs_border_color(unsigned wrap_mode)
5344 {
5345 #if GEN_GEN >= 8
5346    return wrap_mode == TCM_CLAMP_BORDER ||
5347           wrap_mode == TCM_HALF_BORDER;
5348 #else
5349    return wrap_mode == TCM_CLAMP_BORDER;
5350 #endif
5351 }
5352
5353 /**
5354  * Sets the sampler state for a single unit based off of the sampler key
5355  * entry.
5356  */
5357 static void
5358 genX(update_sampler_state)(struct brw_context *brw,
5359                            GLenum target, bool tex_cube_map_seamless,
5360                            GLfloat tex_unit_lod_bias,
5361                            mesa_format format, GLenum base_format,
5362                            const struct gl_texture_object *texObj,
5363                            const struct gl_sampler_object *sampler,
5364                            uint32_t *sampler_state)
5365 {
5366    struct GENX(SAMPLER_STATE) samp_st = { 0 };
5367
5368    /* Select min and mip filters. */
5369    switch (sampler->MinFilter) {
5370    case GL_NEAREST:
5371       samp_st.MinModeFilter = MAPFILTER_NEAREST;
5372       samp_st.MipModeFilter = MIPFILTER_NONE;
5373       break;
5374    case GL_LINEAR:
5375       samp_st.MinModeFilter = MAPFILTER_LINEAR;
5376       samp_st.MipModeFilter = MIPFILTER_NONE;
5377       break;
5378    case GL_NEAREST_MIPMAP_NEAREST:
5379       samp_st.MinModeFilter = MAPFILTER_NEAREST;
5380       samp_st.MipModeFilter = MIPFILTER_NEAREST;
5381       break;
5382    case GL_LINEAR_MIPMAP_NEAREST:
5383       samp_st.MinModeFilter = MAPFILTER_LINEAR;
5384       samp_st.MipModeFilter = MIPFILTER_NEAREST;
5385       break;
5386    case GL_NEAREST_MIPMAP_LINEAR:
5387       samp_st.MinModeFilter = MAPFILTER_NEAREST;
5388       samp_st.MipModeFilter = MIPFILTER_LINEAR;
5389       break;
5390    case GL_LINEAR_MIPMAP_LINEAR:
5391       samp_st.MinModeFilter = MAPFILTER_LINEAR;
5392       samp_st.MipModeFilter = MIPFILTER_LINEAR;
5393       break;
5394    default:
5395       unreachable("not reached");
5396    }
5397
5398    /* Select mag filter. */
5399    samp_st.MagModeFilter = sampler->MagFilter == GL_LINEAR ?
5400       MAPFILTER_LINEAR : MAPFILTER_NEAREST;
5401
5402    /* Enable anisotropic filtering if desired. */
5403    samp_st.MaximumAnisotropy = RATIO21;
5404
5405    if (sampler->MaxAnisotropy > 1.0f) {
5406       if (samp_st.MinModeFilter == MAPFILTER_LINEAR)
5407          samp_st.MinModeFilter = MAPFILTER_ANISOTROPIC;
5408       if (samp_st.MagModeFilter == MAPFILTER_LINEAR)
5409          samp_st.MagModeFilter = MAPFILTER_ANISOTROPIC;
5410
5411       if (sampler->MaxAnisotropy > 2.0f) {
5412          samp_st.MaximumAnisotropy =
5413             MIN2((sampler->MaxAnisotropy - 2) / 2, RATIO161);
5414       }
5415    }
5416
5417    /* Set address rounding bits if not using nearest filtering. */
5418    if (samp_st.MinModeFilter != MAPFILTER_NEAREST) {
5419       samp_st.UAddressMinFilterRoundingEnable = true;
5420       samp_st.VAddressMinFilterRoundingEnable = true;
5421       samp_st.RAddressMinFilterRoundingEnable = true;
5422    }
5423
5424    if (samp_st.MagModeFilter != MAPFILTER_NEAREST) {
5425       samp_st.UAddressMagFilterRoundingEnable = true;
5426       samp_st.VAddressMagFilterRoundingEnable = true;
5427       samp_st.RAddressMagFilterRoundingEnable = true;
5428    }
5429
5430    bool either_nearest =
5431       sampler->MinFilter == GL_NEAREST || sampler->MagFilter == GL_NEAREST;
5432    unsigned wrap_s = translate_wrap_mode(sampler->WrapS, either_nearest);
5433    unsigned wrap_t = translate_wrap_mode(sampler->WrapT, either_nearest);
5434    unsigned wrap_r = translate_wrap_mode(sampler->WrapR, either_nearest);
5435
5436    if (target == GL_TEXTURE_CUBE_MAP ||
5437        target == GL_TEXTURE_CUBE_MAP_ARRAY) {
5438       /* Cube maps must use the same wrap mode for all three coordinate
5439        * dimensions.  Prior to Haswell, only CUBE and CLAMP are valid.
5440        *
5441        * Ivybridge and Baytrail seem to have problems with CUBE mode and
5442        * integer formats.  Fall back to CLAMP for now.
5443        */
5444       if ((tex_cube_map_seamless || sampler->CubeMapSeamless) &&
5445           !(GEN_GEN == 7 && !GEN_IS_HASWELL && texObj->_IsIntegerFormat)) {
5446          wrap_s = TCM_CUBE;
5447          wrap_t = TCM_CUBE;
5448          wrap_r = TCM_CUBE;
5449       } else {
5450          wrap_s = TCM_CLAMP;
5451          wrap_t = TCM_CLAMP;
5452          wrap_r = TCM_CLAMP;
5453       }
5454    } else if (target == GL_TEXTURE_1D) {
5455       /* There's a bug in 1D texture sampling - it actually pays
5456        * attention to the wrap_t value, though it should not.
5457        * Override the wrap_t value here to GL_REPEAT to keep
5458        * any nonexistent border pixels from floating in.
5459        */
5460       wrap_t = TCM_WRAP;
5461    }
5462
5463    samp_st.TCXAddressControlMode = wrap_s;
5464    samp_st.TCYAddressControlMode = wrap_t;
5465    samp_st.TCZAddressControlMode = wrap_r;
5466
5467    samp_st.ShadowFunction =
5468       sampler->CompareMode == GL_COMPARE_R_TO_TEXTURE_ARB ?
5469       intel_translate_shadow_compare_func(sampler->CompareFunc) : 0;
5470
5471 #if GEN_GEN >= 7
5472    /* Set shadow function. */
5473    samp_st.AnisotropicAlgorithm =
5474       samp_st.MinModeFilter == MAPFILTER_ANISOTROPIC ?
5475       EWAApproximation : LEGACY;
5476 #endif
5477
5478 #if GEN_GEN >= 6
5479    samp_st.NonnormalizedCoordinateEnable = target == GL_TEXTURE_RECTANGLE;
5480 #endif
5481
5482    const float hw_max_lod = GEN_GEN >= 7 ? 14 : 13;
5483    samp_st.MinLOD = CLAMP(sampler->MinLod, 0, hw_max_lod);
5484    samp_st.MaxLOD = CLAMP(sampler->MaxLod, 0, hw_max_lod);
5485    samp_st.TextureLODBias =
5486       CLAMP(tex_unit_lod_bias + sampler->LodBias, -16, 15);
5487
5488 #if GEN_GEN == 6
5489    samp_st.BaseMipLevel =
5490       CLAMP(texObj->MinLevel + texObj->BaseLevel, 0, hw_max_lod);
5491    samp_st.MinandMagStateNotEqual =
5492       samp_st.MinModeFilter != samp_st.MagModeFilter;
5493 #endif
5494
5495    /* Upload the border color if necessary.  If not, just point it at
5496     * offset 0 (the start of the batch) - the color should be ignored,
5497     * but that address won't fault in case something reads it anyway.
5498     */
5499    uint32_t border_color_offset = 0;
5500    if (wrap_mode_needs_border_color(wrap_s) ||
5501        wrap_mode_needs_border_color(wrap_t) ||
5502        wrap_mode_needs_border_color(wrap_r)) {
5503       genX(upload_default_color)(brw, sampler, format, base_format,
5504                                  texObj->_IsIntegerFormat,
5505                                  texObj->StencilSampling,
5506                                  &border_color_offset);
5507    }
5508 #if GEN_GEN < 6
5509       samp_st.BorderColorPointer =
5510          ro_bo(brw->batch.state.bo, border_color_offset);
5511 #else
5512       samp_st.BorderColorPointer = border_color_offset;
5513 #endif
5514
5515 #if GEN_GEN >= 8
5516    samp_st.LODPreClampMode = CLAMP_MODE_OGL;
5517 #else
5518    samp_st.LODPreClampEnable = true;
5519 #endif
5520
5521    GENX(SAMPLER_STATE_pack)(brw, sampler_state, &samp_st);
5522 }
5523
5524 static void
5525 update_sampler_state(struct brw_context *brw,
5526                      int unit,
5527                      uint32_t *sampler_state)
5528 {
5529    struct gl_context *ctx = &brw->ctx;
5530    const struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
5531    const struct gl_texture_object *texObj = texUnit->_Current;
5532    const struct gl_sampler_object *sampler = _mesa_get_samplerobj(ctx, unit);
5533
5534    /* These don't use samplers at all. */
5535    if (texObj->Target == GL_TEXTURE_BUFFER)
5536       return;
5537
5538    struct gl_texture_image *firstImage = texObj->Image[0][texObj->BaseLevel];
5539    genX(update_sampler_state)(brw, texObj->Target,
5540                               ctx->Texture.CubeMapSeamless,
5541                               texUnit->LodBias,
5542                               firstImage->TexFormat, firstImage->_BaseFormat,
5543                               texObj, sampler,
5544                               sampler_state);
5545 }
5546
5547 static void
5548 genX(upload_sampler_state_table)(struct brw_context *brw,
5549                                  struct gl_program *prog,
5550                                  struct brw_stage_state *stage_state)
5551 {
5552    struct gl_context *ctx = &brw->ctx;
5553    uint32_t sampler_count = stage_state->sampler_count;
5554
5555    GLbitfield SamplersUsed = prog->SamplersUsed;
5556
5557    if (sampler_count == 0)
5558       return;
5559
5560    /* SAMPLER_STATE is 4 DWords on all platforms. */
5561    const int dwords = GENX(SAMPLER_STATE_length);
5562    const int size_in_bytes = dwords * sizeof(uint32_t);
5563
5564    uint32_t *sampler_state = brw_state_batch(brw,
5565                                              sampler_count * size_in_bytes,
5566                                              32, &stage_state->sampler_offset);
5567    /* memset(sampler_state, 0, sampler_count * size_in_bytes); */
5568
5569    for (unsigned s = 0; s < sampler_count; s++) {
5570       if (SamplersUsed & (1 << s)) {
5571          const unsigned unit = prog->SamplerUnits[s];
5572          if (ctx->Texture.Unit[unit]._Current) {
5573             update_sampler_state(brw, unit, sampler_state);
5574          }
5575       }
5576
5577       sampler_state += dwords;
5578    }
5579
5580    if (GEN_GEN >= 7 && stage_state->stage != MESA_SHADER_COMPUTE) {
5581       /* Emit a 3DSTATE_SAMPLER_STATE_POINTERS_XS packet. */
5582       genX(emit_sampler_state_pointers_xs)(brw, stage_state);
5583    } else {
5584       /* Flag that the sampler state table pointer has changed; later atoms
5585        * will handle it.
5586        */
5587       brw->ctx.NewDriverState |= BRW_NEW_SAMPLER_STATE_TABLE;
5588    }
5589 }
5590
5591 static void
5592 genX(upload_fs_samplers)(struct brw_context *brw)
5593 {
5594    /* BRW_NEW_FRAGMENT_PROGRAM */
5595    struct gl_program *fs = brw->programs[MESA_SHADER_FRAGMENT];
5596    genX(upload_sampler_state_table)(brw, fs, &brw->wm.base);
5597 }
5598
5599 static const struct brw_tracked_state genX(fs_samplers) = {
5600    .dirty = {
5601       .mesa = _NEW_TEXTURE,
5602       .brw = BRW_NEW_BATCH |
5603              BRW_NEW_BLORP |
5604              BRW_NEW_FRAGMENT_PROGRAM,
5605    },
5606    .emit = genX(upload_fs_samplers),
5607 };
5608
5609 static void
5610 genX(upload_vs_samplers)(struct brw_context *brw)
5611 {
5612    /* BRW_NEW_VERTEX_PROGRAM */
5613    struct gl_program *vs = brw->programs[MESA_SHADER_VERTEX];
5614    genX(upload_sampler_state_table)(brw, vs, &brw->vs.base);
5615 }
5616
5617 static const struct brw_tracked_state genX(vs_samplers) = {
5618    .dirty = {
5619       .mesa = _NEW_TEXTURE,
5620       .brw = BRW_NEW_BATCH |
5621              BRW_NEW_BLORP |
5622              BRW_NEW_VERTEX_PROGRAM,
5623    },
5624    .emit = genX(upload_vs_samplers),
5625 };
5626
5627 #if GEN_GEN >= 6
5628 static void
5629 genX(upload_gs_samplers)(struct brw_context *brw)
5630 {
5631    /* BRW_NEW_GEOMETRY_PROGRAM */
5632    struct gl_program *gs = brw->programs[MESA_SHADER_GEOMETRY];
5633    if (!gs)
5634       return;
5635
5636    genX(upload_sampler_state_table)(brw, gs, &brw->gs.base);
5637 }
5638
5639
5640 static const struct brw_tracked_state genX(gs_samplers) = {
5641    .dirty = {
5642       .mesa = _NEW_TEXTURE,
5643       .brw = BRW_NEW_BATCH |
5644              BRW_NEW_BLORP |
5645              BRW_NEW_GEOMETRY_PROGRAM,
5646    },
5647    .emit = genX(upload_gs_samplers),
5648 };
5649 #endif
5650
5651 #if GEN_GEN >= 7
5652 static void
5653 genX(upload_tcs_samplers)(struct brw_context *brw)
5654 {
5655    /* BRW_NEW_TESS_PROGRAMS */
5656    struct gl_program *tcs = brw->programs[MESA_SHADER_TESS_CTRL];
5657    if (!tcs)
5658       return;
5659
5660    genX(upload_sampler_state_table)(brw, tcs, &brw->tcs.base);
5661 }
5662
5663 static const struct brw_tracked_state genX(tcs_samplers) = {
5664    .dirty = {
5665       .mesa = _NEW_TEXTURE,
5666       .brw = BRW_NEW_BATCH |
5667              BRW_NEW_BLORP |
5668              BRW_NEW_TESS_PROGRAMS,
5669    },
5670    .emit = genX(upload_tcs_samplers),
5671 };
5672 #endif
5673
5674 #if GEN_GEN >= 7
5675 static void
5676 genX(upload_tes_samplers)(struct brw_context *brw)
5677 {
5678    /* BRW_NEW_TESS_PROGRAMS */
5679    struct gl_program *tes = brw->programs[MESA_SHADER_TESS_EVAL];
5680    if (!tes)
5681       return;
5682
5683    genX(upload_sampler_state_table)(brw, tes, &brw->tes.base);
5684 }
5685
5686 static const struct brw_tracked_state genX(tes_samplers) = {
5687    .dirty = {
5688       .mesa = _NEW_TEXTURE,
5689       .brw = BRW_NEW_BATCH |
5690              BRW_NEW_BLORP |
5691              BRW_NEW_TESS_PROGRAMS,
5692    },
5693    .emit = genX(upload_tes_samplers),
5694 };
5695 #endif
5696
5697 #if GEN_GEN >= 7
5698 static void
5699 genX(upload_cs_samplers)(struct brw_context *brw)
5700 {
5701    /* BRW_NEW_COMPUTE_PROGRAM */
5702    struct gl_program *cs = brw->programs[MESA_SHADER_COMPUTE];
5703    if (!cs)
5704       return;
5705
5706    genX(upload_sampler_state_table)(brw, cs, &brw->cs.base);
5707 }
5708
5709 const struct brw_tracked_state genX(cs_samplers) = {
5710    .dirty = {
5711       .mesa = _NEW_TEXTURE,
5712       .brw = BRW_NEW_BATCH |
5713              BRW_NEW_BLORP |
5714              BRW_NEW_COMPUTE_PROGRAM,
5715    },
5716    .emit = genX(upload_cs_samplers),
5717 };
5718 #endif
5719
5720 /* ---------------------------------------------------------------------- */
5721
5722 #if GEN_GEN <= 5
5723
5724 static void genX(upload_blend_constant_color)(struct brw_context *brw)
5725 {
5726    struct gl_context *ctx = &brw->ctx;
5727
5728    brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_COLOR), blend_cc) {
5729       blend_cc.BlendConstantColorRed = ctx->Color.BlendColorUnclamped[0];
5730       blend_cc.BlendConstantColorGreen = ctx->Color.BlendColorUnclamped[1];
5731       blend_cc.BlendConstantColorBlue = ctx->Color.BlendColorUnclamped[2];
5732       blend_cc.BlendConstantColorAlpha = ctx->Color.BlendColorUnclamped[3];
5733    }
5734 }
5735
5736 static const struct brw_tracked_state genX(blend_constant_color) = {
5737    .dirty = {
5738       .mesa = _NEW_COLOR,
5739       .brw = BRW_NEW_CONTEXT |
5740              BRW_NEW_BLORP,
5741    },
5742    .emit = genX(upload_blend_constant_color)
5743 };
5744 #endif
5745
5746 /* ---------------------------------------------------------------------- */
5747
5748 void
5749 genX(init_atoms)(struct brw_context *brw)
5750 {
5751 #if GEN_GEN < 6
5752    static const struct brw_tracked_state *render_atoms[] =
5753    {
5754       /* Once all the programs are done, we know how large urb entry
5755        * sizes need to be and can decide if we need to change the urb
5756        * layout.
5757        */
5758       &brw_curbe_offsets,
5759       &brw_recalculate_urb_fence,
5760
5761       &genX(cc_vp),
5762       &genX(color_calc_state),
5763
5764       /* Surface state setup.  Must come before the VS/WM unit.  The binding
5765        * table upload must be last.
5766        */
5767       &brw_vs_pull_constants,
5768       &brw_wm_pull_constants,
5769       &brw_renderbuffer_surfaces,
5770       &brw_renderbuffer_read_surfaces,
5771       &brw_texture_surfaces,
5772       &brw_vs_binding_table,
5773       &brw_wm_binding_table,
5774
5775       &genX(fs_samplers),
5776       &genX(vs_samplers),
5777
5778       /* These set up state for brw_psp_urb_cbs */
5779       &genX(wm_state),
5780       &genX(sf_clip_viewport),
5781       &genX(sf_state),
5782       &genX(vs_state), /* always required, enabled or not */
5783       &genX(clip_state),
5784       &genX(gs_state),
5785
5786       /* Command packets:
5787        */
5788       &brw_binding_table_pointers,
5789       &genX(blend_constant_color),
5790
5791       &brw_depthbuffer,
5792
5793       &genX(polygon_stipple),
5794       &genX(polygon_stipple_offset),
5795
5796       &genX(line_stipple),
5797
5798       &brw_psp_urb_cbs,
5799
5800       &genX(drawing_rect),
5801       &brw_indices, /* must come before brw_vertices */
5802       &genX(index_buffer),
5803       &genX(vertices),
5804
5805       &brw_constant_buffer
5806    };
5807 #elif GEN_GEN == 6
5808    static const struct brw_tracked_state *render_atoms[] =
5809    {
5810       &genX(sf_clip_viewport),
5811
5812       /* Command packets: */
5813
5814       &genX(cc_vp),
5815
5816       &gen6_urb,
5817       &genX(blend_state),               /* must do before cc unit */
5818       &genX(color_calc_state),  /* must do before cc unit */
5819       &genX(depth_stencil_state),       /* must do before cc unit */
5820
5821       &genX(vs_push_constants), /* Before vs_state */
5822       &genX(gs_push_constants), /* Before gs_state */
5823       &genX(wm_push_constants), /* Before wm_state */
5824
5825       /* Surface state setup.  Must come before the VS/WM unit.  The binding
5826        * table upload must be last.
5827        */
5828       &brw_vs_pull_constants,
5829       &brw_vs_ubo_surfaces,
5830       &brw_gs_pull_constants,
5831       &brw_gs_ubo_surfaces,
5832       &brw_wm_pull_constants,
5833       &brw_wm_ubo_surfaces,
5834       &gen6_renderbuffer_surfaces,
5835       &brw_renderbuffer_read_surfaces,
5836       &brw_texture_surfaces,
5837       &gen6_sol_surface,
5838       &brw_vs_binding_table,
5839       &gen6_gs_binding_table,
5840       &brw_wm_binding_table,
5841
5842       &genX(fs_samplers),
5843       &genX(vs_samplers),
5844       &genX(gs_samplers),
5845       &gen6_sampler_state,
5846       &genX(multisample_state),
5847
5848       &genX(vs_state),
5849       &genX(gs_state),
5850       &genX(clip_state),
5851       &genX(sf_state),
5852       &genX(wm_state),
5853
5854       &genX(scissor_state),
5855
5856       &gen6_binding_table_pointers,
5857
5858       &brw_depthbuffer,
5859
5860       &genX(polygon_stipple),
5861       &genX(polygon_stipple_offset),
5862
5863       &genX(line_stipple),
5864
5865       &genX(drawing_rect),
5866
5867       &brw_indices, /* must come before brw_vertices */
5868       &genX(index_buffer),
5869       &genX(vertices),
5870    };
5871 #elif GEN_GEN == 7
5872    static const struct brw_tracked_state *render_atoms[] =
5873    {
5874       /* Command packets: */
5875
5876       &genX(cc_vp),
5877       &genX(sf_clip_viewport),
5878
5879       &gen7_l3_state,
5880       &gen7_push_constant_space,
5881       &gen7_urb,
5882       &genX(blend_state),               /* must do before cc unit */
5883       &genX(color_calc_state),  /* must do before cc unit */
5884       &genX(depth_stencil_state),       /* must do before cc unit */
5885
5886       &brw_vs_image_surfaces, /* Before vs push/pull constants and binding table */
5887       &brw_tcs_image_surfaces, /* Before tcs push/pull constants and binding table */
5888       &brw_tes_image_surfaces, /* Before tes push/pull constants and binding table */
5889       &brw_gs_image_surfaces, /* Before gs push/pull constants and binding table */
5890       &brw_wm_image_surfaces, /* Before wm push/pull constants and binding table */
5891
5892       &genX(vs_push_constants), /* Before vs_state */
5893       &genX(tcs_push_constants),
5894       &genX(tes_push_constants),
5895       &genX(gs_push_constants), /* Before gs_state */
5896       &genX(wm_push_constants), /* Before wm_surfaces and constant_buffer */
5897
5898       /* Surface state setup.  Must come before the VS/WM unit.  The binding
5899        * table upload must be last.
5900        */
5901       &brw_vs_pull_constants,
5902       &brw_vs_ubo_surfaces,
5903       &brw_tcs_pull_constants,
5904       &brw_tcs_ubo_surfaces,
5905       &brw_tes_pull_constants,
5906       &brw_tes_ubo_surfaces,
5907       &brw_gs_pull_constants,
5908       &brw_gs_ubo_surfaces,
5909       &brw_wm_pull_constants,
5910       &brw_wm_ubo_surfaces,
5911       &gen6_renderbuffer_surfaces,
5912       &brw_renderbuffer_read_surfaces,
5913       &brw_texture_surfaces,
5914
5915       &genX(push_constant_packets),
5916
5917       &brw_vs_binding_table,
5918       &brw_tcs_binding_table,
5919       &brw_tes_binding_table,
5920       &brw_gs_binding_table,
5921       &brw_wm_binding_table,
5922
5923       &genX(fs_samplers),
5924       &genX(vs_samplers),
5925       &genX(tcs_samplers),
5926       &genX(tes_samplers),
5927       &genX(gs_samplers),
5928       &genX(multisample_state),
5929
5930       &genX(vs_state),
5931       &genX(hs_state),
5932       &genX(te_state),
5933       &genX(ds_state),
5934       &genX(gs_state),
5935       &genX(sol_state),
5936       &genX(clip_state),
5937       &genX(sbe_state),
5938       &genX(sf_state),
5939       &genX(wm_state),
5940       &genX(ps_state),
5941
5942       &genX(scissor_state),
5943
5944       &brw_depthbuffer,
5945
5946       &genX(polygon_stipple),
5947       &genX(polygon_stipple_offset),
5948
5949       &genX(line_stipple),
5950
5951       &genX(drawing_rect),
5952
5953       &brw_indices, /* must come before brw_vertices */
5954       &genX(index_buffer),
5955       &genX(vertices),
5956
5957 #if GEN_IS_HASWELL
5958       &genX(cut_index),
5959 #endif
5960    };
5961 #elif GEN_GEN >= 8
5962    static const struct brw_tracked_state *render_atoms[] =
5963    {
5964       &genX(cc_vp),
5965       &genX(sf_clip_viewport),
5966
5967       &gen7_l3_state,
5968       &gen7_push_constant_space,
5969       &gen7_urb,
5970       &genX(blend_state),
5971       &genX(color_calc_state),
5972
5973       &brw_vs_image_surfaces, /* Before vs push/pull constants and binding table */
5974       &brw_tcs_image_surfaces, /* Before tcs push/pull constants and binding table */
5975       &brw_tes_image_surfaces, /* Before tes push/pull constants and binding table */
5976       &brw_gs_image_surfaces, /* Before gs push/pull constants and binding table */
5977       &brw_wm_image_surfaces, /* Before wm push/pull constants and binding table */
5978
5979       &genX(vs_push_constants), /* Before vs_state */
5980       &genX(tcs_push_constants),
5981       &genX(tes_push_constants),
5982       &genX(gs_push_constants), /* Before gs_state */
5983       &genX(wm_push_constants), /* Before wm_surfaces and constant_buffer */
5984
5985       /* Surface state setup.  Must come before the VS/WM unit.  The binding
5986        * table upload must be last.
5987        */
5988       &brw_vs_pull_constants,
5989       &brw_vs_ubo_surfaces,
5990       &brw_tcs_pull_constants,
5991       &brw_tcs_ubo_surfaces,
5992       &brw_tes_pull_constants,
5993       &brw_tes_ubo_surfaces,
5994       &brw_gs_pull_constants,
5995       &brw_gs_ubo_surfaces,
5996       &brw_wm_pull_constants,
5997       &brw_wm_ubo_surfaces,
5998       &gen6_renderbuffer_surfaces,
5999       &brw_renderbuffer_read_surfaces,
6000       &brw_texture_surfaces,
6001
6002       &genX(push_constant_packets),
6003
6004       &brw_vs_binding_table,
6005       &brw_tcs_binding_table,
6006       &brw_tes_binding_table,
6007       &brw_gs_binding_table,
6008       &brw_wm_binding_table,
6009
6010       &genX(fs_samplers),
6011       &genX(vs_samplers),
6012       &genX(tcs_samplers),
6013       &genX(tes_samplers),
6014       &genX(gs_samplers),
6015       &genX(multisample_state),
6016
6017       &genX(vs_state),
6018       &genX(hs_state),
6019       &genX(te_state),
6020       &genX(ds_state),
6021       &genX(gs_state),
6022       &genX(sol_state),
6023       &genX(clip_state),
6024       &genX(raster_state),
6025       &genX(sbe_state),
6026       &genX(sf_state),
6027       &genX(ps_blend),
6028       &genX(ps_extra),
6029       &genX(ps_state),
6030       &genX(depth_stencil_state),
6031       &genX(wm_state),
6032
6033       &genX(scissor_state),
6034
6035       &brw_depthbuffer,
6036
6037       &genX(polygon_stipple),
6038       &genX(polygon_stipple_offset),
6039
6040       &genX(line_stipple),
6041
6042       &genX(drawing_rect),
6043
6044       &genX(vf_topology),
6045
6046       &brw_indices,
6047       &genX(index_buffer),
6048       &genX(vertices),
6049
6050       &genX(cut_index),
6051       &gen8_pma_fix,
6052    };
6053 #endif
6054
6055    STATIC_ASSERT(ARRAY_SIZE(render_atoms) <= ARRAY_SIZE(brw->render_atoms));
6056    brw_copy_pipeline_atoms(brw, BRW_RENDER_PIPELINE,
6057                            render_atoms, ARRAY_SIZE(render_atoms));
6058
6059 #if GEN_GEN >= 7
6060    static const struct brw_tracked_state *compute_atoms[] =
6061    {
6062       &gen7_l3_state,
6063       &brw_cs_image_surfaces,
6064       &genX(cs_push_constants),
6065       &genX(cs_pull_constants),
6066       &brw_cs_ubo_surfaces,
6067       &brw_cs_texture_surfaces,
6068       &brw_cs_work_groups_surface,
6069       &genX(cs_samplers),
6070       &genX(cs_state),
6071    };
6072
6073    STATIC_ASSERT(ARRAY_SIZE(compute_atoms) <= ARRAY_SIZE(brw->compute_atoms));
6074    brw_copy_pipeline_atoms(brw, BRW_COMPUTE_PIPELINE,
6075                            compute_atoms, ARRAY_SIZE(compute_atoms));
6076
6077    brw->vtbl.emit_mi_report_perf_count = genX(emit_mi_report_perf_count);
6078    brw->vtbl.emit_compute_walker = genX(emit_gpgpu_walker);
6079 #endif
6080 }