src/intel/vulkan/genX_pipeline.c

   1 /*
   2  * Copyright © 2015 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "anv_private.h"
  25
  26 #include "genxml/gen_macros.h"
  27 #include "genxml/genX_pack.h"
  28
  29 #include "common/gen_l3_config.h"
  30 #include "common/gen_sample_positions.h"
  31 #include "vk_format_info.h"
  32
  33 static uint32_t
  34 vertex_element_comp_control(enum isl_format format, unsigned comp)
  35 {
  36    uint8_t bits;
  37    switch (comp) {
  38    case 0: bits = isl_format_layouts[format].channels.r.bits; break;
  39    case 1: bits = isl_format_layouts[format].channels.g.bits; break;
  40    case 2: bits = isl_format_layouts[format].channels.b.bits; break;
  41    case 3: bits = isl_format_layouts[format].channels.a.bits; break;
  42    default: unreachable("Invalid component");
  43    }
  44
  45    /*
  46     * Take in account hardware restrictions when dealing with 64-bit floats.
  47     *
  48     * From Broadwell spec, command reference structures, page 586:
  49     *  "When SourceElementFormat is set to one of the *64*_PASSTHRU formats,
  50     *   64-bit components are stored * in the URB without any conversion. In
  51     *   this case, vertex elements must be written as 128 or 256 bits, with
  52     *   VFCOMP_STORE_0 being used to pad the output as required. E.g., if
  53     *   R64_PASSTHRU is used to copy a 64-bit Red component into the URB,
  54     *   Component 1 must be specified as VFCOMP_STORE_0 (with Components 2,3
  55     *   set to VFCOMP_NOSTORE) in order to output a 128-bit vertex element, or
  56     *   Components 1-3 must be specified as VFCOMP_STORE_0 in order to output
  57     *   a 256-bit vertex element. Likewise, use of R64G64B64_PASSTHRU requires
  58     *   Component 3 to be specified as VFCOMP_STORE_0 in order to output a
  59     *   256-bit vertex element."
  60     */
  61    if (bits) {
  62       return VFCOMP_STORE_SRC;
  63    } else if (comp >= 2 &&
  64               !isl_format_layouts[format].channels.b.bits &&
  65               isl_format_layouts[format].channels.r.type == ISL_RAW) {
  66       /* When emitting 64-bit attributes, we need to write either 128 or 256
  67        * bit chunks, using VFCOMP_NOSTORE when not writing the chunk, and
  68        * VFCOMP_STORE_0 to pad the written chunk */
  69       return VFCOMP_NOSTORE;
  70    } else if (comp < 3 ||
  71               isl_format_layouts[format].channels.r.type == ISL_RAW) {
  72       /* Note we need to pad with value 0, not 1, due hardware restrictions
  73        * (see comment above) */
  74       return VFCOMP_STORE_0;
  75    } else if (isl_format_layouts[format].channels.r.type == ISL_UINT ||
  76             isl_format_layouts[format].channels.r.type == ISL_SINT) {
  77       assert(comp == 3);
  78       return VFCOMP_STORE_1_INT;
  79    } else {
  80       assert(comp == 3);
  81       return VFCOMP_STORE_1_FP;
  82    }
  83 }
  84
  85 static void
  86 emit_vertex_input(struct anv_pipeline *pipeline,
  87                   const VkPipelineVertexInputStateCreateInfo *info)
  88 {
  89    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
  90
  91    /* Pull inputs_read out of the VS prog data */
  92    const uint64_t inputs_read = vs_prog_data->inputs_read;
  93    const uint64_t double_inputs_read = vs_prog_data->double_inputs_read;
  94    assert((inputs_read & ((1 << VERT_ATTRIB_GENERIC0) - 1)) == 0);
  95    const uint32_t elements = inputs_read >> VERT_ATTRIB_GENERIC0;
  96    const uint32_t elements_double = double_inputs_read >> VERT_ATTRIB_GENERIC0;
  97
  98 #if GEN_GEN >= 8
  99    /* On BDW+, we only need to allocate space for base ids.  Setting up
 100     * the actual vertex and instance id is a separate packet.
 101     */
 102    const bool needs_svgs_elem = vs_prog_data->uses_basevertex ||
 103                                 vs_prog_data->uses_baseinstance;
 104 #else
 105    /* On Haswell and prior, vertex and instance id are created by using the
 106     * ComponentControl fields, so we need an element for any of them.
 107     */
 108    const bool needs_svgs_elem = vs_prog_data->uses_vertexid ||
 109                                 vs_prog_data->uses_instanceid ||
 110                                 vs_prog_data->uses_basevertex ||
 111                                 vs_prog_data->uses_baseinstance;
 112 #endif
 113
 114    uint32_t elem_count = __builtin_popcount(elements) -
 115       __builtin_popcount(elements_double) / 2;
 116
 117    uint32_t total_elems = elem_count + needs_svgs_elem;
 118    if (total_elems == 0)
 119       return;
 120
 121    uint32_t *p;
 122
 123    const uint32_t num_dwords = 1 + total_elems * 2;
 124    p = anv_batch_emitn(&pipeline->batch, num_dwords,
 125                        GENX(3DSTATE_VERTEX_ELEMENTS));
 126    memset(p + 1, 0, (num_dwords - 1) * 4);
 127
 128    for (uint32_t i = 0; i < info->vertexAttributeDescriptionCount; i++) {
 129       const VkVertexInputAttributeDescription *desc =
 130          &info->pVertexAttributeDescriptions[i];
 131       enum isl_format format = anv_get_isl_format(&pipeline->device->info,
 132                                                   desc->format,
 133                                                   VK_IMAGE_ASPECT_COLOR_BIT,
 134                                                   VK_IMAGE_TILING_LINEAR);
 135
 136       assert(desc->binding < 32);
 137
 138       if ((elements & (1 << desc->location)) == 0)
 139          continue; /* Binding unused */
 140
 141       uint32_t slot =
 142          __builtin_popcount(elements & ((1 << desc->location) - 1)) -
 143          DIV_ROUND_UP(__builtin_popcount(elements_double &
 144                                         ((1 << desc->location) -1)), 2);
 145
 146       struct GENX(VERTEX_ELEMENT_STATE) element = {
 147          .VertexBufferIndex = desc->binding,
 148          .Valid = true,
 149          .SourceElementFormat = format,
 150          .EdgeFlagEnable = false,
 151          .SourceElementOffset = desc->offset,
 152          .Component0Control = vertex_element_comp_control(format, 0),
 153          .Component1Control = vertex_element_comp_control(format, 1),
 154          .Component2Control = vertex_element_comp_control(format, 2),
 155          .Component3Control = vertex_element_comp_control(format, 3),
 156       };
 157       GENX(VERTEX_ELEMENT_STATE_pack)(NULL, &p[1 + slot * 2], &element);
 158
 159 #if GEN_GEN >= 8
 160       /* On Broadwell and later, we have a separate VF_INSTANCING packet
 161        * that controls instancing.  On Haswell and prior, that's part of
 162        * VERTEX_BUFFER_STATE which we emit later.
 163        */
 164       anv_batch_emit(&pipeline->batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
 165          vfi.InstancingEnable = pipeline->instancing_enable[desc->binding];
 166          vfi.VertexElementIndex = slot;
 167          /* Vulkan so far doesn't have an instance divisor, so
 168           * this is always 1 (ignored if not instancing). */
 169          vfi.InstanceDataStepRate = 1;
 170       }
 171 #endif
 172    }
 173
 174    const uint32_t id_slot = elem_count;
 175    if (needs_svgs_elem) {
 176       /* From the Broadwell PRM for the 3D_Vertex_Component_Control enum:
 177        *    "Within a VERTEX_ELEMENT_STATE structure, if a Component
 178        *    Control field is set to something other than VFCOMP_STORE_SRC,
 179        *    no higher-numbered Component Control fields may be set to
 180        *    VFCOMP_STORE_SRC"
 181        *
 182        * This means, that if we have BaseInstance, we need BaseVertex as
 183        * well.  Just do all or nothing.
 184        */
 185       uint32_t base_ctrl = (vs_prog_data->uses_basevertex ||
 186                             vs_prog_data->uses_baseinstance) ?
 187                            VFCOMP_STORE_SRC : VFCOMP_STORE_0;
 188
 189       struct GENX(VERTEX_ELEMENT_STATE) element = {
 190          .VertexBufferIndex = 32, /* Reserved for this */
 191          .Valid = true,
 192          .SourceElementFormat = ISL_FORMAT_R32G32_UINT,
 193          .Component0Control = base_ctrl,
 194          .Component1Control = base_ctrl,
 195 #if GEN_GEN >= 8
 196          .Component2Control = VFCOMP_STORE_0,
 197          .Component3Control = VFCOMP_STORE_0,
 198 #else
 199          .Component2Control = VFCOMP_STORE_VID,
 200          .Component3Control = VFCOMP_STORE_IID,
 201 #endif
 202       };
 203       GENX(VERTEX_ELEMENT_STATE_pack)(NULL, &p[1 + id_slot * 2], &element);
 204    }
 205
 206 #if GEN_GEN >= 8
 207    anv_batch_emit(&pipeline->batch, GENX(3DSTATE_VF_SGVS), sgvs) {
 208       sgvs.VertexIDEnable              = vs_prog_data->uses_vertexid;
 209       sgvs.VertexIDComponentNumber     = 2;
 210       sgvs.VertexIDElementOffset       = id_slot;
 211       sgvs.InstanceIDEnable            = vs_prog_data->uses_instanceid;
 212       sgvs.InstanceIDComponentNumber   = 3;
 213       sgvs.InstanceIDElementOffset     = id_slot;
 214    }
 215 #endif
 216 }
 217
 218 void
 219 genX(emit_urb_setup)(struct anv_device *device, struct anv_batch *batch,
 220                      const struct gen_l3_config *l3_config,
 221                      VkShaderStageFlags active_stages,
 222                      const unsigned entry_size[4])
 223 {
 224    const struct gen_device_info *devinfo = &device->info;
 225 #if GEN_IS_HASWELL
 226    const unsigned push_constant_kb = devinfo->gt == 3 ? 32 : 16;
 227 #else
 228    const unsigned push_constant_kb = GEN_GEN >= 8 ? 32 : 16;
 229 #endif
 230
 231    const unsigned urb_size_kb = gen_get_l3_config_urb_size(devinfo, l3_config);
 232
 233    unsigned entries[4];
 234    unsigned start[4];
 235    gen_get_urb_config(devinfo,
 236                       1024 * push_constant_kb, 1024 * urb_size_kb,
 237                       active_stages &
 238                          VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT,
 239                       active_stages & VK_SHADER_STAGE_GEOMETRY_BIT,
 240                       entry_size, entries, start);
 241
 242 #if GEN_GEN == 7 && !GEN_IS_HASWELL
 243    /* From the IVB PRM Vol. 2, Part 1, Section 3.2.1:
 244     *
 245     *    "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth stall
 246     *    needs to be sent just prior to any 3DSTATE_VS, 3DSTATE_URB_VS,
 247     *    3DSTATE_CONSTANT_VS, 3DSTATE_BINDING_TABLE_POINTER_VS,
 248     *    3DSTATE_SAMPLER_STATE_POINTER_VS command.  Only one PIPE_CONTROL
 249     *    needs to be sent before any combination of VS associated 3DSTATE."
 250     */
 251    anv_batch_emit(batch, GEN7_PIPE_CONTROL, pc) {
 252       pc.DepthStallEnable  = true;
 253       pc.PostSyncOperation = WriteImmediateData;
 254       pc.Address           = (struct anv_address) { &device->workaround_bo, 0 };
 255    }
 256 #endif
 257
 258    for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
 259       anv_batch_emit(batch, GENX(3DSTATE_URB_VS), urb) {
 260          urb._3DCommandSubOpcode      += i;
 261          urb.VSURBStartingAddress      = start[i];
 262          urb.VSURBEntryAllocationSize  = entry_size[i] - 1;
 263          urb.VSNumberofURBEntries      = entries[i];
 264       }
 265    }
 266 }
 267
 268 static inline void
 269 emit_urb_setup(struct anv_pipeline *pipeline)
 270 {
 271    unsigned entry_size[4];
 272    for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
 273       const struct brw_vue_prog_data *prog_data =
 274          !anv_pipeline_has_stage(pipeline, i) ? NULL :
 275          (const struct brw_vue_prog_data *) pipeline->shaders[i]->prog_data;
 276
 277       entry_size[i] = prog_data ? prog_data->urb_entry_size : 1;
 278    }
 279
 280    genX(emit_urb_setup)(pipeline->device, &pipeline->batch,
 281                         pipeline->urb.l3_config,
 282                         pipeline->active_stages, entry_size);
 283 }
 284
 285 static void
 286 emit_3dstate_sbe(struct anv_pipeline *pipeline)
 287 {
 288    const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
 289
 290    if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
 291       anv_batch_emit(&pipeline->batch, GENX(3DSTATE_SBE), sbe);
 292 #if GEN_GEN >= 8
 293       anv_batch_emit(&pipeline->batch, GENX(3DSTATE_SBE_SWIZ), sbe);
 294 #endif
 295       return;
 296    }
 297
 298    const struct brw_vue_map *fs_input_map =
 299       &anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map;
 300
 301    struct GENX(3DSTATE_SBE) sbe = {
 302       GENX(3DSTATE_SBE_header),
 303       .AttributeSwizzleEnable = true,
 304       .PointSpriteTextureCoordinateOrigin = UPPERLEFT,
 305       .NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs,
 306       .ConstantInterpolationEnable = wm_prog_data->flat_inputs,
 307    };
 308
 309 #if GEN_GEN >= 9
 310    for (unsigned i = 0; i < 32; i++)
 311       sbe.AttributeActiveComponentFormat[i] = ACF_XYZW;
 312 #endif
 313
 314 #if GEN_GEN >= 8
 315    /* On Broadwell, they broke 3DSTATE_SBE into two packets */
 316    struct GENX(3DSTATE_SBE_SWIZ) swiz = {
 317       GENX(3DSTATE_SBE_SWIZ_header),
 318    };
 319 #else
 320 #  define swiz sbe
 321 #endif
 322
 323    /* Skip the VUE header and position slots by default */
 324    unsigned urb_entry_read_offset = 1;
 325    int max_source_attr = 0;
 326    for (int attr = 0; attr < VARYING_SLOT_MAX; attr++) {
 327       int input_index = wm_prog_data->urb_setup[attr];
 328
 329       if (input_index < 0)
 330          continue;
 331
 332       /* gl_Layer is stored in the VUE header */
 333       if (attr == VARYING_SLOT_LAYER) {
 334          urb_entry_read_offset = 0;
 335          continue;
 336       }
 337
 338       if (attr == VARYING_SLOT_PNTC) {
 339          sbe.PointSpriteTextureCoordinateEnable = 1 << input_index;
 340          continue;
 341       }
 342
 343       const int slot = fs_input_map->varying_to_slot[attr];
 344
 345       if (input_index >= 16)
 346          continue;
 347
 348       if (slot == -1) {
 349          /* This attribute does not exist in the VUE--that means that the
 350           * vertex shader did not write to it.  It could be that it's a
 351           * regular varying read by the fragment shader but not written by
 352           * the vertex shader or it's gl_PrimitiveID. In the first case the
 353           * value is undefined, in the second it needs to be
 354           * gl_PrimitiveID.
 355           */
 356          swiz.Attribute[input_index].ConstantSource = PRIM_ID;
 357          swiz.Attribute[input_index].ComponentOverrideX = true;
 358          swiz.Attribute[input_index].ComponentOverrideY = true;
 359          swiz.Attribute[input_index].ComponentOverrideZ = true;
 360          swiz.Attribute[input_index].ComponentOverrideW = true;
 361       } else {
 362          /* We have to subtract two slots to accout for the URB entry output
 363           * read offset in the VS and GS stages.
 364           */
 365          assert(slot >= 2);
 366          const int source_attr = slot - 2 * urb_entry_read_offset;
 367          max_source_attr = MAX2(max_source_attr, source_attr);
 368          swiz.Attribute[input_index].SourceAttribute = source_attr;
 369       }
 370    }
 371
 372    sbe.VertexURBEntryReadOffset = urb_entry_read_offset;
 373    sbe.VertexURBEntryReadLength = DIV_ROUND_UP(max_source_attr + 1, 2);
 374 #if GEN_GEN >= 8
 375    sbe.ForceVertexURBEntryReadOffset = true;
 376    sbe.ForceVertexURBEntryReadLength = true;
 377 #endif
 378
 379    uint32_t *dw = anv_batch_emit_dwords(&pipeline->batch,
 380                                         GENX(3DSTATE_SBE_length));
 381    GENX(3DSTATE_SBE_pack)(&pipeline->batch, dw, &sbe);
 382
 383 #if GEN_GEN >= 8
 384    dw = anv_batch_emit_dwords(&pipeline->batch, GENX(3DSTATE_SBE_SWIZ_length));
 385    GENX(3DSTATE_SBE_SWIZ_pack)(&pipeline->batch, dw, &swiz);
 386 #endif
 387 }
 388
 389 static const uint32_t vk_to_gen_cullmode[] = {
 390    [VK_CULL_MODE_NONE]                       = CULLMODE_NONE,
 391    [VK_CULL_MODE_FRONT_BIT]                  = CULLMODE_FRONT,
 392    [VK_CULL_MODE_BACK_BIT]                   = CULLMODE_BACK,
 393    [VK_CULL_MODE_FRONT_AND_BACK]             = CULLMODE_BOTH
 394 };
 395
 396 static const uint32_t vk_to_gen_fillmode[] = {
 397    [VK_POLYGON_MODE_FILL]                    = FILL_MODE_SOLID,
 398    [VK_POLYGON_MODE_LINE]                    = FILL_MODE_WIREFRAME,
 399    [VK_POLYGON_MODE_POINT]                   = FILL_MODE_POINT,
 400 };
 401
 402 static const uint32_t vk_to_gen_front_face[] = {
 403    [VK_FRONT_FACE_COUNTER_CLOCKWISE]         = 1,
 404    [VK_FRONT_FACE_CLOCKWISE]                 = 0
 405 };
 406
 407 static void
 408 emit_rs_state(struct anv_pipeline *pipeline,
 409               const VkPipelineRasterizationStateCreateInfo *rs_info,
 410               const VkPipelineMultisampleStateCreateInfo *ms_info,
 411               const struct anv_render_pass *pass,
 412               const struct anv_subpass *subpass)
 413 {
 414    struct GENX(3DSTATE_SF) sf = {
 415       GENX(3DSTATE_SF_header),
 416    };
 417
 418    sf.ViewportTransformEnable = true;
 419    sf.StatisticsEnable = true;
 420    sf.TriangleStripListProvokingVertexSelect = 0;
 421    sf.LineStripListProvokingVertexSelect = 0;
 422    sf.TriangleFanProvokingVertexSelect = 1;
 423
 424    const struct brw_vue_prog_data *last_vue_prog_data =
 425       anv_pipeline_get_last_vue_prog_data(pipeline);
 426
 427    if (last_vue_prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
 428       sf.PointWidthSource = Vertex;
 429    } else {
 430       sf.PointWidthSource = State;
 431       sf.PointWidth = 1.0;
 432    }
 433
 434 #if GEN_GEN >= 8
 435    struct GENX(3DSTATE_RASTER) raster = {
 436       GENX(3DSTATE_RASTER_header),
 437    };
 438 #else
 439 #  define raster sf
 440 #endif
 441
 442    /* For details on 3DSTATE_RASTER multisample state, see the BSpec table
 443     * "Multisample Modes State".
 444     */
 445 #if GEN_GEN >= 8
 446    raster.DXMultisampleRasterizationEnable = true;
 447    raster.ForcedSampleCount = FSC_NUMRASTSAMPLES_0;
 448    raster.ForceMultisampling = false;
 449 #else
 450    raster.MultisampleRasterizationMode =
 451       (ms_info && ms_info->rasterizationSamples > 1) ?
 452       MSRASTMODE_ON_PATTERN : MSRASTMODE_OFF_PIXEL;
 453 #endif
 454
 455    raster.FrontWinding = vk_to_gen_front_face[rs_info->frontFace];
 456    raster.CullMode = vk_to_gen_cullmode[rs_info->cullMode];
 457    raster.FrontFaceFillMode = vk_to_gen_fillmode[rs_info->polygonMode];
 458    raster.BackFaceFillMode = vk_to_gen_fillmode[rs_info->polygonMode];
 459    raster.ScissorRectangleEnable = true;
 460
 461 #if GEN_GEN >= 9
 462    /* GEN9+ splits ViewportZClipTestEnable into near and far enable bits */
 463    raster.ViewportZFarClipTestEnable = !pipeline->depth_clamp_enable;
 464    raster.ViewportZNearClipTestEnable = !pipeline->depth_clamp_enable;
 465 #elif GEN_GEN >= 8
 466    raster.ViewportZClipTestEnable = !pipeline->depth_clamp_enable;
 467 #endif
 468
 469    raster.GlobalDepthOffsetEnableSolid = rs_info->depthBiasEnable;
 470    raster.GlobalDepthOffsetEnableWireframe = rs_info->depthBiasEnable;
 471    raster.GlobalDepthOffsetEnablePoint = rs_info->depthBiasEnable;
 472
 473 #if GEN_GEN == 7
 474    /* Gen7 requires that we provide the depth format in 3DSTATE_SF so that it
 475     * can get the depth offsets correct.
 476     */
 477    if (subpass->depth_stencil_attachment < pass->attachment_count) {
 478       VkFormat vk_format =
 479          pass->attachments[subpass->depth_stencil_attachment].format;
 480       assert(vk_format_is_depth_or_stencil(vk_format));
 481       if (vk_format_aspects(vk_format) & VK_IMAGE_ASPECT_DEPTH_BIT) {
 482          enum isl_format isl_format =
 483             anv_get_isl_format(&pipeline->device->info, vk_format,
 484                                VK_IMAGE_ASPECT_DEPTH_BIT,
 485                                VK_IMAGE_TILING_OPTIMAL);
 486          sf.DepthBufferSurfaceFormat =
 487             isl_format_get_depth_format(isl_format, false);
 488       }
 489    }
 490 #endif
 491
 492 #if GEN_GEN >= 8
 493    GENX(3DSTATE_SF_pack)(NULL, pipeline->gen8.sf, &sf);
 494    GENX(3DSTATE_RASTER_pack)(NULL, pipeline->gen8.raster, &raster);
 495 #else
 496 #  undef raster
 497    GENX(3DSTATE_SF_pack)(NULL, &pipeline->gen7.sf, &sf);
 498 #endif
 499 }
 500
 501 static void
 502 emit_ms_state(struct anv_pipeline *pipeline,
 503               const VkPipelineMultisampleStateCreateInfo *info)
 504 {
 505    uint32_t samples = 1;
 506    uint32_t log2_samples = 0;
 507
 508    /* From the Vulkan 1.0 spec:
 509     *    If pSampleMask is NULL, it is treated as if the mask has all bits
 510     *    enabled, i.e. no coverage is removed from fragments.
 511     *
 512     * 3DSTATE_SAMPLE_MASK.SampleMask is 16 bits.
 513     */
 514 #if GEN_GEN >= 8
 515    uint32_t sample_mask = 0xffff;
 516 #else
 517    uint32_t sample_mask = 0xff;
 518 #endif
 519
 520    if (info) {
 521       samples = info->rasterizationSamples;
 522       log2_samples = __builtin_ffs(samples) - 1;
 523    }
 524
 525    if (info && info->pSampleMask)
 526       sample_mask &= info->pSampleMask[0];
 527
 528    anv_batch_emit(&pipeline->batch, GENX(3DSTATE_MULTISAMPLE), ms) {
 529       ms.NumberofMultisamples       = log2_samples;
 530
 531 #if GEN_GEN >= 8
 532       /* The PRM says that this bit is valid only for DX9:
 533        *
 534        *    SW can choose to set this bit only for DX9 API. DX10/OGL API's
 535        *    should not have any effect by setting or not setting this bit.
 536        */
 537       ms.PixelPositionOffsetEnable  = false;
 538       ms.PixelLocation              = CENTER;
 539 #else
 540       ms.PixelLocation              = PIXLOC_CENTER;
 541
 542       switch (samples) {
 543       case 1:
 544          GEN_SAMPLE_POS_1X(ms.Sample);
 545          break;
 546       case 2:
 547          GEN_SAMPLE_POS_2X(ms.Sample);
 548          break;
 549       case 4:
 550          GEN_SAMPLE_POS_4X(ms.Sample);
 551          break;
 552       case 8:
 553          GEN_SAMPLE_POS_8X(ms.Sample);
 554          break;
 555       default:
 556          break;
 557       }
 558 #endif
 559    }
 560
 561    anv_batch_emit(&pipeline->batch, GENX(3DSTATE_SAMPLE_MASK), sm) {
 562       sm.SampleMask = sample_mask;
 563    }
 564 }
 565
 566 static const uint32_t vk_to_gen_logic_op[] = {
 567    [VK_LOGIC_OP_COPY]                        = LOGICOP_COPY,
 568    [VK_LOGIC_OP_CLEAR]                       = LOGICOP_CLEAR,
 569    [VK_LOGIC_OP_AND]                         = LOGICOP_AND,
 570    [VK_LOGIC_OP_AND_REVERSE]                 = LOGICOP_AND_REVERSE,
 571    [VK_LOGIC_OP_AND_INVERTED]                = LOGICOP_AND_INVERTED,
 572    [VK_LOGIC_OP_NO_OP]                       = LOGICOP_NOOP,
 573    [VK_LOGIC_OP_XOR]                         = LOGICOP_XOR,
 574    [VK_LOGIC_OP_OR]                          = LOGICOP_OR,
 575    [VK_LOGIC_OP_NOR]                         = LOGICOP_NOR,
 576    [VK_LOGIC_OP_EQUIVALENT]                  = LOGICOP_EQUIV,
 577    [VK_LOGIC_OP_INVERT]                      = LOGICOP_INVERT,
 578    [VK_LOGIC_OP_OR_REVERSE]                  = LOGICOP_OR_REVERSE,
 579    [VK_LOGIC_OP_COPY_INVERTED]               = LOGICOP_COPY_INVERTED,
 580    [VK_LOGIC_OP_OR_INVERTED]                 = LOGICOP_OR_INVERTED,
 581    [VK_LOGIC_OP_NAND]                        = LOGICOP_NAND,
 582    [VK_LOGIC_OP_SET]                         = LOGICOP_SET,
 583 };
 584
 585 static const uint32_t vk_to_gen_blend[] = {
 586    [VK_BLEND_FACTOR_ZERO]                    = BLENDFACTOR_ZERO,
 587    [VK_BLEND_FACTOR_ONE]                     = BLENDFACTOR_ONE,
 588    [VK_BLEND_FACTOR_SRC_COLOR]               = BLENDFACTOR_SRC_COLOR,
 589    [VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR]     = BLENDFACTOR_INV_SRC_COLOR,
 590    [VK_BLEND_FACTOR_DST_COLOR]               = BLENDFACTOR_DST_COLOR,
 591    [VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR]     = BLENDFACTOR_INV_DST_COLOR,
 592    [VK_BLEND_FACTOR_SRC_ALPHA]               = BLENDFACTOR_SRC_ALPHA,
 593    [VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA]     = BLENDFACTOR_INV_SRC_ALPHA,
 594    [VK_BLEND_FACTOR_DST_ALPHA]               = BLENDFACTOR_DST_ALPHA,
 595    [VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA]     = BLENDFACTOR_INV_DST_ALPHA,
 596    [VK_BLEND_FACTOR_CONSTANT_COLOR]          = BLENDFACTOR_CONST_COLOR,
 597    [VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR]= BLENDFACTOR_INV_CONST_COLOR,
 598    [VK_BLEND_FACTOR_CONSTANT_ALPHA]          = BLENDFACTOR_CONST_ALPHA,
 599    [VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA]= BLENDFACTOR_INV_CONST_ALPHA,
 600    [VK_BLEND_FACTOR_SRC_ALPHA_SATURATE]      = BLENDFACTOR_SRC_ALPHA_SATURATE,
 601    [VK_BLEND_FACTOR_SRC1_COLOR]              = BLENDFACTOR_SRC1_COLOR,
 602    [VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR]    = BLENDFACTOR_INV_SRC1_COLOR,
 603    [VK_BLEND_FACTOR_SRC1_ALPHA]              = BLENDFACTOR_SRC1_ALPHA,
 604    [VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA]    = BLENDFACTOR_INV_SRC1_ALPHA,
 605 };
 606
 607 static const uint32_t vk_to_gen_blend_op[] = {
 608    [VK_BLEND_OP_ADD]                         = BLENDFUNCTION_ADD,
 609    [VK_BLEND_OP_SUBTRACT]                    = BLENDFUNCTION_SUBTRACT,
 610    [VK_BLEND_OP_REVERSE_SUBTRACT]            = BLENDFUNCTION_REVERSE_SUBTRACT,
 611    [VK_BLEND_OP_MIN]                         = BLENDFUNCTION_MIN,
 612    [VK_BLEND_OP_MAX]                         = BLENDFUNCTION_MAX,
 613 };
 614
 615 static const uint32_t vk_to_gen_compare_op[] = {
 616    [VK_COMPARE_OP_NEVER]                        = PREFILTEROPNEVER,
 617    [VK_COMPARE_OP_LESS]                         = PREFILTEROPLESS,
 618    [VK_COMPARE_OP_EQUAL]                        = PREFILTEROPEQUAL,
 619    [VK_COMPARE_OP_LESS_OR_EQUAL]                = PREFILTEROPLEQUAL,
 620    [VK_COMPARE_OP_GREATER]                      = PREFILTEROPGREATER,
 621    [VK_COMPARE_OP_NOT_EQUAL]                    = PREFILTEROPNOTEQUAL,
 622    [VK_COMPARE_OP_GREATER_OR_EQUAL]             = PREFILTEROPGEQUAL,
 623    [VK_COMPARE_OP_ALWAYS]                       = PREFILTEROPALWAYS,
 624 };
 625
 626 static const uint32_t vk_to_gen_stencil_op[] = {
 627    [VK_STENCIL_OP_KEEP]                         = STENCILOP_KEEP,
 628    [VK_STENCIL_OP_ZERO]                         = STENCILOP_ZERO,
 629    [VK_STENCIL_OP_REPLACE]                      = STENCILOP_REPLACE,
 630    [VK_STENCIL_OP_INCREMENT_AND_CLAMP]          = STENCILOP_INCRSAT,
 631    [VK_STENCIL_OP_DECREMENT_AND_CLAMP]          = STENCILOP_DECRSAT,
 632    [VK_STENCIL_OP_INVERT]                       = STENCILOP_INVERT,
 633    [VK_STENCIL_OP_INCREMENT_AND_WRAP]           = STENCILOP_INCR,
 634    [VK_STENCIL_OP_DECREMENT_AND_WRAP]           = STENCILOP_DECR,
 635 };
 636
 637 static void
 638 emit_ds_state(struct anv_pipeline *pipeline,
 639               const VkPipelineDepthStencilStateCreateInfo *info,
 640               const struct anv_render_pass *pass,
 641               const struct anv_subpass *subpass)
 642 {
 643 #if GEN_GEN == 7
 644 #  define depth_stencil_dw pipeline->gen7.depth_stencil_state
 645 #elif GEN_GEN == 8
 646 #  define depth_stencil_dw pipeline->gen8.wm_depth_stencil
 647 #else
 648 #  define depth_stencil_dw pipeline->gen9.wm_depth_stencil
 649 #endif
 650
 651    if (info == NULL) {
 652       /* We're going to OR this together with the dynamic state.  We need
 653        * to make sure it's initialized to something useful.
 654        */
 655       memset(depth_stencil_dw, 0, sizeof(depth_stencil_dw));
 656       return;
 657    }
 658
 659    /* VkBool32 depthBoundsTestEnable; // optional (depth_bounds_test) */
 660
 661 #if GEN_GEN <= 7
 662    struct GENX(DEPTH_STENCIL_STATE) depth_stencil = {
 663 #else
 664    struct GENX(3DSTATE_WM_DEPTH_STENCIL) depth_stencil = {
 665 #endif
 666       .DepthTestEnable = info->depthTestEnable,
 667       .DepthBufferWriteEnable = info->depthWriteEnable,
 668       .DepthTestFunction = vk_to_gen_compare_op[info->depthCompareOp],
 669       .DoubleSidedStencilEnable = true,
 670
 671       .StencilTestEnable = info->stencilTestEnable,
 672       .StencilBufferWriteEnable = info->stencilTestEnable,
 673       .StencilFailOp = vk_to_gen_stencil_op[info->front.failOp],
 674       .StencilPassDepthPassOp = vk_to_gen_stencil_op[info->front.passOp],
 675       .StencilPassDepthFailOp = vk_to_gen_stencil_op[info->front.depthFailOp],
 676       .StencilTestFunction = vk_to_gen_compare_op[info->front.compareOp],
 677       .BackfaceStencilFailOp = vk_to_gen_stencil_op[info->back.failOp],
 678       .BackfaceStencilPassDepthPassOp = vk_to_gen_stencil_op[info->back.passOp],
 679       .BackfaceStencilPassDepthFailOp =vk_to_gen_stencil_op[info->back.depthFailOp],
 680       .BackfaceStencilTestFunction = vk_to_gen_compare_op[info->back.compareOp],
 681    };
 682
 683    VkImageAspectFlags aspects = 0;
 684    if (subpass->depth_stencil_attachment != VK_ATTACHMENT_UNUSED) {
 685       VkFormat depth_stencil_format =
 686          pass->attachments[subpass->depth_stencil_attachment].format;
 687       aspects = vk_format_aspects(depth_stencil_format);
 688    }
 689
 690    /* The Vulkan spec requires that if either depth or stencil is not present,
 691     * the pipeline is to act as if the test silently passes.
 692     */
 693    if (!(aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) {
 694       depth_stencil.DepthBufferWriteEnable = false;
 695       depth_stencil.DepthTestFunction = PREFILTEROPALWAYS;
 696    }
 697
 698    if (!(aspects & VK_IMAGE_ASPECT_STENCIL_BIT)) {
 699       depth_stencil.StencilBufferWriteEnable = false;
 700       depth_stencil.StencilTestFunction = PREFILTEROPALWAYS;
 701       depth_stencil.BackfaceStencilTestFunction = PREFILTEROPALWAYS;
 702    }
 703
 704    /* From the Broadwell PRM:
 705     *
 706     *    "If Depth_Test_Enable = 1 AND Depth_Test_func = EQUAL, the
 707     *    Depth_Write_Enable must be set to 0."
 708     */
 709    if (info->depthTestEnable && info->depthCompareOp == VK_COMPARE_OP_EQUAL)
 710       depth_stencil.DepthBufferWriteEnable = false;
 711
 712 #if GEN_GEN <= 7
 713    GENX(DEPTH_STENCIL_STATE_pack)(NULL, depth_stencil_dw, &depth_stencil);
 714 #else
 715    GENX(3DSTATE_WM_DEPTH_STENCIL_pack)(NULL, depth_stencil_dw, &depth_stencil);
 716 #endif
 717 }
 718
 719 static void
 720 emit_cb_state(struct anv_pipeline *pipeline,
 721               const VkPipelineColorBlendStateCreateInfo *info,
 722               const VkPipelineMultisampleStateCreateInfo *ms_info)
 723 {
 724    struct anv_device *device = pipeline->device;
 725
 726    const uint32_t num_dwords = GENX(BLEND_STATE_length);
 727    pipeline->blend_state =
 728       anv_state_pool_alloc(&device->dynamic_state_pool, num_dwords * 4, 64);
 729
 730    struct GENX(BLEND_STATE) blend_state = {
 731 #if GEN_GEN >= 8
 732       .AlphaToCoverageEnable = ms_info && ms_info->alphaToCoverageEnable,
 733       .AlphaToOneEnable = ms_info && ms_info->alphaToOneEnable,
 734 #else
 735       /* Make sure it gets zeroed */
 736       .Entry = { { 0, }, },
 737 #endif
 738    };
 739
 740    /* Default everything to disabled */
 741    for (uint32_t i = 0; i < 8; i++) {
 742       blend_state.Entry[i].WriteDisableAlpha = true;
 743       blend_state.Entry[i].WriteDisableRed = true;
 744       blend_state.Entry[i].WriteDisableGreen = true;
 745       blend_state.Entry[i].WriteDisableBlue = true;
 746    }
 747
 748    uint32_t surface_count = 0;
 749    struct anv_pipeline_bind_map *map;
 750    if (anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
 751       map = &pipeline->shaders[MESA_SHADER_FRAGMENT]->bind_map;
 752       surface_count = map->surface_count;
 753    }
 754
 755    bool has_writeable_rt = false;
 756    for (unsigned i = 0; i < surface_count; i++) {
 757       struct anv_pipeline_binding *binding = &map->surface_to_descriptor[i];
 758
 759       /* All color attachments are at the beginning of the binding table */
 760       if (binding->set != ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS)
 761          break;
 762
 763       /* We can have at most 8 attachments */
 764       assert(i < 8);
 765
 766       if (binding->index >= info->attachmentCount)
 767          continue;
 768
 769       assert(binding->binding == 0);
 770       const VkPipelineColorBlendAttachmentState *a =
 771          &info->pAttachments[binding->index];
 772
 773       blend_state.Entry[i] = (struct GENX(BLEND_STATE_ENTRY)) {
 774 #if GEN_GEN < 8
 775          .AlphaToCoverageEnable = ms_info && ms_info->alphaToCoverageEnable,
 776          .AlphaToOneEnable = ms_info && ms_info->alphaToOneEnable,
 777 #endif
 778          .LogicOpEnable = info->logicOpEnable,
 779          .LogicOpFunction = vk_to_gen_logic_op[info->logicOp],
 780          .ColorBufferBlendEnable = a->blendEnable,
 781          .ColorClampRange = COLORCLAMP_RTFORMAT,
 782          .PreBlendColorClampEnable = true,
 783          .PostBlendColorClampEnable = true,
 784          .SourceBlendFactor = vk_to_gen_blend[a->srcColorBlendFactor],
 785          .DestinationBlendFactor = vk_to_gen_blend[a->dstColorBlendFactor],
 786          .ColorBlendFunction = vk_to_gen_blend_op[a->colorBlendOp],
 787          .SourceAlphaBlendFactor = vk_to_gen_blend[a->srcAlphaBlendFactor],
 788          .DestinationAlphaBlendFactor = vk_to_gen_blend[a->dstAlphaBlendFactor],
 789          .AlphaBlendFunction = vk_to_gen_blend_op[a->alphaBlendOp],
 790          .WriteDisableAlpha = !(a->colorWriteMask & VK_COLOR_COMPONENT_A_BIT),
 791          .WriteDisableRed = !(a->colorWriteMask & VK_COLOR_COMPONENT_R_BIT),
 792          .WriteDisableGreen = !(a->colorWriteMask & VK_COLOR_COMPONENT_G_BIT),
 793          .WriteDisableBlue = !(a->colorWriteMask & VK_COLOR_COMPONENT_B_BIT),
 794       };
 795
 796       if (a->srcColorBlendFactor != a->srcAlphaBlendFactor ||
 797           a->dstColorBlendFactor != a->dstAlphaBlendFactor ||
 798           a->colorBlendOp != a->alphaBlendOp) {
 799 #if GEN_GEN >= 8
 800          blend_state.IndependentAlphaBlendEnable = true;
 801 #else
 802          blend_state.Entry[i].IndependentAlphaBlendEnable = true;
 803 #endif
 804       }
 805
 806       if (a->colorWriteMask != 0)
 807          has_writeable_rt = true;
 808
 809       /* Our hardware applies the blend factor prior to the blend function
 810        * regardless of what function is used.  Technically, this means the
 811        * hardware can do MORE than GL or Vulkan specify.  However, it also
 812        * means that, for MIN and MAX, we have to stomp the blend factor to
 813        * ONE to make it a no-op.
 814        */
 815       if (a->colorBlendOp == VK_BLEND_OP_MIN ||
 816           a->colorBlendOp == VK_BLEND_OP_MAX) {
 817          blend_state.Entry[i].SourceBlendFactor = BLENDFACTOR_ONE;
 818          blend_state.Entry[i].DestinationBlendFactor = BLENDFACTOR_ONE;
 819       }
 820       if (a->alphaBlendOp == VK_BLEND_OP_MIN ||
 821           a->alphaBlendOp == VK_BLEND_OP_MAX) {
 822          blend_state.Entry[i].SourceAlphaBlendFactor = BLENDFACTOR_ONE;
 823          blend_state.Entry[i].DestinationAlphaBlendFactor = BLENDFACTOR_ONE;
 824       }
 825    }
 826
 827 #if GEN_GEN >= 8
 828    struct GENX(BLEND_STATE_ENTRY) *bs0 = &blend_state.Entry[0];
 829    anv_batch_emit(&pipeline->batch, GENX(3DSTATE_PS_BLEND), blend) {
 830       blend.AlphaToCoverageEnable         = blend_state.AlphaToCoverageEnable;
 831       blend.HasWriteableRT                = has_writeable_rt;
 832       blend.ColorBufferBlendEnable        = bs0->ColorBufferBlendEnable;
 833       blend.SourceAlphaBlendFactor        = bs0->SourceAlphaBlendFactor;
 834       blend.DestinationAlphaBlendFactor   = bs0->DestinationAlphaBlendFactor;
 835       blend.SourceBlendFactor             = bs0->SourceBlendFactor;
 836       blend.DestinationBlendFactor        = bs0->DestinationBlendFactor;
 837       blend.AlphaTestEnable               = false;
 838       blend.IndependentAlphaBlendEnable   =
 839          blend_state.IndependentAlphaBlendEnable;
 840    }
 841 #else
 842    (void)has_writeable_rt;
 843 #endif
 844
 845    GENX(BLEND_STATE_pack)(NULL, pipeline->blend_state.map, &blend_state);
 846    if (!device->info.has_llc)
 847       anv_state_clflush(pipeline->blend_state);
 848
 849    anv_batch_emit(&pipeline->batch, GENX(3DSTATE_BLEND_STATE_POINTERS), bsp) {
 850       bsp.BlendStatePointer      = pipeline->blend_state.offset;
 851 #if GEN_GEN >= 8
 852       bsp.BlendStatePointerValid = true;
 853 #endif
 854    }
 855 }
 856
 857 static void
 858 emit_3dstate_clip(struct anv_pipeline *pipeline,
 859                   const VkPipelineViewportStateCreateInfo *vp_info,
 860                   const VkPipelineRasterizationStateCreateInfo *rs_info)
 861 {
 862    const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
 863    (void) wm_prog_data;
 864    anv_batch_emit(&pipeline->batch, GENX(3DSTATE_CLIP), clip) {
 865       clip.ClipEnable               = true;
 866       clip.EarlyCullEnable          = true;
 867       clip.APIMode                  = APIMODE_D3D,
 868       clip.ViewportXYClipTestEnable = true;
 869
 870       clip.ClipMode = CLIPMODE_NORMAL;
 871
 872       clip.TriangleStripListProvokingVertexSelect = 0;
 873       clip.LineStripListProvokingVertexSelect     = 0;
 874       clip.TriangleFanProvokingVertexSelect       = 1;
 875
 876       clip.MinimumPointWidth = 0.125;
 877       clip.MaximumPointWidth = 255.875;
 878       clip.MaximumVPIndex    = (vp_info ? vp_info->viewportCount : 1) - 1;
 879
 880 #if GEN_GEN == 7
 881       clip.FrontWinding            = vk_to_gen_front_face[rs_info->frontFace];
 882       clip.CullMode                = vk_to_gen_cullmode[rs_info->cullMode];
 883       clip.ViewportZClipTestEnable = !pipeline->depth_clamp_enable;
 884       const struct brw_vue_prog_data *last =
 885          anv_pipeline_get_last_vue_prog_data(pipeline);
 886       if (last) {
 887          clip.UserClipDistanceClipTestEnableBitmask = last->clip_distance_mask;
 888          clip.UserClipDistanceCullTestEnableBitmask = last->cull_distance_mask;
 889       }
 890 #else
 891       clip.NonPerspectiveBarycentricEnable = wm_prog_data ?
 892          (wm_prog_data->barycentric_interp_modes & 0x38) != 0 : 0;
 893 #endif
 894    }
 895 }
 896
 897 static void
 898 emit_3dstate_streamout(struct anv_pipeline *pipeline,
 899                        const VkPipelineRasterizationStateCreateInfo *rs_info)
 900 {
 901    anv_batch_emit(&pipeline->batch, GENX(3DSTATE_STREAMOUT), so) {
 902       so.RenderingDisable = rs_info->rasterizerDiscardEnable;
 903    }
 904 }
 905
 906 static inline uint32_t
 907 get_sampler_count(const struct anv_shader_bin *bin)
 908 {
 909    return DIV_ROUND_UP(bin->bind_map.sampler_count, 4);
 910 }
 911
 912 static inline uint32_t
 913 get_binding_table_entry_count(const struct anv_shader_bin *bin)
 914 {
 915    return DIV_ROUND_UP(bin->bind_map.surface_count, 32);
 916 }
 917
 918 static inline struct anv_address
 919 get_scratch_address(struct anv_pipeline *pipeline,
 920                     gl_shader_stage stage,
 921                     const struct anv_shader_bin *bin)
 922 {
 923    return (struct anv_address) {
 924       .bo = anv_scratch_pool_alloc(pipeline->device,
 925                                    &pipeline->device->scratch_pool,
 926                                    stage, bin->prog_data->total_scratch),
 927       .offset = 0,
 928    };
 929 }
 930
 931 static inline uint32_t
 932 get_scratch_space(const struct anv_shader_bin *bin)
 933 {
 934    return ffs(bin->prog_data->total_scratch / 2048);
 935 }
 936
 937 static inline uint32_t
 938 get_urb_output_offset()
 939 {
 940    /* Skip the VUE header and position slots */
 941    return 1;
 942 }
 943
 944 static inline uint32_t
 945 get_urb_output_length(const struct anv_shader_bin *bin)
 946 {
 947    const struct brw_vue_prog_data *prog_data =
 948       (const struct brw_vue_prog_data *)bin->prog_data;
 949
 950    return (prog_data->vue_map.num_slots + 1) / 2 - get_urb_output_offset();
 951 }
 952
 953 static void
 954 emit_3dstate_vs(struct anv_pipeline *pipeline)
 955 {
 956    const struct gen_device_info *devinfo = &pipeline->device->info;
 957    const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
 958    const struct anv_shader_bin *vs_bin =
 959       pipeline->shaders[MESA_SHADER_VERTEX];
 960
 961    assert(anv_pipeline_has_stage(pipeline, MESA_SHADER_VERTEX));
 962
 963    anv_batch_emit(&pipeline->batch, GENX(3DSTATE_VS), vs) {
 964       vs.FunctionEnable       = true;
 965       vs.StatisticsEnable     = true;
 966       vs.KernelStartPointer   = vs_bin->kernel.offset;
 967 #if GEN_GEN >= 8
 968       vs.SIMD8DispatchEnable  =
 969          vs_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8;
 970 #endif
 971
 972       assert(!vs_prog_data->base.base.use_alt_mode);
 973       vs.SingleVertexDispatch       = false;
 974       vs.VectorMaskEnable           = false;
 975       vs.SamplerCount               = get_sampler_count(vs_bin);
 976       vs.BindingTableEntryCount     = get_binding_table_entry_count(vs_bin);
 977       vs.FloatingPointMode          = IEEE754;
 978       vs.IllegalOpcodeExceptionEnable = false;
 979       vs.SoftwareExceptionEnable    = false;
 980       vs.MaximumNumberofThreads     = devinfo->max_vs_threads - 1;
 981       vs.VertexCacheDisable         = false;
 982
 983       vs.VertexURBEntryReadLength      = vs_prog_data->base.urb_read_length;
 984       vs.VertexURBEntryReadOffset      = 0;
 985       vs.DispatchGRFStartRegisterForURBData =
 986          vs_prog_data->base.base.dispatch_grf_start_reg;
 987
 988 #if GEN_GEN >= 8
 989       vs.VertexURBEntryOutputReadOffset = get_urb_output_offset();
 990       vs.VertexURBEntryOutputLength     = get_urb_output_length(vs_bin);
 991
 992       vs.UserClipDistanceClipTestEnableBitmask =
 993          vs_prog_data->base.clip_distance_mask;
 994       vs.UserClipDistanceCullTestEnableBitmask =
 995          vs_prog_data->base.cull_distance_mask;
 996 #endif
 997
 998       vs.PerThreadScratchSpace   = get_scratch_space(vs_bin);
 999       vs.ScratchSpaceBasePointer =
1000          get_scratch_address(pipeline, MESA_SHADER_VERTEX, vs_bin);
1001    }
1002 }
1003
1004 static void
1005 emit_3dstate_hs_te_ds(struct anv_pipeline *pipeline)
1006 {
1007    if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
1008       anv_batch_emit(&pipeline->batch, GENX(3DSTATE_HS), hs);
1009       anv_batch_emit(&pipeline->batch, GENX(3DSTATE_TE), te);
1010       anv_batch_emit(&pipeline->batch, GENX(3DSTATE_DS), ds);
1011       return;
1012    }
1013
1014    const struct gen_device_info *devinfo = &pipeline->device->info;
1015    const struct anv_shader_bin *tcs_bin =
1016       pipeline->shaders[MESA_SHADER_TESS_CTRL];
1017    const struct anv_shader_bin *tes_bin =
1018       pipeline->shaders[MESA_SHADER_TESS_EVAL];
1019
1020    const struct brw_tcs_prog_data *tcs_prog_data = get_tcs_prog_data(pipeline);
1021    const struct brw_tes_prog_data *tes_prog_data = get_tes_prog_data(pipeline);
1022
1023    anv_batch_emit(&pipeline->batch, GENX(3DSTATE_HS), hs) {
1024       hs.FunctionEnable = true;
1025       hs.StatisticsEnable = true;
1026       hs.KernelStartPointer = tcs_bin->kernel.offset;
1027
1028       hs.SamplerCount = get_sampler_count(tcs_bin);
1029       hs.BindingTableEntryCount = get_binding_table_entry_count(tcs_bin);
1030       hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1;
1031       hs.IncludeVertexHandles = true;
1032       hs.InstanceCount = tcs_prog_data->instances - 1;
1033
1034       hs.VertexURBEntryReadLength = 0;
1035       hs.VertexURBEntryReadOffset = 0;
1036       hs.DispatchGRFStartRegisterForURBData =
1037          tcs_prog_data->base.base.dispatch_grf_start_reg;
1038
1039       hs.PerThreadScratchSpace = get_scratch_space(tcs_bin);
1040       hs.ScratchSpaceBasePointer =
1041          get_scratch_address(pipeline, MESA_SHADER_TESS_CTRL, tcs_bin);
1042    }
1043
1044    anv_batch_emit(&pipeline->batch, GENX(3DSTATE_TE), te) {
1045       te.Partitioning = tes_prog_data->partitioning;
1046       te.OutputTopology = tes_prog_data->output_topology;
1047       te.TEDomain = tes_prog_data->domain;
1048       te.TEEnable = true;
1049       te.MaximumTessellationFactorOdd = 63.0;
1050       te.MaximumTessellationFactorNotOdd = 64.0;
1051    }
1052
1053    anv_batch_emit(&pipeline->batch, GENX(3DSTATE_DS), ds) {
1054       ds.FunctionEnable = true;
1055       ds.StatisticsEnable = true;
1056       ds.KernelStartPointer = tes_bin->kernel.offset;
1057
1058       ds.SamplerCount = get_sampler_count(tes_bin);
1059       ds.BindingTableEntryCount = get_binding_table_entry_count(tes_bin);
1060       ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1;
1061
1062       ds.ComputeWCoordinateEnable =
1063          tes_prog_data->domain == BRW_TESS_DOMAIN_TRI;
1064
1065       ds.PatchURBEntryReadLength = tes_prog_data->base.urb_read_length;
1066       ds.PatchURBEntryReadOffset = 0;
1067       ds.DispatchGRFStartRegisterForURBData =
1068          tes_prog_data->base.base.dispatch_grf_start_reg;
1069
1070 #if GEN_GEN >= 8
1071       ds.VertexURBEntryOutputReadOffset = 1;
1072       ds.VertexURBEntryOutputLength =
1073          (tes_prog_data->base.vue_map.num_slots + 1) / 2 - 1;
1074
1075       ds.DispatchMode =
1076          tes_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8 ?
1077             DISPATCH_MODE_SIMD8_SINGLE_PATCH :
1078             DISPATCH_MODE_SIMD4X2;
1079
1080       ds.UserClipDistanceClipTestEnableBitmask =
1081          tes_prog_data->base.clip_distance_mask;
1082       ds.UserClipDistanceCullTestEnableBitmask =
1083          tes_prog_data->base.cull_distance_mask;
1084 #endif
1085
1086       ds.PerThreadScratchSpace = get_scratch_space(tes_bin);
1087       ds.ScratchSpaceBasePointer =
1088          get_scratch_address(pipeline, MESA_SHADER_TESS_EVAL, tes_bin);
1089    }
1090 }
1091
1092 static void
1093 emit_3dstate_gs(struct anv_pipeline *pipeline)
1094 {
1095    const struct gen_device_info *devinfo = &pipeline->device->info;
1096    const struct anv_shader_bin *gs_bin =
1097       pipeline->shaders[MESA_SHADER_GEOMETRY];
1098
1099    if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) {
1100       anv_batch_emit(&pipeline->batch, GENX(3DSTATE_GS), gs);
1101       return;
1102    }
1103
1104    const struct brw_gs_prog_data *gs_prog_data = get_gs_prog_data(pipeline);
1105
1106    anv_batch_emit(&pipeline->batch, GENX(3DSTATE_GS), gs) {
1107       gs.FunctionEnable          = true;
1108       gs.StatisticsEnable        = true;
1109       gs.KernelStartPointer      = gs_bin->kernel.offset;
1110       gs.DispatchMode            = gs_prog_data->base.dispatch_mode;
1111
1112       gs.SingleProgramFlow       = false;
1113       gs.VectorMaskEnable        = false;
1114       gs.SamplerCount            = get_sampler_count(gs_bin);
1115       gs.BindingTableEntryCount  = get_binding_table_entry_count(gs_bin);
1116       gs.IncludeVertexHandles    = gs_prog_data->base.include_vue_handles;
1117       gs.IncludePrimitiveID      = gs_prog_data->include_primitive_id;
1118
1119       if (GEN_GEN == 8) {
1120          /* Broadwell is weird.  It needs us to divide by 2. */
1121          gs.MaximumNumberofThreads = devinfo->max_gs_threads / 2 - 1;
1122       } else {
1123          gs.MaximumNumberofThreads = devinfo->max_gs_threads - 1;
1124       }
1125
1126       gs.OutputVertexSize        = gs_prog_data->output_vertex_size_hwords * 2 - 1;
1127       gs.OutputTopology          = gs_prog_data->output_topology;
1128       gs.VertexURBEntryReadLength = gs_prog_data->base.urb_read_length;
1129       gs.ControlDataFormat       = gs_prog_data->control_data_format;
1130       gs.ControlDataHeaderSize   = gs_prog_data->control_data_header_size_hwords;
1131       gs.InstanceControl         = MAX2(gs_prog_data->invocations, 1) - 1;
1132 #if GEN_GEN >= 8 || GEN_IS_HASWELL
1133       gs.ReorderMode             = TRAILING;
1134 #else
1135       gs.ReorderEnable           = true;
1136 #endif
1137
1138 #if GEN_GEN >= 8
1139       gs.ExpectedVertexCount     = gs_prog_data->vertices_in;
1140       gs.StaticOutput            = gs_prog_data->static_vertex_count >= 0;
1141       gs.StaticOutputVertexCount = gs_prog_data->static_vertex_count >= 0 ?
1142                                    gs_prog_data->static_vertex_count : 0;
1143 #endif
1144
1145       gs.VertexURBEntryReadOffset = 0;
1146       gs.VertexURBEntryReadLength = gs_prog_data->base.urb_read_length;
1147       gs.DispatchGRFStartRegisterForURBData =
1148          gs_prog_data->base.base.dispatch_grf_start_reg;
1149
1150 #if GEN_GEN >= 8
1151       gs.VertexURBEntryOutputReadOffset = get_urb_output_offset();
1152       gs.VertexURBEntryOutputLength     = get_urb_output_length(gs_bin);
1153
1154       gs.UserClipDistanceClipTestEnableBitmask =
1155          gs_prog_data->base.clip_distance_mask;
1156       gs.UserClipDistanceCullTestEnableBitmask =
1157          gs_prog_data->base.cull_distance_mask;
1158 #endif
1159
1160       gs.PerThreadScratchSpace   = get_scratch_space(gs_bin);
1161       gs.ScratchSpaceBasePointer =
1162          get_scratch_address(pipeline, MESA_SHADER_GEOMETRY, gs_bin);
1163    }
1164 }
1165
1166 static void
1167 emit_3dstate_wm(struct anv_pipeline *pipeline, struct anv_subpass *subpass,
1168                 const VkPipelineMultisampleStateCreateInfo *multisample)
1169 {
1170    const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
1171
1172    MAYBE_UNUSED uint32_t samples =
1173       multisample ? multisample->rasterizationSamples : 1;
1174
1175    anv_batch_emit(&pipeline->batch, GENX(3DSTATE_WM), wm) {
1176       wm.StatisticsEnable                    = true;
1177       wm.LineEndCapAntialiasingRegionWidth   = _05pixels;
1178       wm.LineAntialiasingRegionWidth         = _10pixels;
1179       wm.PointRasterizationRule              = RASTRULE_UPPER_RIGHT;
1180
1181       if (anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
1182          if (wm_prog_data->early_fragment_tests) {
1183             wm.EarlyDepthStencilControl         = EDSC_PREPS;
1184          } else if (wm_prog_data->has_side_effects) {
1185             wm.EarlyDepthStencilControl         = EDSC_PSEXEC;
1186          } else {
1187             wm.EarlyDepthStencilControl         = EDSC_NORMAL;
1188          }
1189
1190          wm.BarycentricInterpolationMode =
1191             wm_prog_data->barycentric_interp_modes;
1192
1193 #if GEN_GEN < 8
1194          /* FIXME: This needs a lot more work, cf gen7 upload_wm_state(). */
1195          wm.ThreadDispatchEnable          = true;
1196
1197          wm.PixelShaderComputedDepthMode  = wm_prog_data->computed_depth_mode;
1198          wm.PixelShaderUsesSourceDepth    = wm_prog_data->uses_src_depth;
1199          wm.PixelShaderUsesSourceW        = wm_prog_data->uses_src_w;
1200          wm.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask;
1201
1202          /* If the subpass has a depth or stencil self-dependency, then we
1203           * need to force the hardware to do the depth/stencil write *after*
1204           * fragment shader execution.  Otherwise, the writes may hit memory
1205           * before we get around to fetching from the input attachment and we
1206           * may get the depth or stencil value from the current draw rather
1207           * than the previous one.
1208           */
1209          wm.PixelShaderKillsPixel         = subpass->has_ds_self_dep ||
1210                                             wm_prog_data->uses_kill;
1211
1212          if (samples > 1) {
1213             wm.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
1214             if (wm_prog_data->persample_dispatch) {
1215                wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
1216             } else {
1217                wm.MultisampleDispatchMode = MSDISPMODE_PERPIXEL;
1218             }
1219          } else {
1220             wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
1221             wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
1222          }
1223 #endif
1224       }
1225    }
1226 }
1227
1228 static inline bool
1229 is_dual_src_blend_factor(VkBlendFactor factor)
1230 {
1231    return factor == VK_BLEND_FACTOR_SRC1_COLOR ||
1232           factor == VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR ||
1233           factor == VK_BLEND_FACTOR_SRC1_ALPHA ||
1234           factor == VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA;
1235 }
1236
1237 static void
1238 emit_3dstate_ps(struct anv_pipeline *pipeline,
1239                 const VkPipelineColorBlendStateCreateInfo *blend)
1240 {
1241    MAYBE_UNUSED const struct gen_device_info *devinfo = &pipeline->device->info;
1242    const struct anv_shader_bin *fs_bin =
1243       pipeline->shaders[MESA_SHADER_FRAGMENT];
1244
1245    if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
1246       anv_batch_emit(&pipeline->batch, GENX(3DSTATE_PS), ps) {
1247 #if GEN_GEN == 7
1248          /* Even if no fragments are ever dispatched, gen7 hardware hangs if
1249           * we don't at least set the maximum number of threads.
1250           */
1251          ps.MaximumNumberofThreads = devinfo->max_wm_threads - 1;
1252 #endif
1253       }
1254       return;
1255    }
1256
1257    const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
1258
1259 #if GEN_GEN < 8
1260    /* The hardware wedges if you have this bit set but don't turn on any dual
1261     * source blend factors.
1262     */
1263    bool dual_src_blend = false;
1264    if (wm_prog_data->dual_src_blend) {
1265       for (uint32_t i = 0; i < blend->attachmentCount; i++) {
1266          const VkPipelineColorBlendAttachmentState *bstate =
1267             &blend->pAttachments[i];
1268
1269          if (bstate->blendEnable &&
1270              (is_dual_src_blend_factor(bstate->srcColorBlendFactor) ||
1271               is_dual_src_blend_factor(bstate->dstColorBlendFactor) ||
1272               is_dual_src_blend_factor(bstate->srcAlphaBlendFactor) ||
1273               is_dual_src_blend_factor(bstate->dstAlphaBlendFactor))) {
1274             dual_src_blend = true;
1275             break;
1276          }
1277       }
1278    }
1279 #endif
1280
1281    anv_batch_emit(&pipeline->batch, GENX(3DSTATE_PS), ps) {
1282       ps.KernelStartPointer0        = fs_bin->kernel.offset;
1283       ps.KernelStartPointer1        = 0;
1284       ps.KernelStartPointer2        = fs_bin->kernel.offset +
1285                                       wm_prog_data->prog_offset_2;
1286       ps._8PixelDispatchEnable      = wm_prog_data->dispatch_8;
1287       ps._16PixelDispatchEnable     = wm_prog_data->dispatch_16;
1288       ps._32PixelDispatchEnable     = false;
1289
1290       ps.SingleProgramFlow          = false;
1291       ps.VectorMaskEnable           = true;
1292       ps.SamplerCount               = get_sampler_count(fs_bin);
1293       ps.BindingTableEntryCount     = get_binding_table_entry_count(fs_bin);
1294       ps.PushConstantEnable         = wm_prog_data->base.nr_params > 0;
1295       ps.PositionXYOffsetSelect     = wm_prog_data->uses_pos_offset ?
1296                                       POSOFFSET_SAMPLE: POSOFFSET_NONE;
1297 #if GEN_GEN < 8
1298       ps.AttributeEnable            = wm_prog_data->num_varying_inputs > 0;
1299       ps.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
1300       ps.DualSourceBlendEnable      = dual_src_blend;
1301 #endif
1302
1303 #if GEN_IS_HASWELL
1304       /* Haswell requires the sample mask to be set in this packet as well
1305        * as in 3DSTATE_SAMPLE_MASK; the values should match.
1306        */
1307       ps.SampleMask                 = 0xff;
1308 #endif
1309
1310 #if GEN_GEN >= 9
1311       ps.MaximumNumberofThreadsPerPSD  = 64 - 1;
1312 #elif GEN_GEN >= 8
1313       ps.MaximumNumberofThreadsPerPSD  = 64 - 2;
1314 #else
1315       ps.MaximumNumberofThreads        = devinfo->max_wm_threads - 1;
1316 #endif
1317
1318       ps.DispatchGRFStartRegisterForConstantSetupData0 =
1319          wm_prog_data->base.dispatch_grf_start_reg;
1320       ps.DispatchGRFStartRegisterForConstantSetupData1 = 0;
1321       ps.DispatchGRFStartRegisterForConstantSetupData2 =
1322          wm_prog_data->dispatch_grf_start_reg_2;
1323
1324       ps.PerThreadScratchSpace   = get_scratch_space(fs_bin);
1325       ps.ScratchSpaceBasePointer =
1326          get_scratch_address(pipeline, MESA_SHADER_FRAGMENT, fs_bin);
1327    }
1328 }
1329
1330 #if GEN_GEN >= 8
1331 static void
1332 emit_3dstate_ps_extra(struct anv_pipeline *pipeline,
1333                       struct anv_subpass *subpass)
1334 {
1335    const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
1336
1337    if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
1338       anv_batch_emit(&pipeline->batch, GENX(3DSTATE_PS_EXTRA), ps);
1339       return;
1340    }
1341
1342    anv_batch_emit(&pipeline->batch, GENX(3DSTATE_PS_EXTRA), ps) {
1343       ps.PixelShaderValid              = true;
1344       ps.AttributeEnable               = wm_prog_data->num_varying_inputs > 0;
1345       ps.oMaskPresenttoRenderTarget    = wm_prog_data->uses_omask;
1346       ps.PixelShaderIsPerSample        = wm_prog_data->persample_dispatch;
1347       ps.PixelShaderComputedDepthMode  = wm_prog_data->computed_depth_mode;
1348       ps.PixelShaderUsesSourceDepth    = wm_prog_data->uses_src_depth;
1349       ps.PixelShaderUsesSourceW        = wm_prog_data->uses_src_w;
1350
1351       /* If the subpass has a depth or stencil self-dependency, then we need
1352        * to force the hardware to do the depth/stencil write *after* fragment
1353        * shader execution.  Otherwise, the writes may hit memory before we get
1354        * around to fetching from the input attachment and we may get the depth
1355        * or stencil value from the current draw rather than the previous one.
1356        */
1357       ps.PixelShaderKillsPixel         = subpass->has_ds_self_dep ||
1358                                          wm_prog_data->uses_kill;
1359
1360 #if GEN_GEN >= 9
1361       ps.PixelShaderPullsBary    = wm_prog_data->pulls_bary;
1362       ps.InputCoverageMaskState  = wm_prog_data->uses_sample_mask ?
1363                                    ICMS_INNER_CONSERVATIVE : ICMS_NONE;
1364 #else
1365       ps.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask;
1366 #endif
1367    }
1368 }
1369
1370 static void
1371 emit_3dstate_vf_topology(struct anv_pipeline *pipeline)
1372 {
1373    anv_batch_emit(&pipeline->batch, GENX(3DSTATE_VF_TOPOLOGY), vft) {
1374       vft.PrimitiveTopologyType = pipeline->topology;
1375    }
1376 }
1377 #endif
1378
1379 static VkResult
1380 genX(graphics_pipeline_create)(
1381     VkDevice                                    _device,
1382     struct anv_pipeline_cache *                 cache,
1383     const VkGraphicsPipelineCreateInfo*         pCreateInfo,
1384     const VkAllocationCallbacks*                pAllocator,
1385     VkPipeline*                                 pPipeline)
1386 {
1387    ANV_FROM_HANDLE(anv_device, device, _device);
1388    ANV_FROM_HANDLE(anv_render_pass, pass, pCreateInfo->renderPass);
1389    struct anv_subpass *subpass = &pass->subpasses[pCreateInfo->subpass];
1390    struct anv_pipeline *pipeline;
1391    VkResult result;
1392
1393    assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO);
1394
1395    pipeline = vk_alloc2(&device->alloc, pAllocator, sizeof(*pipeline), 8,
1396                          VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
1397    if (pipeline == NULL)
1398       return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
1399
1400    result = anv_pipeline_init(pipeline, device, cache,
1401                               pCreateInfo, pAllocator);
1402    if (result != VK_SUCCESS) {
1403       vk_free2(&device->alloc, pAllocator, pipeline);
1404       return result;
1405    }
1406
1407    assert(pCreateInfo->pVertexInputState);
1408    emit_vertex_input(pipeline, pCreateInfo->pVertexInputState);
1409    assert(pCreateInfo->pRasterizationState);
1410    emit_rs_state(pipeline, pCreateInfo->pRasterizationState,
1411                  pCreateInfo->pMultisampleState, pass, subpass);
1412    emit_ms_state(pipeline, pCreateInfo->pMultisampleState);
1413    emit_ds_state(pipeline, pCreateInfo->pDepthStencilState, pass, subpass);
1414    emit_cb_state(pipeline, pCreateInfo->pColorBlendState,
1415                            pCreateInfo->pMultisampleState);
1416
1417    emit_urb_setup(pipeline);
1418
1419    emit_3dstate_clip(pipeline, pCreateInfo->pViewportState,
1420                      pCreateInfo->pRasterizationState);
1421    emit_3dstate_streamout(pipeline, pCreateInfo->pRasterizationState);
1422
1423 #if 0
1424    /* From gen7_vs_state.c */
1425
1426    /**
1427     * From Graphics BSpec: 3D-Media-GPGPU Engine > 3D Pipeline Stages >
1428     * Geometry > Geometry Shader > State:
1429     *
1430     *     "Note: Because of corruption in IVB:GT2, software needs to flush the
1431     *     whole fixed function pipeline when the GS enable changes value in
1432     *     the 3DSTATE_GS."
1433     *
1434     * The hardware architects have clarified that in this context "flush the
1435     * whole fixed function pipeline" means to emit a PIPE_CONTROL with the "CS
1436     * Stall" bit set.
1437     */
1438    if (!brw->is_haswell && !brw->is_baytrail)
1439       gen7_emit_vs_workaround_flush(brw);
1440 #endif
1441
1442    emit_3dstate_vs(pipeline);
1443    emit_3dstate_hs_te_ds(pipeline);
1444    emit_3dstate_gs(pipeline);
1445    emit_3dstate_sbe(pipeline);
1446    emit_3dstate_wm(pipeline, subpass, pCreateInfo->pMultisampleState);
1447    emit_3dstate_ps(pipeline, pCreateInfo->pColorBlendState);
1448 #if GEN_GEN >= 8
1449    emit_3dstate_ps_extra(pipeline, subpass);
1450    emit_3dstate_vf_topology(pipeline);
1451 #endif
1452
1453    *pPipeline = anv_pipeline_to_handle(pipeline);
1454
1455    return VK_SUCCESS;
1456 }
1457
1458 static VkResult
1459 compute_pipeline_create(
1460     VkDevice                                    _device,
1461     struct anv_pipeline_cache *                 cache,
1462     const VkComputePipelineCreateInfo*          pCreateInfo,
1463     const VkAllocationCallbacks*                pAllocator,
1464     VkPipeline*                                 pPipeline)
1465 {
1466    ANV_FROM_HANDLE(anv_device, device, _device);
1467    const struct anv_physical_device *physical_device =
1468       &device->instance->physicalDevice;
1469    const struct gen_device_info *devinfo = &physical_device->info;
1470    struct anv_pipeline *pipeline;
1471    VkResult result;
1472
1473    assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO);
1474
1475    pipeline = vk_alloc2(&device->alloc, pAllocator, sizeof(*pipeline), 8,
1476                          VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
1477    if (pipeline == NULL)
1478       return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
1479
1480    pipeline->device = device;
1481    pipeline->layout = anv_pipeline_layout_from_handle(pCreateInfo->layout);
1482
1483    pipeline->blend_state.map = NULL;
1484
1485    result = anv_reloc_list_init(&pipeline->batch_relocs,
1486                                 pAllocator ? pAllocator : &device->alloc);
1487    if (result != VK_SUCCESS) {
1488       vk_free2(&device->alloc, pAllocator, pipeline);
1489       return result;
1490    }
1491    pipeline->batch.next = pipeline->batch.start = pipeline->batch_data;
1492    pipeline->batch.end = pipeline->batch.start + sizeof(pipeline->batch_data);
1493    pipeline->batch.relocs = &pipeline->batch_relocs;
1494
1495    /* When we free the pipeline, we detect stages based on the NULL status
1496     * of various prog_data pointers.  Make them NULL by default.
1497     */
1498    memset(pipeline->shaders, 0, sizeof(pipeline->shaders));
1499
1500    pipeline->active_stages = 0;
1501
1502    pipeline->needs_data_cache = false;
1503
1504    assert(pCreateInfo->stage.stage == VK_SHADER_STAGE_COMPUTE_BIT);
1505    ANV_FROM_HANDLE(anv_shader_module, module,  pCreateInfo->stage.module);
1506    result = anv_pipeline_compile_cs(pipeline, cache, pCreateInfo, module,
1507                                     pCreateInfo->stage.pName,
1508                                     pCreateInfo->stage.pSpecializationInfo);
1509    if (result != VK_SUCCESS) {
1510       vk_free2(&device->alloc, pAllocator, pipeline);
1511       return result;
1512    }
1513
1514    const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline);
1515
1516    anv_pipeline_setup_l3_config(pipeline, cs_prog_data->base.total_shared > 0);
1517
1518    uint32_t group_size = cs_prog_data->local_size[0] *
1519       cs_prog_data->local_size[1] * cs_prog_data->local_size[2];
1520    uint32_t remainder = group_size & (cs_prog_data->simd_size - 1);
1521
1522    if (remainder > 0)
1523       pipeline->cs_right_mask = ~0u >> (32 - remainder);
1524    else
1525       pipeline->cs_right_mask = ~0u >> (32 - cs_prog_data->simd_size);
1526
1527    const uint32_t vfe_curbe_allocation =
1528       ALIGN(cs_prog_data->push.per_thread.regs * cs_prog_data->threads +
1529             cs_prog_data->push.cross_thread.regs, 2);
1530
1531    const uint32_t subslices = MAX2(physical_device->subslice_total, 1);
1532
1533    const struct anv_shader_bin *cs_bin =
1534       pipeline->shaders[MESA_SHADER_COMPUTE];
1535
1536    anv_batch_emit(&pipeline->batch, GENX(MEDIA_VFE_STATE), vfe) {
1537 #if GEN_GEN > 7
1538       vfe.StackSize              = 0;
1539 #else
1540       vfe.GPGPUMode              = true;
1541 #endif
1542       vfe.MaximumNumberofThreads =
1543          devinfo->max_cs_threads * subslices - 1;
1544       vfe.NumberofURBEntries     = GEN_GEN <= 7 ? 0 : 2;
1545       vfe.ResetGatewayTimer      = true;
1546 #if GEN_GEN <= 8
1547       vfe.BypassGatewayControl   = true;
1548 #endif
1549       vfe.URBEntryAllocationSize = GEN_GEN <= 7 ? 0 : 2;
1550       vfe.CURBEAllocationSize    = vfe_curbe_allocation;
1551
1552       vfe.PerThreadScratchSpace = get_scratch_space(cs_bin);
1553       vfe.ScratchSpaceBasePointer =
1554          get_scratch_address(pipeline, MESA_SHADER_COMPUTE, cs_bin);
1555    }
1556
1557    struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = {
1558       .KernelStartPointer     = cs_bin->kernel.offset,
1559
1560       .SamplerCount           = get_sampler_count(cs_bin),
1561       .BindingTableEntryCount = get_binding_table_entry_count(cs_bin),
1562       .BarrierEnable          = cs_prog_data->uses_barrier,
1563       .SharedLocalMemorySize  =
1564          encode_slm_size(GEN_GEN, cs_prog_data->base.total_shared),
1565
1566 #if !GEN_IS_HASWELL
1567       .ConstantURBEntryReadOffset = 0,
1568 #endif
1569       .ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs,
1570 #if GEN_GEN >= 8 || GEN_IS_HASWELL
1571       .CrossThreadConstantDataReadLength =
1572          cs_prog_data->push.cross_thread.regs,
1573 #endif
1574
1575       .NumberofThreadsinGPGPUThreadGroup = cs_prog_data->threads,
1576    };
1577    GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL,
1578                                         pipeline->interface_descriptor_data,
1579                                         &desc);
1580
1581    *pPipeline = anv_pipeline_to_handle(pipeline);
1582
1583    return VK_SUCCESS;
1584 }
1585
1586 VkResult genX(CreateGraphicsPipelines)(
1587     VkDevice                                    _device,
1588     VkPipelineCache                             pipelineCache,
1589     uint32_t                                    count,
1590     const VkGraphicsPipelineCreateInfo*         pCreateInfos,
1591     const VkAllocationCallbacks*                pAllocator,
1592     VkPipeline*                                 pPipelines)
1593 {
1594    ANV_FROM_HANDLE(anv_pipeline_cache, pipeline_cache, pipelineCache);
1595
1596    VkResult result = VK_SUCCESS;
1597
1598    unsigned i;
1599    for (i = 0; i < count; i++) {
1600       result = genX(graphics_pipeline_create)(_device,
1601                                               pipeline_cache,
1602                                               &pCreateInfos[i],
1603                                               pAllocator, &pPipelines[i]);
1604
1605       /* Bail out on the first error as it is not obvious what error should be
1606        * report upon 2 different failures. */
1607       if (result != VK_SUCCESS)
1608          break;
1609    }
1610
1611    for (; i < count; i++)
1612       pPipelines[i] = VK_NULL_HANDLE;
1613
1614    return result;
1615 }
1616
1617 VkResult genX(CreateComputePipelines)(
1618     VkDevice                                    _device,
1619     VkPipelineCache                             pipelineCache,
1620     uint32_t                                    count,
1621     const VkComputePipelineCreateInfo*          pCreateInfos,
1622     const VkAllocationCallbacks*                pAllocator,
1623     VkPipeline*                                 pPipelines)
1624 {
1625    ANV_FROM_HANDLE(anv_pipeline_cache, pipeline_cache, pipelineCache);
1626
1627    VkResult result = VK_SUCCESS;
1628
1629    unsigned i;
1630    for (i = 0; i < count; i++) {
1631       result = compute_pipeline_create(_device, pipeline_cache,
1632                                        &pCreateInfos[i],
1633                                        pAllocator, &pPipelines[i]);
1634
1635       /* Bail out on the first error as it is not obvious what error should be
1636        * report upon 2 different failures. */
1637       if (result != VK_SUCCESS)
1638          break;
1639    }
1640
1641    for (; i < count; i++)
1642       pPipelines[i] = VK_NULL_HANDLE;
1643
1644    return result;
1645 }