src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp

   1 /*
   2  * Copyright © 2013 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  21  * DEALINGS IN THE SOFTWARE.
  22  */
  23
  24 /**
  25  * \file brw_vec4_gs_visitor.cpp
  26  *
  27  * Geometry-shader-specific code derived from the vec4_visitor class.
  28  */
  29
  30 #include "brw_vec4_gs_visitor.h"
  31
  32 const unsigned MAX_GS_INPUT_VERTICES = 6;
  33
  34 namespace brw {
  35
  36 vec4_gs_visitor::vec4_gs_visitor(struct brw_context *brw,
  37                                  struct brw_gs_compile *c,
  38                                  struct gl_shader_program *prog,
  39                                  struct brw_shader *shader,
  40                                  void *mem_ctx,
  41                                  bool no_spills)
  42    : vec4_visitor(brw, &c->base, &c->gp->program.Base, &c->key.base,
  43                   &c->prog_data.base, prog, shader, mem_ctx,
  44                   INTEL_DEBUG & DEBUG_GS, no_spills),
  45      c(c)
  46 {
  47 }
  48
  49
  50 dst_reg *
  51 vec4_gs_visitor::make_reg_for_system_value(ir_variable *ir)
  52 {
  53    /* Geometry shaders don't use any system values. */
  54    assert(!"Unreached");
  55    return NULL;
  56 }
  57
  58
  59 int
  60 vec4_gs_visitor::setup_varying_inputs(int payload_reg, int *attribute_map,
  61                                       int attributes_per_reg)
  62 {
  63    /* For geometry shaders there are N copies of the input attributes, where N
  64     * is the number of input vertices.  attribute_map[BRW_VARYING_SLOT_COUNT *
  65     * i + j] represents attribute j for vertex i.
  66     *
  67     * Note that GS inputs are read from the VUE 256 bits (2 vec4's) at a time,
  68     * so the total number of input slots that will be delivered to the GS (and
  69     * thus the stride of the input arrays) is urb_read_length * 2.
  70     */
  71    const unsigned num_input_vertices = c->gp->program.VerticesIn;
  72    assert(num_input_vertices <= MAX_GS_INPUT_VERTICES);
  73    unsigned input_array_stride = c->prog_data.base.urb_read_length * 2;
  74
  75    for (int slot = 0; slot < c->input_vue_map.num_slots; slot++) {
  76       int varying = c->input_vue_map.slot_to_varying[slot];
  77       for (unsigned vertex = 0; vertex < num_input_vertices; vertex++) {
  78          attribute_map[BRW_VARYING_SLOT_COUNT * vertex + varying] =
  79             attributes_per_reg * payload_reg + input_array_stride * vertex +
  80             slot;
  81       }
  82    }
  83
  84    int regs_used = ALIGN(input_array_stride * num_input_vertices,
  85                          attributes_per_reg) / attributes_per_reg;
  86    return payload_reg + regs_used;
  87 }
  88
  89
  90 void
  91 vec4_gs_visitor::setup_payload()
  92 {
  93    int attribute_map[BRW_VARYING_SLOT_COUNT * MAX_GS_INPUT_VERTICES];
  94
  95    /* If we are in dual instanced mode, then attributes are going to be
  96     * interleaved, so one register contains two attribute slots.
  97     */
  98    int attributes_per_reg = c->prog_data.dual_instanced_dispatch ? 2 : 1;
  99
 100    /* If a geometry shader tries to read from an input that wasn't written by
 101     * the vertex shader, that produces undefined results, but it shouldn't
 102     * crash anything.  So initialize attribute_map to zeros--that ensures that
 103     * these undefined results are read from r0.
 104     */
 105    memset(attribute_map, 0, sizeof(attribute_map));
 106
 107    int reg = 0;
 108
 109    /* The payload always contains important data in r0, which contains
 110     * the URB handles that are passed on to the URB write at the end
 111     * of the thread.
 112     */
 113    reg++;
 114
 115    /* If the shader uses gl_PrimitiveIDIn, that goes in r1. */
 116    if (c->prog_data.include_primitive_id)
 117       attribute_map[VARYING_SLOT_PRIMITIVE_ID] = attributes_per_reg * reg++;
 118
 119    reg = setup_uniforms(reg);
 120
 121    reg = setup_varying_inputs(reg, attribute_map, attributes_per_reg);
 122
 123    lower_attributes_to_hw_regs(attribute_map,
 124                                c->prog_data.dual_instanced_dispatch);
 125
 126    this->first_non_payload_grf = reg;
 127 }
 128
 129
 130 void
 131 vec4_gs_visitor::emit_prolog()
 132 {
 133    /* In vertex shaders, r0.2 is guaranteed to be initialized to zero.  In
 134     * geometry shaders, it isn't (it contains a bunch of information we don't
 135     * need, like the input primitive type).  We need r0.2 to be zero in order
 136     * to build scratch read/write messages correctly (otherwise this value
 137     * will be interpreted as a global offset, causing us to do our scratch
 138     * reads/writes to garbage memory).  So just set it to zero at the top of
 139     * the shader.
 140     */
 141    this->current_annotation = "clear r0.2";
 142    dst_reg r0(retype(brw_vec4_grf(0, 0), BRW_REGISTER_TYPE_UD));
 143    vec4_instruction *inst = emit(GS_OPCODE_SET_DWORD_2_IMMED, r0, 0u);
 144    inst->force_writemask_all = true;
 145
 146    /* Create a virtual register to hold the vertex count */
 147    this->vertex_count = src_reg(this, glsl_type::uint_type);
 148
 149    /* Initialize the vertex_count register to 0 */
 150    this->current_annotation = "initialize vertex_count";
 151    inst = emit(MOV(dst_reg(this->vertex_count), 0u));
 152    inst->force_writemask_all = true;
 153
 154    if (c->control_data_header_size_bits > 0) {
 155       /* Create a virtual register to hold the current set of control data
 156        * bits.
 157        */
 158       this->control_data_bits = src_reg(this, glsl_type::uint_type);
 159
 160       /* If we're outputting more than 32 control data bits, then EmitVertex()
 161        * will set control_data_bits to 0 after emitting the first vertex.
 162        * Otherwise, we need to initialize it to 0 here.
 163        */
 164       if (c->control_data_header_size_bits <= 32) {
 165          this->current_annotation = "initialize control data bits";
 166          inst = emit(MOV(dst_reg(this->control_data_bits), 0u));
 167          inst->force_writemask_all = true;
 168       }
 169    }
 170
 171    /* If the geometry shader uses the gl_PointSize input, we need to fix it up
 172     * to account for the fact that the vertex shader stored it in the w
 173     * component of VARYING_SLOT_PSIZ.
 174     */
 175    if (c->gp->program.Base.InputsRead & VARYING_BIT_PSIZ) {
 176       this->current_annotation = "swizzle gl_PointSize input";
 177       for (int vertex = 0; vertex < c->gp->program.VerticesIn; vertex++) {
 178          dst_reg dst(ATTR,
 179                      BRW_VARYING_SLOT_COUNT * vertex + VARYING_SLOT_PSIZ);
 180          dst.type = BRW_REGISTER_TYPE_F;
 181          src_reg src(dst);
 182          dst.writemask = WRITEMASK_X;
 183          src.swizzle = BRW_SWIZZLE_WWWW;
 184          inst = emit(MOV(dst, src));
 185
 186          /* In dual instanced dispatch mode, dst has a width of 4, so we need
 187           * to make sure the MOV happens regardless of which channels are
 188           * enabled.
 189           */
 190          inst->force_writemask_all = true;
 191       }
 192    }
 193
 194    this->current_annotation = NULL;
 195 }
 196
 197
 198 void
 199 vec4_gs_visitor::emit_program_code()
 200 {
 201    /* We don't support NV_geometry_program4. */
 202    assert(!"Unreached");
 203 }
 204
 205
 206 void
 207 vec4_gs_visitor::emit_thread_end()
 208 {
 209    if (c->control_data_header_size_bits > 0) {
 210       /* During shader execution, we only ever call emit_control_data_bits()
 211        * just prior to outputting a vertex.  Therefore, the control data bits
 212        * corresponding to the most recently output vertex still need to be
 213        * emitted.
 214        */
 215       current_annotation = "thread end: emit control data bits";
 216       emit_control_data_bits();
 217    }
 218
 219    /* MRF 0 is reserved for the debugger, so start with message header
 220     * in MRF 1.
 221     */
 222    int base_mrf = 1;
 223
 224    current_annotation = "thread end";
 225    dst_reg mrf_reg(MRF, base_mrf);
 226    src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
 227    vec4_instruction *inst = emit(MOV(mrf_reg, r0));
 228    inst->force_writemask_all = true;
 229    emit(GS_OPCODE_SET_VERTEX_COUNT, mrf_reg, this->vertex_count);
 230    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
 231       emit_shader_time_end();
 232    inst = emit(GS_OPCODE_THREAD_END);
 233    inst->base_mrf = base_mrf;
 234    inst->mlen = 1;
 235 }
 236
 237
 238 void
 239 vec4_gs_visitor::emit_urb_write_header(int mrf)
 240 {
 241    /* The SEND instruction that writes the vertex data to the VUE will use
 242     * per_slot_offset=true, which means that DWORDs 3 and 4 of the message
 243     * header specify an offset (in multiples of 256 bits) into the URB entry
 244     * at which the write should take place.
 245     *
 246     * So we have to prepare a message header with the appropriate offset
 247     * values.
 248     */
 249    dst_reg mrf_reg(MRF, mrf);
 250    src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
 251    this->current_annotation = "URB write header";
 252    vec4_instruction *inst = emit(MOV(mrf_reg, r0));
 253    inst->force_writemask_all = true;
 254    emit(GS_OPCODE_SET_WRITE_OFFSET, mrf_reg, this->vertex_count,
 255         (uint32_t) c->prog_data.output_vertex_size_hwords);
 256 }
 257
 258
 259 vec4_instruction *
 260 vec4_gs_visitor::emit_urb_write_opcode(bool complete)
 261 {
 262    /* We don't care whether the vertex is complete, because in general
 263     * geometry shaders output multiple vertices, and we don't terminate the
 264     * thread until all vertices are complete.
 265     */
 266    (void) complete;
 267
 268    vec4_instruction *inst = emit(GS_OPCODE_URB_WRITE);
 269    inst->offset = c->prog_data.control_data_header_size_hwords;
 270    inst->urb_write_flags = BRW_URB_WRITE_PER_SLOT_OFFSET;
 271    return inst;
 272 }
 273
 274
 275 int
 276 vec4_gs_visitor::compute_array_stride(ir_dereference_array *ir)
 277 {
 278    /* Geometry shader inputs are arrays, but they use an unusual array layout:
 279     * instead of all array elements for a given geometry shader input being
 280     * stored consecutively, all geometry shader inputs are interleaved into
 281     * one giant array.  At this stage of compilation, we assume that the
 282     * stride of the array is BRW_VARYING_SLOT_COUNT.  Later,
 283     * setup_attributes() will remap our accesses to the actual input array.
 284     */
 285    ir_dereference_variable *deref_var = ir->array->as_dereference_variable();
 286    if (deref_var && deref_var->var->data.mode == ir_var_shader_in)
 287       return BRW_VARYING_SLOT_COUNT;
 288    else
 289       return vec4_visitor::compute_array_stride(ir);
 290 }
 291
 292
 293 /**
 294  * Write out a batch of 32 control data bits from the control_data_bits
 295  * register to the URB.
 296  *
 297  * The current value of the vertex_count register determines which DWORD in
 298  * the URB receives the control data bits.  The control_data_bits register is
 299  * assumed to contain the correct data for the vertex that was most recently
 300  * output, and all previous vertices that share the same DWORD.
 301  *
 302  * This function takes care of ensuring that if no vertices have been output
 303  * yet, no control bits are emitted.
 304  */
 305 void
 306 vec4_gs_visitor::emit_control_data_bits()
 307 {
 308    assert(c->control_data_bits_per_vertex != 0);
 309
 310    /* Since the URB_WRITE_OWORD message operates with 128-bit (vec4 sized)
 311     * granularity, we need to use two tricks to ensure that the batch of 32
 312     * control data bits is written to the appropriate DWORD in the URB.  To
 313     * select which vec4 we are writing to, we use the "slot {0,1} offset"
 314     * fields of the message header.  To select which DWORD in the vec4 we are
 315     * writing to, we use the channel mask fields of the message header.  To
 316     * avoid penalizing geometry shaders that emit a small number of vertices
 317     * with extra bookkeeping, we only do each of these tricks when
 318     * c->prog_data.control_data_header_size_bits is large enough to make it
 319     * necessary.
 320     *
 321     * Note: this means that if we're outputting just a single DWORD of control
 322     * data bits, we'll actually replicate it four times since we won't do any
 323     * channel masking.  But that's not a problem since in this case the
 324     * hardware only pays attention to the first DWORD.
 325     */
 326    enum brw_urb_write_flags urb_write_flags = BRW_URB_WRITE_OWORD;
 327    if (c->control_data_header_size_bits > 32)
 328       urb_write_flags = urb_write_flags | BRW_URB_WRITE_USE_CHANNEL_MASKS;
 329    if (c->control_data_header_size_bits > 128)
 330       urb_write_flags = urb_write_flags | BRW_URB_WRITE_PER_SLOT_OFFSET;
 331
 332    /* If vertex_count is 0, then no control data bits have been accumulated
 333     * yet, so we should do nothing.
 334     */
 335    emit(CMP(dst_null_d(), this->vertex_count, 0u, BRW_CONDITIONAL_NEQ));
 336    emit(IF(BRW_PREDICATE_NORMAL));
 337    {
 338       /* If we are using either channel masks or a per-slot offset, then we
 339        * need to figure out which DWORD we are trying to write to, using the
 340        * formula:
 341        *
 342        *     dword_index = (vertex_count - 1) * bits_per_vertex / 32
 343        *
 344        * Since bits_per_vertex is a power of two, and is known at compile
 345        * time, this can be optimized to:
 346        *
 347        *     dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
 348        */
 349       src_reg dword_index(this, glsl_type::uint_type);
 350       if (urb_write_flags) {
 351          src_reg prev_count(this, glsl_type::uint_type);
 352          emit(ADD(dst_reg(prev_count), this->vertex_count, 0xffffffffu));
 353          unsigned log2_bits_per_vertex =
 354             _mesa_fls(c->control_data_bits_per_vertex);
 355          emit(SHR(dst_reg(dword_index), prev_count,
 356                   (uint32_t) (6 - log2_bits_per_vertex)));
 357       }
 358
 359       /* Start building the URB write message.  The first MRF gets a copy of
 360        * R0.
 361        */
 362       int base_mrf = 1;
 363       dst_reg mrf_reg(MRF, base_mrf);
 364       src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
 365       vec4_instruction *inst = emit(MOV(mrf_reg, r0));
 366       inst->force_writemask_all = true;
 367
 368       if (urb_write_flags & BRW_URB_WRITE_PER_SLOT_OFFSET) {
 369          /* Set the per-slot offset to dword_index / 4, to that we'll write to
 370           * the appropriate OWORD within the control data header.
 371           */
 372          src_reg per_slot_offset(this, glsl_type::uint_type);
 373          emit(SHR(dst_reg(per_slot_offset), dword_index, 2u));
 374          emit(GS_OPCODE_SET_WRITE_OFFSET, mrf_reg, per_slot_offset, 1u);
 375       }
 376
 377       if (urb_write_flags & BRW_URB_WRITE_USE_CHANNEL_MASKS) {
 378          /* Set the channel masks to 1 << (dword_index % 4), so that we'll
 379           * write to the appropriate DWORD within the OWORD.  We need to do
 380           * this computation with force_writemask_all, otherwise garbage data
 381           * from invocation 0 might clobber the mask for invocation 1 when
 382           * GS_OPCODE_PREPARE_CHANNEL_MASKS tries to OR the two masks
 383           * together.
 384           */
 385          src_reg channel(this, glsl_type::uint_type);
 386          inst = emit(AND(dst_reg(channel), dword_index, 3u));
 387          inst->force_writemask_all = true;
 388          src_reg one(this, glsl_type::uint_type);
 389          inst = emit(MOV(dst_reg(one), 1u));
 390          inst->force_writemask_all = true;
 391          src_reg channel_mask(this, glsl_type::uint_type);
 392          inst = emit(SHL(dst_reg(channel_mask), one, channel));
 393          inst->force_writemask_all = true;
 394          emit(GS_OPCODE_PREPARE_CHANNEL_MASKS, dst_reg(channel_mask));
 395          emit(GS_OPCODE_SET_CHANNEL_MASKS, mrf_reg, channel_mask);
 396       }
 397
 398       /* Store the control data bits in the message payload and send it. */
 399       dst_reg mrf_reg2(MRF, base_mrf + 1);
 400       inst = emit(MOV(mrf_reg2, this->control_data_bits));
 401       inst->force_writemask_all = true;
 402       inst = emit(GS_OPCODE_URB_WRITE);
 403       inst->urb_write_flags = urb_write_flags;
 404       inst->base_mrf = base_mrf;
 405       inst->mlen = 2;
 406    }
 407    emit(BRW_OPCODE_ENDIF);
 408 }
 409
 410
 411 void
 412 vec4_gs_visitor::visit(ir_emit_vertex *)
 413 {
 414    this->current_annotation = "emit vertex: safety check";
 415
 416    /* To ensure that we don't output more vertices than the shader specified
 417     * using max_vertices, do the logic inside a conditional of the form "if
 418     * (vertex_count < MAX)"
 419     */
 420    unsigned num_output_vertices = c->gp->program.VerticesOut;
 421    emit(CMP(dst_null_d(), this->vertex_count,
 422             src_reg(num_output_vertices), BRW_CONDITIONAL_L));
 423    emit(IF(BRW_PREDICATE_NORMAL));
 424    {
 425       /* If we're outputting 32 control data bits or less, then we can wait
 426        * until the shader is over to output them all.  Otherwise we need to
 427        * output them as we go.  Now is the time to do it, since we're about to
 428        * output the vertex_count'th vertex, so it's guaranteed that the
 429        * control data bits associated with the (vertex_count - 1)th vertex are
 430        * correct.
 431        */
 432       if (c->control_data_header_size_bits > 32) {
 433          this->current_annotation = "emit vertex: emit control data bits";
 434          /* Only emit control data bits if we've finished accumulating a batch
 435           * of 32 bits.  This is the case when:
 436           *
 437           *     (vertex_count * bits_per_vertex) % 32 == 0
 438           *
 439           * (in other words, when the last 5 bits of vertex_count *
 440           * bits_per_vertex are 0).  Assuming bits_per_vertex == 2^n for some
 441           * integer n (which is always the case, since bits_per_vertex is
 442           * always 1 or 2), this is equivalent to requiring that the last 5-n
 443           * bits of vertex_count are 0:
 444           *
 445           *     vertex_count & (2^(5-n) - 1) == 0
 446           *
 447           * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is
 448           * equivalent to:
 449           *
 450           *     vertex_count & (32 / bits_per_vertex - 1) == 0
 451           */
 452          vec4_instruction *inst =
 453             emit(AND(dst_null_d(), this->vertex_count,
 454                      (uint32_t) (32 / c->control_data_bits_per_vertex - 1)));
 455          inst->conditional_mod = BRW_CONDITIONAL_Z;
 456          emit(IF(BRW_PREDICATE_NORMAL));
 457          {
 458             emit_control_data_bits();
 459
 460             /* Reset control_data_bits to 0 so we can start accumulating a new
 461              * batch.
 462              *
 463              * Note: in the case where vertex_count == 0, this neutralizes the
 464              * effect of any call to EndPrimitive() that the shader may have
 465              * made before outputting its first vertex.
 466              */
 467             inst = emit(MOV(dst_reg(this->control_data_bits), 0u));
 468             inst->force_writemask_all = true;
 469          }
 470          emit(BRW_OPCODE_ENDIF);
 471       }
 472
 473       this->current_annotation = "emit vertex: vertex data";
 474       emit_vertex();
 475
 476       this->current_annotation = "emit vertex: increment vertex count";
 477       emit(ADD(dst_reg(this->vertex_count), this->vertex_count,
 478                src_reg(1u)));
 479    }
 480    emit(BRW_OPCODE_ENDIF);
 481
 482    this->current_annotation = NULL;
 483 }
 484
 485 void
 486 vec4_gs_visitor::visit(ir_end_primitive *)
 487 {
 488    /* We can only do EndPrimitive() functionality when the control data
 489     * consists of cut bits.  Fortunately, the only time it isn't is when the
 490     * output type is points, in which case EndPrimitive() is a no-op.
 491     */
 492    if (c->prog_data.control_data_format !=
 493        GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) {
 494       return;
 495    }
 496
 497    /* Cut bits use one bit per vertex. */
 498    assert(c->control_data_bits_per_vertex == 1);
 499
 500    /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting
 501     * vertex n, 0 otherwise.  So all we need to do here is mark bit
 502     * (vertex_count - 1) % 32 in the cut_bits register to indicate that
 503     * EndPrimitive() was called after emitting vertex (vertex_count - 1);
 504     * vec4_gs_visitor::emit_control_data_bits() will take care of the rest.
 505     *
 506     * Note that if EndPrimitve() is called before emitting any vertices, this
 507     * will cause us to set bit 31 of the control_data_bits register to 1.
 508     * That's fine because:
 509     *
 510     * - If max_vertices < 32, then vertex number 31 (zero-based) will never be
 511     *   output, so the hardware will ignore cut bit 31.
 512     *
 513     * - If max_vertices == 32, then vertex number 31 is guaranteed to be the
 514     *   last vertex, so setting cut bit 31 has no effect (since the primitive
 515     *   is automatically ended when the GS terminates).
 516     *
 517     * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the
 518     *   control_data_bits register to 0 when the first vertex is emitted.
 519     */
 520
 521    /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */
 522    src_reg one(this, glsl_type::uint_type);
 523    emit(MOV(dst_reg(one), 1u));
 524    src_reg prev_count(this, glsl_type::uint_type);
 525    emit(ADD(dst_reg(prev_count), this->vertex_count, 0xffffffffu));
 526    src_reg mask(this, glsl_type::uint_type);
 527    /* Note: we're relying on the fact that the GEN SHL instruction only pays
 528     * attention to the lower 5 bits of its second source argument, so on this
 529     * architecture, 1 << (vertex_count - 1) is equivalent to 1 <<
 530     * ((vertex_count - 1) % 32).
 531     */
 532    emit(SHL(dst_reg(mask), one, prev_count));
 533    emit(OR(dst_reg(this->control_data_bits), this->control_data_bits, mask));
 534 }
 535
 536
 537 extern "C" const unsigned *
 538 brw_gs_emit(struct brw_context *brw,
 539             struct gl_shader_program *prog,
 540             struct brw_gs_compile *c,
 541             void *mem_ctx,
 542             unsigned *final_assembly_size)
 543 {
 544    struct brw_shader *shader =
 545       (brw_shader *) prog->_LinkedShaders[MESA_SHADER_GEOMETRY];
 546
 547    if (unlikely(INTEL_DEBUG & DEBUG_GS)) {
 548       printf("GLSL IR for native geometry shader %d:\n", prog->Name);
 549       _mesa_print_ir(shader->ir, NULL);
 550       printf("\n\n");
 551    }
 552
 553    /* Compile the geometry shader in DUAL_OBJECT dispatch mode, if we can do
 554     * so without spilling.
 555     */
 556    if (likely(!(INTEL_DEBUG & DEBUG_NO_DUAL_OBJECT_GS))) {
 557       c->prog_data.dual_instanced_dispatch = false;
 558
 559       vec4_gs_visitor v(brw, c, prog, shader, mem_ctx, true /* no_spills */);
 560       if (v.run()) {
 561          vec4_generator g(brw, prog, &c->gp->program.Base, &c->prog_data.base,
 562                           mem_ctx, INTEL_DEBUG & DEBUG_GS);
 563          const unsigned *generated =
 564             g.generate_assembly(&v.instructions, final_assembly_size);
 565
 566          return generated;
 567       }
 568    }
 569
 570    /* Either we failed to compile in DUAL_OBJECT mode (probably because it
 571     * would have required spilling) or DUAL_OBJECT mode is disabled.  So fall
 572     * back to DUAL_INSTANCED mode, which consumes fewer registers.
 573     *
 574     * FIXME: In an ideal world we'd fall back to SINGLE mode, which would
 575     * allow us to interleave general purpose registers (resulting in even less
 576     * likelihood of spilling).  But at the moment, the vec4 generator and
 577     * visitor classes don't have the infrastructure to interleave general
 578     * purpose registers, so DUAL_INSTANCED is the best we can do.
 579     */
 580    c->prog_data.dual_instanced_dispatch = true;
 581
 582    vec4_gs_visitor v(brw, c, prog, shader, mem_ctx, false /* no_spills */);
 583    if (!v.run()) {
 584       prog->LinkStatus = false;
 585       ralloc_strcat(&prog->InfoLog, v.fail_msg);
 586       return NULL;
 587    }
 588
 589    vec4_generator g(brw, prog, &c->gp->program.Base, &c->prog_data.base,
 590                     mem_ctx, INTEL_DEBUG & DEBUG_GS);
 591    const unsigned *generated =
 592       g.generate_assembly(&v.instructions, final_assembly_size);
 593
 594    return generated;
 595 }
 596
 597
 598 } /* namespace brw */