src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp

   1 /*
   2  * Copyright © 2014 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * This code is based on original work by Ilia Mirkin.
  24  */
  25
  26 /**
  27  * \file gen6_gs_visitor.cpp
  28  *
  29  * Gen6 geometry shader implementation
  30  */
  31
  32 #include "gen6_gs_visitor.h"
  33
  34 namespace brw {
  35
  36 void
  37 gen6_gs_visitor::emit_prolog()
  38 {
  39    vec4_gs_visitor::emit_prolog();
  40
  41    /* Gen6 geometry shaders require to allocate an initial VUE handle via
  42     * FF_SYNC message, however the documentation remarks that only one thread
  43     * can write to the URB simultaneously and the FF_SYNC message provides the
  44     * synchronization mechanism for this, so using this message effectively
  45     * stalls the thread until it is its turn to write to the URB. Because of
  46     * this, the best way to implement geometry shader algorithms in gen6 is to
  47     * execute the algorithm before the FF_SYNC message to maximize parallelism.
  48     *
  49     * To achieve this we buffer the geometry shader outputs for each emitted
  50     * vertex in vertex_output during operation. Then, when we have processed
  51     * the last vertex (that is, at thread end time), we send the FF_SYNC
  52     * message to allocate the initial VUE handle and write all buffered vertex
  53     * data to the URB in one go.
  54     *
  55     * For each emitted vertex, vertex_output will hold vue_map.num_slots
  56     * data items plus one additional item to hold required flags
  57     * (PrimType, PrimStart, PrimEnd, as expected by the URB_WRITE message)
  58     * which come right after the data items for that vertex. Vertex data and
  59     * flags for the next vertex come right after the data items and flags for
  60     * the previous vertex.
  61     */
  62    this->current_annotation = "gen6 prolog";
  63    this->vertex_output = src_reg(this,
  64                                  glsl_type::uint_type,
  65                                  (prog_data->vue_map.num_slots + 1) *
  66                                  c->gp->program.VerticesOut);
  67    this->vertex_output_offset = src_reg(this, glsl_type::uint_type);
  68    emit(MOV(dst_reg(this->vertex_output_offset), src_reg(0u)));
  69
  70    /* MRF 1 will be the header for all messages (FF_SYNC and URB_WRITES),
  71     * so initialize it once to R0.
  72     */
  73    vec4_instruction *inst = emit(MOV(dst_reg(MRF, 1),
  74                                      retype(brw_vec8_grf(0, 0),
  75                                             BRW_REGISTER_TYPE_UD)));
  76    inst->force_writemask_all = true;
  77
  78    /* This will be used as a temporary to store writeback data of FF_SYNC
  79     * and URB_WRITE messages.
  80     */
  81    this->temp = src_reg(this, glsl_type::uint_type);
  82
  83    /* This will be used to know when we are processing the first vertex of
  84     * a primitive. We will set this to URB_WRITE_PRIM_START only when we know
  85     * that we are processing the first vertex in the primitive and to zero
  86     * otherwise. This way we can use its value directly in the URB write
  87     * headers.
  88     */
  89    this->first_vertex = src_reg(this, glsl_type::uint_type);
  90    emit(MOV(dst_reg(this->first_vertex), URB_WRITE_PRIM_START));
  91
  92    /* The FF_SYNC message requires to know the number of primitives generated,
  93     * so keep a counter for this.
  94     */
  95    this->prim_count = src_reg(this, glsl_type::uint_type);
  96    emit(MOV(dst_reg(this->prim_count), 0u));
  97 }
  98
  99 void
 100 gen6_gs_visitor::visit(ir_emit_vertex *)
 101 {
 102    this->current_annotation = "gen6 emit vertex";
 103    /* Honor max_vertex layout indication in geometry shader by ignoring any
 104     * vertices coming after c->gp->program.VerticesOut.
 105     */
 106    unsigned num_output_vertices = c->gp->program.VerticesOut;
 107    emit(CMP(dst_null_d(), this->vertex_count, src_reg(num_output_vertices),
 108             BRW_CONDITIONAL_L));
 109    emit(IF(BRW_PREDICATE_NORMAL));
 110    {
 111       /* Buffer all output slots for this vertex in vertex_output */
 112       for (int slot = 0; slot < prog_data->vue_map.num_slots; ++slot) {
 113          /* We will handle PSIZ for each vertex at thread end time since it
 114           * is not computed by the GS algorithm and requires specific handling.
 115           */
 116          int varying = prog_data->vue_map.slot_to_varying[slot];
 117          if (varying != VARYING_SLOT_PSIZ) {
 118             dst_reg dst(this->vertex_output);
 119             dst.reladdr = ralloc(mem_ctx, src_reg);
 120             memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
 121             emit_urb_slot(dst, varying);
 122          }
 123          emit(ADD(dst_reg(this->vertex_output_offset),
 124                   this->vertex_output_offset, 1u));
 125       }
 126
 127       /* Now buffer flags for this vertex */
 128       dst_reg dst(this->vertex_output);
 129       dst.reladdr = ralloc(mem_ctx, src_reg);
 130       memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
 131       if (c->gp->program.OutputType == GL_POINTS) {
 132          /* If we are outputting points, then every vertex has PrimStart and
 133           * PrimEnd set.
 134           */
 135          emit(MOV(dst, (_3DPRIM_POINTLIST << URB_WRITE_PRIM_TYPE_SHIFT) |
 136                   URB_WRITE_PRIM_START | URB_WRITE_PRIM_END));
 137          emit(ADD(dst_reg(this->prim_count), this->prim_count, 1u));
 138       } else {
 139          /* Otherwise, we can only set the PrimStart flag, which we have stored
 140           * in the first_vertex register. We will have to wait until we execute
 141           * EndPrimitive() or we end the thread to set the PrimEnd flag on a
 142           * vertex.
 143           */
 144          emit(OR(dst, this->first_vertex,
 145                  (c->prog_data.output_topology << URB_WRITE_PRIM_TYPE_SHIFT)));
 146          emit(MOV(dst_reg(this->first_vertex), 0u));
 147       }
 148       emit(ADD(dst_reg(this->vertex_output_offset),
 149                this->vertex_output_offset, 1u));
 150
 151       /* Update vertex count */
 152       emit(ADD(dst_reg(this->vertex_count), this->vertex_count, 1u));
 153    }
 154    emit(BRW_OPCODE_ENDIF);
 155 }
 156
 157 void
 158 gen6_gs_visitor::visit(ir_end_primitive *)
 159 {
 160    this->current_annotation = "gen6 end primitive";
 161    /* Calling EndPrimitive() is optional for point output. In this case we set
 162     * the PrimEnd flag when we process EmitVertex().
 163     */
 164    if (c->gp->program.OutputType == GL_POINTS)
 165       return;
 166
 167    /* Otherwise we know that the last vertex we have processed was the last
 168     * vertex in the primitive and we need to set its PrimEnd flag, so do this
 169     * unless we haven't emitted that vertex at all.
 170     *
 171     * Notice that we have already incremented vertex_count when we processed
 172     * the last emit_vertex, so we need to take that into account in the
 173     * comparison below (hence the num_output_vertices + 1 in the comparison
 174     * below).
 175     */
 176    unsigned num_output_vertices = c->gp->program.VerticesOut;
 177    emit(CMP(dst_null_d(), this->vertex_count, src_reg(num_output_vertices + 1),
 178             BRW_CONDITIONAL_L));
 179    emit(IF(BRW_PREDICATE_NORMAL));
 180    {
 181       /* vertex_output_offset is already pointing at the first entry of the
 182        * next vertex. So subtract 1 to modify the flags for the previous
 183        * vertex.
 184        */
 185       src_reg offset(this, glsl_type::uint_type);
 186       emit(ADD(dst_reg(offset), this->vertex_output_offset, brw_imm_d(-1)));
 187
 188       src_reg dst(this->vertex_output);
 189       dst.reladdr = ralloc(mem_ctx, src_reg);
 190       memcpy(dst.reladdr, &offset, sizeof(src_reg));
 191
 192       emit(OR(dst_reg(dst), dst, URB_WRITE_PRIM_END));
 193       emit(ADD(dst_reg(this->prim_count), this->prim_count, 1u));
 194
 195       /* Set the first vertex flag to indicate that the next vertex will start
 196        * a primitive.
 197        */
 198       emit(MOV(dst_reg(this->first_vertex), URB_WRITE_PRIM_START));
 199    }
 200    emit(BRW_OPCODE_ENDIF);
 201 }
 202
 203 void
 204 gen6_gs_visitor::emit_urb_write_header(int mrf)
 205 {
 206    this->current_annotation = "gen6 urb header";
 207    /* Compute offset of the flags for the current vertex in vertex_output and
 208     * write them in dw2 of the message header.
 209     *
 210     * Notice that by the time that emit_thread_end() calls here
 211     * vertex_output_offset should point to the first data item of the current
 212     * vertex in vertex_output, thus we only need to add the number of output
 213     * slots per vertex to that offset to obtain the flags data offset.
 214     */
 215    src_reg flags_offset(this, glsl_type::uint_type);
 216    emit(ADD(dst_reg(flags_offset),
 217             this->vertex_output_offset, src_reg(prog_data->vue_map.num_slots)));
 218
 219    src_reg flags_data(this->vertex_output);
 220    flags_data.reladdr = ralloc(mem_ctx, src_reg);
 221    memcpy(flags_data.reladdr, &flags_offset, sizeof(src_reg));
 222
 223    emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, mrf), flags_data);
 224 }
 225
 226 void
 227 gen6_gs_visitor::emit_urb_write_opcode(bool complete, src_reg vertex,
 228                                        int base_mrf, int mlen, int urb_offset)
 229 {
 230    vec4_instruction *inst = NULL;
 231
 232    /* If the vertex is not complete we don't have to do anything special */
 233    if (!complete) {
 234       inst = emit(GS_OPCODE_URB_WRITE);
 235       inst->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
 236       inst->base_mrf = base_mrf;
 237       inst->mlen = mlen;
 238       inst->offset = urb_offset;
 239       return;
 240    }
 241
 242    /* Otherwise, if this is not the last vertex we are going to write,
 243     * we have to request a new VUE handle for the next vertex.
 244     *
 245     * Notice that the vertex parameter has been pre-incremented in
 246     * emit_thread_end() to make this comparison easier.
 247     */
 248    emit(CMP(dst_null_d(), vertex, this->vertex_count, BRW_CONDITIONAL_L));
 249    emit(IF(BRW_PREDICATE_NORMAL));
 250    {
 251       inst = emit(GS_OPCODE_URB_WRITE_ALLOCATE);
 252       inst->urb_write_flags = BRW_URB_WRITE_COMPLETE;
 253       inst->base_mrf = base_mrf;
 254       inst->mlen = mlen;
 255       inst->offset = urb_offset;
 256       inst->dst = dst_reg(MRF, base_mrf);
 257       inst->src[0] = this->temp;
 258    }
 259    emit(BRW_OPCODE_ELSE);
 260    {
 261       inst = emit(GS_OPCODE_URB_WRITE);
 262       inst->urb_write_flags = BRW_URB_WRITE_COMPLETE;
 263       inst->base_mrf = base_mrf;
 264       inst->mlen = mlen;
 265       inst->offset = urb_offset;
 266    }
 267    emit(BRW_OPCODE_ENDIF);
 268 }
 269
 270 void
 271 gen6_gs_visitor::emit_thread_end()
 272 {
 273    /* Here we have to:
 274     * 1) Emit an FF_SYNC messsage to obtain an initial VUE handle.
 275     * 2) Loop over all buffered vertex data and write it to corresponding
 276     *    URB entries.
 277     * 3) Allocate new VUE handles for all vertices other than the first.
 278     * 4) Send a final EOT message.
 279     */
 280
 281    /* MRF 0 is reserved for the debugger, so start with message header
 282     * in MRF 1.
 283     */
 284    int base_mrf = 1;
 285
 286    /* In the process of generating our URB write message contents, we
 287     * may need to unspill a register or load from an array.  Those
 288     * reads would use MRFs 14-15.
 289     */
 290    int max_usable_mrf = 13;
 291
 292    /* Issue the FF_SYNC message and obtain the initial VUE handle. */
 293    this->current_annotation = "gen6 thread end: ff_sync";
 294    vec4_instruction *inst =
 295       emit(GS_OPCODE_FF_SYNC, dst_reg(this->temp), this->prim_count);
 296    inst->base_mrf = base_mrf;
 297
 298    /* Loop over all buffered vertices and emit URB write messages */
 299    this->current_annotation = "gen6 thread end: urb writes init";
 300    src_reg vertex(this, glsl_type::uint_type);
 301    emit(MOV(dst_reg(vertex), 0u));
 302    emit(MOV(dst_reg(this->vertex_output_offset), 0u));
 303
 304    this->current_annotation = "gen6 thread end: urb writes";
 305    emit(BRW_OPCODE_DO);
 306    {
 307       emit(CMP(dst_null_d(), vertex, this->vertex_count, BRW_CONDITIONAL_GE));
 308       inst = emit(BRW_OPCODE_BREAK);
 309       inst->predicate = BRW_PREDICATE_NORMAL;
 310
 311       /* First we prepare the message header */
 312       emit_urb_write_header(base_mrf);
 313
 314       /* Then add vertex data to the message in interleaved fashion */
 315       int slot = 0;
 316       bool complete = false;
 317       do {
 318          int mrf = base_mrf + 1;
 319
 320          /* URB offset is in URB row increments, and each of our MRFs is half
 321           * of one of those, since we're doing interleaved writes.
 322           */
 323          int urb_offset = slot / 2;
 324
 325          for (; slot < prog_data->vue_map.num_slots; ++slot) {
 326             int varying = prog_data->vue_map.slot_to_varying[slot];
 327             current_annotation = output_reg_annotation[varying];
 328
 329             /* Compute offset of this slot for the current vertex
 330              * in vertex_output
 331              */
 332             src_reg data(this->vertex_output);
 333             data.reladdr = ralloc(mem_ctx, src_reg);
 334             memcpy(data.reladdr, &this->vertex_output_offset, sizeof(src_reg));
 335
 336             if (varying == VARYING_SLOT_PSIZ) {
 337                /* We did not buffer PSIZ, emit it directly here */
 338                emit_urb_slot(dst_reg(MRF, mrf), varying);
 339             } else {
 340                /* Copy this slot to the appropriate message register */
 341                dst_reg reg = dst_reg(MRF, mrf);
 342                reg.type = output_reg[varying].type;
 343                data.type = reg.type;
 344                vec4_instruction *inst = emit(MOV(reg, data));
 345                inst->force_writemask_all = true;
 346             }
 347
 348             mrf++;
 349             emit(ADD(dst_reg(this->vertex_output_offset),
 350                      this->vertex_output_offset, 1u));
 351
 352             /* If this was max_usable_mrf, we can't fit anything more into this
 353              * URB WRITE.
 354              */
 355             if (mrf > max_usable_mrf) {
 356                slot++;
 357                break;
 358             }
 359          }
 360
 361          complete = slot >= prog_data->vue_map.num_slots;
 362
 363          /* When we emit the URB_WRITE below we need to do different things
 364           * depending on whether this is the last vertex we are going to
 365           * write. That means that we will need to check if
 366           * vertex >= vertex_count - 1. However, by increasing vertex early
 367           * we transform that comparison into vertex >= vertex_count, which
 368           * is more convenient.
 369           */
 370          if (complete)
 371             emit(ADD(dst_reg(vertex), vertex, 1u));
 372
 373          /* URB data written (does not include the message header reg) must
 374           * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
 375           * section 5.4.3.2.2: URB_INTERLEAVED.
 376           */
 377          int mlen = mrf - base_mrf;
 378          if ((mlen % 2) != 1)
 379             mlen++;
 380          emit_urb_write_opcode(complete, vertex, base_mrf, mlen, urb_offset);
 381       } while (!complete);
 382
 383       /* Skip over the flags data item so that vertex_output_offset points to
 384        * the first data item of the next vertex, so that we can start writing
 385        * the next vertex.
 386        */
 387        emit(ADD(dst_reg(this->vertex_output_offset),
 388                 this->vertex_output_offset, 1u));
 389    }
 390    emit(BRW_OPCODE_WHILE);
 391
 392    /* Finally, emit EOT message.
 393     *
 394     * In gen6 it looks like we have to set the complete flag too, otherwise
 395     * the GPU hangs.
 396     */
 397    this->current_annotation = "gen6 thread end: EOT";
 398    inst = emit(GS_OPCODE_THREAD_END);
 399    inst->urb_write_flags = BRW_URB_WRITE_COMPLETE;
 400    inst->base_mrf = base_mrf;
 401    inst->mlen = 1;
 402 }
 403
 404 } /* namespace brw */