src/mesa/drivers/dri/i965/brw_gs_emit.c

   1 /*
   2  Copyright (C) Intel Corp.  2006.  All Rights Reserved.
   3  Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
   4  develop this 3D driver.
   5
   6  Permission is hereby granted, free of charge, to any person obtaining
   7  a copy of this software and associated documentation files (the
   8  "Software"), to deal in the Software without restriction, including
   9  without limitation the rights to use, copy, modify, merge, publish,
  10  distribute, sublicense, and/or sell copies of the Software, and to
  11  permit persons to whom the Software is furnished to do so, subject to
  12  the following conditions:
  13
  14  The above copyright notice and this permission notice (including the
  15  next paragraph) shall be included in all copies or substantial
  16  portions of the Software.
  17
  18  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  19  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  20  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  21  IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
  22  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  23  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  24  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  25
  26  **********************************************************************/
  27  /*
  28   * Authors:
  29   *   Keith Whitwell <keith@tungstengraphics.com>
  30   */
  31
  32
  33 #include "main/glheader.h"
  34 #include "main/macros.h"
  35 #include "main/enums.h"
  36
  37 #include "program/program.h"
  38 #include "intel_batchbuffer.h"
  39
  40 #include "brw_defines.h"
  41 #include "brw_context.h"
  42 #include "brw_eu.h"
  43 #include "brw_gs.h"
  44
  45 /**
  46  * Allocate registers for GS.
  47  *
  48  * If svbi_payload_enable is true, then the thread will be spawned with the
  49  * "SVBI Payload Enable" bit set, so GRF 1 needs to be set aside to hold the
  50  * streamed vertex buffer indices.
  51  */
  52 static void brw_gs_alloc_regs( struct brw_gs_compile *c,
  53                                GLuint nr_verts,
  54                                bool svbi_payload_enable )
  55 {
  56    GLuint i = 0,j;
  57
  58    /* Register usage is static, precompute here:
  59     */
  60    c->reg.R0 = retype(brw_vec8_grf(i, 0), BRW_REGISTER_TYPE_UD); i++;
  61
  62    /* Streamed vertex buffer indices */
  63    if (svbi_payload_enable)
  64       c->reg.SVBI = retype(brw_vec8_grf(i++, 0), BRW_REGISTER_TYPE_UD);
  65
  66    /* Payload vertices plus space for more generated vertices:
  67     */
  68    for (j = 0; j < nr_verts; j++) {
  69       c->reg.vertex[j] = brw_vec4_grf(i, 0);
  70       i += c->nr_regs;
  71    }
  72
  73    c->reg.header = retype(brw_vec8_grf(i++, 0), BRW_REGISTER_TYPE_UD);
  74    c->reg.temp = retype(brw_vec8_grf(i++, 0), BRW_REGISTER_TYPE_UD);
  75
  76    c->prog_data.urb_read_length = c->nr_regs;
  77    c->prog_data.total_grf = i;
  78 }
  79
  80
  81 /**
  82  * Set up the initial value of c->reg.header register based on c->reg.R0.
  83  *
  84  * The following information is passed to the GS thread in R0, and needs to be
  85  * included in the first URB_WRITE or FF_SYNC message sent by the GS:
  86  *
  87  * - DWORD 0 [31:0] handle info (Gen4 only)
  88  * - DWORD 5 [7:0] FFTID
  89  * - DWORD 6 [31:0] Debug info
  90  * - DWORD 7 [31:0] Debug info
  91  *
  92  * This function sets up the above data by copying by copying the contents of
  93  * R0 to the header register.
  94  */
  95 static void brw_gs_initialize_header(struct brw_gs_compile *c)
  96 {
  97    struct brw_compile *p = &c->func;
  98    brw_MOV(p, c->reg.header, c->reg.R0);
  99 }
 100
 101 /**
 102  * Overwrite DWORD 2 of c->reg.header with the given immediate unsigned value.
 103  *
 104  * In URB_WRITE messages, DWORD 2 contains the fields PrimType, PrimStart,
 105  * PrimEnd, Increment CL_INVOCATIONS, and SONumPrimsWritten, many of which we
 106  * need to be able to update on a per-vertex basis.
 107  */
 108 static void brw_gs_overwrite_header_dw2(struct brw_gs_compile *c,
 109                                         unsigned dw2)
 110 {
 111    struct brw_compile *p = &c->func;
 112    brw_MOV(p, get_element_ud(c->reg.header, 2), brw_imm_ud(dw2));
 113 }
 114
 115 /**
 116  * Overwrite DWORD 2 of c->reg.header with the primitive type from c->reg.R0.
 117  *
 118  * When the thread is spawned, GRF 0 contains the primitive type in bits 4:0
 119  * of DWORD 2.  URB_WRITE messages need the primitive type in bits 6:2 of
 120  * DWORD 2.  So this function extracts the primitive type field, bitshifts it
 121  * appropriately, and stores it in c->reg.header.
 122  */
 123 static void brw_gs_overwrite_header_dw2_from_r0(struct brw_gs_compile *c)
 124 {
 125    struct brw_compile *p = &c->func;
 126    brw_AND(p, get_element_ud(c->reg.header, 2), get_element_ud(c->reg.R0, 2),
 127            brw_imm_ud(0x1f));
 128    brw_SHL(p, get_element_ud(c->reg.header, 2),
 129            get_element_ud(c->reg.header, 2), brw_imm_ud(2));
 130 }
 131
 132 /**
 133  * Apply an additive offset to DWORD 2 of c->reg.header.
 134  *
 135  * This is used to set/unset the "PrimStart" and "PrimEnd" flags appropriately
 136  * for each vertex.
 137  */
 138 static void brw_gs_offset_header_dw2(struct brw_gs_compile *c, int offset)
 139 {
 140    struct brw_compile *p = &c->func;
 141    brw_ADD(p, get_element_d(c->reg.header, 2), get_element_d(c->reg.header, 2),
 142            brw_imm_d(offset));
 143 }
 144
 145
 146 /**
 147  * Emit a vertex using the URB_WRITE message.  Use the contents of
 148  * c->reg.header for the message header, and the registers starting at \c vert
 149  * for the vertex data.
 150  *
 151  * If \c last is true, then this is the last vertex, so no further URB space
 152  * should be allocated, and this message should end the thread.
 153  *
 154  * If \c last is false, then a new URB entry will be allocated, and its handle
 155  * will be stored in DWORD 0 of c->reg.header for use in the next URB_WRITE
 156  * message.
 157  */
 158 static void brw_gs_emit_vue(struct brw_gs_compile *c,
 159                             struct brw_reg vert,
 160                             bool last)
 161 {
 162    struct brw_compile *p = &c->func;
 163    bool allocate = !last;
 164
 165    /* Copy the vertex from vertn into m1..mN+1:
 166     */
 167    brw_copy8(p, brw_message_reg(1), vert, c->nr_regs);
 168
 169    /* Send each vertex as a seperate write to the urb.  This is
 170     * different to the concept in brw_sf_emit.c, where subsequent
 171     * writes are used to build up a single urb entry.  Each of these
 172     * writes instantiates a seperate urb entry, and a new one must be
 173     * allocated each time.
 174     */
 175    brw_urb_WRITE(p,
 176                  allocate ? c->reg.temp
 177                           : retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
 178                  0,
 179                  c->reg.header,
 180                  allocate,
 181                  1,             /* used */
 182                  c->nr_regs + 1, /* msg length */
 183                  allocate ? 1 : 0, /* response length */
 184                  allocate ? 0 : 1, /* eot */
 185                  1,             /* writes_complete */
 186                  0,             /* urb offset */
 187                  BRW_URB_SWIZZLE_NONE);
 188
 189    if (allocate) {
 190       brw_MOV(p, get_element_ud(c->reg.header, 0),
 191               get_element_ud(c->reg.temp, 0));
 192    }
 193 }
 194
 195 /**
 196  * De-allocate the URB entry that was previously allocated to this thread
 197  * (without writing any vertex data to it), and terminate the thread.  This is
 198  * used to implement RASTERIZER_DISCARD functionality.
 199  */
 200 static void brw_gs_terminate(struct brw_gs_compile *c)
 201 {
 202    struct brw_compile *p = &c->func;
 203    brw_urb_WRITE(p,
 204                  retype(brw_null_reg(), BRW_REGISTER_TYPE_UD), /* dest */
 205                  0, /* msg_reg_nr */
 206                  c->reg.header, /* src0 */
 207                  false, /* allocate */
 208                  false, /* used */
 209                  1, /* msg_length */
 210                  0, /* response_length */
 211                  true, /* eot */
 212                  true, /* writes_complete */
 213                  0, /* offset */
 214                  BRW_URB_SWIZZLE_NONE);
 215 }
 216
 217 /**
 218  * Send an FF_SYNC message to ensure that all previously spawned GS threads
 219  * have finished sending primitives down the pipeline, and to allocate a URB
 220  * entry for the first output vertex.  Only needed when intel->needs_ff_sync
 221  * is true.
 222  *
 223  * This function modifies c->reg.header: in DWORD 1, it stores num_prim (which
 224  * is needed by the FF_SYNC message), and in DWORD 0, it stores the handle to
 225  * the allocated URB entry (which will be needed by the URB_WRITE meesage that
 226  * follows).
 227  */
 228 static void brw_gs_ff_sync(struct brw_gs_compile *c, int num_prim)
 229 {
 230    struct brw_compile *p = &c->func;
 231
 232    brw_MOV(p, get_element_ud(c->reg.header, 1), brw_imm_ud(num_prim));
 233    brw_ff_sync(p,
 234                c->reg.temp,
 235                0,
 236                c->reg.header,
 237                1, /* allocate */
 238                1, /* response length */
 239                0 /* eot */);
 240    brw_MOV(p, get_element_ud(c->reg.header, 0),
 241            get_element_ud(c->reg.temp, 0));
 242 }
 243
 244
 245 void brw_gs_quads( struct brw_gs_compile *c, struct brw_gs_prog_key *key )
 246 {
 247    struct intel_context *intel = &c->func.brw->intel;
 248
 249    brw_gs_alloc_regs(c, 4, false);
 250    brw_gs_initialize_header(c);
 251    /* Use polygons for correct edgeflag behaviour. Note that vertex 3
 252     * is the PV for quads, but vertex 0 for polygons:
 253     */
 254    if (intel->needs_ff_sync)
 255       brw_gs_ff_sync(c, 1);
 256    brw_gs_overwrite_header_dw2(
 257       c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT)
 258           | URB_WRITE_PRIM_START));
 259    if (key->pv_first) {
 260       brw_gs_emit_vue(c, c->reg.vertex[0], 0);
 261       brw_gs_overwrite_header_dw2(
 262          c, _3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT);
 263       brw_gs_emit_vue(c, c->reg.vertex[1], 0);
 264       brw_gs_emit_vue(c, c->reg.vertex[2], 0);
 265       brw_gs_overwrite_header_dw2(
 266          c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT)
 267              | URB_WRITE_PRIM_END));
 268       brw_gs_emit_vue(c, c->reg.vertex[3], 1);
 269    }
 270    else {
 271       brw_gs_emit_vue(c, c->reg.vertex[3], 0);
 272       brw_gs_overwrite_header_dw2(
 273          c, _3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT);
 274       brw_gs_emit_vue(c, c->reg.vertex[0], 0);
 275       brw_gs_emit_vue(c, c->reg.vertex[1], 0);
 276       brw_gs_overwrite_header_dw2(
 277          c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT)
 278              | URB_WRITE_PRIM_END));
 279       brw_gs_emit_vue(c, c->reg.vertex[2], 1);
 280    }
 281 }
 282
 283 void brw_gs_quad_strip( struct brw_gs_compile *c, struct brw_gs_prog_key *key )
 284 {
 285    struct intel_context *intel = &c->func.brw->intel;
 286
 287    brw_gs_alloc_regs(c, 4, false);
 288    brw_gs_initialize_header(c);
 289
 290    if (intel->needs_ff_sync)
 291       brw_gs_ff_sync(c, 1);
 292    brw_gs_overwrite_header_dw2(
 293       c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT)
 294           | URB_WRITE_PRIM_START));
 295    if (key->pv_first) {
 296       brw_gs_emit_vue(c, c->reg.vertex[0], 0);
 297       brw_gs_overwrite_header_dw2(
 298          c, _3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT);
 299       brw_gs_emit_vue(c, c->reg.vertex[1], 0);
 300       brw_gs_emit_vue(c, c->reg.vertex[2], 0);
 301       brw_gs_overwrite_header_dw2(
 302          c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT)
 303              | URB_WRITE_PRIM_END));
 304       brw_gs_emit_vue(c, c->reg.vertex[3], 1);
 305    }
 306    else {
 307       brw_gs_emit_vue(c, c->reg.vertex[2], 0);
 308       brw_gs_overwrite_header_dw2(
 309          c, _3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT);
 310       brw_gs_emit_vue(c, c->reg.vertex[3], 0);
 311       brw_gs_emit_vue(c, c->reg.vertex[0], 0);
 312       brw_gs_overwrite_header_dw2(
 313          c, ((_3DPRIM_POLYGON << URB_WRITE_PRIM_TYPE_SHIFT)
 314              | URB_WRITE_PRIM_END));
 315       brw_gs_emit_vue(c, c->reg.vertex[1], 1);
 316    }
 317 }
 318
 319 void brw_gs_lines( struct brw_gs_compile *c )
 320 {
 321    struct intel_context *intel = &c->func.brw->intel;
 322
 323    brw_gs_alloc_regs(c, 2, false);
 324    brw_gs_initialize_header(c);
 325
 326    if (intel->needs_ff_sync)
 327       brw_gs_ff_sync(c, 1);
 328    brw_gs_overwrite_header_dw2(
 329       c, ((_3DPRIM_LINESTRIP << URB_WRITE_PRIM_TYPE_SHIFT)
 330           | URB_WRITE_PRIM_START));
 331    brw_gs_emit_vue(c, c->reg.vertex[0], 0);
 332    brw_gs_overwrite_header_dw2(
 333       c, ((_3DPRIM_LINESTRIP << URB_WRITE_PRIM_TYPE_SHIFT)
 334           | URB_WRITE_PRIM_END));
 335    brw_gs_emit_vue(c, c->reg.vertex[1], 1);
 336 }
 337
 338 /**
 339  * Generate the geometry shader program used on Gen6 to perform stream output
 340  * (transform feedback).
 341  */
 342 void
 343 gen6_sol_program(struct brw_gs_compile *c, struct brw_gs_prog_key *key,
 344                  unsigned num_verts, bool check_edge_flags)
 345 {
 346    struct brw_compile *p = &c->func;
 347    c->prog_data.svbi_postincrement_value = num_verts;
 348
 349    brw_gs_alloc_regs(c, num_verts, true);
 350    brw_gs_initialize_header(c);
 351
 352    if (key->num_transform_feedback_bindings > 0) {
 353       unsigned vertex, binding;
 354       /* Note: since we use the binding table to keep track of buffer offsets
 355        * and stride, the GS doesn't need to keep track of a separate pointer
 356        * into each buffer; it uses a single pointer which increments by 1 for
 357        * each vertex.  So we use SVBI0 for this pointer, regardless of whether
 358        * transform feedback is in interleaved or separate attribs mode.
 359        */
 360       brw_MOV(p, get_element_ud(c->reg.header, 5),
 361               get_element_ud(c->reg.SVBI, 0));
 362
 363       /* Make sure that the buffers have enough room for all the vertices. */
 364       brw_ADD(p, get_element_ud(c->reg.temp, 0),
 365                  get_element_ud(c->reg.SVBI, 0), brw_imm_ud(num_verts));
 366       brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_LE,
 367                  get_element_ud(c->reg.temp, 0),
 368                  get_element_ud(c->reg.SVBI, 4));
 369       brw_IF(p, BRW_EXECUTE_1);
 370
 371       /* For each vertex, generate code to output each varying using the
 372        * appropriate binding table entry.
 373        */
 374       for (vertex = 0; vertex < num_verts; ++vertex) {
 375          for (binding = 0; binding < key->num_transform_feedback_bindings;
 376               ++binding) {
 377             unsigned char vert_result =
 378                key->transform_feedback_bindings[binding];
 379             unsigned char slot = c->vue_map.vert_result_to_slot[vert_result];
 380             /* From the Sandybridge PRM, Volume 2, Part 1, Section 4.5.1:
 381              *
 382              *   "Prior to End of Thread with a URB_WRITE, the kernel must
 383              *   ensure that all writes are complete by sending the final
 384              *   write as a committed write."
 385              */
 386             bool final_write =
 387                binding == key->num_transform_feedback_bindings - 1 &&
 388                vertex == num_verts - 1;
 389             struct brw_reg vertex_slot = c->reg.vertex[vertex];
 390             vertex_slot.nr += slot / 2;
 391             vertex_slot.subnr = (slot % 2) * 16;
 392             brw_MOV(p, stride(c->reg.header, 4, 4, 1),
 393                     retype(vertex_slot, BRW_REGISTER_TYPE_UD));
 394             brw_svb_write(p,
 395                           final_write ? c->reg.temp : brw_null_reg(), /* dest */
 396                           1, /* msg_reg_nr */
 397                           c->reg.header, /* src0 */
 398                           SURF_INDEX_SOL_BINDING(binding), /* binding_table_index */
 399                           final_write); /* send_commit_msg */
 400          }
 401
 402          /* If there are more vertices to output, increment the pointer so
 403           * that we will start outputting to the next location in the
 404           * transform feedback buffers.
 405           */
 406          if (vertex != num_verts - 1) {
 407             brw_ADD(p, get_element_ud(c->reg.header, 5),
 408                     get_element_ud(c->reg.header, 5), brw_imm_ud(1));
 409          }
 410       }
 411       brw_ENDIF(p);
 412
 413       /* Now, reinitialize the header register from R0 to restore the parts of
 414        * the register that we overwrote while streaming out transform feedback
 415        * data.
 416        */
 417       brw_gs_initialize_header(c);
 418
 419       /* Finally, wait for the write commit to occur so that we can proceed to
 420        * other things safely.
 421        *
 422        * From the Sandybridge PRM, Volume 4, Part 1, Section 3.3:
 423        *
 424        *   The write commit does not modify the destination register, but
 425        *   merely clears the dependency associated with the destination
 426        *   register. Thus, a simple “mov” instruction using the register as a
 427        *   source is sufficient to wait for the write commit to occur.
 428        */
 429       brw_MOV(p, c->reg.temp, c->reg.temp);
 430    }
 431
 432    brw_gs_ff_sync(c, 1);
 433
 434    /* If RASTERIZER_DISCARD is enabled, we have nothing further to do, so
 435     * release the URB that was just allocated, and terminate the thread.
 436     */
 437    if (key->rasterizer_discard) {
 438       brw_gs_terminate(c);
 439       return;
 440    }
 441
 442    brw_gs_overwrite_header_dw2_from_r0(c);
 443    switch (num_verts) {
 444    case 1:
 445       brw_gs_offset_header_dw2(c, URB_WRITE_PRIM_START | URB_WRITE_PRIM_END);
 446       brw_gs_emit_vue(c, c->reg.vertex[0], true);
 447       break;
 448    case 2:
 449       brw_gs_offset_header_dw2(c, URB_WRITE_PRIM_START);
 450       brw_gs_emit_vue(c, c->reg.vertex[0], false);
 451       brw_gs_offset_header_dw2(c, URB_WRITE_PRIM_END - URB_WRITE_PRIM_START);
 452       brw_gs_emit_vue(c, c->reg.vertex[1], true);
 453       break;
 454    case 3:
 455       if (check_edge_flags) {
 456          /* Only emit vertices 0 and 1 if this is the first triangle of the
 457           * polygon.  Otherwise they are redundant.
 458           */
 459          brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
 460          brw_AND(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
 461                  get_element_ud(c->reg.R0, 2),
 462                  brw_imm_ud(BRW_GS_EDGE_INDICATOR_0));
 463          brw_IF(p, BRW_EXECUTE_1);
 464       }
 465       brw_gs_offset_header_dw2(c, URB_WRITE_PRIM_START);
 466       brw_gs_emit_vue(c, c->reg.vertex[0], false);
 467       brw_gs_offset_header_dw2(c, -URB_WRITE_PRIM_START);
 468       brw_gs_emit_vue(c, c->reg.vertex[1], false);
 469       if (check_edge_flags) {
 470          brw_ENDIF(p);
 471          /* Only emit vertex 2 in PRIM_END mode if this is the last triangle
 472           * of the polygon.  Otherwise leave the primitive incomplete because
 473           * there are more polygon vertices coming.
 474           */
 475          brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
 476          brw_AND(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
 477                  get_element_ud(c->reg.R0, 2),
 478                  brw_imm_ud(BRW_GS_EDGE_INDICATOR_1));
 479          brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
 480       }
 481       brw_gs_offset_header_dw2(c, URB_WRITE_PRIM_END);
 482       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 483       brw_gs_emit_vue(c, c->reg.vertex[2], true);
 484       break;
 485    }
 486 }