glsl: Lower UBO and SSBO access in glsl linker
[mesa.git] / src / mesa / drivers / dri / i965 / gen6_gs_visitor.cpp
1 /*
2 * Copyright © 2014 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * This code is based on original work by Ilia Mirkin.
24 */
25
26 /**
27 * \file gen6_gs_visitor.cpp
28 *
29 * Gen6 geometry shader implementation
30 */
31
32 #include "gen6_gs_visitor.h"
33
34 namespace brw {
35
36 void
37 gen6_gs_visitor::emit_prolog()
38 {
39 vec4_gs_visitor::emit_prolog();
40
41 /* Gen6 geometry shaders require to allocate an initial VUE handle via
42 * FF_SYNC message, however the documentation remarks that only one thread
43 * can write to the URB simultaneously and the FF_SYNC message provides the
44 * synchronization mechanism for this, so using this message effectively
45 * stalls the thread until it is its turn to write to the URB. Because of
46 * this, the best way to implement geometry shader algorithms in gen6 is to
47 * execute the algorithm before the FF_SYNC message to maximize parallelism.
48 *
49 * To achieve this we buffer the geometry shader outputs for each emitted
50 * vertex in vertex_output during operation. Then, when we have processed
51 * the last vertex (that is, at thread end time), we send the FF_SYNC
52 * message to allocate the initial VUE handle and write all buffered vertex
53 * data to the URB in one go.
54 *
55 * For each emitted vertex, vertex_output will hold vue_map.num_slots
56 * data items plus one additional item to hold required flags
57 * (PrimType, PrimStart, PrimEnd, as expected by the URB_WRITE message)
58 * which come right after the data items for that vertex. Vertex data and
59 * flags for the next vertex come right after the data items and flags for
60 * the previous vertex.
61 */
62 this->current_annotation = "gen6 prolog";
63 this->vertex_output = src_reg(this,
64 glsl_type::uint_type,
65 (prog_data->vue_map.num_slots + 1) *
66 nir->info.gs.vertices_out);
67 this->vertex_output_offset = src_reg(this, glsl_type::uint_type);
68 emit(MOV(dst_reg(this->vertex_output_offset), src_reg(0u)));
69
70 /* MRF 1 will be the header for all messages (FF_SYNC and URB_WRITES),
71 * so initialize it once to R0.
72 */
73 vec4_instruction *inst = emit(MOV(dst_reg(MRF, 1),
74 retype(brw_vec8_grf(0, 0),
75 BRW_REGISTER_TYPE_UD)));
76 inst->force_writemask_all = true;
77
78 /* This will be used as a temporary to store writeback data of FF_SYNC
79 * and URB_WRITE messages.
80 */
81 this->temp = src_reg(this, glsl_type::uint_type);
82
83 /* This will be used to know when we are processing the first vertex of
84 * a primitive. We will set this to URB_WRITE_PRIM_START only when we know
85 * that we are processing the first vertex in the primitive and to zero
86 * otherwise. This way we can use its value directly in the URB write
87 * headers.
88 */
89 this->first_vertex = src_reg(this, glsl_type::uint_type);
90 emit(MOV(dst_reg(this->first_vertex), URB_WRITE_PRIM_START));
91
92 /* The FF_SYNC message requires to know the number of primitives generated,
93 * so keep a counter for this.
94 */
95 this->prim_count = src_reg(this, glsl_type::uint_type);
96 emit(MOV(dst_reg(this->prim_count), 0u));
97
98 if (gs_prog_data->gen6_xfb_enabled) {
99 /* Create a virtual register to hold destination indices in SOL */
100 this->destination_indices = src_reg(this, glsl_type::uvec4_type);
101 /* Create a virtual register to hold number of written primitives */
102 this->sol_prim_written = src_reg(this, glsl_type::uint_type);
103 /* Create a virtual register to hold Streamed Vertex Buffer Indices */
104 this->svbi = src_reg(this, glsl_type::uvec4_type);
105 /* Create a virtual register to hold max values of SVBI */
106 this->max_svbi = src_reg(this, glsl_type::uvec4_type);
107 emit(MOV(dst_reg(this->max_svbi),
108 src_reg(retype(brw_vec1_grf(1, 4), BRW_REGISTER_TYPE_UD))));
109
110 xfb_setup();
111 }
112
113 /* PrimitveID is delivered in r0.1 of the thread payload. If the program
114 * needs it we have to move it to a separate register where we can map
115 * the atttribute.
116 *
117 * Notice that we cannot use a virtual register for this, because we need to
118 * map all input attributes to hardware registers in setup_payload(),
119 * which happens before virtual registers are mapped to hardware registers.
120 * We could work around that issue if we were able to compute the first
121 * non-payload register here and move the PrimitiveID information to that
122 * register, but we can't because at this point we don't know the final
123 * number uniforms that will be included in the payload.
124 *
125 * So, what we do is to place PrimitiveID information in r1, which is always
126 * delivered as part of the payload, but its only populated with data
127 * relevant for transform feedback when we set GEN6_GS_SVBI_PAYLOAD_ENABLE
128 * in the 3DSTATE_GS state packet. That information can be obtained by other
129 * means though, so we can safely use r1 for this purpose.
130 */
131 if (gs_prog_data->include_primitive_id) {
132 this->primitive_id =
133 src_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
134 emit(GS_OPCODE_SET_PRIMITIVE_ID, dst_reg(this->primitive_id));
135 }
136 }
137
138 void
139 gen6_gs_visitor::gs_emit_vertex(int stream_id)
140 {
141 this->current_annotation = "gen6 emit vertex";
142
143 /* Buffer all output slots for this vertex in vertex_output */
144 for (int slot = 0; slot < prog_data->vue_map.num_slots; ++slot) {
145 int varying = prog_data->vue_map.slot_to_varying[slot];
146 if (varying != VARYING_SLOT_PSIZ) {
147 dst_reg dst(this->vertex_output);
148 dst.reladdr = ralloc(mem_ctx, src_reg);
149 memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
150 emit_urb_slot(dst, varying);
151 } else {
152 /* The PSIZ slot can pack multiple varyings in different channels
153 * and emit_urb_slot() will produce a MOV instruction for each of
154 * them. Since we are writing to an array, that will translate to
155 * possibly multiple MOV instructions with an array destination and
156 * each will generate a scratch write with the same offset into
157 * scratch space (thus, each one overwriting the previous). This is
158 * not what we want. What we will do instead is emit PSIZ to a
159 * a regular temporary register, then move that resgister into the
160 * array. This way we only have one instruction with an array
161 * destination and we only produce a single scratch write.
162 */
163 dst_reg tmp = dst_reg(src_reg(this, glsl_type::uvec4_type));
164 emit_urb_slot(tmp, varying);
165 dst_reg dst(this->vertex_output);
166 dst.reladdr = ralloc(mem_ctx, src_reg);
167 memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
168 vec4_instruction *inst = emit(MOV(dst, src_reg(tmp)));
169 inst->force_writemask_all = true;
170 }
171
172 emit(ADD(dst_reg(this->vertex_output_offset),
173 this->vertex_output_offset, 1u));
174 }
175
176 /* Now buffer flags for this vertex */
177 dst_reg dst(this->vertex_output);
178 dst.reladdr = ralloc(mem_ctx, src_reg);
179 memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
180 if (nir->info.gs.output_primitive == GL_POINTS) {
181 /* If we are outputting points, then every vertex has PrimStart and
182 * PrimEnd set.
183 */
184 emit(MOV(dst, (_3DPRIM_POINTLIST << URB_WRITE_PRIM_TYPE_SHIFT) |
185 URB_WRITE_PRIM_START | URB_WRITE_PRIM_END));
186 emit(ADD(dst_reg(this->prim_count), this->prim_count, 1u));
187 } else {
188 /* Otherwise, we can only set the PrimStart flag, which we have stored
189 * in the first_vertex register. We will have to wait until we execute
190 * EndPrimitive() or we end the thread to set the PrimEnd flag on a
191 * vertex.
192 */
193 emit(OR(dst, this->first_vertex,
194 (gs_prog_data->output_topology << URB_WRITE_PRIM_TYPE_SHIFT)));
195 emit(MOV(dst_reg(this->first_vertex), 0u));
196 }
197 emit(ADD(dst_reg(this->vertex_output_offset),
198 this->vertex_output_offset, 1u));
199 }
200
201 void
202 gen6_gs_visitor::gs_end_primitive()
203 {
204 this->current_annotation = "gen6 end primitive";
205 /* Calling EndPrimitive() is optional for point output. In this case we set
206 * the PrimEnd flag when we process EmitVertex().
207 */
208 if (nir->info.gs.output_primitive == GL_POINTS)
209 return;
210
211 /* Otherwise we know that the last vertex we have processed was the last
212 * vertex in the primitive and we need to set its PrimEnd flag, so do this
213 * unless we haven't emitted that vertex at all (vertex_count != 0).
214 *
215 * Notice that we have already incremented vertex_count when we processed
216 * the last emit_vertex, so we need to take that into account in the
217 * comparison below (hence the num_output_vertices + 1 in the comparison
218 * below).
219 */
220 unsigned num_output_vertices = nir->info.gs.vertices_out;
221 emit(CMP(dst_null_d(), this->vertex_count, src_reg(num_output_vertices + 1),
222 BRW_CONDITIONAL_L));
223 vec4_instruction *inst = emit(CMP(dst_null_d(),
224 this->vertex_count, 0u,
225 BRW_CONDITIONAL_NEQ));
226 inst->predicate = BRW_PREDICATE_NORMAL;
227 emit(IF(BRW_PREDICATE_NORMAL));
228 {
229 /* vertex_output_offset is already pointing at the first entry of the
230 * next vertex. So subtract 1 to modify the flags for the previous
231 * vertex.
232 */
233 src_reg offset(this, glsl_type::uint_type);
234 emit(ADD(dst_reg(offset), this->vertex_output_offset, src_reg(-1)));
235
236 src_reg dst(this->vertex_output);
237 dst.reladdr = ralloc(mem_ctx, src_reg);
238 memcpy(dst.reladdr, &offset, sizeof(src_reg));
239
240 emit(OR(dst_reg(dst), dst, URB_WRITE_PRIM_END));
241 emit(ADD(dst_reg(this->prim_count), this->prim_count, 1u));
242
243 /* Set the first vertex flag to indicate that the next vertex will start
244 * a primitive.
245 */
246 emit(MOV(dst_reg(this->first_vertex), URB_WRITE_PRIM_START));
247 }
248 emit(BRW_OPCODE_ENDIF);
249 }
250
251 void
252 gen6_gs_visitor::emit_urb_write_header(int mrf)
253 {
254 this->current_annotation = "gen6 urb header";
255 /* Compute offset of the flags for the current vertex in vertex_output and
256 * write them in dw2 of the message header.
257 *
258 * Notice that by the time that emit_thread_end() calls here
259 * vertex_output_offset should point to the first data item of the current
260 * vertex in vertex_output, thus we only need to add the number of output
261 * slots per vertex to that offset to obtain the flags data offset.
262 */
263 src_reg flags_offset(this, glsl_type::uint_type);
264 emit(ADD(dst_reg(flags_offset),
265 this->vertex_output_offset, src_reg(prog_data->vue_map.num_slots)));
266
267 src_reg flags_data(this->vertex_output);
268 flags_data.reladdr = ralloc(mem_ctx, src_reg);
269 memcpy(flags_data.reladdr, &flags_offset, sizeof(src_reg));
270
271 emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, mrf), flags_data);
272 }
273
274 static int
275 align_interleaved_urb_mlen(int mlen)
276 {
277 /* URB data written (does not include the message header reg) must
278 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
279 * section 5.4.3.2.2: URB_INTERLEAVED.
280 */
281 if ((mlen % 2) != 1)
282 mlen++;
283 return mlen;
284 }
285
286 void
287 gen6_gs_visitor::emit_urb_write_opcode(bool complete, int base_mrf,
288 int last_mrf, int urb_offset)
289 {
290 vec4_instruction *inst = NULL;
291
292 if (!complete) {
293 /* If the vertex is not complete we don't have to do anything special */
294 inst = emit(GS_OPCODE_URB_WRITE);
295 inst->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
296 } else {
297 /* Otherwise we always request to allocate a new VUE handle. If this is
298 * the last write before the EOT message and the new handle never gets
299 * used it will be dereferenced when we send the EOT message. This is
300 * necessary to avoid different setups for the EOT message (one for the
301 * case when there is no output and another for the case when there is)
302 * which would require to end the program with an IF/ELSE/ENDIF block,
303 * something we do not want.
304 */
305 inst = emit(GS_OPCODE_URB_WRITE_ALLOCATE);
306 inst->urb_write_flags = BRW_URB_WRITE_COMPLETE;
307 inst->dst = dst_reg(MRF, base_mrf);
308 inst->src[0] = this->temp;
309 }
310
311 inst->base_mrf = base_mrf;
312 inst->mlen = align_interleaved_urb_mlen(last_mrf - base_mrf);
313 inst->offset = urb_offset;
314 }
315
316 void
317 gen6_gs_visitor::emit_thread_end()
318 {
319 /* Make sure the current primitive is ended: we know it is not ended when
320 * first_vertex is not zero. This is only relevant for outputs other than
321 * points because in the point case we set PrimEnd on all vertices.
322 */
323 if (nir->info.gs.output_primitive != GL_POINTS) {
324 emit(CMP(dst_null_d(), this->first_vertex, 0u, BRW_CONDITIONAL_Z));
325 emit(IF(BRW_PREDICATE_NORMAL));
326 gs_end_primitive();
327 emit(BRW_OPCODE_ENDIF);
328 }
329
330 /* Here we have to:
331 * 1) Emit an FF_SYNC messsage to obtain an initial VUE handle.
332 * 2) Loop over all buffered vertex data and write it to corresponding
333 * URB entries.
334 * 3) Allocate new VUE handles for all vertices other than the first.
335 * 4) Send a final EOT message.
336 */
337
338 /* MRF 0 is reserved for the debugger, so start with message header
339 * in MRF 1.
340 */
341 int base_mrf = 1;
342
343 /* In the process of generating our URB write message contents, we
344 * may need to unspill a register or load from an array. Those
345 * reads would use MRFs 21..23
346 */
347 int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
348
349 /* Issue the FF_SYNC message and obtain the initial VUE handle. */
350 emit(CMP(dst_null_d(), this->vertex_count, 0u, BRW_CONDITIONAL_G));
351 emit(IF(BRW_PREDICATE_NORMAL));
352 {
353 this->current_annotation = "gen6 thread end: ff_sync";
354
355 vec4_instruction *inst;
356 if (gs_prog_data->gen6_xfb_enabled) {
357 src_reg sol_temp(this, glsl_type::uvec4_type);
358 emit(GS_OPCODE_FF_SYNC_SET_PRIMITIVES,
359 dst_reg(this->svbi),
360 this->vertex_count,
361 this->prim_count,
362 sol_temp);
363 inst = emit(GS_OPCODE_FF_SYNC,
364 dst_reg(this->temp), this->prim_count, this->svbi);
365 } else {
366 inst = emit(GS_OPCODE_FF_SYNC,
367 dst_reg(this->temp), this->prim_count, src_reg(0u));
368 }
369 inst->base_mrf = base_mrf;
370
371 /* Loop over all buffered vertices and emit URB write messages */
372 this->current_annotation = "gen6 thread end: urb writes init";
373 src_reg vertex(this, glsl_type::uint_type);
374 emit(MOV(dst_reg(vertex), 0u));
375 emit(MOV(dst_reg(this->vertex_output_offset), 0u));
376
377 this->current_annotation = "gen6 thread end: urb writes";
378 emit(BRW_OPCODE_DO);
379 {
380 emit(CMP(dst_null_d(), vertex, this->vertex_count, BRW_CONDITIONAL_GE));
381 inst = emit(BRW_OPCODE_BREAK);
382 inst->predicate = BRW_PREDICATE_NORMAL;
383
384 /* First we prepare the message header */
385 emit_urb_write_header(base_mrf);
386
387 /* Then add vertex data to the message in interleaved fashion */
388 int slot = 0;
389 bool complete = false;
390 do {
391 int mrf = base_mrf + 1;
392
393 /* URB offset is in URB row increments, and each of our MRFs is half
394 * of one of those, since we're doing interleaved writes.
395 */
396 int urb_offset = slot / 2;
397
398 for (; slot < prog_data->vue_map.num_slots; ++slot) {
399 int varying = prog_data->vue_map.slot_to_varying[slot];
400 current_annotation = output_reg_annotation[varying];
401
402 /* Compute offset of this slot for the current vertex
403 * in vertex_output
404 */
405 src_reg data(this->vertex_output);
406 data.reladdr = ralloc(mem_ctx, src_reg);
407 memcpy(data.reladdr, &this->vertex_output_offset,
408 sizeof(src_reg));
409
410 /* Copy this slot to the appropriate message register */
411 dst_reg reg = dst_reg(MRF, mrf);
412 reg.type = output_reg[varying].type;
413 data.type = reg.type;
414 vec4_instruction *inst = emit(MOV(reg, data));
415 inst->force_writemask_all = true;
416
417 mrf++;
418 emit(ADD(dst_reg(this->vertex_output_offset),
419 this->vertex_output_offset, 1u));
420
421 /* If this was max_usable_mrf, we can't fit anything more into
422 * this URB WRITE. Same if we reached the max. message length.
423 */
424 if (mrf > max_usable_mrf ||
425 align_interleaved_urb_mlen(mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
426 slot++;
427 break;
428 }
429 }
430
431 complete = slot >= prog_data->vue_map.num_slots;
432 emit_urb_write_opcode(complete, base_mrf, mrf, urb_offset);
433 } while (!complete);
434
435 /* Skip over the flags data item so that vertex_output_offset points
436 * to the first data item of the next vertex, so that we can start
437 * writing the next vertex.
438 */
439 emit(ADD(dst_reg(this->vertex_output_offset),
440 this->vertex_output_offset, 1u));
441
442 emit(ADD(dst_reg(vertex), vertex, 1u));
443 }
444 emit(BRW_OPCODE_WHILE);
445
446 if (gs_prog_data->gen6_xfb_enabled)
447 xfb_write();
448 }
449 emit(BRW_OPCODE_ENDIF);
450
451 /* Finally, emit EOT message.
452 *
453 * In gen6 we need to end the thread differently depending on whether we have
454 * emitted at least one vertex or not. In case we did, the EOT message must
455 * always include the COMPLETE flag or else the GPU hangs. If we have not
456 * produced any output we can't use the COMPLETE flag.
457 *
458 * However, this would lead us to end the program with an ENDIF opcode,
459 * which we want to avoid, so what we do is that we always request a new
460 * VUE handle every time we do a URB WRITE, even for the last vertex we emit.
461 * With this we make sure that whether we have emitted at least one vertex
462 * or none at all, we have to finish the thread without writing to the URB,
463 * which works for both cases by setting the COMPLETE and UNUSED flags in
464 * the EOT message.
465 */
466 this->current_annotation = "gen6 thread end: EOT";
467
468 if (gs_prog_data->gen6_xfb_enabled) {
469 /* When emitting EOT, set SONumPrimsWritten Increment Value. */
470 src_reg data(this, glsl_type::uint_type);
471 emit(AND(dst_reg(data), this->sol_prim_written, src_reg(0xffffu)));
472 emit(SHL(dst_reg(data), data, src_reg(16u)));
473 emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, base_mrf), data);
474 }
475
476 vec4_instruction *inst = emit(GS_OPCODE_THREAD_END);
477 inst->urb_write_flags = BRW_URB_WRITE_COMPLETE | BRW_URB_WRITE_UNUSED;
478 inst->base_mrf = base_mrf;
479 inst->mlen = 1;
480 }
481
482 void
483 gen6_gs_visitor::setup_payload()
484 {
485 int attribute_map[BRW_VARYING_SLOT_COUNT * MAX_GS_INPUT_VERTICES];
486
487 /* Attributes are going to be interleaved, so one register contains two
488 * attribute slots.
489 */
490 int attributes_per_reg = 2;
491
492 /* If a geometry shader tries to read from an input that wasn't written by
493 * the vertex shader, that produces undefined results, but it shouldn't
494 * crash anything. So initialize attribute_map to zeros--that ensures that
495 * these undefined results are read from r0.
496 */
497 memset(attribute_map, 0, sizeof(attribute_map));
498
499 int reg = 0;
500
501 /* The payload always contains important data in r0. */
502 reg++;
503
504 /* r1 is always part of the payload and it holds information relevant
505 * for transform feedback when we set the GEN6_GS_SVBI_PAYLOAD_ENABLE bit in
506 * the 3DSTATE_GS packet. We will overwrite it with the PrimitiveID
507 * information (and move the original value to a virtual register if
508 * necessary).
509 */
510 if (gs_prog_data->include_primitive_id)
511 attribute_map[VARYING_SLOT_PRIMITIVE_ID] = attributes_per_reg * reg;
512 reg++;
513
514 reg = setup_uniforms(reg);
515
516 reg = setup_varying_inputs(reg, attribute_map, attributes_per_reg);
517
518 lower_attributes_to_hw_regs(attribute_map, true);
519
520 this->first_non_payload_grf = reg;
521 }
522
523 void
524 gen6_gs_visitor::xfb_setup()
525 {
526 static const unsigned swizzle_for_offset[4] = {
527 BRW_SWIZZLE4(0, 1, 2, 3),
528 BRW_SWIZZLE4(1, 2, 3, 3),
529 BRW_SWIZZLE4(2, 3, 3, 3),
530 BRW_SWIZZLE4(3, 3, 3, 3)
531 };
532
533 const struct gl_transform_feedback_info *linked_xfb_info =
534 &this->shader_prog->LinkedTransformFeedback;
535 int i;
536
537 /* Make sure that the VUE slots won't overflow the unsigned chars in
538 * prog_data->transform_feedback_bindings[].
539 */
540 STATIC_ASSERT(BRW_VARYING_SLOT_COUNT <= 256);
541
542 /* Make sure that we don't need more binding table entries than we've
543 * set aside for use in transform feedback. (We shouldn't, since we
544 * set aside enough binding table entries to have one per component).
545 */
546 assert(linked_xfb_info->NumOutputs <= BRW_MAX_SOL_BINDINGS);
547
548 gs_prog_data->num_transform_feedback_bindings = linked_xfb_info->NumOutputs;
549 for (i = 0; i < gs_prog_data->num_transform_feedback_bindings; i++) {
550 gs_prog_data->transform_feedback_bindings[i] =
551 linked_xfb_info->Outputs[i].OutputRegister;
552 gs_prog_data->transform_feedback_swizzles[i] =
553 swizzle_for_offset[linked_xfb_info->Outputs[i].ComponentOffset];
554 }
555 }
556
557 void
558 gen6_gs_visitor::xfb_write()
559 {
560 unsigned num_verts;
561
562 if (!gs_prog_data->num_transform_feedback_bindings)
563 return;
564
565 switch (gs_prog_data->output_topology) {
566 case _3DPRIM_POINTLIST:
567 num_verts = 1;
568 break;
569 case _3DPRIM_LINELIST:
570 case _3DPRIM_LINESTRIP:
571 case _3DPRIM_LINELOOP:
572 num_verts = 2;
573 break;
574 case _3DPRIM_TRILIST:
575 case _3DPRIM_TRIFAN:
576 case _3DPRIM_TRISTRIP:
577 case _3DPRIM_RECTLIST:
578 num_verts = 3;
579 break;
580 case _3DPRIM_QUADLIST:
581 case _3DPRIM_QUADSTRIP:
582 case _3DPRIM_POLYGON:
583 num_verts = 3;
584 break;
585 default:
586 unreachable("Unexpected primitive type in Gen6 SOL program.");
587 }
588
589 this->current_annotation = "gen6 thread end: svb writes init";
590
591 emit(MOV(dst_reg(this->vertex_output_offset), 0u));
592 emit(MOV(dst_reg(this->sol_prim_written), 0u));
593
594 /* Check that at least one primitive can be written
595 *
596 * Note: since we use the binding table to keep track of buffer offsets
597 * and stride, the GS doesn't need to keep track of a separate pointer
598 * into each buffer; it uses a single pointer which increments by 1 for
599 * each vertex. So we use SVBI0 for this pointer, regardless of whether
600 * transform feedback is in interleaved or separate attribs mode.
601 */
602 src_reg sol_temp(this, glsl_type::uvec4_type);
603 emit(ADD(dst_reg(sol_temp), this->svbi, src_reg(num_verts)));
604
605 /* Compare SVBI calculated number with the maximum value, which is
606 * in R1.4 (previously saved in this->max_svbi) for gen6.
607 */
608 emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE));
609 emit(IF(BRW_PREDICATE_NORMAL));
610 {
611 src_reg destination_indices_uw =
612 retype(destination_indices, BRW_REGISTER_TYPE_UW);
613
614 vec4_instruction *inst = emit(MOV(dst_reg(destination_indices_uw),
615 brw_imm_v(0x00020100))); /* (0, 1, 2) */
616 inst->force_writemask_all = true;
617
618 emit(ADD(dst_reg(this->destination_indices),
619 this->destination_indices,
620 this->svbi));
621 }
622 emit(BRW_OPCODE_ENDIF);
623
624 /* Write transform feedback data for all processed vertices. */
625 for (int i = 0; i < (int)nir->info.gs.vertices_out; i++) {
626 emit(MOV(dst_reg(sol_temp), i));
627 emit(CMP(dst_null_d(), sol_temp, this->vertex_count,
628 BRW_CONDITIONAL_L));
629 emit(IF(BRW_PREDICATE_NORMAL));
630 {
631 xfb_program(i, num_verts);
632 }
633 emit(BRW_OPCODE_ENDIF);
634 }
635 }
636
637 void
638 gen6_gs_visitor::xfb_program(unsigned vertex, unsigned num_verts)
639 {
640 unsigned binding;
641 unsigned num_bindings = gs_prog_data->num_transform_feedback_bindings;
642 src_reg sol_temp(this, glsl_type::uvec4_type);
643
644 /* Check for buffer overflow: we need room to write the complete primitive
645 * (all vertices). Otherwise, avoid writing any vertices for it
646 */
647 emit(ADD(dst_reg(sol_temp), this->sol_prim_written, 1u));
648 emit(MUL(dst_reg(sol_temp), sol_temp, src_reg(num_verts)));
649 emit(ADD(dst_reg(sol_temp), sol_temp, this->svbi));
650 emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE));
651 emit(IF(BRW_PREDICATE_NORMAL));
652 {
653 /* Avoid overwriting MRF 1 as it is used as URB write message header */
654 dst_reg mrf_reg(MRF, 2);
655
656 this->current_annotation = "gen6: emit SOL vertex data";
657 /* For each vertex, generate code to output each varying using the
658 * appropriate binding table entry.
659 */
660 for (binding = 0; binding < num_bindings; ++binding) {
661 unsigned char varying =
662 gs_prog_data->transform_feedback_bindings[binding];
663
664 /* Set up the correct destination index for this vertex */
665 vec4_instruction *inst = emit(GS_OPCODE_SVB_SET_DST_INDEX,
666 mrf_reg,
667 this->destination_indices);
668 inst->sol_vertex = vertex % num_verts;
669
670 /* From the Sandybridge PRM, Volume 2, Part 1, Section 4.5.1:
671 *
672 * "Prior to End of Thread with a URB_WRITE, the kernel must
673 * ensure that all writes are complete by sending the final
674 * write as a committed write."
675 */
676 bool final_write = binding == (unsigned) num_bindings - 1 &&
677 inst->sol_vertex == num_verts - 1;
678
679 /* Compute offset of this varying for the current vertex
680 * in vertex_output
681 */
682 this->current_annotation = output_reg_annotation[varying];
683 src_reg data(this->vertex_output);
684 data.reladdr = ralloc(mem_ctx, src_reg);
685 int offset = get_vertex_output_offset_for_varying(vertex, varying);
686 emit(MOV(dst_reg(this->vertex_output_offset), offset));
687 memcpy(data.reladdr, &this->vertex_output_offset, sizeof(src_reg));
688 data.type = output_reg[varying].type;
689
690 /* PSIZ, LAYER and VIEWPORT are packed in different channels of the
691 * same slot, so make sure we write the appropriate channel
692 */
693 if (varying == VARYING_SLOT_PSIZ)
694 data.swizzle = BRW_SWIZZLE_WWWW;
695 else if (varying == VARYING_SLOT_LAYER)
696 data.swizzle = BRW_SWIZZLE_YYYY;
697 else if (varying == VARYING_SLOT_VIEWPORT)
698 data.swizzle = BRW_SWIZZLE_ZZZZ;
699 else
700 data.swizzle = gs_prog_data->transform_feedback_swizzles[binding];
701
702 /* Write data */
703 inst = emit(GS_OPCODE_SVB_WRITE, mrf_reg, data, sol_temp);
704 inst->sol_binding = binding;
705 inst->sol_final_write = final_write;
706
707 if (final_write) {
708 /* This is the last vertex of the primitive, then increment
709 * SO num primitive counter and destination indices.
710 */
711 emit(ADD(dst_reg(this->destination_indices),
712 this->destination_indices,
713 src_reg(num_verts)));
714 emit(ADD(dst_reg(this->sol_prim_written),
715 this->sol_prim_written, 1u));
716 }
717
718 }
719 this->current_annotation = NULL;
720 }
721 emit(BRW_OPCODE_ENDIF);
722 }
723
724 int
725 gen6_gs_visitor::get_vertex_output_offset_for_varying(int vertex, int varying)
726 {
727 /* Find the output slot assigned to this varying.
728 *
729 * VARYING_SLOT_LAYER and VARYING_SLOT_VIEWPORT are packed in the same slot
730 * as VARYING_SLOT_PSIZ.
731 */
732 if (varying == VARYING_SLOT_LAYER || varying == VARYING_SLOT_VIEWPORT)
733 varying = VARYING_SLOT_PSIZ;
734 int slot = prog_data->vue_map.varying_to_slot[varying];
735
736 if (slot < 0) {
737 /* This varying does not exist in the VUE so we are not writing to it
738 * and its value is undefined. We still want to return a valid offset
739 * into vertex_output though, to prevent any out-of-bound accesses into
740 * the vertex_output array. Since the value for this varying is undefined
741 * we don't really care for the value we assign to it, so any offset
742 * within the limits of vertex_output will do.
743 */
744 slot = 0;
745 }
746
747 return vertex * (prog_data->vue_map.num_slots + 1) + slot;
748 }
749
750 } /* namespace brw */