i965/gen6/gs: Implement geometry shaders for outputs other than points.
[mesa.git] / src / mesa / drivers / dri / i965 / gen6_gs_visitor.cpp
1 /*
2 * Copyright © 2014 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * This code is based on original work by Ilia Mirkin.
24 */
25
26 /**
27 * \file gen6_gs_visitor.cpp
28 *
29 * Gen6 geometry shader implementation
30 */
31
32 #include "gen6_gs_visitor.h"
33
34 namespace brw {
35
36 void
37 gen6_gs_visitor::emit_prolog()
38 {
39 vec4_gs_visitor::emit_prolog();
40
41 /* Gen6 geometry shaders require to allocate an initial VUE handle via
42 * FF_SYNC message, however the documentation remarks that only one thread
43 * can write to the URB simultaneously and the FF_SYNC message provides the
44 * synchronization mechanism for this, so using this message effectively
45 * stalls the thread until it is its turn to write to the URB. Because of
46 * this, the best way to implement geometry shader algorithms in gen6 is to
47 * execute the algorithm before the FF_SYNC message to maximize parallelism.
48 *
49 * To achieve this we buffer the geometry shader outputs for each emitted
50 * vertex in vertex_output during operation. Then, when we have processed
51 * the last vertex (that is, at thread end time), we send the FF_SYNC
52 * message to allocate the initial VUE handle and write all buffered vertex
53 * data to the URB in one go.
54 *
55 * For each emitted vertex, vertex_output will hold vue_map.num_slots
56 * data items plus one additional item to hold required flags
57 * (PrimType, PrimStart, PrimEnd, as expected by the URB_WRITE message)
58 * which come right after the data items for that vertex. Vertex data and
59 * flags for the next vertex come right after the data items and flags for
60 * the previous vertex.
61 */
62 this->current_annotation = "gen6 prolog";
63 this->vertex_output = src_reg(this,
64 glsl_type::uint_type,
65 (prog_data->vue_map.num_slots + 1) *
66 c->gp->program.VerticesOut);
67 this->vertex_output_offset = src_reg(this, glsl_type::uint_type);
68 emit(MOV(dst_reg(this->vertex_output_offset), src_reg(0u)));
69
70 /* MRF 1 will be the header for all messages (FF_SYNC and URB_WRITES),
71 * so initialize it once to R0.
72 */
73 vec4_instruction *inst = emit(MOV(dst_reg(MRF, 1),
74 retype(brw_vec8_grf(0, 0),
75 BRW_REGISTER_TYPE_UD)));
76 inst->force_writemask_all = true;
77
78 /* This will be used as a temporary to store writeback data of FF_SYNC
79 * and URB_WRITE messages.
80 */
81 this->temp = src_reg(this, glsl_type::uint_type);
82
83 /* This will be used to know when we are processing the first vertex of
84 * a primitive. We will set this to URB_WRITE_PRIM_START only when we know
85 * that we are processing the first vertex in the primitive and to zero
86 * otherwise. This way we can use its value directly in the URB write
87 * headers.
88 */
89 this->first_vertex = src_reg(this, glsl_type::uint_type);
90 emit(MOV(dst_reg(this->first_vertex), URB_WRITE_PRIM_START));
91
92 /* The FF_SYNC message requires to know the number of primitives generated,
93 * so keep a counter for this.
94 */
95 this->prim_count = src_reg(this, glsl_type::uint_type);
96 emit(MOV(dst_reg(this->prim_count), 0u));
97 }
98
99 void
100 gen6_gs_visitor::visit(ir_emit_vertex *)
101 {
102 this->current_annotation = "gen6 emit vertex";
103 /* Honor max_vertex layout indication in geometry shader by ignoring any
104 * vertices coming after c->gp->program.VerticesOut.
105 */
106 unsigned num_output_vertices = c->gp->program.VerticesOut;
107 emit(CMP(dst_null_d(), this->vertex_count, src_reg(num_output_vertices),
108 BRW_CONDITIONAL_L));
109 emit(IF(BRW_PREDICATE_NORMAL));
110 {
111 /* Buffer all output slots for this vertex in vertex_output */
112 for (int slot = 0; slot < prog_data->vue_map.num_slots; ++slot) {
113 /* We will handle PSIZ for each vertex at thread end time since it
114 * is not computed by the GS algorithm and requires specific handling.
115 */
116 int varying = prog_data->vue_map.slot_to_varying[slot];
117 if (varying != VARYING_SLOT_PSIZ) {
118 dst_reg dst(this->vertex_output);
119 dst.reladdr = ralloc(mem_ctx, src_reg);
120 memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
121 emit_urb_slot(dst, varying);
122 }
123 emit(ADD(dst_reg(this->vertex_output_offset),
124 this->vertex_output_offset, 1u));
125 }
126
127 /* Now buffer flags for this vertex */
128 dst_reg dst(this->vertex_output);
129 dst.reladdr = ralloc(mem_ctx, src_reg);
130 memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
131 if (c->gp->program.OutputType == GL_POINTS) {
132 /* If we are outputting points, then every vertex has PrimStart and
133 * PrimEnd set.
134 */
135 emit(MOV(dst, (_3DPRIM_POINTLIST << URB_WRITE_PRIM_TYPE_SHIFT) |
136 URB_WRITE_PRIM_START | URB_WRITE_PRIM_END));
137 emit(ADD(dst_reg(this->prim_count), this->prim_count, 1u));
138 } else {
139 /* Otherwise, we can only set the PrimStart flag, which we have stored
140 * in the first_vertex register. We will have to wait until we execute
141 * EndPrimitive() or we end the thread to set the PrimEnd flag on a
142 * vertex.
143 */
144 emit(OR(dst, this->first_vertex,
145 (c->prog_data.output_topology << URB_WRITE_PRIM_TYPE_SHIFT)));
146 emit(MOV(dst_reg(this->first_vertex), 0u));
147 }
148 emit(ADD(dst_reg(this->vertex_output_offset),
149 this->vertex_output_offset, 1u));
150
151 /* Update vertex count */
152 emit(ADD(dst_reg(this->vertex_count), this->vertex_count, 1u));
153 }
154 emit(BRW_OPCODE_ENDIF);
155 }
156
157 void
158 gen6_gs_visitor::visit(ir_end_primitive *)
159 {
160 this->current_annotation = "gen6 end primitive";
161 /* Calling EndPrimitive() is optional for point output. In this case we set
162 * the PrimEnd flag when we process EmitVertex().
163 */
164 if (c->gp->program.OutputType == GL_POINTS)
165 return;
166
167 /* Otherwise we know that the last vertex we have processed was the last
168 * vertex in the primitive and we need to set its PrimEnd flag, so do this
169 * unless we haven't emitted that vertex at all.
170 *
171 * Notice that we have already incremented vertex_count when we processed
172 * the last emit_vertex, so we need to take that into account in the
173 * comparison below (hence the num_output_vertices + 1 in the comparison
174 * below).
175 */
176 unsigned num_output_vertices = c->gp->program.VerticesOut;
177 emit(CMP(dst_null_d(), this->vertex_count, src_reg(num_output_vertices + 1),
178 BRW_CONDITIONAL_L));
179 emit(IF(BRW_PREDICATE_NORMAL));
180 {
181 /* vertex_output_offset is already pointing at the first entry of the
182 * next vertex. So subtract 1 to modify the flags for the previous
183 * vertex.
184 */
185 src_reg offset(this, glsl_type::uint_type);
186 emit(ADD(dst_reg(offset), this->vertex_output_offset, brw_imm_d(-1)));
187
188 src_reg dst(this->vertex_output);
189 dst.reladdr = ralloc(mem_ctx, src_reg);
190 memcpy(dst.reladdr, &offset, sizeof(src_reg));
191
192 emit(OR(dst_reg(dst), dst, URB_WRITE_PRIM_END));
193 emit(ADD(dst_reg(this->prim_count), this->prim_count, 1u));
194
195 /* Set the first vertex flag to indicate that the next vertex will start
196 * a primitive.
197 */
198 emit(MOV(dst_reg(this->first_vertex), URB_WRITE_PRIM_START));
199 }
200 emit(BRW_OPCODE_ENDIF);
201 }
202
203 void
204 gen6_gs_visitor::emit_urb_write_header(int mrf)
205 {
206 this->current_annotation = "gen6 urb header";
207 /* Compute offset of the flags for the current vertex in vertex_output and
208 * write them in dw2 of the message header.
209 *
210 * Notice that by the time that emit_thread_end() calls here
211 * vertex_output_offset should point to the first data item of the current
212 * vertex in vertex_output, thus we only need to add the number of output
213 * slots per vertex to that offset to obtain the flags data offset.
214 */
215 src_reg flags_offset(this, glsl_type::uint_type);
216 emit(ADD(dst_reg(flags_offset),
217 this->vertex_output_offset, src_reg(prog_data->vue_map.num_slots)));
218
219 src_reg flags_data(this->vertex_output);
220 flags_data.reladdr = ralloc(mem_ctx, src_reg);
221 memcpy(flags_data.reladdr, &flags_offset, sizeof(src_reg));
222
223 emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, mrf), flags_data);
224 }
225
226 void
227 gen6_gs_visitor::emit_urb_write_opcode(bool complete, src_reg vertex,
228 int base_mrf, int mlen, int urb_offset)
229 {
230 vec4_instruction *inst = NULL;
231
232 /* If the vertex is not complete we don't have to do anything special */
233 if (!complete) {
234 inst = emit(GS_OPCODE_URB_WRITE);
235 inst->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
236 inst->base_mrf = base_mrf;
237 inst->mlen = mlen;
238 inst->offset = urb_offset;
239 return;
240 }
241
242 /* Otherwise, if this is not the last vertex we are going to write,
243 * we have to request a new VUE handle for the next vertex.
244 *
245 * Notice that the vertex parameter has been pre-incremented in
246 * emit_thread_end() to make this comparison easier.
247 */
248 emit(CMP(dst_null_d(), vertex, this->vertex_count, BRW_CONDITIONAL_L));
249 emit(IF(BRW_PREDICATE_NORMAL));
250 {
251 inst = emit(GS_OPCODE_URB_WRITE_ALLOCATE);
252 inst->urb_write_flags = BRW_URB_WRITE_COMPLETE;
253 inst->base_mrf = base_mrf;
254 inst->mlen = mlen;
255 inst->offset = urb_offset;
256 inst->dst = dst_reg(MRF, base_mrf);
257 inst->src[0] = this->temp;
258 }
259 emit(BRW_OPCODE_ELSE);
260 {
261 inst = emit(GS_OPCODE_URB_WRITE);
262 inst->urb_write_flags = BRW_URB_WRITE_COMPLETE;
263 inst->base_mrf = base_mrf;
264 inst->mlen = mlen;
265 inst->offset = urb_offset;
266 }
267 emit(BRW_OPCODE_ENDIF);
268 }
269
270 void
271 gen6_gs_visitor::emit_thread_end()
272 {
273 /* Here we have to:
274 * 1) Emit an FF_SYNC messsage to obtain an initial VUE handle.
275 * 2) Loop over all buffered vertex data and write it to corresponding
276 * URB entries.
277 * 3) Allocate new VUE handles for all vertices other than the first.
278 * 4) Send a final EOT message.
279 */
280
281 /* MRF 0 is reserved for the debugger, so start with message header
282 * in MRF 1.
283 */
284 int base_mrf = 1;
285
286 /* In the process of generating our URB write message contents, we
287 * may need to unspill a register or load from an array. Those
288 * reads would use MRFs 14-15.
289 */
290 int max_usable_mrf = 13;
291
292 /* Issue the FF_SYNC message and obtain the initial VUE handle. */
293 this->current_annotation = "gen6 thread end: ff_sync";
294 vec4_instruction *inst =
295 emit(GS_OPCODE_FF_SYNC, dst_reg(this->temp), this->prim_count);
296 inst->base_mrf = base_mrf;
297
298 /* Loop over all buffered vertices and emit URB write messages */
299 this->current_annotation = "gen6 thread end: urb writes init";
300 src_reg vertex(this, glsl_type::uint_type);
301 emit(MOV(dst_reg(vertex), 0u));
302 emit(MOV(dst_reg(this->vertex_output_offset), 0u));
303
304 this->current_annotation = "gen6 thread end: urb writes";
305 emit(BRW_OPCODE_DO);
306 {
307 emit(CMP(dst_null_d(), vertex, this->vertex_count, BRW_CONDITIONAL_GE));
308 inst = emit(BRW_OPCODE_BREAK);
309 inst->predicate = BRW_PREDICATE_NORMAL;
310
311 /* First we prepare the message header */
312 emit_urb_write_header(base_mrf);
313
314 /* Then add vertex data to the message in interleaved fashion */
315 int slot = 0;
316 bool complete = false;
317 do {
318 int mrf = base_mrf + 1;
319
320 /* URB offset is in URB row increments, and each of our MRFs is half
321 * of one of those, since we're doing interleaved writes.
322 */
323 int urb_offset = slot / 2;
324
325 for (; slot < prog_data->vue_map.num_slots; ++slot) {
326 int varying = prog_data->vue_map.slot_to_varying[slot];
327 current_annotation = output_reg_annotation[varying];
328
329 /* Compute offset of this slot for the current vertex
330 * in vertex_output
331 */
332 src_reg data(this->vertex_output);
333 data.reladdr = ralloc(mem_ctx, src_reg);
334 memcpy(data.reladdr, &this->vertex_output_offset, sizeof(src_reg));
335
336 if (varying == VARYING_SLOT_PSIZ) {
337 /* We did not buffer PSIZ, emit it directly here */
338 emit_urb_slot(dst_reg(MRF, mrf), varying);
339 } else {
340 /* Copy this slot to the appropriate message register */
341 dst_reg reg = dst_reg(MRF, mrf);
342 reg.type = output_reg[varying].type;
343 data.type = reg.type;
344 vec4_instruction *inst = emit(MOV(reg, data));
345 inst->force_writemask_all = true;
346 }
347
348 mrf++;
349 emit(ADD(dst_reg(this->vertex_output_offset),
350 this->vertex_output_offset, 1u));
351
352 /* If this was max_usable_mrf, we can't fit anything more into this
353 * URB WRITE.
354 */
355 if (mrf > max_usable_mrf) {
356 slot++;
357 break;
358 }
359 }
360
361 complete = slot >= prog_data->vue_map.num_slots;
362
363 /* When we emit the URB_WRITE below we need to do different things
364 * depending on whether this is the last vertex we are going to
365 * write. That means that we will need to check if
366 * vertex >= vertex_count - 1. However, by increasing vertex early
367 * we transform that comparison into vertex >= vertex_count, which
368 * is more convenient.
369 */
370 if (complete)
371 emit(ADD(dst_reg(vertex), vertex, 1u));
372
373 /* URB data written (does not include the message header reg) must
374 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
375 * section 5.4.3.2.2: URB_INTERLEAVED.
376 */
377 int mlen = mrf - base_mrf;
378 if ((mlen % 2) != 1)
379 mlen++;
380 emit_urb_write_opcode(complete, vertex, base_mrf, mlen, urb_offset);
381 } while (!complete);
382
383 /* Skip over the flags data item so that vertex_output_offset points to
384 * the first data item of the next vertex, so that we can start writing
385 * the next vertex.
386 */
387 emit(ADD(dst_reg(this->vertex_output_offset),
388 this->vertex_output_offset, 1u));
389 }
390 emit(BRW_OPCODE_WHILE);
391
392 /* Finally, emit EOT message.
393 *
394 * In gen6 it looks like we have to set the complete flag too, otherwise
395 * the GPU hangs.
396 */
397 this->current_annotation = "gen6 thread end: EOT";
398 inst = emit(GS_OPCODE_THREAD_END);
399 inst->urb_write_flags = BRW_URB_WRITE_COMPLETE;
400 inst->base_mrf = base_mrf;
401 inst->mlen = 1;
402 }
403
404 } /* namespace brw */