2 * Copyright © 2014 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
23 * This code is based on original work by Ilia Mirkin.
27 * \file gen6_gs_visitor.cpp
29 * Gen6 geometry shader implementation
32 #include "gen6_gs_visitor.h"
37 gen6_gs_visitor::emit_prolog()
39 vec4_gs_visitor::emit_prolog();
41 /* Gen6 geometry shaders require to allocate an initial VUE handle via
42 * FF_SYNC message, however the documentation remarks that only one thread
43 * can write to the URB simultaneously and the FF_SYNC message provides the
44 * synchronization mechanism for this, so using this message effectively
45 * stalls the thread until it is its turn to write to the URB. Because of
46 * this, the best way to implement geometry shader algorithms in gen6 is to
47 * execute the algorithm before the FF_SYNC message to maximize parallelism.
49 * To achieve this we buffer the geometry shader outputs for each emitted
50 * vertex in vertex_output during operation. Then, when we have processed
51 * the last vertex (that is, at thread end time), we send the FF_SYNC
52 * message to allocate the initial VUE handle and write all buffered vertex
53 * data to the URB in one go.
55 * For each emitted vertex, vertex_output will hold vue_map.num_slots
56 * data items plus one additional item to hold required flags
57 * (PrimType, PrimStart, PrimEnd, as expected by the URB_WRITE message)
58 * which come right after the data items for that vertex. Vertex data and
59 * flags for the next vertex come right after the data items and flags for
60 * the previous vertex.
62 this->current_annotation
= "gen6 prolog";
63 this->vertex_output
= src_reg(this,
65 (prog_data
->vue_map
.num_slots
+ 1) *
66 c
->gp
->program
.VerticesOut
);
67 this->vertex_output_offset
= src_reg(this, glsl_type::uint_type
);
68 emit(MOV(dst_reg(this->vertex_output_offset
), src_reg(0u)));
70 /* MRF 1 will be the header for all messages (FF_SYNC and URB_WRITES),
71 * so initialize it once to R0.
73 vec4_instruction
*inst
= emit(MOV(dst_reg(MRF
, 1),
74 retype(brw_vec8_grf(0, 0),
75 BRW_REGISTER_TYPE_UD
)));
76 inst
->force_writemask_all
= true;
78 /* This will be used as a temporary to store writeback data of FF_SYNC
79 * and URB_WRITE messages.
81 this->temp
= src_reg(this, glsl_type::uint_type
);
85 gen6_gs_visitor::visit(ir_emit_vertex
*)
87 this->current_annotation
= "gen6 emit vertex";
88 /* Honor max_vertex layout indication in geometry shader by ignoring any
89 * vertices coming after c->gp->program.VerticesOut.
91 unsigned num_output_vertices
= c
->gp
->program
.VerticesOut
;
92 emit(CMP(dst_null_d(), this->vertex_count
, src_reg(num_output_vertices
),
94 emit(IF(BRW_PREDICATE_NORMAL
));
96 /* Buffer all output slots for this vertex in vertex_output */
97 for (int slot
= 0; slot
< prog_data
->vue_map
.num_slots
; ++slot
) {
98 /* We will handle PSIZ for each vertex at thread end time since it
99 * is not computed by the GS algorithm and requires specific handling.
101 int varying
= prog_data
->vue_map
.slot_to_varying
[slot
];
102 if (varying
!= VARYING_SLOT_PSIZ
) {
103 dst_reg
dst(this->vertex_output
);
104 dst
.reladdr
= ralloc(mem_ctx
, src_reg
);
105 memcpy(dst
.reladdr
, &this->vertex_output_offset
, sizeof(src_reg
));
106 emit_urb_slot(dst
, varying
);
108 emit(ADD(dst_reg(this->vertex_output_offset
),
109 this->vertex_output_offset
, 1u));
112 /* Now buffer flags for this vertex (we only support point output
115 dst_reg
dst(this->vertex_output
);
116 dst
.reladdr
= ralloc(mem_ctx
, src_reg
);
117 memcpy(dst
.reladdr
, &this->vertex_output_offset
, sizeof(src_reg
));
118 /* If we are outputting points, then every vertex has PrimStart and
121 if (c
->gp
->program
.OutputType
== GL_POINTS
) {
122 emit(MOV(dst
, (_3DPRIM_POINTLIST
<< URB_WRITE_PRIM_TYPE_SHIFT
) |
123 URB_WRITE_PRIM_START
| URB_WRITE_PRIM_END
));
125 emit(ADD(dst_reg(this->vertex_output_offset
),
126 this->vertex_output_offset
, 1u));
128 /* Update vertex count */
129 emit(ADD(dst_reg(this->vertex_count
), this->vertex_count
, 1u));
131 emit(BRW_OPCODE_ENDIF
);
135 gen6_gs_visitor::visit(ir_end_primitive
*)
137 this->current_annotation
= "gen6 end primitive";
138 /* Calling EndPrimitive() is optional for point output. In this case we set
139 * the PrimEnd flag when we process EmitVertex().
141 if (c
->gp
->program
.OutputType
== GL_POINTS
)
146 gen6_gs_visitor::emit_urb_write_header(int mrf
)
148 this->current_annotation
= "gen6 urb header";
149 /* Compute offset of the flags for the current vertex in vertex_output and
150 * write them in dw2 of the message header.
152 * Notice that by the time that emit_thread_end() calls here
153 * vertex_output_offset should point to the first data item of the current
154 * vertex in vertex_output, thus we only need to add the number of output
155 * slots per vertex to that offset to obtain the flags data offset.
157 src_reg
flags_offset(this, glsl_type::uint_type
);
158 emit(ADD(dst_reg(flags_offset
),
159 this->vertex_output_offset
, src_reg(prog_data
->vue_map
.num_slots
)));
161 src_reg
flags_data(this->vertex_output
);
162 flags_data
.reladdr
= ralloc(mem_ctx
, src_reg
);
163 memcpy(flags_data
.reladdr
, &flags_offset
, sizeof(src_reg
));
165 emit(GS_OPCODE_SET_DWORD_2
, dst_reg(MRF
, mrf
), flags_data
);
169 gen6_gs_visitor::emit_urb_write_opcode(bool complete
, src_reg vertex
,
170 int base_mrf
, int mlen
, int urb_offset
)
172 vec4_instruction
*inst
= NULL
;
174 /* If the vertex is not complete we don't have to do anything special */
176 inst
= emit(GS_OPCODE_URB_WRITE
);
177 inst
->urb_write_flags
= BRW_URB_WRITE_NO_FLAGS
;
178 inst
->base_mrf
= base_mrf
;
180 inst
->offset
= urb_offset
;
184 /* Otherwise, if this is not the last vertex we are going to write,
185 * we have to request a new VUE handle for the next vertex.
187 * Notice that the vertex parameter has been pre-incremented in
188 * emit_thread_end() to make this comparison easier.
190 emit(CMP(dst_null_d(), vertex
, this->vertex_count
, BRW_CONDITIONAL_L
));
191 emit(IF(BRW_PREDICATE_NORMAL
));
193 inst
= emit(GS_OPCODE_URB_WRITE_ALLOCATE
);
194 inst
->urb_write_flags
= BRW_URB_WRITE_COMPLETE
;
195 inst
->base_mrf
= base_mrf
;
197 inst
->offset
= urb_offset
;
198 inst
->dst
= dst_reg(MRF
, base_mrf
);
199 inst
->src
[0] = this->temp
;
201 emit(BRW_OPCODE_ELSE
);
203 inst
= emit(GS_OPCODE_URB_WRITE
);
204 inst
->urb_write_flags
= BRW_URB_WRITE_COMPLETE
;
205 inst
->base_mrf
= base_mrf
;
207 inst
->offset
= urb_offset
;
209 emit(BRW_OPCODE_ENDIF
);
213 gen6_gs_visitor::emit_thread_end()
216 * 1) Emit an FF_SYNC messsage to obtain an initial VUE handle.
217 * 2) Loop over all buffered vertex data and write it to corresponding
219 * 3) Allocate new VUE handles for all vertices other than the first.
220 * 4) Send a final EOT message.
223 /* MRF 0 is reserved for the debugger, so start with message header
228 /* In the process of generating our URB write message contents, we
229 * may need to unspill a register or load from an array. Those
230 * reads would use MRFs 14-15.
232 int max_usable_mrf
= 13;
234 /* Issue the FF_SYNC message and obtain the initial VUE handle. */
235 this->current_annotation
= "gen6 thread end: ff_sync";
236 vec4_instruction
*inst
=
237 emit(GS_OPCODE_FF_SYNC
, dst_reg(this->temp
), this->vertex_count
);
238 inst
->base_mrf
= base_mrf
;
240 /* Loop over all buffered vertices and emit URB write messages */
241 this->current_annotation
= "gen6 thread end: urb writes init";
242 src_reg
vertex(this, glsl_type::uint_type
);
243 emit(MOV(dst_reg(vertex
), 0u));
244 emit(MOV(dst_reg(this->vertex_output_offset
), 0u));
246 this->current_annotation
= "gen6 thread end: urb writes";
249 emit(CMP(dst_null_d(), vertex
, this->vertex_count
, BRW_CONDITIONAL_GE
));
250 inst
= emit(BRW_OPCODE_BREAK
);
251 inst
->predicate
= BRW_PREDICATE_NORMAL
;
253 /* First we prepare the message header */
254 emit_urb_write_header(base_mrf
);
256 /* Then add vertex data to the message in interleaved fashion */
258 bool complete
= false;
260 int mrf
= base_mrf
+ 1;
262 /* URB offset is in URB row increments, and each of our MRFs is half
263 * of one of those, since we're doing interleaved writes.
265 int urb_offset
= slot
/ 2;
267 for (; slot
< prog_data
->vue_map
.num_slots
; ++slot
) {
268 int varying
= prog_data
->vue_map
.slot_to_varying
[slot
];
269 current_annotation
= output_reg_annotation
[varying
];
271 /* Compute offset of this slot for the current vertex
274 src_reg
data(this->vertex_output
);
275 data
.reladdr
= ralloc(mem_ctx
, src_reg
);
276 memcpy(data
.reladdr
, &this->vertex_output_offset
, sizeof(src_reg
));
278 if (varying
== VARYING_SLOT_PSIZ
) {
279 /* We did not buffer PSIZ, emit it directly here */
280 emit_urb_slot(dst_reg(MRF
, mrf
), varying
);
282 /* Copy this slot to the appropriate message register */
283 dst_reg reg
= dst_reg(MRF
, mrf
);
284 reg
.type
= output_reg
[varying
].type
;
285 data
.type
= reg
.type
;
286 vec4_instruction
*inst
= emit(MOV(reg
, data
));
287 inst
->force_writemask_all
= true;
291 emit(ADD(dst_reg(this->vertex_output_offset
),
292 this->vertex_output_offset
, 1u));
294 /* If this was max_usable_mrf, we can't fit anything more into this
297 if (mrf
> max_usable_mrf
) {
303 complete
= slot
>= prog_data
->vue_map
.num_slots
;
305 /* When we emit the URB_WRITE below we need to do different things
306 * depending on whether this is the last vertex we are going to
307 * write. That means that we will need to check if
308 * vertex >= vertex_count - 1. However, by increasing vertex early
309 * we transform that comparison into vertex >= vertex_count, which
310 * is more convenient.
313 emit(ADD(dst_reg(vertex
), vertex
, 1u));
315 /* URB data written (does not include the message header reg) must
316 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
317 * section 5.4.3.2.2: URB_INTERLEAVED.
319 int mlen
= mrf
- base_mrf
;
322 emit_urb_write_opcode(complete
, vertex
, base_mrf
, mlen
, urb_offset
);
325 /* Skip over the flags data item so that vertex_output_offset points to
326 * the first data item of the next vertex, so that we can start writing
329 emit(ADD(dst_reg(this->vertex_output_offset
),
330 this->vertex_output_offset
, 1u));
332 emit(BRW_OPCODE_WHILE
);
334 /* Finally, emit EOT message.
336 * In gen6 it looks like we have to set the complete flag too, otherwise
339 this->current_annotation
= "gen6 thread end: EOT";
340 inst
= emit(GS_OPCODE_THREAD_END
);
341 inst
->urb_write_flags
= BRW_URB_WRITE_COMPLETE
;
342 inst
->base_mrf
= base_mrf
;
346 } /* namespace brw */