1 #include "sfn_vertexstageexport.h"
3 #include "sfn_shaderio.h"
7 using std::priority_queue
;
9 VertexStageExportBase::VertexStageExportBase(VertexStage
& proc
):
17 VertexStageExportBase::~VertexStageExportBase()
22 VertexStageExportForFS::VertexStageExportForFS(VertexStage
& proc
,
23 const pipe_stream_output_info
*so_info
,
24 r600_pipe_shader
*pipe_shader
, const r600_shader_key
&key
):
25 VertexStageExportBase(proc
),
26 m_last_param_export(nullptr),
27 m_last_pos_export(nullptr),
29 m_enabled_stream_buffers_mask(0),
31 m_pipe_shader(pipe_shader
),
36 bool VertexStageExportBase::do_process_outputs(nir_variable
*output
)
38 if (output
->data
.location
== VARYING_SLOT_COL0
||
39 output
->data
.location
== VARYING_SLOT_COL1
||
40 (output
->data
.location
>= VARYING_SLOT_VAR0
&&
41 output
->data
.location
<= VARYING_SLOT_VAR31
) ||
42 (output
->data
.location
>= VARYING_SLOT_TEX0
&&
43 output
->data
.location
<= VARYING_SLOT_TEX7
) ||
44 output
->data
.location
== VARYING_SLOT_BFC0
||
45 output
->data
.location
== VARYING_SLOT_BFC1
||
46 output
->data
.location
== VARYING_SLOT_CLIP_VERTEX
||
47 output
->data
.location
== VARYING_SLOT_CLIP_DIST0
||
48 output
->data
.location
== VARYING_SLOT_CLIP_DIST1
||
49 output
->data
.location
== VARYING_SLOT_POS
||
50 output
->data
.location
== VARYING_SLOT_PSIZ
||
51 output
->data
.location
== VARYING_SLOT_FOGC
||
52 output
->data
.location
== VARYING_SLOT_LAYER
||
53 output
->data
.location
== VARYING_SLOT_EDGE
||
54 output
->data
.location
== VARYING_SLOT_VIEWPORT
57 r600_shader_io
& io
= m_proc
.sh_info().output
[output
->data
.driver_location
];
58 auto semantic
= r600_get_varying_semantic(output
->data
.location
);
59 io
.name
= semantic
.first
;
60 io
.sid
= semantic
.second
;
62 m_proc
.evaluate_spi_sid(io
);
63 io
.write_mask
= ((1 << glsl_get_components(output
->type
)) - 1)
64 << output
->data
.location_frac
;
65 ++m_proc
.sh_info().noutput
;
67 if (output
->data
.location
== VARYING_SLOT_PSIZ
||
68 output
->data
.location
== VARYING_SLOT_EDGE
||
69 output
->data
.location
== VARYING_SLOT_LAYER
) // VIEWPORT?
72 if (output
->data
.location
!= VARYING_SLOT_POS
&&
73 output
->data
.location
!= VARYING_SLOT_EDGE
&&
74 output
->data
.location
!= VARYING_SLOT_PSIZ
&&
75 output
->data
.location
!= VARYING_SLOT_CLIP_VERTEX
)
76 m_param_map
[output
->data
.location
] = m_cur_param
++;
84 bool VertexStageExportForFS::store_deref(const nir_variable
*out_var
, nir_intrinsic_instr
* instr
)
87 switch (out_var
->data
.location
) {
88 case VARYING_SLOT_PSIZ
:
89 m_proc
.sh_info().vs_out_point_size
= 1;
90 m_proc
.sh_info().vs_out_misc_write
= 1;
92 case VARYING_SLOT_POS
:
93 return emit_varying_pos(out_var
, instr
);
94 case VARYING_SLOT_EDGE
: {
95 std::array
<uint32_t, 4> swizzle_override
= {7 ,0, 7, 7};
96 return emit_varying_pos(out_var
, instr
, &swizzle_override
);
98 case VARYING_SLOT_VIEWPORT
: {
99 std::array
<uint32_t, 4> swizzle_override
= {7, 7, 7, 0};
100 return emit_varying_pos(out_var
, instr
, &swizzle_override
) &&
101 emit_varying_param(out_var
, instr
);
103 case VARYING_SLOT_CLIP_VERTEX
:
104 return emit_clip_vertices(out_var
, instr
);
105 case VARYING_SLOT_CLIP_DIST0
:
106 case VARYING_SLOT_CLIP_DIST1
:
107 m_num_clip_dist
+= 4;
108 return emit_varying_param(out_var
, instr
) && emit_varying_pos(out_var
, instr
);
109 case VARYING_SLOT_LAYER
: {
110 m_proc
.sh_info().vs_out_misc_write
= 1;
111 m_proc
.sh_info().vs_out_layer
= 1;
112 std::array
<uint32_t, 4> swz
= {7,7,0,7};
113 return emit_varying_pos(out_var
, instr
, &swz
) &&
114 emit_varying_param(out_var
, instr
);
116 case VARYING_SLOT_VIEW_INDEX
:
117 return emit_varying_pos(out_var
, instr
) &&
118 emit_varying_param(out_var
, instr
);
121 return emit_varying_param(out_var
, instr
);
124 fprintf(stderr
, "r600-NIR: Unimplemented store_deref for %d\n",
125 out_var
->data
.location
);
129 bool VertexStageExportForFS::emit_varying_pos(const nir_variable
*out_var
, nir_intrinsic_instr
* instr
,
130 std::array
<uint32_t, 4> *swizzle_override
)
132 std::array
<uint32_t,4> swizzle
;
133 uint32_t write_mask
= 0;
135 if (swizzle_override
) {
136 swizzle
= *swizzle_override
;
137 for (int i
= 0; i
< 4; ++i
) {
139 write_mask
|= 1 << i
;
142 write_mask
= nir_intrinsic_write_mask(instr
) << out_var
->data
.location_frac
;
143 for (int i
= 0; i
< 4; ++i
)
144 swizzle
[i
] = ((1 << i
) & write_mask
) ? i
- out_var
->data
.location_frac
: 7;
147 m_proc
.sh_info().output
[out_var
->data
.driver_location
].write_mask
= write_mask
;
149 GPRVector value
= m_proc
.vec_from_nir_with_fetch_constant(instr
->src
[1], write_mask
, swizzle
);
150 m_proc
.set_output(out_var
->data
.driver_location
, value
.sel());
154 switch (out_var
->data
.location
) {
155 case VARYING_SLOT_EDGE
: {
156 m_proc
.sh_info().vs_out_misc_write
= 1;
157 m_proc
.sh_info().vs_out_edgeflag
= 1;
158 m_proc
.emit_instruction(op1_mov
, value
.reg_i(1), {value
.reg_i(1)}, {alu_write
, alu_dst_clamp
, alu_last_instr
});
159 m_proc
.emit_instruction(op1_flt_to_int
, value
.reg_i(1), {value
.reg_i(1)}, {alu_write
, alu_last_instr
});
160 m_proc
.sh_info().output
[out_var
->data
.driver_location
].write_mask
= 0xf;
163 case VARYING_SLOT_PSIZ
:
164 case VARYING_SLOT_LAYER
:
167 case VARYING_SLOT_VIEWPORT
:
168 m_proc
.sh_info().vs_out_misc_write
= 1;
169 m_proc
.sh_info().vs_out_viewport
= 1;
172 case VARYING_SLOT_POS
:
174 case VARYING_SLOT_CLIP_DIST0
:
175 case VARYING_SLOT_CLIP_DIST1
:
176 export_slot
= m_cur_clip_pos
++;
179 sfn_log
<< SfnLog::err
<< __func__
<< "Unsupported location "
180 << out_var
->data
.location
<< "\n";
184 m_last_pos_export
= new ExportInstruction(export_slot
, value
, ExportInstruction::et_pos
);
185 m_proc
.emit_export_instruction(m_last_pos_export
);
186 m_proc
.add_param_output_reg(out_var
->data
.driver_location
, m_last_pos_export
->gpr_ptr());
190 bool VertexStageExportForFS::emit_varying_param(const nir_variable
*out_var
, nir_intrinsic_instr
* instr
)
192 assert(out_var
->data
.driver_location
< m_proc
.sh_info().noutput
);
193 sfn_log
<< SfnLog::io
<< __func__
<< ": emit DDL: " << out_var
->data
.driver_location
<< "\n";
195 int write_mask
= nir_intrinsic_write_mask(instr
) << out_var
->data
.location_frac
;
196 std::array
<uint32_t,4> swizzle
;
197 for (int i
= 0; i
< 4; ++i
)
198 swizzle
[i
] = ((1 << i
) & write_mask
) ? i
- out_var
->data
.location_frac
: 7;
200 m_proc
.sh_info().output
[out_var
->data
.driver_location
].write_mask
= write_mask
;
202 GPRVector value
= m_proc
.vec_from_nir_with_fetch_constant(instr
->src
[1], write_mask
, swizzle
, true);
203 m_proc
.sh_info().output
[out_var
->data
.driver_location
].gpr
= value
.sel();
205 /* This should use the registers!! */
206 m_proc
.set_output(out_var
->data
.driver_location
, value
.sel());
208 auto param_loc
= m_param_map
.find(out_var
->data
.location
);
209 assert(param_loc
!= m_param_map
.end());
211 m_last_param_export
= new ExportInstruction(param_loc
->second
, value
, ExportInstruction::et_param
);
212 m_proc
.emit_export_instruction(m_last_param_export
);
213 m_proc
.add_param_output_reg(out_var
->data
.driver_location
, m_last_param_export
->gpr_ptr());
217 bool VertexStageExportForFS::emit_clip_vertices(const nir_variable
*out_var
, nir_intrinsic_instr
* instr
)
219 m_proc
.sh_info().cc_dist_mask
= 0xff;
220 m_proc
.sh_info().clip_dist_write
= 0xff;
222 m_clip_vertex
= m_proc
.vec_from_nir_with_fetch_constant(instr
->src
[1], 0xf, {0,1,2,3});
223 m_proc
.add_param_output_reg(out_var
->data
.driver_location
, &m_clip_vertex
);
225 for (int i
= 0; i
< 4; ++i
)
226 m_proc
.sh_info().output
[out_var
->data
.driver_location
].write_mask
|= 1 << i
;
228 GPRVector clip_dist
[2] = { m_proc
.get_temp_vec4(), m_proc
.get_temp_vec4()};
230 for (int i
= 0; i
< 8; i
++) {
233 AluInstruction
*ir
= nullptr;
234 for (int j
= 0; j
< 4; j
++) {
235 ir
= new AluInstruction(op2_dot4_ieee
, clip_dist
[oreg
].reg_i(j
), m_clip_vertex
.reg_i(j
),
236 PValue(new UniformValue(512 + i
, j
, R600_BUFFER_INFO_CONST_BUFFER
)),
237 (j
== ochan
) ? EmitInstruction::write
: EmitInstruction::empty
);
238 m_proc
.emit_instruction(ir
);
240 ir
->set_flag(alu_last_instr
);
243 m_last_pos_export
= new ExportInstruction(m_cur_clip_pos
++, clip_dist
[0], ExportInstruction::et_pos
);
244 m_proc
.emit_export_instruction(m_last_pos_export
);
246 m_last_pos_export
= new ExportInstruction(m_cur_clip_pos
, clip_dist
[1], ExportInstruction::et_pos
);
247 m_proc
.emit_export_instruction(m_last_pos_export
);
252 void VertexStageExportForFS::finalize_exports()
254 if (m_key
.vs
.as_gs_a
) {
255 PValue
o(new GPRValue(0,PIPE_SWIZZLE_0
));
256 GPRVector
primid({m_proc
.primitive_id(), o
,o
,o
});
257 m_last_param_export
= new ExportInstruction(m_cur_param
, primid
, ExportInstruction::et_param
);
258 m_proc
.emit_export_instruction(m_last_param_export
);
260 i
= m_proc
.sh_info().noutput
++;
261 auto& io
= m_proc
.sh_info().output
[i
];
262 io
.name
= TGSI_SEMANTIC_PRIMID
;
265 io
.interpolate
= TGSI_INTERPOLATE_CONSTANT
;
267 io
.spi_sid
= m_key
.vs
.prim_id_out
;
268 m_proc
.sh_info().vs_as_gs_a
= 1;
271 if (m_so_info
&& m_so_info
->num_outputs
)
274 m_pipe_shader
->enabled_stream_buffers_mask
= m_enabled_stream_buffers_mask
;
276 if (!m_last_param_export
) {
277 GPRVector
value(0,{7,7,7,7});
278 m_last_param_export
= new ExportInstruction(0, value
, ExportInstruction::et_param
);
279 m_proc
.emit_export_instruction(m_last_param_export
);
281 m_last_param_export
->set_last();
283 if (!m_last_pos_export
) {
284 GPRVector
value(0,{7,7,7,7});
285 m_last_pos_export
= new ExportInstruction(0, value
, ExportInstruction::et_pos
);
286 m_proc
.emit_export_instruction(m_last_pos_export
);
288 m_last_pos_export
->set_last();
291 bool VertexStageExportForFS::emit_stream(int stream
)
294 if (m_so_info
->num_outputs
> PIPE_MAX_SO_OUTPUTS
) {
295 R600_ERR("Too many stream outputs: %d\n", m_so_info
->num_outputs
);
298 for (unsigned i
= 0; i
< m_so_info
->num_outputs
; i
++) {
299 if (m_so_info
->output
[i
].output_buffer
>= 4) {
300 R600_ERR("Exceeded the max number of stream output buffers, got: %d\n",
301 m_so_info
->output
[i
].output_buffer
);
305 const GPRVector
*so_gpr
[PIPE_MAX_SHADER_OUTPUTS
];
306 unsigned start_comp
[PIPE_MAX_SHADER_OUTPUTS
];
307 std::vector
<GPRVector
> tmp(m_so_info
->num_outputs
);
309 /* Initialize locations where the outputs are stored. */
310 for (unsigned i
= 0; i
< m_so_info
->num_outputs
; i
++) {
311 if (stream
!= -1 && stream
!= m_so_info
->output
[i
].stream
)
314 sfn_log
<< SfnLog::instr
<< "Emit stream " << i
315 << " with register index " << m_so_info
->output
[i
].register_index
<< " so_gpr:";
318 so_gpr
[i
] = m_proc
.output_register(m_so_info
->output
[i
].register_index
);
321 sfn_log
<< SfnLog::err
<< "\nERR: register index "
322 << m_so_info
->output
[i
].register_index
323 << " doesn't correspond to an output register\n";
326 start_comp
[i
] = m_so_info
->output
[i
].start_component
;
327 /* Lower outputs with dst_offset < start_component.
329 * We can only output 4D vectors with a write mask, e.g. we can
330 * only output the W component at offset 3, etc. If we want
331 * to store Y, Z, or W at buffer offset 0, we need to use MOV
332 * to move it to X and output X. */
333 if (m_so_info
->output
[i
].dst_offset
< m_so_info
->output
[i
].start_component
) {
334 int tmp_index
= m_proc
.allocate_temp_register();
335 int sc
= m_so_info
->output
[i
].start_component
;
336 AluInstruction
*alu
= nullptr;
337 for (int j
= 0; j
< m_so_info
->output
[i
].num_components
; j
++) {
338 PValue
dst(new GPRValue(tmp_index
, j
));
339 alu
= new AluInstruction(op1_mov
, dst
, so_gpr
[i
]->reg_i(j
+ sc
), {alu_write
});
340 tmp
[i
].set_reg_i(j
, dst
);
341 m_proc
.emit_instruction(alu
);
344 alu
->set_flag(alu_last_instr
);
346 /* Fill the vector with masked values */
347 PValue
dst_blank(new GPRValue(tmp_index
, 7));
348 for (int j
= m_so_info
->output
[i
].num_components
; j
< 4; j
++)
349 tmp
[i
].set_reg_i(j
, dst_blank
);
354 sfn_log
<< SfnLog::instr
<< *so_gpr
[i
] << "\n";
357 /* Write outputs to buffers. */
358 for (unsigned i
= 0; i
< m_so_info
->num_outputs
; i
++) {
359 sfn_log
<< SfnLog::instr
<< "Write output buffer " << i
360 << " with register index " << m_so_info
->output
[i
].register_index
<< "\n";
362 StreamOutIntruction
*out_stream
=
363 new StreamOutIntruction(*so_gpr
[i
],
364 m_so_info
->output
[i
].num_components
,
365 m_so_info
->output
[i
].dst_offset
- start_comp
[i
],
366 ((1 << m_so_info
->output
[i
].num_components
) - 1) << start_comp
[i
],
367 m_so_info
->output
[i
].output_buffer
,
368 m_so_info
->output
[i
].stream
);
369 m_proc
.emit_export_instruction(out_stream
);
370 m_enabled_stream_buffers_mask
|= (1 << m_so_info
->output
[i
].output_buffer
) << m_so_info
->output
[i
].stream
* 4;
376 VertexStageExportForGS::VertexStageExportForGS(VertexStage
&proc
,
377 const r600_shader
*gs_shader
):
378 VertexStageExportBase(proc
),
380 m_gs_shader(gs_shader
)
385 bool VertexStageExportForGS::store_deref(const nir_variable
*out_var
, nir_intrinsic_instr
* instr
)
388 int ring_offset
= -1;
389 const r600_shader_io
& out_io
= m_proc
.sh_info().output
[out_var
->data
.driver_location
];
391 sfn_log
<< SfnLog::io
<< "check output " << out_var
->data
.driver_location
392 << " name=" << out_io
.name
<< " sid=" << out_io
.sid
<< "\n";
393 for (unsigned k
= 0; k
< m_gs_shader
->ninput
; ++k
) {
394 auto& in_io
= m_gs_shader
->input
[k
];
395 sfn_log
<< SfnLog::io
<< " against " << k
<< " name=" << in_io
.name
<< " sid=" << in_io
.sid
<< "\n";
397 if (in_io
.name
== out_io
.name
&&
398 in_io
.sid
== out_io
.sid
) {
399 ring_offset
= in_io
.ring_offset
;
404 if (out_var
->data
.location
== VARYING_SLOT_VIEWPORT
) {
405 m_proc
.sh_info().vs_out_viewport
= 1;
406 m_proc
.sh_info().vs_out_misc_write
= 1;
410 if (ring_offset
== -1) {
411 sfn_log
<< SfnLog::err
<< "VS defines output at "
412 << out_var
->data
.driver_location
<< "name=" << out_io
.name
413 << " sid=" << out_io
.sid
<< " that is not consumed as GS input\n";
417 uint32_t write_mask
= (1 << instr
->num_components
) - 1;
419 GPRVector value
= m_proc
.vec_from_nir_with_fetch_constant(instr
->src
[1], write_mask
,
420 swizzle_from_comps(instr
->num_components
), true);
422 auto ir
= new MemRingOutIntruction(cf_mem_ring
, mem_write
, value
,
423 ring_offset
>> 2, 4, PValue());
424 m_proc
.emit_export_instruction(ir
);
426 m_proc
.sh_info().output
[out_var
->data
.driver_location
].write_mask
|= write_mask
;
427 if (out_var
->data
.location
== VARYING_SLOT_CLIP_DIST0
||
428 out_var
->data
.location
== VARYING_SLOT_CLIP_DIST1
)
429 m_num_clip_dist
+= 4;
434 void VertexStageExportForGS::finalize_exports()
439 VertexStageExportForES::VertexStageExportForES(VertexStage
& proc
):
440 VertexStageExportBase(proc
)
444 bool VertexStageExportForES::store_deref(const nir_variable
*out_var
, nir_intrinsic_instr
* instr
)
449 void VertexStageExportForES::finalize_exports()