1 #include "sfn_vertexstageexport.h"
3 #include "sfn_shaderio.h"
7 using std::priority_queue
;
9 VertexStageExportBase::VertexStageExportBase(VertexStage
& proc
):
17 VertexStageExportBase::~VertexStageExportBase()
22 VertexStageExportForFS::VertexStageExportForFS(VertexStage
& proc
,
23 const pipe_stream_output_info
*so_info
,
24 r600_pipe_shader
*pipe_shader
, const r600_shader_key
&key
):
25 VertexStageExportBase(proc
),
26 m_last_param_export(nullptr),
27 m_last_pos_export(nullptr),
29 m_enabled_stream_buffers_mask(0),
31 m_pipe_shader(pipe_shader
),
36 void VertexStageExportBase::setup_paramn_map()
38 priority_queue
<int, std::vector
<int>, std::greater
<int>> q
;
39 for (auto a
: m_param_map
) {
47 m_param_map
[loc
] = next_param
++;
51 bool VertexStageExportBase::do_process_outputs(nir_variable
*output
)
53 if (output
->data
.location
== VARYING_SLOT_COL0
||
54 output
->data
.location
== VARYING_SLOT_COL1
||
55 (output
->data
.location
>= VARYING_SLOT_VAR0
&&
56 output
->data
.location
<= VARYING_SLOT_VAR31
) ||
57 (output
->data
.location
>= VARYING_SLOT_TEX0
&&
58 output
->data
.location
<= VARYING_SLOT_TEX7
) ||
59 output
->data
.location
== VARYING_SLOT_BFC0
||
60 output
->data
.location
== VARYING_SLOT_BFC1
||
61 output
->data
.location
== VARYING_SLOT_CLIP_VERTEX
||
62 output
->data
.location
== VARYING_SLOT_CLIP_DIST0
||
63 output
->data
.location
== VARYING_SLOT_CLIP_DIST1
||
64 output
->data
.location
== VARYING_SLOT_POS
||
65 output
->data
.location
== VARYING_SLOT_PSIZ
||
66 output
->data
.location
== VARYING_SLOT_FOGC
||
67 output
->data
.location
== VARYING_SLOT_LAYER
||
68 output
->data
.location
== VARYING_SLOT_EDGE
||
69 output
->data
.location
== VARYING_SLOT_VIEWPORT
72 r600_shader_io
& io
= m_proc
.sh_info().output
[output
->data
.driver_location
];
73 auto semantic
= r600_get_varying_semantic(output
->data
.location
);
74 io
.name
= semantic
.first
;
75 io
.sid
= semantic
.second
;
77 m_proc
.evaluate_spi_sid(io
);
78 io
.write_mask
= ((1 << glsl_get_components(output
->type
)) - 1)
79 << output
->data
.location_frac
;
80 ++m_proc
.sh_info().noutput
;
82 if (output
->data
.location
== VARYING_SLOT_PSIZ
||
83 output
->data
.location
== VARYING_SLOT_EDGE
||
84 output
->data
.location
== VARYING_SLOT_LAYER
) // VIEWPORT?
87 if (output
->data
.location
!= VARYING_SLOT_POS
&&
88 output
->data
.location
!= VARYING_SLOT_EDGE
&&
89 output
->data
.location
!= VARYING_SLOT_PSIZ
&&
90 output
->data
.location
!= VARYING_SLOT_CLIP_VERTEX
)
91 m_param_map
[output
->data
.location
] = m_cur_param
++;
99 bool VertexStageExportForFS::store_deref(const nir_variable
*out_var
, nir_intrinsic_instr
* instr
)
102 switch (out_var
->data
.location
) {
103 case VARYING_SLOT_PSIZ
:
104 m_proc
.sh_info().vs_out_point_size
= 1;
105 m_proc
.sh_info().vs_out_misc_write
= 1;
107 case VARYING_SLOT_POS
:
108 return emit_varying_pos(out_var
, instr
);
109 case VARYING_SLOT_EDGE
: {
110 std::array
<uint32_t, 4> swizzle_override
= {7 ,0, 7, 7};
111 return emit_varying_pos(out_var
, instr
, &swizzle_override
);
113 case VARYING_SLOT_VIEWPORT
: {
114 std::array
<uint32_t, 4> swizzle_override
= {7, 7, 7, 0};
115 return emit_varying_pos(out_var
, instr
, &swizzle_override
) &&
116 emit_varying_param(out_var
, instr
);
118 case VARYING_SLOT_CLIP_VERTEX
:
119 return emit_clip_vertices(out_var
, instr
);
120 case VARYING_SLOT_CLIP_DIST0
:
121 case VARYING_SLOT_CLIP_DIST1
:
122 m_num_clip_dist
+= 4;
123 return emit_varying_param(out_var
, instr
) && emit_varying_pos(out_var
, instr
);
124 case VARYING_SLOT_LAYER
: {
125 m_proc
.sh_info().vs_out_misc_write
= 1;
126 m_proc
.sh_info().vs_out_layer
= 1;
127 std::array
<uint32_t, 4> swz
= {7,7,0,7};
128 return emit_varying_pos(out_var
, instr
, &swz
) &&
129 emit_varying_param(out_var
, instr
);
131 case VARYING_SLOT_VIEW_INDEX
:
132 return emit_varying_pos(out_var
, instr
) &&
133 emit_varying_param(out_var
, instr
);
136 return emit_varying_param(out_var
, instr
);
139 fprintf(stderr
, "r600-NIR: Unimplemented store_deref for %d\n",
140 out_var
->data
.location
);
144 bool VertexStageExportForFS::emit_varying_pos(const nir_variable
*out_var
, nir_intrinsic_instr
* instr
,
145 std::array
<uint32_t, 4> *swizzle_override
)
147 std::array
<uint32_t,4> swizzle
;
148 uint32_t write_mask
= 0;
150 if (swizzle_override
) {
151 swizzle
= *swizzle_override
;
152 for (int i
= 0; i
< 4; ++i
) {
154 write_mask
|= 1 << i
;
157 write_mask
= nir_intrinsic_write_mask(instr
) << out_var
->data
.location_frac
;
158 for (int i
= 0; i
< 4; ++i
)
159 swizzle
[i
] = ((1 << i
) & write_mask
) ? i
- out_var
->data
.location_frac
: 7;
162 m_proc
.sh_info().output
[out_var
->data
.driver_location
].write_mask
= write_mask
;
164 GPRVector value
= m_proc
.vec_from_nir_with_fetch_constant(instr
->src
[1], write_mask
, swizzle
);
165 m_proc
.set_output(out_var
->data
.driver_location
, value
.sel());
169 switch (out_var
->data
.location
) {
170 case VARYING_SLOT_EDGE
: {
171 m_proc
.sh_info().vs_out_misc_write
= 1;
172 m_proc
.sh_info().vs_out_edgeflag
= 1;
173 m_proc
.emit_instruction(op1_mov
, value
.reg_i(1), {value
.reg_i(1)}, {alu_write
, alu_dst_clamp
, alu_last_instr
});
174 m_proc
.emit_instruction(op1_flt_to_int
, value
.reg_i(1), {value
.reg_i(1)}, {alu_write
, alu_last_instr
});
175 m_proc
.sh_info().output
[out_var
->data
.driver_location
].write_mask
= 0xf;
178 case VARYING_SLOT_PSIZ
:
179 case VARYING_SLOT_LAYER
:
182 case VARYING_SLOT_VIEWPORT
:
183 m_proc
.sh_info().vs_out_misc_write
= 1;
184 m_proc
.sh_info().vs_out_viewport
= 1;
187 case VARYING_SLOT_POS
:
189 case VARYING_SLOT_CLIP_DIST0
:
190 case VARYING_SLOT_CLIP_DIST1
:
191 export_slot
= m_cur_clip_pos
++;
194 sfn_log
<< SfnLog::err
<< __func__
<< "Unsupported location "
195 << out_var
->data
.location
<< "\n";
199 m_last_pos_export
= new ExportInstruction(export_slot
, value
, ExportInstruction::et_pos
);
200 m_proc
.emit_export_instruction(m_last_pos_export
);
201 m_proc
.add_param_output_reg(out_var
->data
.driver_location
, m_last_pos_export
->gpr_ptr());
205 bool VertexStageExportForFS::emit_varying_param(const nir_variable
*out_var
, nir_intrinsic_instr
* instr
)
207 assert(out_var
->data
.driver_location
< m_proc
.sh_info().noutput
);
208 sfn_log
<< SfnLog::io
<< __func__
<< ": emit DDL: " << out_var
->data
.driver_location
<< "\n";
210 int write_mask
= nir_intrinsic_write_mask(instr
) << out_var
->data
.location_frac
;
211 std::array
<uint32_t,4> swizzle
;
212 for (int i
= 0; i
< 4; ++i
)
213 swizzle
[i
] = ((1 << i
) & write_mask
) ? i
- out_var
->data
.location_frac
: 7;
215 m_proc
.sh_info().output
[out_var
->data
.driver_location
].write_mask
= write_mask
;
217 GPRVector value
= m_proc
.vec_from_nir_with_fetch_constant(instr
->src
[1], write_mask
, swizzle
);
218 m_proc
.sh_info().output
[out_var
->data
.driver_location
].gpr
= value
.sel();
220 /* This should use the registers!! */
221 m_proc
.set_output(out_var
->data
.driver_location
, value
.sel());
223 auto param_loc
= m_param_map
.find(out_var
->data
.location
);
224 assert(param_loc
!= m_param_map
.end());
226 m_last_param_export
= new ExportInstruction(param_loc
->second
, value
, ExportInstruction::et_param
);
227 m_proc
.emit_export_instruction(m_last_param_export
);
228 m_proc
.add_param_output_reg(out_var
->data
.driver_location
, m_last_param_export
->gpr_ptr());
232 bool VertexStageExportForFS::emit_clip_vertices(const nir_variable
*out_var
, nir_intrinsic_instr
* instr
)
234 m_proc
.sh_info().cc_dist_mask
= 0xff;
235 m_proc
.sh_info().clip_dist_write
= 0xff;
237 GPRVector clip_vertex
= m_proc
.vec_from_nir_with_fetch_constant(instr
->src
[1], 0xf, {0,1,2,3});
239 for (int i
= 0; i
< 4; ++i
)
240 m_proc
.sh_info().output
[out_var
->data
.driver_location
].write_mask
|= 1 << i
;
242 GPRVector clip_dist
[2] = { m_proc
.get_temp_vec4(), m_proc
.get_temp_vec4()};
244 for (int i
= 0; i
< 8; i
++) {
247 AluInstruction
*ir
= nullptr;
248 for (int j
= 0; j
< 4; j
++) {
249 ir
= new AluInstruction(op2_dot4_ieee
, clip_dist
[oreg
].reg_i(j
), clip_vertex
.reg_i(j
),
250 PValue(new UniformValue(512 + i
, j
, R600_BUFFER_INFO_CONST_BUFFER
)),
251 (j
== ochan
) ? EmitInstruction::write
: EmitInstruction::empty
);
252 m_proc
.emit_instruction(ir
);
254 ir
->set_flag(alu_last_instr
);
257 m_last_pos_export
= new ExportInstruction(m_cur_clip_pos
++, clip_dist
[0], ExportInstruction::et_pos
);
258 m_proc
.emit_export_instruction(m_last_pos_export
);
260 m_last_pos_export
= new ExportInstruction(m_cur_clip_pos
, clip_dist
[1], ExportInstruction::et_pos
);
261 m_proc
.emit_export_instruction(m_last_pos_export
);
266 void VertexStageExportForFS::finalize_exports()
268 if (m_key
.vs
.as_gs_a
) {
269 PValue
o(new GPRValue(0,PIPE_SWIZZLE_0
));
270 GPRVector
primid({m_proc
.primitive_id(), o
,o
,o
});
271 m_last_param_export
= new ExportInstruction(m_cur_param
, primid
, ExportInstruction::et_param
);
272 m_proc
.emit_export_instruction(m_last_param_export
);
274 i
= m_proc
.sh_info().noutput
++;
275 auto& io
= m_proc
.sh_info().output
[i
];
276 io
.name
= TGSI_SEMANTIC_PRIMID
;
279 io
.interpolate
= TGSI_INTERPOLATE_CONSTANT
;
281 io
.spi_sid
= m_key
.vs
.prim_id_out
;
282 m_proc
.sh_info().vs_as_gs_a
= 1;
285 if (m_so_info
&& m_so_info
->num_outputs
)
288 m_pipe_shader
->enabled_stream_buffers_mask
= m_enabled_stream_buffers_mask
;
290 if (!m_last_param_export
) {
291 GPRVector
value(0,{7,7,7,7});
292 m_last_param_export
= new ExportInstruction(0, value
, ExportInstruction::et_param
);
293 m_proc
.emit_export_instruction(m_last_param_export
);
295 m_last_param_export
->set_last();
297 if (!m_last_pos_export
) {
298 GPRVector
value(0,{7,7,7,7});
299 m_last_pos_export
= new ExportInstruction(0, value
, ExportInstruction::et_pos
);
300 m_proc
.emit_export_instruction(m_last_pos_export
);
302 m_last_pos_export
->set_last();
305 bool VertexStageExportForFS::emit_stream(int stream
)
308 if (m_so_info
->num_outputs
> PIPE_MAX_SO_OUTPUTS
) {
309 R600_ERR("Too many stream outputs: %d\n", m_so_info
->num_outputs
);
312 for (unsigned i
= 0; i
< m_so_info
->num_outputs
; i
++) {
313 if (m_so_info
->output
[i
].output_buffer
>= 4) {
314 R600_ERR("Exceeded the max number of stream output buffers, got: %d\n",
315 m_so_info
->output
[i
].output_buffer
);
319 const GPRVector
*so_gpr
[PIPE_MAX_SHADER_OUTPUTS
];
320 unsigned start_comp
[PIPE_MAX_SHADER_OUTPUTS
];
321 std::vector
<GPRVector
> tmp(m_so_info
->num_outputs
);
323 /* Initialize locations where the outputs are stored. */
324 for (unsigned i
= 0; i
< m_so_info
->num_outputs
; i
++) {
325 if (stream
!= -1 && stream
!= m_so_info
->output
[i
].stream
)
328 sfn_log
<< SfnLog::instr
<< "Emit stream " << i
329 << " with register index " << m_so_info
->output
[i
].register_index
<< " so_gpr:";
332 so_gpr
[i
] = m_proc
.output_register(m_so_info
->output
[i
].register_index
);
335 sfn_log
<< SfnLog::err
<< "\nERR: register index "
336 << m_so_info
->output
[i
].register_index
337 << " doesn't correspond to an output register\n";
340 start_comp
[i
] = m_so_info
->output
[i
].start_component
;
341 /* Lower outputs with dst_offset < start_component.
343 * We can only output 4D vectors with a write mask, e.g. we can
344 * only output the W component at offset 3, etc. If we want
345 * to store Y, Z, or W at buffer offset 0, we need to use MOV
346 * to move it to X and output X. */
347 if (m_so_info
->output
[i
].dst_offset
< m_so_info
->output
[i
].start_component
) {
348 int tmp_index
= m_proc
.allocate_temp_register();
349 int sc
= m_so_info
->output
[i
].start_component
;
350 AluInstruction
*alu
= nullptr;
351 for (int j
= 0; j
< m_so_info
->output
[i
].num_components
; j
++) {
352 PValue
dst(new GPRValue(tmp_index
, j
));
353 alu
= new AluInstruction(op1_mov
, dst
, so_gpr
[i
]->reg_i(j
+ sc
), {alu_write
});
354 tmp
[i
].set_reg_i(j
, dst
);
355 m_proc
.emit_instruction(alu
);
358 alu
->set_flag(alu_last_instr
);
360 /* Fill the vector with masked values */
361 PValue
dst_blank(new GPRValue(tmp_index
, 7));
362 for (int j
= m_so_info
->output
[i
].num_components
; j
< 4; j
++)
363 tmp
[i
].set_reg_i(j
, dst_blank
);
368 sfn_log
<< SfnLog::instr
<< *so_gpr
[i
] << "\n";
371 /* Write outputs to buffers. */
372 for (unsigned i
= 0; i
< m_so_info
->num_outputs
; i
++) {
373 sfn_log
<< SfnLog::instr
<< "Write output buffer " << i
374 << " with register index " << m_so_info
->output
[i
].register_index
<< "\n";
376 StreamOutIntruction
*out_stream
=
377 new StreamOutIntruction(*so_gpr
[i
],
378 m_so_info
->output
[i
].num_components
,
379 m_so_info
->output
[i
].dst_offset
- start_comp
[i
],
380 ((1 << m_so_info
->output
[i
].num_components
) - 1) << start_comp
[i
],
381 m_so_info
->output
[i
].output_buffer
,
382 m_so_info
->output
[i
].stream
);
383 m_proc
.emit_export_instruction(out_stream
);
384 m_enabled_stream_buffers_mask
|= (1 << m_so_info
->output
[i
].output_buffer
) << m_so_info
->output
[i
].stream
* 4;
390 VertexStageExportForGS::VertexStageExportForGS(VertexStage
&proc
,
391 const r600_shader
*gs_shader
):
392 VertexStageExportBase(proc
),
393 m_gs_shader(gs_shader
)
398 bool VertexStageExportForGS::store_deref(const nir_variable
*out_var
, nir_intrinsic_instr
* instr
)
401 int ring_offset
= -1;
402 const r600_shader_io
& out_io
= m_proc
.sh_info().output
[out_var
->data
.driver_location
];
404 sfn_log
<< SfnLog::io
<< "check output " << out_var
->data
.driver_location
405 << " name=" << out_io
.name
<< " sid=" << out_io
.sid
<< "\n";
406 for (unsigned k
= 0; k
< m_gs_shader
->ninput
; ++k
) {
407 auto& in_io
= m_gs_shader
->input
[k
];
408 sfn_log
<< SfnLog::io
<< " against " << k
<< " name=" << in_io
.name
<< " sid=" << in_io
.sid
<< "\n";
410 if (in_io
.name
== out_io
.name
&&
411 in_io
.sid
== out_io
.sid
) {
412 ring_offset
= in_io
.ring_offset
;
417 if (out_var
->data
.location
== VARYING_SLOT_VIEWPORT
) {
418 m_proc
.sh_info().vs_out_viewport
= 1;
419 m_proc
.sh_info().vs_out_misc_write
= 1;
423 if (ring_offset
== -1) {
424 sfn_log
<< SfnLog::err
<< "VS defines output at "
425 << out_var
->data
.driver_location
<< "name=" << out_io
.name
426 << " sid=" << out_io
.sid
<< " that is not consumed as GS input\n";
430 uint32_t write_mask
= (1 << instr
->num_components
) - 1;
432 GPRVector value
= m_proc
.vec_from_nir_with_fetch_constant(instr
->src
[1], write_mask
,
433 swizzle_from_comps(instr
->num_components
));
435 auto ir
= new MemRingOutIntruction(cf_mem_ring
, mem_write
, value
,
436 ring_offset
>> 2, 4, PValue());
437 m_proc
.emit_export_instruction(ir
);
439 m_proc
.sh_info().output
[out_var
->data
.driver_location
].write_mask
|= write_mask
;
440 if (out_var
->data
.location
== VARYING_SLOT_CLIP_DIST0
||
441 out_var
->data
.location
== VARYING_SLOT_CLIP_DIST1
)
442 m_num_clip_dist
+= 4;
447 void VertexStageExportForGS::finalize_exports()
452 VertexStageExportForES::VertexStageExportForES(VertexStage
& proc
):
453 VertexStageExportBase(proc
)
457 bool VertexStageExportForES::store_deref(const nir_variable
*out_var
, nir_intrinsic_instr
* instr
)
462 void VertexStageExportForES::finalize_exports()