2 * Copyright (C) 2018 Jonathan Marek <jonathan@marek.ca>
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 * Jonathan Marek <jonathan@marek.ca>
27 #include "ir2_private.h"
30 src_swizzle(struct ir2_context
*ctx
, struct ir2_src
*src
, unsigned ncomp
)
32 struct ir2_reg_component
*comps
;
42 /* we need to take into account where the components were allocated */
43 comps
= get_reg_src(ctx
, src
)->comp
;
44 for (int i
= 0; i
< ncomp
; i
++) {
45 swiz
|= swiz_set(comps
[swiz_get(src
->swizzle
, i
)].c
, i
);
50 /* alu instr need to take into how the output components are allocated */
52 /* scalar doesn't need to take into account dest swizzle */
55 alu_swizzle_scalar(struct ir2_context
*ctx
, struct ir2_src
*reg
)
57 /* hardware seems to take from W, but swizzle everywhere just in case */
58 return swiz_merge(src_swizzle(ctx
, reg
, 1), IR2_SWIZZLE_XXXX
);
62 alu_swizzle(struct ir2_context
*ctx
, struct ir2_instr
*instr
, struct ir2_src
*src
)
64 struct ir2_reg_component
*comp
= get_reg(instr
)->comp
;
65 unsigned swiz0
= src_swizzle(ctx
, src
, src_ncomp(instr
));
68 /* non per component special cases */
69 switch (instr
->alu
.vector_opc
) {
70 case PRED_SETE_PUSHv
... PRED_SETGTE_PUSHv
:
71 return alu_swizzle_scalar(ctx
, src
);
81 for (int i
= 0, j
= 0; i
< dst_ncomp(instr
); j
++) {
82 if (instr
->alu
.write_mask
& 1 << j
) {
84 swiz
|= swiz_set(i
, comp
[j
].c
);
88 return swiz_merge(swiz0
, swiz
);
92 alu_swizzle_scalar2(struct ir2_context
*ctx
, struct ir2_src
*src
, unsigned s1
)
94 /* hardware seems to take from ZW, but swizzle everywhere (ABAB) */
95 unsigned s0
= swiz_get(src_swizzle(ctx
, src
, 1), 0);
96 return swiz_merge(swiz_set(s0
, 0) | swiz_set(s1
, 1), IR2_SWIZZLE_XYXY
);
99 /* write_mask needs to be transformed by allocation information */
102 alu_write_mask(struct ir2_context
*ctx
, struct ir2_instr
*instr
)
104 struct ir2_reg_component
*comp
= get_reg(instr
)->comp
;
105 unsigned write_mask
= 0;
107 for (int i
= 0; i
< 4; i
++) {
108 if (instr
->alu
.write_mask
& 1 << i
)
109 write_mask
|= 1 << comp
[i
].c
;
115 /* fetch instructions can swizzle dest, but src swizzle needs conversion */
118 fetch_swizzle(struct ir2_context
*ctx
, struct ir2_src
*src
, unsigned ncomp
)
120 unsigned alu_swiz
= src_swizzle(ctx
, src
, ncomp
);
122 for (int i
= 0; i
< ncomp
; i
++)
123 swiz
|= swiz_get(alu_swiz
, i
) << i
* 2;
128 fetch_dst_swiz(struct ir2_context
*ctx
, struct ir2_instr
*instr
)
130 struct ir2_reg_component
*comp
= get_reg(instr
)->comp
;
131 unsigned dst_swiz
= 0xfff;
132 for (int i
= 0; i
< dst_ncomp(instr
); i
++) {
133 dst_swiz
&= ~(7 << comp
[i
].c
* 3);
134 dst_swiz
|= i
<< comp
[i
].c
* 3;
139 /* register / export # for instr */
141 dst_to_reg(struct ir2_context
*ctx
, struct ir2_instr
*instr
)
143 if (is_export(instr
))
144 return instr
->alu
.export
;
146 return get_reg(instr
)->idx
;
149 /* register # for src */
150 static unsigned src_to_reg(struct ir2_context
*ctx
, struct ir2_src
*src
)
152 return get_reg_src(ctx
, src
)->idx
;
155 static unsigned src_reg_byte(struct ir2_context
*ctx
, struct ir2_src
*src
)
157 if (src
->type
== IR2_SRC_CONST
) {
158 assert(!src
->abs
); /* no abs bit for const */
161 return src_to_reg(ctx
, src
) | (src
->abs
? 0x80 : 0);
164 /* produce the 12 byte binary instruction for a given sched_instr */
166 fill_instr(struct ir2_context
*ctx
, struct ir2_sched_instr
*sched
,
167 instr_t
*bc
, bool * is_fetch
)
169 struct ir2_instr
*instr
= sched
->instr
, *instr_s
, *instr_v
;
173 if (instr
&& instr
->type
== IR2_FETCH
) {
176 bc
->fetch
.opc
= instr
->fetch
.opc
;
177 bc
->fetch
.pred_select
= !!instr
->pred
;
178 bc
->fetch
.pred_condition
= instr
->pred
& 1;
180 struct ir2_src
*src
= instr
->src
;
182 if (instr
->fetch
.opc
== VTX_FETCH
) {
183 instr_fetch_vtx_t
*vtx
= &bc
->fetch
.vtx
;
185 assert(instr
->fetch
.vtx
.const_idx
<= 0x1f);
186 assert(instr
->fetch
.vtx
.const_idx_sel
<= 0x3);
188 vtx
->src_reg
= src_to_reg(ctx
, src
);
189 vtx
->src_swiz
= fetch_swizzle(ctx
, src
, 1);
190 vtx
->dst_reg
= dst_to_reg(ctx
, instr
);
191 vtx
->dst_swiz
= fetch_dst_swiz(ctx
, instr
);
193 vtx
->must_be_one
= 1;
194 vtx
->const_index
= instr
->fetch
.vtx
.const_idx
;
195 vtx
->const_index_sel
= instr
->fetch
.vtx
.const_idx_sel
;
197 /* other fields will be patched */
199 /* XXX seems like every FETCH but the first has
202 vtx
->reserved3
= instr
->idx
? 0x1 : 0x0;
203 vtx
->reserved0
= instr
->idx
? 0x2 : 0x3;
204 } else if (instr
->fetch
.opc
== TEX_FETCH
) {
205 instr_fetch_tex_t
*tex
= &bc
->fetch
.tex
;
207 tex
->src_reg
= src_to_reg(ctx
, src
);
208 tex
->src_swiz
= fetch_swizzle(ctx
, src
, 3);
209 tex
->dst_reg
= dst_to_reg(ctx
, instr
);
210 tex
->dst_swiz
= fetch_dst_swiz(ctx
, instr
);
211 /* tex->const_idx = patch_fetches */
212 tex
->mag_filter
= TEX_FILTER_USE_FETCH_CONST
;
213 tex
->min_filter
= TEX_FILTER_USE_FETCH_CONST
;
214 tex
->mip_filter
= TEX_FILTER_USE_FETCH_CONST
;
215 tex
->aniso_filter
= ANISO_FILTER_USE_FETCH_CONST
;
216 tex
->arbitrary_filter
= ARBITRARY_FILTER_USE_FETCH_CONST
;
217 tex
->vol_mag_filter
= TEX_FILTER_USE_FETCH_CONST
;
218 tex
->vol_min_filter
= TEX_FILTER_USE_FETCH_CONST
;
219 tex
->use_comp_lod
= ctx
->so
->type
== MESA_SHADER_FRAGMENT
;
220 tex
->use_reg_lod
= instr
->src_count
== 2;
221 tex
->sample_location
= SAMPLE_CENTER
;
222 tex
->tx_coord_denorm
= instr
->fetch
.tex
.is_rect
;
223 } else if (instr
->fetch
.opc
== TEX_SET_TEX_LOD
) {
224 instr_fetch_tex_t
*tex
= &bc
->fetch
.tex
;
226 tex
->src_reg
= src_to_reg(ctx
, src
);
227 tex
->src_swiz
= fetch_swizzle(ctx
, src
, 1);
229 tex
->dst_swiz
= 0xfff;
231 tex
->mag_filter
= TEX_FILTER_USE_FETCH_CONST
;
232 tex
->min_filter
= TEX_FILTER_USE_FETCH_CONST
;
233 tex
->mip_filter
= TEX_FILTER_USE_FETCH_CONST
;
234 tex
->aniso_filter
= ANISO_FILTER_USE_FETCH_CONST
;
235 tex
->arbitrary_filter
= ARBITRARY_FILTER_USE_FETCH_CONST
;
236 tex
->vol_mag_filter
= TEX_FILTER_USE_FETCH_CONST
;
237 tex
->vol_min_filter
= TEX_FILTER_USE_FETCH_CONST
;
238 tex
->use_comp_lod
= 1;
239 tex
->use_reg_lod
= 0;
240 tex
->sample_location
= SAMPLE_CENTER
;
247 instr_v
= sched
->instr
;
248 instr_s
= sched
->instr_s
;
251 struct ir2_src src1
, src2
, *src3
;
253 src1
= instr_v
->src
[0];
254 src2
= instr_v
->src
[instr_v
->src_count
> 1];
255 src3
= instr_v
->src_count
== 3 ? &instr_v
->src
[2] : NULL
;
257 bc
->alu
.vector_opc
= instr_v
->alu
.vector_opc
;
258 bc
->alu
.vector_write_mask
= alu_write_mask(ctx
, instr_v
);
259 bc
->alu
.vector_dest
= dst_to_reg(ctx
, instr_v
);
260 bc
->alu
.vector_clamp
= instr_v
->alu
.saturate
;
261 bc
->alu
.export_data
= instr_v
->alu
.export
>= 0;
263 /* single operand SETEv, use 0.0f as src2 */
264 if (instr_v
->src_count
== 1 &&
265 (bc
->alu
.vector_opc
== SETEv
||
266 bc
->alu
.vector_opc
== SETNEv
||
267 bc
->alu
.vector_opc
== SETGTv
||
268 bc
->alu
.vector_opc
== SETGTEv
))
269 src2
= ir2_zero(ctx
);
271 /* export32 instr for a20x hw binning has this bit set..
272 * it seems to do more than change the base address of constants
275 bc
->alu
.relative_addr
=
276 (bc
->alu
.export_data
&& bc
->alu
.vector_dest
== 32);
278 bc
->alu
.src1_reg_byte
= src_reg_byte(ctx
, &src1
);
279 bc
->alu
.src1_swiz
= alu_swizzle(ctx
, instr_v
, &src1
);
280 bc
->alu
.src1_reg_negate
= src1
.negate
;
281 bc
->alu
.src1_sel
= src1
.type
!= IR2_SRC_CONST
;
283 bc
->alu
.src2_reg_byte
= src_reg_byte(ctx
, &src2
);
284 bc
->alu
.src2_swiz
= alu_swizzle(ctx
, instr_v
, &src2
);
285 bc
->alu
.src2_reg_negate
= src2
.negate
;
286 bc
->alu
.src2_sel
= src2
.type
!= IR2_SRC_CONST
;
289 bc
->alu
.src3_reg_byte
= src_reg_byte(ctx
, src3
);
290 bc
->alu
.src3_swiz
= alu_swizzle(ctx
, instr_v
, src3
);
291 bc
->alu
.src3_reg_negate
= src3
->negate
;
292 bc
->alu
.src3_sel
= src3
->type
!= IR2_SRC_CONST
;
295 bc
->alu
.pred_select
= instr_v
->pred
;
299 struct ir2_src
*src
= instr_s
->src
;
301 bc
->alu
.scalar_opc
= instr_s
->alu
.scalar_opc
;
302 bc
->alu
.scalar_write_mask
= alu_write_mask(ctx
, instr_s
);
303 bc
->alu
.scalar_dest
= dst_to_reg(ctx
, instr_s
);
304 bc
->alu
.scalar_clamp
= instr_s
->alu
.saturate
;
305 bc
->alu
.export_data
= instr_s
->alu
.export
>= 0;
307 if (instr_s
->src_count
== 1) {
308 bc
->alu
.src3_reg_byte
= src_reg_byte(ctx
, src
);
309 bc
->alu
.src3_swiz
= alu_swizzle_scalar(ctx
, src
);
310 bc
->alu
.src3_reg_negate
= src
->negate
;
311 bc
->alu
.src3_sel
= src
->type
!= IR2_SRC_CONST
;
313 assert(instr_s
->src_count
== 2);
315 bc
->alu
.src3_reg_byte
= src_reg_byte(ctx
, src
);
316 bc
->alu
.src3_swiz
= alu_swizzle_scalar2(ctx
, src
, instr_s
->alu
.src1_swizzle
);
317 bc
->alu
.src3_reg_negate
= src
->negate
;
318 bc
->alu
.src3_sel
= src
->type
!= IR2_SRC_CONST
;;
322 assert(instr_s
->pred
== instr_v
->pred
);
323 bc
->alu
.pred_select
= instr_s
->pred
;
331 write_cfs(struct ir2_context
*ctx
, instr_cf_t
* cfs
, unsigned cf_idx
,
332 instr_cf_alloc_t
*alloc
, instr_cf_exec_t
*exec
)
337 cfs
[cf_idx
++].alloc
= *alloc
;
339 /* for memory alloc offset for patching */
340 if (alloc
&& alloc
->buffer_select
== SQ_MEMORY
&&
341 ctx
->info
->mem_export_ptr
== -1)
342 ctx
->info
->mem_export_ptr
= cf_idx
/ 2 * 3;
344 cfs
[cf_idx
++].exec
= *exec
;
345 exec
->address
+= exec
->count
;
352 /* assemble the final shader */
353 void assemble(struct ir2_context
*ctx
, bool binning
)
355 /* hw seems to have a limit of 384 (num_cf/2+num_instr <= 384)
356 * address is 9 bits so could it be 512 ?
359 instr_t bytecode
[384], bc
;
360 unsigned block_addr
[128];
364 instr_cf_exec_t exec
= {.opc
= EXEC
};
365 instr_cf_alloc_t alloc
= {.opc
= ALLOC
};
367 int sync_id
, sync_id_prev
= -1;
368 bool is_fetch
= false;
369 bool need_sync
= true;
370 bool need_alloc
= false;
371 unsigned block_idx
= 0;
373 ctx
->info
->mem_export_ptr
= -1;
374 ctx
->info
->num_fetch_instrs
= 0;
376 /* vertex shader always needs to allocate at least one parameter
377 * if it will never happen,
379 if (ctx
->so
->type
== MESA_SHADER_VERTEX
&& ctx
->f
->inputs_count
== 0) {
380 alloc
.buffer_select
= SQ_PARAMETER_PIXEL
;
381 cfs
[num_cf
++].alloc
= alloc
;
386 for (int i
= 0, j
= 0; j
< ctx
->instr_sched_count
; j
++) {
387 struct ir2_instr
*instr
= ctx
->instr_sched
[j
].instr
;
389 /* catch IR2_CF since it isn't a regular instruction */
390 if (instr
&& instr
->type
== IR2_CF
) {
391 assert(!need_alloc
); /* XXX */
393 /* flush any exec cf before inserting jmp */
395 num_cf
= write_cfs(ctx
, cfs
, num_cf
, NULL
, &exec
);
397 cfs
[num_cf
++].jmp_call
= (instr_cf_jmp_call_t
) {
399 .address
= instr
->cf
.block_idx
, /* will be fixed later */
400 .force_call
= !instr
->pred
,
402 .direction
= instr
->cf
.block_idx
> instr
->block_idx
,
403 .condition
= instr
->pred
& 1,
408 /* fill the 3 dwords for the instruction */
409 fill_instr(ctx
, &ctx
->instr_sched
[j
], &bc
, &is_fetch
);
411 /* we need to sync between ALU/VTX_FETCH/TEX_FETCH types */
414 sync_id
= bc
.fetch
.opc
== VTX_FETCH
? 1 : 2;
416 need_sync
= sync_id
!= sync_id_prev
;
417 sync_id_prev
= sync_id
;
422 if (ctx
->instr_sched
[j
].instr
)
423 block
= ctx
->instr_sched
[j
].instr
->block_idx
;
425 block
= ctx
->instr_sched
[j
].instr_s
->block_idx
;
427 assert(block_idx
<= block
);
430 /* info for patching */
432 struct ir2_fetch_info
*info
=
433 &ctx
->info
->fetch_info
[ctx
->info
->num_fetch_instrs
++];
434 info
->offset
= i
* 3; /* add cf offset later */
436 if (bc
.fetch
.opc
== VTX_FETCH
) {
437 info
->vtx
.dst_swiz
= bc
.fetch
.vtx
.dst_swiz
;
438 } else if (bc
.fetch
.opc
== TEX_FETCH
) {
439 info
->tex
.samp_id
= instr
->fetch
.tex
.samp_id
;
440 info
->tex
.src_swiz
= bc
.fetch
.tex
.src_swiz
;
442 ctx
->info
->num_fetch_instrs
--;
446 /* exec cf after 6 instr or when switching between fetch / alu */
447 if (exec
.count
== 6 || (exec
.count
&& (need_sync
|| block
!= block_idx
))) {
448 num_cf
= write_cfs(ctx
, cfs
, num_cf
, need_alloc
? &alloc
: NULL
, &exec
);
452 /* update block_addrs for jmp patching */
453 while (block_idx
< block
)
454 block_addr
[++block_idx
] = num_cf
;
456 /* export - fill alloc cf */
457 if (!is_fetch
&& bc
.alu
.export_data
) {
458 /* get the export buffer from either vector/scalar dest */
459 instr_alloc_type_t buffer
=
460 export_buf(bc
.alu
.vector_dest
);
461 if (bc
.alu
.scalar_write_mask
) {
462 if (bc
.alu
.vector_write_mask
)
463 assert(buffer
== export_buf(bc
.alu
.scalar_dest
));
464 buffer
= export_buf(bc
.alu
.scalar_dest
);
467 /* flush previous alloc if the buffer changes */
468 bool need_new_alloc
= buffer
!= alloc
.buffer_select
;
470 /* memory export always in 32/33 pair, new alloc on 32 */
471 if (bc
.alu
.vector_dest
== 32)
472 need_new_alloc
= true;
474 if (need_new_alloc
&& exec
.count
) {
475 num_cf
= write_cfs(ctx
, cfs
, num_cf
, need_alloc
? &alloc
: NULL
, &exec
);
479 need_alloc
|= need_new_alloc
;
482 alloc
.buffer_select
= buffer
;
484 if (buffer
== SQ_PARAMETER_PIXEL
&& ctx
->so
->type
== MESA_SHADER_VERTEX
)
485 alloc
.size
= ctx
->f
->inputs_count
- 1;
487 if (buffer
== SQ_POSITION
)
488 alloc
.size
= ctx
->so
->writes_psize
;
492 exec
.serialize
|= 0x1 << exec
.count
* 2;
494 exec
.serialize
|= 0x2 << exec
.count
* 2;
504 write_cfs(ctx
, cfs
, num_cf
, need_alloc
? &alloc
: NULL
, &exec
);
506 /* insert nop to get an even # of CFs */
508 cfs
[num_cf
++] = (instr_cf_t
) {
512 for (int idx
= 0; idx
< num_cf
; idx
++) {
513 switch (cfs
[idx
].opc
) {
519 cfs
[idx
].exec
.address
+= num_cf
/ 2;
522 cfs
[idx
].jmp_call
.address
= block_addr
[cfs
[idx
].jmp_call
.address
];
529 /* concatenate cfs and alu/fetch */
530 uint32_t cfdwords
= num_cf
/ 2 * 3;
531 uint32_t alufetchdwords
= exec
.address
* 3;
532 uint32_t sizedwords
= cfdwords
+ alufetchdwords
;
533 uint32_t *dwords
= malloc(sizedwords
* 4);
535 memcpy(dwords
, cfs
, cfdwords
* 4);
536 memcpy(&dwords
[cfdwords
], bytecode
, alufetchdwords
* 4);
538 /* finalize ir2_shader_info */
539 ctx
->info
->dwords
= dwords
;
540 ctx
->info
->sizedwords
= sizedwords
;
541 for (int i
= 0; i
< ctx
->info
->num_fetch_instrs
; i
++)
542 ctx
->info
->fetch_info
[i
].offset
+= cfdwords
;
544 if (fd_mesa_debug
& FD_DBG_DISASM
) {
545 DBG("disassemble: type=%d", ctx
->so
->type
);
546 disasm_a2xx(dwords
, sizedwords
, 0, ctx
->so
->type
);