2 * Copyright (C) 2018 Jonathan Marek <jonathan@marek.ca>
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 * Jonathan Marek <jonathan@marek.ca>
27 #include "ir2_private.h"
29 static bool scalar_possible(struct ir2_instr
*instr
)
31 if (instr
->alu
.scalar_opc
== SCALAR_NONE
)
34 return src_ncomp(instr
) == 1;
37 static bool is_alu_compatible(struct ir2_instr
*a
, struct ir2_instr
*b
)
42 /* dont use same instruction twice */
46 /* PRED_SET must be alone */
47 if (b
->alu
.scalar_opc
>= PRED_SETEs
&&
48 b
->alu
.scalar_opc
<= PRED_SET_RESTOREs
)
51 /* must write to same export (issues otherwise?) */
52 return a
->alu
.export
== b
->alu
.export
;
55 /* priority of vector instruction for scheduling (lower=higher prio) */
56 static unsigned alu_vector_prio(struct ir2_instr
*instr
)
58 if (instr
->alu
.vector_opc
== VECTOR_NONE
)
64 /* TODO check src type and ncomps */
65 if (instr
->src_count
== 3)
68 if (!scalar_possible(instr
))
71 return instr
->src_count
== 2 ? 2 : 3;
74 /* priority of scalar instruction for scheduling (lower=higher prio) */
75 static unsigned alu_scalar_prio(struct ir2_instr
*instr
)
77 if (!scalar_possible(instr
))
80 /* this case is dealt with later */
81 if (instr
->src_count
> 1)
87 /* PRED to end of block */
88 if (instr
->alu
.scalar_opc
>= PRED_SETEs
&&
89 instr
->alu
.scalar_opc
<= PRED_SET_RESTOREs
)
92 /* scalar only have highest priority */
93 return instr
->alu
.vector_opc
== VECTOR_NONE
? 0 : 3;
96 /* this is a bit messy:
97 * we want to find a slot where we can insert a scalar MOV with
98 * a vector instruction that was already scheduled
100 static struct ir2_sched_instr
*
101 insert(struct ir2_context
*ctx
, unsigned block_idx
, unsigned reg_idx
,
102 struct ir2_src src1
, unsigned *comp
)
104 struct ir2_sched_instr
*sched
= NULL
, *s
;
105 unsigned i
, mask
= 0xf;
107 /* go first earliest point where the mov can be inserted */
108 for (i
= ctx
->instr_sched_count
-1; i
> 0; i
--) {
109 s
= &ctx
->instr_sched
[i
- 1];
111 if (s
->instr
&& s
->instr
->block_idx
!= block_idx
)
113 if (s
->instr_s
&& s
->instr_s
->block_idx
!= block_idx
)
116 if (src1
.type
== IR2_SRC_SSA
) {
117 if ((s
->instr
&& s
->instr
->idx
== src1
.num
) ||
118 (s
->instr_s
&& s
->instr_s
->idx
== src1
.num
))
122 unsigned mr
= ~(s
->reg_state
[reg_idx
/8] >> reg_idx
%8*4 & 0xf);
123 if ((mask
& mr
) == 0)
127 if (s
->instr_s
|| s
->instr
->src_count
== 3)
130 if (s
->instr
->type
!= IR2_ALU
|| s
->instr
->alu
.export
>= 0)
135 *comp
= ffs(mask
) - 1;
140 * in this case, insert a mov to place the 2nd src into to same reg
141 * (scalar sources come from the same register)
143 * this is a common case which works when one of the srcs is input/const
144 * but for instrs which have 2 ssa/reg srcs, then its not ideal
147 scalarize_case1(struct ir2_context
*ctx
, struct ir2_instr
*instr
, bool order
)
149 struct ir2_src src0
= instr
->src
[ order
];
150 struct ir2_src src1
= instr
->src
[!order
];
151 struct ir2_sched_instr
*sched
;
152 struct ir2_instr
*ins
;
164 /* TODO, insert needs logic for this */
165 if (src1
.type
== IR2_SRC_REG
)
168 /* we could do something if they match src1.. */
169 if (src0
.negate
|| src0
.abs
)
172 reg
= get_reg_src(ctx
, &src0
);
174 /* result not used more since we will overwrite */
175 for (int i
= 0; i
< 4; i
++)
176 if (reg
->comp
[i
].ref_count
!= !!(instr
->alu
.write_mask
& 1 << i
))
179 /* find a place to insert the mov */
180 sched
= insert(ctx
, instr
->block_idx
, reg
->idx
, src1
, &comp
);
184 ins
= &ctx
->instr
[idx
= ctx
->instr_count
++];
190 ins
->ssa
.idx
= reg
->idx
;
192 ins
->ssa
.comp
[0].c
= comp
;
193 ins
->alu
.scalar_opc
= MAXs
;
194 ins
->alu
.export
= -1;
195 ins
->alu
.write_mask
= 1;
196 ins
->pred
= instr
->pred
;
197 ins
->block_idx
= instr
->block_idx
;
199 instr
->src
[0] = src0
;
200 instr
->alu
.src1_swizzle
= comp
;
202 sched
->instr_s
= ins
;
206 /* fill sched with next fetch or (vector and/or scalar) alu instruction */
207 static int sched_next(struct ir2_context
*ctx
, struct ir2_sched_instr
*sched
)
209 struct ir2_instr
*avail
[0x100], *instr_v
= NULL
, *instr_s
= NULL
;
210 unsigned avail_count
= 0;
212 instr_alloc_type_t export
= ~0u;
215 /* XXX merge this loop with the other one somehow? */
216 ir2_foreach_instr(instr
, ctx
) {
217 if (!instr
->need_emit
)
219 if (is_export(instr
))
220 export
= MIN2(export
, export_buf(instr
->alu
.export
));
223 ir2_foreach_instr(instr
, ctx
) {
224 if (!instr
->need_emit
)
227 /* dont mix exports */
228 if (is_export(instr
) && export_buf(instr
->alu
.export
) != export
)
232 block_idx
= instr
->block_idx
;
233 else if (block_idx
!= instr
->block_idx
|| /* must be same block */
234 instr
->type
== IR2_CF
|| /* CF/MEM must be alone */
235 (is_export(instr
) && export
== SQ_MEMORY
))
237 /* it works because IR2_CF is always at end of block
238 * and somewhat same idea with MEM exports, which might not be alone
239 * but will end up in-order at least
242 /* check if dependencies are satisfied */
244 ir2_foreach_src(src
, instr
) {
245 if (src
->type
== IR2_SRC_REG
) {
246 /* need to check if all previous instructions in the block
247 * which write the reg have been emitted
249 * XXX: check components instead of whole register
251 struct ir2_reg
*reg
= get_reg_src(ctx
, src
);
252 ir2_foreach_instr(p
, ctx
) {
253 if (!p
->is_ssa
&& p
->reg
== reg
&& p
->idx
< instr
->idx
)
254 is_ok
&= !p
->need_emit
;
256 } else if (src
->type
== IR2_SRC_SSA
) {
257 /* in this case its easy, just check need_emit */
258 is_ok
&= !ctx
->instr
[src
->num
].need_emit
;
264 avail
[avail_count
++] = instr
;
268 assert(block_idx
== -1);
272 /* priority to FETCH instructions */
273 ir2_foreach_avail(instr
) {
274 if (instr
->type
== IR2_ALU
)
277 ra_src_free(ctx
, instr
);
278 ra_reg(ctx
, get_reg(instr
), -1, false, 0);
280 instr
->need_emit
= false;
281 sched
->instr
= instr
;
282 sched
->instr_s
= NULL
;
286 /* TODO precompute priorities */
288 unsigned prio_v
= ~0u, prio_s
= ~0u, prio
;
289 ir2_foreach_avail(instr
) {
290 prio
= alu_vector_prio(instr
);
297 /* TODO can still insert scalar if src_count=3, if smart about it */
298 if (!instr_v
|| instr_v
->src_count
< 3) {
299 ir2_foreach_avail(instr
) {
300 bool compat
= is_alu_compatible(instr_v
, instr
);
302 prio
= alu_scalar_prio(instr
);
303 if (prio
>= prio_v
&& !compat
)
315 assert(instr_v
|| instr_s
);
317 /* now, we try more complex insertion of vector instruction as scalar
318 * TODO: if we are smart we can still insert if instr_v->src_count==3
320 if (!instr_s
&& instr_v
->src_count
< 3) {
321 ir2_foreach_avail(instr
) {
322 if (!is_alu_compatible(instr_v
, instr
) || !scalar_possible(instr
))
325 /* at this point, src_count should always be 2 */
326 assert(instr
->src_count
== 2);
328 if (scalarize_case1(ctx
, instr
, 0)) {
332 if (scalarize_case1(ctx
, instr
, 1)) {
339 /* free src registers */
341 instr_v
->need_emit
= false;
342 ra_src_free(ctx
, instr_v
);
346 instr_s
->need_emit
= false;
347 ra_src_free(ctx
, instr_s
);
350 /* allocate dst registers */
352 ra_reg(ctx
, get_reg(instr_v
), -1, is_export(instr_v
), instr_v
->alu
.write_mask
);
355 ra_reg(ctx
, get_reg(instr_s
), -1, is_export(instr_s
), instr_s
->alu
.write_mask
);
357 sched
->instr
= instr_v
;
358 sched
->instr_s
= instr_s
;
362 /* scheduling: determine order of instructions */
363 static void schedule_instrs(struct ir2_context
*ctx
)
365 struct ir2_sched_instr
*sched
;
368 /* allocate input registers */
369 for (unsigned idx
= 0; idx
< ARRAY_SIZE(ctx
->input
); idx
++)
370 if (ctx
->input
[idx
].initialized
)
371 ra_reg(ctx
, &ctx
->input
[idx
], idx
, false, 0);
374 sched
= &ctx
->instr_sched
[ctx
->instr_sched_count
++];
375 block_idx
= sched_next(ctx
, sched
);
378 memcpy(sched
->reg_state
, ctx
->reg_state
, sizeof(ctx
->reg_state
));
380 /* catch texture fetch after scheduling and insert the
381 * SET_TEX_LOD right before it if necessary
384 struct ir2_instr
*instr
= sched
->instr
, *tex_lod
;
385 if (instr
&& instr
->type
== IR2_FETCH
&&
386 instr
->fetch
.opc
== TEX_FETCH
&& instr
->src_count
== 2) {
387 /* generate the SET_LOD instruction */
388 tex_lod
= &ctx
->instr
[ctx
->instr_count
++];
389 tex_lod
->type
= IR2_FETCH
;
390 tex_lod
->block_idx
= instr
->block_idx
;
391 tex_lod
->pred
= instr
->pred
;
392 tex_lod
->fetch
.opc
= TEX_SET_TEX_LOD
;
393 tex_lod
->src
[0] = instr
->src
[1];
394 tex_lod
->src_count
= 1;
397 sched
->instr
= tex_lod
;
398 ctx
->instr_sched_count
++;
401 bool free_block
= true;
402 ir2_foreach_instr(instr
, ctx
)
403 free_block
&= instr
->block_idx
!= block_idx
;
405 ra_block_free(ctx
, block_idx
);
407 ctx
->instr_sched_count
--;
411 ir2_compile(struct fd2_shader_stateobj
*so
, unsigned variant
,
412 struct fd2_shader_stateobj
*fp
)
414 struct ir2_context ctx
= { };
415 bool binning
= !fp
&& so
->type
== MESA_SHADER_VERTEX
;
418 so
->variant
[variant
].f
= fp
->variant
[0].f
;
421 ctx
.info
= &so
->variant
[variant
].info
;
422 ctx
.f
= &so
->variant
[variant
].f
;
423 ctx
.info
->max_reg
= -1;
425 /* convert nir to internal representation */
426 ir2_nir_compile(&ctx
, binning
);
428 /* copy propagate srcs */
431 /* get ref_counts and kill non-needed instructions */
434 /* remove movs used to write outputs */
437 /* instruction order.. and vector->scalar conversions */
438 schedule_instrs(&ctx
);
440 /* finally, assemble to bitcode */
441 assemble(&ctx
, binning
);