nir: allow specifying filter callback in lower_alu_to_scalar
[mesa.git] / src / gallium / drivers / freedreno / a2xx / ir2_assemble.c
1 /*
2 * Copyright (C) 2018 Jonathan Marek <jonathan@marek.ca>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Jonathan Marek <jonathan@marek.ca>
25 */
26
27 #include "ir2_private.h"
28
29 static unsigned
30 src_swizzle(struct ir2_context *ctx, struct ir2_src *src, unsigned ncomp)
31 {
32 struct ir2_reg_component *comps;
33 unsigned swiz = 0;
34
35 switch (src->type) {
36 case IR2_SRC_SSA:
37 case IR2_SRC_REG:
38 break;
39 default:
40 return src->swizzle;
41 }
42 /* we need to take into account where the components were allocated */
43 comps = get_reg_src(ctx, src)->comp;
44 for (int i = 0; i < ncomp; i++) {
45 swiz |= swiz_set(comps[swiz_get(src->swizzle, i)].c, i);
46 }
47 return swiz;
48 }
49
50 /* alu instr need to take into how the output components are allocated */
51
52 /* scalar doesn't need to take into account dest swizzle */
53
54 static unsigned
55 alu_swizzle_scalar(struct ir2_context *ctx, struct ir2_src *reg)
56 {
57 /* hardware seems to take from W, but swizzle everywhere just in case */
58 return swiz_merge(src_swizzle(ctx, reg, 1), IR2_SWIZZLE_XXXX);
59 }
60
61 static unsigned
62 alu_swizzle(struct ir2_context *ctx, struct ir2_instr *instr, struct ir2_src *src)
63 {
64 struct ir2_reg_component *comp = get_reg(instr)->comp;
65 unsigned swiz0 = src_swizzle(ctx, src, src_ncomp(instr));
66 unsigned swiz = 0;
67
68 /* non per component special cases */
69 switch (instr->alu.vector_opc) {
70 case PRED_SETE_PUSHv ... PRED_SETGTE_PUSHv:
71 return alu_swizzle_scalar(ctx, src);
72 case DOT2ADDv:
73 case DOT3v:
74 case DOT4v:
75 case CUBEv:
76 return swiz0;
77 default:
78 break;
79 }
80
81 for (int i = 0, j = 0; i < dst_ncomp(instr); j++) {
82 if (instr->alu.write_mask & 1 << j) {
83 if (comp[j].c != 7)
84 swiz |= swiz_set(i, comp[j].c);
85 i++;
86 }
87 }
88 return swiz_merge(swiz0, swiz);
89 }
90
91 static unsigned
92 alu_swizzle_scalar2(struct ir2_context *ctx, struct ir2_src *src, unsigned s1)
93 {
94 /* hardware seems to take from ZW, but swizzle everywhere (ABAB) */
95 unsigned s0 = swiz_get(src_swizzle(ctx, src, 1), 0);
96 return swiz_merge(swiz_set(s0, 0) | swiz_set(s1, 1), IR2_SWIZZLE_XYXY);
97 }
98
99 /* write_mask needs to be transformed by allocation information */
100
101 static unsigned
102 alu_write_mask(struct ir2_context *ctx, struct ir2_instr *instr)
103 {
104 struct ir2_reg_component *comp = get_reg(instr)->comp;
105 unsigned write_mask = 0;
106
107 for (int i = 0; i < 4; i++) {
108 if (instr->alu.write_mask & 1 << i)
109 write_mask |= 1 << comp[i].c;
110 }
111
112 return write_mask;
113 }
114
115 /* fetch instructions can swizzle dest, but src swizzle needs conversion */
116
117 static unsigned
118 fetch_swizzle(struct ir2_context *ctx, struct ir2_src *src, unsigned ncomp)
119 {
120 unsigned alu_swiz = src_swizzle(ctx, src, ncomp);
121 unsigned swiz = 0;
122 for (int i = 0; i < ncomp; i++)
123 swiz |= swiz_get(alu_swiz, i) << i * 2;
124 return swiz;
125 }
126
127 static unsigned
128 fetch_dst_swiz(struct ir2_context *ctx, struct ir2_instr *instr)
129 {
130 struct ir2_reg_component *comp = get_reg(instr)->comp;
131 unsigned dst_swiz = 0xfff;
132 for (int i = 0; i < dst_ncomp(instr); i++) {
133 dst_swiz &= ~(7 << comp[i].c * 3);
134 dst_swiz |= i << comp[i].c * 3;
135 }
136 return dst_swiz;
137 }
138
139 /* register / export # for instr */
140 static unsigned
141 dst_to_reg(struct ir2_context *ctx, struct ir2_instr *instr)
142 {
143 if (is_export(instr))
144 return instr->alu.export;
145
146 return get_reg(instr)->idx;
147 }
148
149 /* register # for src */
150 static unsigned src_to_reg(struct ir2_context *ctx, struct ir2_src *src)
151 {
152 return get_reg_src(ctx, src)->idx;
153 }
154
155 static unsigned src_reg_byte(struct ir2_context *ctx, struct ir2_src *src)
156 {
157 if (src->type == IR2_SRC_CONST) {
158 assert(!src->abs); /* no abs bit for const */
159 return src->num;
160 }
161 return src_to_reg(ctx, src) | (src->abs ? 0x80 : 0);
162 }
163
164 /* produce the 12 byte binary instruction for a given sched_instr */
165 static void
166 fill_instr(struct ir2_context *ctx, struct ir2_sched_instr *sched,
167 instr_t *bc, bool * is_fetch)
168 {
169 struct ir2_instr *instr = sched->instr, *instr_s, *instr_v;
170
171 *bc = (instr_t) {};
172
173 if (instr && instr->type == IR2_FETCH) {
174 *is_fetch = true;
175
176 bc->fetch.opc = instr->fetch.opc;
177 bc->fetch.pred_select = !!instr->pred;
178 bc->fetch.pred_condition = instr->pred & 1;
179
180 struct ir2_src *src = instr->src;
181
182 if (instr->fetch.opc == VTX_FETCH) {
183 instr_fetch_vtx_t *vtx = &bc->fetch.vtx;
184
185 assert(instr->fetch.vtx.const_idx <= 0x1f);
186 assert(instr->fetch.vtx.const_idx_sel <= 0x3);
187
188 vtx->src_reg = src_to_reg(ctx, src);
189 vtx->src_swiz = fetch_swizzle(ctx, src, 1);
190 vtx->dst_reg = dst_to_reg(ctx, instr);
191 vtx->dst_swiz = fetch_dst_swiz(ctx, instr);
192
193 vtx->must_be_one = 1;
194 vtx->const_index = instr->fetch.vtx.const_idx;
195 vtx->const_index_sel = instr->fetch.vtx.const_idx_sel;
196
197 /* other fields will be patched */
198
199 /* XXX seems like every FETCH but the first has
200 * this bit set:
201 */
202 vtx->reserved3 = instr->idx ? 0x1 : 0x0;
203 vtx->reserved0 = instr->idx ? 0x2 : 0x3;
204 } else if (instr->fetch.opc == TEX_FETCH) {
205 instr_fetch_tex_t *tex = &bc->fetch.tex;
206
207 tex->src_reg = src_to_reg(ctx, src);
208 tex->src_swiz = fetch_swizzle(ctx, src, 3);
209 tex->dst_reg = dst_to_reg(ctx, instr);
210 tex->dst_swiz = fetch_dst_swiz(ctx, instr);
211 /* tex->const_idx = patch_fetches */
212 tex->mag_filter = TEX_FILTER_USE_FETCH_CONST;
213 tex->min_filter = TEX_FILTER_USE_FETCH_CONST;
214 tex->mip_filter = TEX_FILTER_USE_FETCH_CONST;
215 tex->aniso_filter = ANISO_FILTER_USE_FETCH_CONST;
216 tex->arbitrary_filter = ARBITRARY_FILTER_USE_FETCH_CONST;
217 tex->vol_mag_filter = TEX_FILTER_USE_FETCH_CONST;
218 tex->vol_min_filter = TEX_FILTER_USE_FETCH_CONST;
219 tex->use_comp_lod = ctx->so->type == MESA_SHADER_FRAGMENT;
220 tex->use_reg_lod = instr->src_count == 2;
221 tex->sample_location = SAMPLE_CENTER;
222 tex->tx_coord_denorm = instr->fetch.tex.is_rect;
223 } else if (instr->fetch.opc == TEX_SET_TEX_LOD) {
224 instr_fetch_tex_t *tex = &bc->fetch.tex;
225
226 tex->src_reg = src_to_reg(ctx, src);
227 tex->src_swiz = fetch_swizzle(ctx, src, 1);
228 tex->dst_reg = 0;
229 tex->dst_swiz = 0xfff;
230
231 tex->mag_filter = TEX_FILTER_USE_FETCH_CONST;
232 tex->min_filter = TEX_FILTER_USE_FETCH_CONST;
233 tex->mip_filter = TEX_FILTER_USE_FETCH_CONST;
234 tex->aniso_filter = ANISO_FILTER_USE_FETCH_CONST;
235 tex->arbitrary_filter = ARBITRARY_FILTER_USE_FETCH_CONST;
236 tex->vol_mag_filter = TEX_FILTER_USE_FETCH_CONST;
237 tex->vol_min_filter = TEX_FILTER_USE_FETCH_CONST;
238 tex->use_comp_lod = 1;
239 tex->use_reg_lod = 0;
240 tex->sample_location = SAMPLE_CENTER;
241 } else {
242 assert(0);
243 }
244 return;
245 }
246
247 instr_v = sched->instr;
248 instr_s = sched->instr_s;
249
250 if (instr_v) {
251 struct ir2_src src1, src2, *src3;
252
253 src1 = instr_v->src[0];
254 src2 = instr_v->src[instr_v->src_count > 1];
255 src3 = instr_v->src_count == 3 ? &instr_v->src[2] : NULL;
256
257 bc->alu.vector_opc = instr_v->alu.vector_opc;
258 bc->alu.vector_write_mask = alu_write_mask(ctx, instr_v);
259 bc->alu.vector_dest = dst_to_reg(ctx, instr_v);
260 bc->alu.vector_clamp = instr_v->alu.saturate;
261 bc->alu.export_data = instr_v->alu.export >= 0;
262
263 /* single operand SETEv, use 0.0f as src2 */
264 if (instr_v->src_count == 1 &&
265 (bc->alu.vector_opc == SETEv ||
266 bc->alu.vector_opc == SETNEv ||
267 bc->alu.vector_opc == SETGTv ||
268 bc->alu.vector_opc == SETGTEv))
269 src2 = ir2_zero(ctx);
270
271 /* export32 instr for a20x hw binning has this bit set..
272 * it seems to do more than change the base address of constants
273 * XXX this is a hack
274 */
275 bc->alu.relative_addr =
276 (bc->alu.export_data && bc->alu.vector_dest == 32);
277
278 bc->alu.src1_reg_byte = src_reg_byte(ctx, &src1);
279 bc->alu.src1_swiz = alu_swizzle(ctx, instr_v, &src1);
280 bc->alu.src1_reg_negate = src1.negate;
281 bc->alu.src1_sel = src1.type != IR2_SRC_CONST;
282
283 bc->alu.src2_reg_byte = src_reg_byte(ctx, &src2);
284 bc->alu.src2_swiz = alu_swizzle(ctx, instr_v, &src2);
285 bc->alu.src2_reg_negate = src2.negate;
286 bc->alu.src2_sel = src2.type != IR2_SRC_CONST;
287
288 if (src3) {
289 bc->alu.src3_reg_byte = src_reg_byte(ctx, src3);
290 bc->alu.src3_swiz = alu_swizzle(ctx, instr_v, src3);
291 bc->alu.src3_reg_negate = src3->negate;
292 bc->alu.src3_sel = src3->type != IR2_SRC_CONST;
293 }
294
295 bc->alu.pred_select = instr_v->pred;
296 }
297
298 if (instr_s) {
299 struct ir2_src *src = instr_s->src;
300
301 bc->alu.scalar_opc = instr_s->alu.scalar_opc;
302 bc->alu.scalar_write_mask = alu_write_mask(ctx, instr_s);
303 bc->alu.scalar_dest = dst_to_reg(ctx, instr_s);
304 bc->alu.scalar_clamp = instr_s->alu.saturate;
305 bc->alu.export_data = instr_s->alu.export >= 0;
306
307 if (instr_s->src_count == 1) {
308 bc->alu.src3_reg_byte = src_reg_byte(ctx, src);
309 bc->alu.src3_swiz = alu_swizzle_scalar(ctx, src);
310 bc->alu.src3_reg_negate = src->negate;
311 bc->alu.src3_sel = src->type != IR2_SRC_CONST;
312 } else {
313 assert(instr_s->src_count == 2);
314
315 bc->alu.src3_reg_byte = src_reg_byte(ctx, src);
316 bc->alu.src3_swiz = alu_swizzle_scalar2(ctx, src, instr_s->alu.src1_swizzle);
317 bc->alu.src3_reg_negate = src->negate;
318 bc->alu.src3_sel = src->type != IR2_SRC_CONST;;
319 }
320
321 if (instr_v)
322 assert(instr_s->pred == instr_v->pred);
323 bc->alu.pred_select = instr_s->pred;
324 }
325
326 *is_fetch = false;
327 return;
328 }
329
330 static unsigned
331 write_cfs(struct ir2_context *ctx, instr_cf_t * cfs, unsigned cf_idx,
332 instr_cf_alloc_t *alloc, instr_cf_exec_t *exec)
333 {
334 assert(exec->count);
335
336 if (alloc)
337 cfs[cf_idx++].alloc = *alloc;
338
339 /* for memory alloc offset for patching */
340 if (alloc && alloc->buffer_select == SQ_MEMORY &&
341 ctx->info->mem_export_ptr == -1)
342 ctx->info->mem_export_ptr = cf_idx / 2 * 3;
343
344 cfs[cf_idx++].exec = *exec;
345 exec->address += exec->count;
346 exec->serialize = 0;
347 exec->count = 0;
348
349 return cf_idx;
350 }
351
352 /* assemble the final shader */
353 void assemble(struct ir2_context *ctx, bool binning)
354 {
355 /* hw seems to have a limit of 384 (num_cf/2+num_instr <= 384)
356 * address is 9 bits so could it be 512 ?
357 */
358 instr_cf_t cfs[384];
359 instr_t bytecode[384], bc;
360 unsigned block_addr[128];
361 unsigned num_cf = 0;
362
363 /* CF instr state */
364 instr_cf_exec_t exec = {.opc = EXEC};
365 instr_cf_alloc_t alloc = {.opc = ALLOC};
366
367 int sync_id, sync_id_prev = -1;
368 bool is_fetch = false;
369 bool need_sync = true;
370 bool need_alloc = false;
371 unsigned block_idx = 0;
372
373 ctx->info->mem_export_ptr = -1;
374 ctx->info->num_fetch_instrs = 0;
375
376 /* vertex shader always needs to allocate at least one parameter
377 * if it will never happen,
378 */
379 if (ctx->so->type == MESA_SHADER_VERTEX && ctx->f->inputs_count == 0) {
380 alloc.buffer_select = SQ_PARAMETER_PIXEL;
381 cfs[num_cf++].alloc = alloc;
382 }
383
384 block_addr[0] = 0;
385
386 for (int i = 0, j = 0; j < ctx->instr_sched_count; j++) {
387 struct ir2_instr *instr = ctx->instr_sched[j].instr;
388
389 /* catch IR2_CF since it isn't a regular instruction */
390 if (instr && instr->type == IR2_CF) {
391 assert(!need_alloc); /* XXX */
392
393 /* flush any exec cf before inserting jmp */
394 if (exec.count)
395 num_cf = write_cfs(ctx, cfs, num_cf, NULL, &exec);
396
397 cfs[num_cf++].jmp_call = (instr_cf_jmp_call_t) {
398 .opc = COND_JMP,
399 .address = instr->cf.block_idx, /* will be fixed later */
400 .force_call = !instr->pred,
401 .predicated_jmp = 1,
402 .direction = instr->cf.block_idx > instr->block_idx,
403 .condition = instr->pred & 1,
404 };
405 continue;
406 }
407
408 /* fill the 3 dwords for the instruction */
409 fill_instr(ctx, &ctx->instr_sched[j], &bc, &is_fetch);
410
411 /* we need to sync between ALU/VTX_FETCH/TEX_FETCH types */
412 sync_id = 0;
413 if (is_fetch)
414 sync_id = bc.fetch.opc == VTX_FETCH ? 1 : 2;
415
416 need_sync = sync_id != sync_id_prev;
417 sync_id_prev = sync_id;
418
419 unsigned block;
420 {
421
422 if (ctx->instr_sched[j].instr)
423 block = ctx->instr_sched[j].instr->block_idx;
424 else
425 block = ctx->instr_sched[j].instr_s->block_idx;
426
427 assert(block_idx <= block);
428 }
429
430 /* info for patching */
431 if (is_fetch) {
432 struct ir2_fetch_info *info =
433 &ctx->info->fetch_info[ctx->info->num_fetch_instrs++];
434 info->offset = i * 3; /* add cf offset later */
435
436 if (bc.fetch.opc == VTX_FETCH) {
437 info->vtx.dst_swiz = bc.fetch.vtx.dst_swiz;
438 } else if (bc.fetch.opc == TEX_FETCH) {
439 info->tex.samp_id = instr->fetch.tex.samp_id;
440 info->tex.src_swiz = bc.fetch.tex.src_swiz;
441 } else {
442 ctx->info->num_fetch_instrs--;
443 }
444 }
445
446 /* exec cf after 6 instr or when switching between fetch / alu */
447 if (exec.count == 6 || (exec.count && (need_sync || block != block_idx))) {
448 num_cf = write_cfs(ctx, cfs, num_cf, need_alloc ? &alloc : NULL, &exec);
449 need_alloc = false;
450 }
451
452 /* update block_addrs for jmp patching */
453 while (block_idx < block)
454 block_addr[++block_idx] = num_cf;
455
456 /* export - fill alloc cf */
457 if (!is_fetch && bc.alu.export_data) {
458 /* get the export buffer from either vector/scalar dest */
459 instr_alloc_type_t buffer =
460 export_buf(bc.alu.vector_dest);
461 if (bc.alu.scalar_write_mask) {
462 if (bc.alu.vector_write_mask)
463 assert(buffer == export_buf(bc.alu.scalar_dest));
464 buffer = export_buf(bc.alu.scalar_dest);
465 }
466
467 /* flush previous alloc if the buffer changes */
468 bool need_new_alloc = buffer != alloc.buffer_select;
469
470 /* memory export always in 32/33 pair, new alloc on 32 */
471 if (bc.alu.vector_dest == 32)
472 need_new_alloc = true;
473
474 if (need_new_alloc && exec.count) {
475 num_cf = write_cfs(ctx, cfs, num_cf, need_alloc ? &alloc : NULL, &exec);
476 need_alloc = false;
477 }
478
479 need_alloc |= need_new_alloc;
480
481 alloc.size = 0;
482 alloc.buffer_select = buffer;
483
484 if (buffer == SQ_PARAMETER_PIXEL && ctx->so->type == MESA_SHADER_VERTEX)
485 alloc.size = ctx->f->inputs_count - 1;
486
487 if (buffer == SQ_POSITION)
488 alloc.size = ctx->so->writes_psize;
489 }
490
491 if (is_fetch)
492 exec.serialize |= 0x1 << exec.count * 2;
493 if (need_sync)
494 exec.serialize |= 0x2 << exec.count * 2;
495
496 need_sync = false;
497 exec.count += 1;
498 bytecode[i++] = bc;
499 }
500
501 /* final exec cf */
502 exec.opc = EXEC_END;
503 num_cf =
504 write_cfs(ctx, cfs, num_cf, need_alloc ? &alloc : NULL, &exec);
505
506 /* insert nop to get an even # of CFs */
507 if (num_cf % 2)
508 cfs[num_cf++] = (instr_cf_t) {
509 .opc = NOP};
510
511 /* patch cf addrs */
512 for (int idx = 0; idx < num_cf; idx++) {
513 switch (cfs[idx].opc) {
514 case NOP:
515 case ALLOC:
516 break;
517 case EXEC:
518 case EXEC_END:
519 cfs[idx].exec.address += num_cf / 2;
520 break;
521 case COND_JMP:
522 cfs[idx].jmp_call.address = block_addr[cfs[idx].jmp_call.address];
523 break;
524 default:
525 assert(0);
526 }
527 }
528
529 /* concatenate cfs and alu/fetch */
530 uint32_t cfdwords = num_cf / 2 * 3;
531 uint32_t alufetchdwords = exec.address * 3;
532 uint32_t sizedwords = cfdwords + alufetchdwords;
533 uint32_t *dwords = malloc(sizedwords * 4);
534 assert(dwords);
535 memcpy(dwords, cfs, cfdwords * 4);
536 memcpy(&dwords[cfdwords], bytecode, alufetchdwords * 4);
537
538 /* finalize ir2_shader_info */
539 ctx->info->dwords = dwords;
540 ctx->info->sizedwords = sizedwords;
541 for (int i = 0; i < ctx->info->num_fetch_instrs; i++)
542 ctx->info->fetch_info[i].offset += cfdwords;
543
544 if (fd_mesa_debug & FD_DBG_DISASM) {
545 DBG("disassemble: type=%d", ctx->so->type);
546 disasm_a2xx(dwords, sizedwords, 0, ctx->so->type);
547 }
548 }