2 * Copyright © 2016 Broadcom
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 #include "util/u_format.h"
26 #include "util/u_math.h"
27 #include "util/u_memory.h"
28 #include "util/ralloc.h"
29 #include "util/hash_table.h"
30 #include "compiler/nir/nir.h"
31 #include "compiler/nir/nir_builder.h"
32 #include "common/v3d_device_info.h"
33 #include "v3d_compiler.h"
35 /* We don't do any address packing. */
36 #define __gen_user_data void
37 #define __gen_address_type uint32_t
38 #define __gen_address_offset(reloc) (*reloc)
39 #define __gen_emit_reloc(cl, reloc)
40 #include "cle/v3d_packet_v33_pack.h"
43 ntq_get_src(struct v3d_compile
*c
, nir_src src
, int i
);
45 ntq_emit_cf_list(struct v3d_compile
*c
, struct exec_list
*list
);
48 resize_qreg_array(struct v3d_compile
*c
,
53 if (*size
>= decl_size
)
56 uint32_t old_size
= *size
;
57 *size
= MAX2(*size
* 2, decl_size
);
58 *regs
= reralloc(c
, *regs
, struct qreg
, *size
);
60 fprintf(stderr
, "Malloc failure\n");
64 for (uint32_t i
= old_size
; i
< *size
; i
++)
65 (*regs
)[i
] = c
->undef
;
69 vir_SFU(struct v3d_compile
*c
, int waddr
, struct qreg src
)
71 vir_FMOV_dest(c
, vir_reg(QFILE_MAGIC
, waddr
), src
);
72 return vir_FMOV(c
, vir_reg(QFILE_MAGIC
, V3D_QPU_WADDR_R4
));
76 vir_LDTMU(struct v3d_compile
*c
)
78 vir_NOP(c
)->qpu
.sig
.ldtmu
= true;
79 return vir_MOV(c
, vir_reg(QFILE_MAGIC
, V3D_QPU_WADDR_R4
));
83 indirect_uniform_load(struct v3d_compile
*c
, nir_intrinsic_instr
*intr
)
85 struct qreg indirect_offset
= ntq_get_src(c
, intr
->src
[0], 0);
86 uint32_t offset
= nir_intrinsic_base(intr
);
87 struct v3d_ubo_range
*range
= NULL
;
90 for (i
= 0; i
< c
->num_ubo_ranges
; i
++) {
91 range
= &c
->ubo_ranges
[i
];
92 if (offset
>= range
->src_offset
&&
93 offset
< range
->src_offset
+ range
->size
) {
97 /* The driver-location-based offset always has to be within a declared
100 assert(i
!= c
->num_ubo_ranges
);
101 if (!c
->ubo_range_used
[i
]) {
102 c
->ubo_range_used
[i
] = true;
103 range
->dst_offset
= c
->next_ubo_dst_offset
;
104 c
->next_ubo_dst_offset
+= range
->size
;
107 offset
-= range
->src_offset
;
109 if (range
->dst_offset
+ offset
!= 0) {
110 indirect_offset
= vir_ADD(c
, indirect_offset
,
111 vir_uniform_ui(c
, range
->dst_offset
+
115 /* Adjust for where we stored the TGSI register base. */
117 vir_reg(QFILE_MAGIC
, V3D_QPU_WADDR_TMUA
),
118 vir_uniform(c
, QUNIFORM_UBO_ADDR
, 0),
125 ntq_init_ssa_def(struct v3d_compile
*c
, nir_ssa_def
*def
)
127 struct qreg
*qregs
= ralloc_array(c
->def_ht
, struct qreg
,
128 def
->num_components
);
129 _mesa_hash_table_insert(c
->def_ht
, def
, qregs
);
134 * This function is responsible for getting VIR results into the associated
135 * storage for a NIR instruction.
137 * If it's a NIR SSA def, then we just set the associated hash table entry to
140 * If it's a NIR reg, then we need to update the existing qreg assigned to the
141 * NIR destination with the incoming value. To do that without introducing
142 * new MOVs, we require that the incoming qreg either be a uniform, or be
143 * SSA-defined by the previous VIR instruction in the block and rewritable by
144 * this function. That lets us sneak ahead and insert the SF flag beforehand
145 * (knowing that the previous instruction doesn't depend on flags) and rewrite
146 * its destination to be the NIR reg's destination
149 ntq_store_dest(struct v3d_compile
*c
, nir_dest
*dest
, int chan
,
152 struct qinst
*last_inst
= NULL
;
153 if (!list_empty(&c
->cur_block
->instructions
))
154 last_inst
= (struct qinst
*)c
->cur_block
->instructions
.prev
;
156 assert(result
.file
== QFILE_UNIF
||
157 (result
.file
== QFILE_TEMP
&&
158 last_inst
&& last_inst
== c
->defs
[result
.index
]));
161 assert(chan
< dest
->ssa
.num_components
);
164 struct hash_entry
*entry
=
165 _mesa_hash_table_search(c
->def_ht
, &dest
->ssa
);
170 qregs
= ntq_init_ssa_def(c
, &dest
->ssa
);
172 qregs
[chan
] = result
;
174 nir_register
*reg
= dest
->reg
.reg
;
175 assert(dest
->reg
.base_offset
== 0);
176 assert(reg
->num_array_elems
== 0);
177 struct hash_entry
*entry
=
178 _mesa_hash_table_search(c
->def_ht
, reg
);
179 struct qreg
*qregs
= entry
->data
;
181 /* Insert a MOV if the source wasn't an SSA def in the
182 * previous instruction.
184 if (result
.file
== QFILE_UNIF
) {
185 result
= vir_MOV(c
, result
);
186 last_inst
= c
->defs
[result
.index
];
189 /* We know they're both temps, so just rewrite index. */
190 c
->defs
[last_inst
->dst
.index
] = NULL
;
191 last_inst
->dst
.index
= qregs
[chan
].index
;
193 /* If we're in control flow, then make this update of the reg
194 * conditional on the execution mask.
196 if (c
->execute
.file
!= QFILE_NULL
) {
197 last_inst
->dst
.index
= qregs
[chan
].index
;
199 /* Set the flags to the current exec mask. To insert
200 * the flags push, we temporarily remove our SSA
203 list_del(&last_inst
->link
);
204 vir_PF(c
, c
->execute
, V3D_QPU_PF_PUSHZ
);
205 list_addtail(&last_inst
->link
,
206 &c
->cur_block
->instructions
);
208 vir_set_cond(last_inst
, V3D_QPU_COND_IFA
);
209 last_inst
->cond_is_exec_mask
= true;
215 ntq_get_src(struct v3d_compile
*c
, nir_src src
, int i
)
217 struct hash_entry
*entry
;
219 entry
= _mesa_hash_table_search(c
->def_ht
, src
.ssa
);
220 assert(i
< src
.ssa
->num_components
);
222 nir_register
*reg
= src
.reg
.reg
;
223 entry
= _mesa_hash_table_search(c
->def_ht
, reg
);
224 assert(reg
->num_array_elems
== 0);
225 assert(src
.reg
.base_offset
== 0);
226 assert(i
< reg
->num_components
);
229 struct qreg
*qregs
= entry
->data
;
234 ntq_get_alu_src(struct v3d_compile
*c
, nir_alu_instr
*instr
,
237 assert(util_is_power_of_two(instr
->dest
.write_mask
));
238 unsigned chan
= ffs(instr
->dest
.write_mask
) - 1;
239 struct qreg r
= ntq_get_src(c
, instr
->src
[src
].src
,
240 instr
->src
[src
].swizzle
[chan
]);
242 assert(!instr
->src
[src
].abs
);
243 assert(!instr
->src
[src
].negate
);
248 static inline struct qreg
249 vir_SAT(struct v3d_compile
*c
, struct qreg val
)
252 vir_FMIN(c
, val
, vir_uniform_f(c
, 1.0)),
253 vir_uniform_f(c
, 0.0));
257 ntq_umul(struct v3d_compile
*c
, struct qreg src0
, struct qreg src1
)
259 vir_MULTOP(c
, src0
, src1
);
260 return vir_UMUL24(c
, src0
, src1
);
264 ntq_minify(struct v3d_compile
*c
, struct qreg size
, struct qreg level
)
266 return vir_MAX(c
, vir_SHR(c
, size
, level
), vir_uniform_ui(c
, 1));
270 ntq_emit_txs(struct v3d_compile
*c
, nir_tex_instr
*instr
)
272 unsigned unit
= instr
->texture_index
;
273 int lod_index
= nir_tex_instr_src_index(instr
, nir_tex_src_lod
);
274 int dest_size
= nir_tex_instr_dest_size(instr
);
276 struct qreg lod
= c
->undef
;
278 lod
= ntq_get_src(c
, instr
->src
[lod_index
].src
, 0);
280 for (int i
= 0; i
< dest_size
; i
++) {
282 enum quniform_contents contents
;
284 if (instr
->is_array
&& i
== dest_size
- 1)
285 contents
= QUNIFORM_TEXTURE_ARRAY_SIZE
;
287 contents
= QUNIFORM_TEXTURE_WIDTH
+ i
;
289 struct qreg size
= vir_uniform(c
, contents
, unit
);
291 switch (instr
->sampler_dim
) {
292 case GLSL_SAMPLER_DIM_1D
:
293 case GLSL_SAMPLER_DIM_2D
:
294 case GLSL_SAMPLER_DIM_3D
:
295 case GLSL_SAMPLER_DIM_CUBE
:
296 /* Don't minify the array size. */
297 if (!(instr
->is_array
&& i
== dest_size
- 1)) {
298 size
= ntq_minify(c
, size
, lod
);
302 case GLSL_SAMPLER_DIM_RECT
:
303 /* There's no LOD field for rects */
307 unreachable("Bad sampler type");
310 ntq_store_dest(c
, &instr
->dest
, i
, size
);
315 ntq_emit_tex(struct v3d_compile
*c
, nir_tex_instr
*instr
)
317 unsigned unit
= instr
->texture_index
;
319 /* Since each texture sampling op requires uploading uniforms to
320 * reference the texture, there's no HW support for texture size and
321 * you just upload uniforms containing the size.
324 case nir_texop_query_levels
:
325 ntq_store_dest(c
, &instr
->dest
, 0,
326 vir_uniform(c
, QUNIFORM_TEXTURE_LEVELS
, unit
));
329 ntq_emit_txs(c
, instr
);
335 struct V3D33_TEXTURE_UNIFORM_PARAMETER_0_CFG_MODE1 p0_unpacked
= {
336 V3D33_TEXTURE_UNIFORM_PARAMETER_0_CFG_MODE1_header
,
338 .fetch_sample_mode
= instr
->op
== nir_texop_txf
,
341 struct V3D33_TEXTURE_UNIFORM_PARAMETER_1_CFG_MODE1 p1_unpacked
= {
344 switch (instr
->sampler_dim
) {
345 case GLSL_SAMPLER_DIM_1D
:
347 p0_unpacked
.lookup_type
= TEXTURE_1D_ARRAY
;
349 p0_unpacked
.lookup_type
= TEXTURE_1D
;
351 case GLSL_SAMPLER_DIM_2D
:
352 case GLSL_SAMPLER_DIM_RECT
:
354 p0_unpacked
.lookup_type
= TEXTURE_2D_ARRAY
;
356 p0_unpacked
.lookup_type
= TEXTURE_2D
;
358 case GLSL_SAMPLER_DIM_3D
:
359 p0_unpacked
.lookup_type
= TEXTURE_3D
;
361 case GLSL_SAMPLER_DIM_CUBE
:
362 p0_unpacked
.lookup_type
= TEXTURE_CUBE_MAP
;
365 unreachable("Bad sampler type");
368 struct qreg coords
[5];
370 for (unsigned i
= 0; i
< instr
->num_srcs
; i
++) {
371 switch (instr
->src
[i
].src_type
) {
372 case nir_tex_src_coord
:
373 for (int j
= 0; j
< instr
->coord_components
; j
++) {
374 coords
[next_coord
++] =
375 ntq_get_src(c
, instr
->src
[i
].src
, j
);
377 if (instr
->coord_components
< 2)
378 coords
[next_coord
++] = vir_uniform_f(c
, 0.5);
380 case nir_tex_src_bias
:
381 coords
[next_coord
++] =
382 ntq_get_src(c
, instr
->src
[i
].src
, 0);
384 p0_unpacked
.bias_supplied
= true;
386 case nir_tex_src_lod
:
387 coords
[next_coord
++] =
389 ntq_get_src(c
, instr
->src
[i
].src
, 0),
390 vir_uniform(c
, QUNIFORM_TEXTURE_FIRST_LEVEL
,
393 if (instr
->op
!= nir_texop_txf
&&
394 instr
->op
!= nir_texop_tg4
) {
395 p0_unpacked
.disable_autolod_use_bias_only
= true;
398 case nir_tex_src_comparator
:
399 coords
[next_coord
++] =
400 ntq_get_src(c
, instr
->src
[i
].src
, 0);
402 p0_unpacked
.shadow
= true;
405 case nir_tex_src_offset
: {
406 nir_const_value
*offset
=
407 nir_src_as_const_value(instr
->src
[i
].src
);
408 p0_unpacked
.texel_offset_for_s_coordinate
=
411 if (instr
->coord_components
>= 2)
412 p0_unpacked
.texel_offset_for_t_coordinate
=
415 if (instr
->coord_components
>= 3)
416 p0_unpacked
.texel_offset_for_r_coordinate
=
422 unreachable("unknown texture source");
426 bool return_16
= (c
->key
->tex
[unit
].return_size
== 16 ||
429 /* Limit the number of channels returned to both how many the NIR
430 * instruction writes and how many the instruction could produce.
432 uint32_t instr_return_channels
= nir_tex_instr_dest_size(instr
);
434 instr_return_channels
= (instr_return_channels
+ 1) / 2;
436 p1_unpacked
.return_words_of_texture_data
=
437 (1 << MIN2(instr_return_channels
,
438 c
->key
->tex
[unit
].return_channels
)) - 1;
441 V3D33_TEXTURE_UNIFORM_PARAMETER_0_CFG_MODE1_pack(NULL
,
442 (uint8_t *)&p0_packed
,
446 V3D33_TEXTURE_UNIFORM_PARAMETER_1_CFG_MODE1_pack(NULL
,
447 (uint8_t *)&p1_packed
,
449 /* Load unit number into the address field, which will be be used by
450 * the driver to decide which texture to put in the actual address
453 p1_packed
|= unit
<< 5;
455 /* There is no native support for GL texture rectangle coordinates, so
456 * we have to rescale from ([0, width], [0, height]) to ([0, 1], [0,
459 if (instr
->sampler_dim
== GLSL_SAMPLER_DIM_RECT
) {
460 coords
[0] = vir_FMUL(c
, coords
[0],
461 vir_uniform(c
, QUNIFORM_TEXRECT_SCALE_X
,
463 coords
[1] = vir_FMUL(c
, coords
[1],
464 vir_uniform(c
, QUNIFORM_TEXRECT_SCALE_Y
,
468 struct qreg texture_u
[] = {
469 vir_uniform(c
, QUNIFORM_TEXTURE_CONFIG_P0_0
+ unit
, p0_packed
),
470 vir_uniform(c
, QUNIFORM_TEXTURE_CONFIG_P1
, p1_packed
),
472 uint32_t next_texture_u
= 0;
474 for (int i
= 0; i
< next_coord
; i
++) {
477 if (i
== next_coord
- 1)
478 dst
= vir_reg(QFILE_MAGIC
, V3D_QPU_WADDR_TMUL
);
480 dst
= vir_reg(QFILE_MAGIC
, V3D_QPU_WADDR_TMU
);
482 struct qinst
*tmu
= vir_MOV_dest(c
, dst
, coords
[i
]);
485 tmu
->has_implicit_uniform
= true;
486 tmu
->src
[vir_get_implicit_uniform_src(tmu
)] =
487 texture_u
[next_texture_u
++];
491 struct qreg return_values
[4];
492 for (int i
= 0; i
< 4; i
++) {
493 /* Swizzling .zw of an RG texture should give undefined
494 * results, not crash the compiler.
496 if (p1_unpacked
.return_words_of_texture_data
& (1 << i
))
497 return_values
[i
] = vir_LDTMU(c
);
499 return_values
[i
] = c
->undef
;
502 for (int i
= 0; i
< nir_tex_instr_dest_size(instr
); i
++) {
506 STATIC_ASSERT(PIPE_SWIZZLE_X
== 0);
507 chan
= return_values
[i
/ 2];
509 if (nir_alu_type_get_base_type(instr
->dest_type
) ==
511 enum v3d_qpu_input_unpack unpack
;
513 unpack
= V3D_QPU_UNPACK_H
;
515 unpack
= V3D_QPU_UNPACK_L
;
517 chan
= vir_FMOV(c
, chan
);
518 vir_set_unpack(c
->defs
[chan
.index
], 0, unpack
);
520 /* If we're unpacking the low field, shift it
521 * up to the top first.
524 chan
= vir_SHL(c
, chan
,
525 vir_uniform_ui(c
, 16));
528 /* Do proper sign extension to a 32-bit int. */
529 if (nir_alu_type_get_base_type(instr
->dest_type
) ==
531 chan
= vir_ASR(c
, chan
,
532 vir_uniform_ui(c
, 16));
534 chan
= vir_SHR(c
, chan
,
535 vir_uniform_ui(c
, 16));
539 chan
= vir_MOV(c
, return_values
[i
]);
541 ntq_store_dest(c
, &instr
->dest
, i
, chan
);
546 ntq_fsincos(struct v3d_compile
*c
, struct qreg src
, bool is_cos
)
548 struct qreg input
= vir_FMUL(c
, src
, vir_uniform_f(c
, 1.0f
/ M_PI
));
550 input
= vir_FADD(c
, input
, vir_uniform_f(c
, 0.5));
552 struct qreg periods
= vir_FROUND(c
, input
);
553 struct qreg sin_output
= vir_SFU(c
, V3D_QPU_WADDR_SIN
,
554 vir_FSUB(c
, input
, periods
));
555 return vir_XOR(c
, sin_output
, vir_SHL(c
,
556 vir_FTOIN(c
, periods
),
557 vir_uniform_ui(c
, -1)));
561 ntq_fsign(struct v3d_compile
*c
, struct qreg src
)
563 struct qreg t
= vir_get_temp(c
);
565 vir_MOV_dest(c
, t
, vir_uniform_f(c
, 0.0));
566 vir_PF(c
, vir_FMOV(c
, src
), V3D_QPU_PF_PUSHZ
);
567 vir_MOV_cond(c
, V3D_QPU_COND_IFNA
, t
, vir_uniform_f(c
, 1.0));
568 vir_PF(c
, vir_FMOV(c
, src
), V3D_QPU_PF_PUSHN
);
569 vir_MOV_cond(c
, V3D_QPU_COND_IFA
, t
, vir_uniform_f(c
, -1.0));
570 return vir_MOV(c
, t
);
574 ntq_isign(struct v3d_compile
*c
, struct qreg src
)
576 struct qreg t
= vir_get_temp(c
);
578 vir_MOV_dest(c
, t
, vir_uniform_ui(c
, 0));
579 vir_PF(c
, vir_MOV(c
, src
), V3D_QPU_PF_PUSHZ
);
580 vir_MOV_cond(c
, V3D_QPU_COND_IFNA
, t
, vir_uniform_ui(c
, 1));
581 vir_PF(c
, vir_MOV(c
, src
), V3D_QPU_PF_PUSHN
);
582 vir_MOV_cond(c
, V3D_QPU_COND_IFA
, t
, vir_uniform_ui(c
, -1));
583 return vir_MOV(c
, t
);
587 emit_fragcoord_input(struct v3d_compile
*c
, int attr
)
589 c
->inputs
[attr
* 4 + 0] = vir_FXCD(c
);
590 c
->inputs
[attr
* 4 + 1] = vir_FYCD(c
);
591 c
->inputs
[attr
* 4 + 2] = c
->payload_z
;
592 c
->inputs
[attr
* 4 + 3] = vir_SFU(c
, V3D_QPU_WADDR_RECIP
,
597 emit_fragment_varying(struct v3d_compile
*c
, nir_variable
*var
,
600 struct qreg vary
= vir_reg(QFILE_VARY
, ~0);
601 struct qreg r5
= vir_reg(QFILE_MAGIC
, V3D_QPU_WADDR_R5
);
603 /* For gl_PointCoord input or distance along a line, we'll be called
604 * with no nir_variable, and we don't count toward VPM size so we
605 * don't track an input slot.
608 return vir_FADD(c
, vir_FMUL(c
, vary
, c
->payload_w
), r5
);
611 int i
= c
->num_inputs
++;
612 c
->input_slots
[i
] = v3d_slot_from_slot_and_component(var
->data
.location
,
615 switch (var
->data
.interpolation
) {
616 case INTERP_MODE_NONE
:
617 /* If a gl_FrontColor or gl_BackColor input has no interp
618 * qualifier, then if we're using glShadeModel(GL_FLAT) it
619 * needs to be flat shaded.
621 switch (var
->data
.location
) {
622 case VARYING_SLOT_COL0
:
623 case VARYING_SLOT_COL1
:
624 case VARYING_SLOT_BFC0
:
625 case VARYING_SLOT_BFC1
:
626 if (c
->fs_key
->shade_model_flat
) {
627 BITSET_SET(c
->flat_shade_flags
, i
);
628 vir_MOV_dest(c
, c
->undef
, vary
);
629 return vir_MOV(c
, r5
);
631 return vir_FADD(c
, vir_FMUL(c
, vary
,
638 case INTERP_MODE_SMOOTH
:
639 if (var
->data
.centroid
) {
640 return vir_FADD(c
, vir_FMUL(c
, vary
,
641 c
->payload_w_centroid
), r5
);
643 return vir_FADD(c
, vir_FMUL(c
, vary
, c
->payload_w
), r5
);
645 case INTERP_MODE_NOPERSPECTIVE
:
646 /* C appears after the mov from the varying.
647 XXX: improve ldvary setup.
649 return vir_FADD(c
, vir_MOV(c
, vary
), r5
);
650 case INTERP_MODE_FLAT
:
651 BITSET_SET(c
->flat_shade_flags
, i
);
652 vir_MOV_dest(c
, c
->undef
, vary
);
653 return vir_MOV(c
, r5
);
655 unreachable("Bad interp mode");
660 emit_fragment_input(struct v3d_compile
*c
, int attr
, nir_variable
*var
)
662 for (int i
= 0; i
< glsl_get_vector_elements(var
->type
); i
++) {
663 int chan
= var
->data
.location_frac
+ i
;
664 c
->inputs
[attr
* 4 + chan
] =
665 emit_fragment_varying(c
, var
, chan
);
670 add_output(struct v3d_compile
*c
,
671 uint32_t decl_offset
,
675 uint32_t old_array_size
= c
->outputs_array_size
;
676 resize_qreg_array(c
, &c
->outputs
, &c
->outputs_array_size
,
679 if (old_array_size
!= c
->outputs_array_size
) {
680 c
->output_slots
= reralloc(c
,
682 struct v3d_varying_slot
,
683 c
->outputs_array_size
);
686 c
->output_slots
[decl_offset
] =
687 v3d_slot_from_slot_and_component(slot
, swizzle
);
691 declare_uniform_range(struct v3d_compile
*c
, uint32_t start
, uint32_t size
)
693 unsigned array_id
= c
->num_ubo_ranges
++;
694 if (array_id
>= c
->ubo_ranges_array_size
) {
695 c
->ubo_ranges_array_size
= MAX2(c
->ubo_ranges_array_size
* 2,
697 c
->ubo_ranges
= reralloc(c
, c
->ubo_ranges
,
698 struct v3d_ubo_range
,
699 c
->ubo_ranges_array_size
);
700 c
->ubo_range_used
= reralloc(c
, c
->ubo_range_used
,
702 c
->ubo_ranges_array_size
);
705 c
->ubo_ranges
[array_id
].dst_offset
= 0;
706 c
->ubo_ranges
[array_id
].src_offset
= start
;
707 c
->ubo_ranges
[array_id
].size
= size
;
708 c
->ubo_range_used
[array_id
] = false;
712 * If compare_instr is a valid comparison instruction, emits the
713 * compare_instr's comparison and returns the sel_instr's return value based
714 * on the compare_instr's result.
717 ntq_emit_comparison(struct v3d_compile
*c
, struct qreg
*dest
,
718 nir_alu_instr
*compare_instr
,
719 nir_alu_instr
*sel_instr
)
721 struct qreg src0
= ntq_get_alu_src(c
, compare_instr
, 0);
722 struct qreg src1
= ntq_get_alu_src(c
, compare_instr
, 1);
723 bool cond_invert
= false;
725 switch (compare_instr
->op
) {
728 vir_PF(c
, vir_FCMP(c
, src0
, src1
), V3D_QPU_PF_PUSHZ
);
731 vir_PF(c
, vir_XOR(c
, src0
, src1
), V3D_QPU_PF_PUSHZ
);
736 vir_PF(c
, vir_FCMP(c
, src0
, src1
), V3D_QPU_PF_PUSHZ
);
740 vir_PF(c
, vir_XOR(c
, src0
, src1
), V3D_QPU_PF_PUSHZ
);
746 vir_PF(c
, vir_FCMP(c
, src1
, src0
), V3D_QPU_PF_PUSHC
);
749 vir_PF(c
, vir_MIN(c
, src1
, src0
), V3D_QPU_PF_PUSHC
);
753 vir_PF(c
, vir_SUB(c
, src0
, src1
), V3D_QPU_PF_PUSHC
);
759 vir_PF(c
, vir_FCMP(c
, src0
, src1
), V3D_QPU_PF_PUSHN
);
762 vir_PF(c
, vir_MIN(c
, src1
, src0
), V3D_QPU_PF_PUSHC
);
765 vir_PF(c
, vir_SUB(c
, src0
, src1
), V3D_QPU_PF_PUSHC
);
772 enum v3d_qpu_cond cond
= (cond_invert
?
776 switch (sel_instr
->op
) {
781 *dest
= vir_SEL(c
, cond
,
782 vir_uniform_f(c
, 1.0), vir_uniform_f(c
, 0.0));
786 *dest
= vir_SEL(c
, cond
,
787 ntq_get_alu_src(c
, sel_instr
, 1),
788 ntq_get_alu_src(c
, sel_instr
, 2));
792 *dest
= vir_SEL(c
, cond
,
793 vir_uniform_ui(c
, ~0), vir_uniform_ui(c
, 0));
797 /* Make the temporary for nir_store_dest(). */
798 *dest
= vir_MOV(c
, *dest
);
804 * Attempts to fold a comparison generating a boolean result into the
805 * condition code for selecting between two values, instead of comparing the
806 * boolean result against 0 to generate the condition code.
808 static struct qreg
ntq_emit_bcsel(struct v3d_compile
*c
, nir_alu_instr
*instr
,
811 if (!instr
->src
[0].src
.is_ssa
)
813 if (instr
->src
[0].src
.ssa
->parent_instr
->type
!= nir_instr_type_alu
)
815 nir_alu_instr
*compare
=
816 nir_instr_as_alu(instr
->src
[0].src
.ssa
->parent_instr
);
821 if (ntq_emit_comparison(c
, &dest
, compare
, instr
))
825 vir_PF(c
, src
[0], V3D_QPU_PF_PUSHZ
);
826 return vir_MOV(c
, vir_SEL(c
, V3D_QPU_COND_IFNA
, src
[1], src
[2]));
831 ntq_emit_alu(struct v3d_compile
*c
, nir_alu_instr
*instr
)
833 /* This should always be lowered to ALU operations for V3D. */
834 assert(!instr
->dest
.saturate
);
836 /* Vectors are special in that they have non-scalarized writemasks,
837 * and just take the first swizzle channel for each argument in order
838 * into each writemask channel.
840 if (instr
->op
== nir_op_vec2
||
841 instr
->op
== nir_op_vec3
||
842 instr
->op
== nir_op_vec4
) {
844 for (int i
= 0; i
< nir_op_infos
[instr
->op
].num_inputs
; i
++)
845 srcs
[i
] = ntq_get_src(c
, instr
->src
[i
].src
,
846 instr
->src
[i
].swizzle
[0]);
847 for (int i
= 0; i
< nir_op_infos
[instr
->op
].num_inputs
; i
++)
848 ntq_store_dest(c
, &instr
->dest
.dest
, i
,
849 vir_MOV(c
, srcs
[i
]));
853 /* General case: We can just grab the one used channel per src. */
854 struct qreg src
[nir_op_infos
[instr
->op
].num_inputs
];
855 for (int i
= 0; i
< nir_op_infos
[instr
->op
].num_inputs
; i
++) {
856 src
[i
] = ntq_get_alu_src(c
, instr
, i
);
864 result
= vir_MOV(c
, src
[0]);
868 result
= vir_XOR(c
, src
[0], vir_uniform_ui(c
, 1 << 31));
871 result
= vir_NEG(c
, src
[0]);
875 result
= vir_FMUL(c
, src
[0], src
[1]);
878 result
= vir_FADD(c
, src
[0], src
[1]);
881 result
= vir_FSUB(c
, src
[0], src
[1]);
884 result
= vir_FMIN(c
, src
[0], src
[1]);
887 result
= vir_FMAX(c
, src
[0], src
[1]);
891 result
= vir_FTOIZ(c
, src
[0]);
894 result
= vir_FTOUZ(c
, src
[0]);
897 result
= vir_ITOF(c
, src
[0]);
900 result
= vir_UTOF(c
, src
[0]);
903 result
= vir_AND(c
, src
[0], vir_uniform_f(c
, 1.0));
906 result
= vir_AND(c
, src
[0], vir_uniform_ui(c
, 1));
910 vir_PF(c
, src
[0], V3D_QPU_PF_PUSHZ
);
911 result
= vir_MOV(c
, vir_SEL(c
, V3D_QPU_COND_IFNA
,
912 vir_uniform_ui(c
, ~0),
913 vir_uniform_ui(c
, 0)));
917 result
= vir_ADD(c
, src
[0], src
[1]);
920 result
= vir_SHR(c
, src
[0], src
[1]);
923 result
= vir_SUB(c
, src
[0], src
[1]);
926 result
= vir_ASR(c
, src
[0], src
[1]);
929 result
= vir_SHL(c
, src
[0], src
[1]);
932 result
= vir_MIN(c
, src
[0], src
[1]);
935 result
= vir_UMIN(c
, src
[0], src
[1]);
938 result
= vir_MAX(c
, src
[0], src
[1]);
941 result
= vir_UMAX(c
, src
[0], src
[1]);
944 result
= vir_AND(c
, src
[0], src
[1]);
947 result
= vir_OR(c
, src
[0], src
[1]);
950 result
= vir_XOR(c
, src
[0], src
[1]);
953 result
= vir_NOT(c
, src
[0]);
957 result
= ntq_umul(c
, src
[0], src
[1]);
974 if (!ntq_emit_comparison(c
, &result
, instr
, instr
)) {
975 fprintf(stderr
, "Bad comparison instruction\n");
980 result
= ntq_emit_bcsel(c
, instr
, src
);
983 vir_PF(c
, src
[0], V3D_QPU_PF_PUSHZ
);
984 result
= vir_MOV(c
, vir_SEL(c
, V3D_QPU_COND_IFNA
,
989 result
= vir_SFU(c
, V3D_QPU_WADDR_RECIP
, src
[0]);
992 result
= vir_SFU(c
, V3D_QPU_WADDR_RSQRT
, src
[0]);
995 result
= vir_SFU(c
, V3D_QPU_WADDR_EXP
, src
[0]);
998 result
= vir_SFU(c
, V3D_QPU_WADDR_LOG
, src
[0]);
1002 result
= vir_FCEIL(c
, src
[0]);
1005 result
= vir_FFLOOR(c
, src
[0]);
1007 case nir_op_fround_even
:
1008 result
= vir_FROUND(c
, src
[0]);
1011 result
= vir_FTRUNC(c
, src
[0]);
1014 result
= vir_FSUB(c
, src
[0], vir_FFLOOR(c
, src
[0]));
1018 result
= ntq_fsincos(c
, src
[0], false);
1021 result
= ntq_fsincos(c
, src
[0], true);
1025 result
= ntq_fsign(c
, src
[0]);
1028 result
= ntq_isign(c
, src
[0]);
1032 result
= vir_FMOV(c
, src
[0]);
1033 vir_set_unpack(c
->defs
[result
.index
], 0, V3D_QPU_UNPACK_ABS
);
1038 result
= vir_MAX(c
, src
[0],
1039 vir_SUB(c
, vir_uniform_ui(c
, 0), src
[0]));
1043 case nir_op_fddx_coarse
:
1044 case nir_op_fddx_fine
:
1045 result
= vir_FDX(c
, src
[0]);
1049 case nir_op_fddy_coarse
:
1050 case nir_op_fddy_fine
:
1051 result
= vir_FDY(c
, src
[0]);
1055 fprintf(stderr
, "unknown NIR ALU inst: ");
1056 nir_print_instr(&instr
->instr
, stderr
);
1057 fprintf(stderr
, "\n");
1061 /* We have a scalar result, so the instruction should only have a
1062 * single channel written to.
1064 assert(util_is_power_of_two(instr
->dest
.write_mask
));
1065 ntq_store_dest(c
, &instr
->dest
.dest
,
1066 ffs(instr
->dest
.write_mask
) - 1, result
);
1069 /* Each TLB read/write setup (a render target or depth buffer) takes an 8-bit
1070 * specifier. They come from a register that's preloaded with 0xffffffff
1071 * (0xff gets you normal vec4 f16 RT0 writes), and when one is neaded the low
1072 * 8 bits are shifted off the bottom and 0xff shifted in from the top.
1074 #define TLB_TYPE_F16_COLOR (3 << 6)
1075 #define TLB_TYPE_I32_COLOR (1 << 6)
1076 #define TLB_TYPE_F32_COLOR (0 << 6)
1077 #define TLB_RENDER_TARGET_SHIFT 3 /* Reversed! 7 = RT 0, 0 = RT 7. */
1078 #define TLB_SAMPLE_MODE_PER_SAMPLE (0 << 2)
1079 #define TLB_SAMPLE_MODE_PER_PIXEL (1 << 2)
1080 #define TLB_F16_SWAP_HI_LO (1 << 1)
1081 #define TLB_VEC_SIZE_4_F16 (1 << 0)
1082 #define TLB_VEC_SIZE_2_F16 (0 << 0)
1083 #define TLB_VEC_SIZE_MINUS_1_SHIFT 0
1085 /* Triggers Z/Stencil testing, used when the shader state's "FS modifies Z"
1088 #define TLB_TYPE_DEPTH ((2 << 6) | (0 << 4))
1089 #define TLB_DEPTH_TYPE_INVARIANT (0 << 2) /* Unmodified sideband input used */
1090 #define TLB_DEPTH_TYPE_PER_PIXEL (1 << 2) /* QPU result used */
1092 /* Stencil is a single 32-bit write. */
1093 #define TLB_TYPE_STENCIL_ALPHA ((2 << 6) | (1 << 4))
1096 emit_frag_end(struct v3d_compile
*c
)
1099 if (c->output_sample_mask_index != -1) {
1100 vir_MS_MASK(c, c->outputs[c->output_sample_mask_index]);
1104 bool has_any_tlb_color_write
= false;
1105 for (int rt
= 0; rt
< c
->fs_key
->nr_cbufs
; rt
++) {
1106 if (c
->output_color_var
[rt
])
1107 has_any_tlb_color_write
= true;
1110 if (c
->output_position_index
!= -1) {
1111 struct qinst
*inst
= vir_MOV_dest(c
,
1112 vir_reg(QFILE_TLBU
, 0),
1113 c
->outputs
[c
->output_position_index
]);
1115 inst
->src
[vir_get_implicit_uniform_src(inst
)] =
1118 TLB_DEPTH_TYPE_PER_PIXEL
|
1120 } else if (c
->s
->info
.fs
.uses_discard
|| !has_any_tlb_color_write
) {
1121 /* Emit passthrough Z if it needed to be delayed until shader
1122 * end due to potential discards.
1124 * Since (single-threaded) fragment shaders always need a TLB
1125 * write, emit passthrouh Z if we didn't have any color
1126 * buffers and flag us as potentially discarding, so that we
1127 * can use Z as the TLB write.
1129 c
->s
->info
.fs
.uses_discard
= true;
1131 struct qinst
*inst
= vir_MOV_dest(c
,
1132 vir_reg(QFILE_TLBU
, 0),
1133 vir_reg(QFILE_NULL
, 0));
1135 inst
->src
[vir_get_implicit_uniform_src(inst
)] =
1138 TLB_DEPTH_TYPE_INVARIANT
|
1142 /* XXX: Performance improvement: Merge Z write and color writes TLB
1146 for (int rt
= 0; rt
< c
->fs_key
->nr_cbufs
; rt
++) {
1147 if (!c
->output_color_var
[rt
])
1150 nir_variable
*var
= c
->output_color_var
[rt
];
1151 struct qreg
*color
= &c
->outputs
[var
->data
.driver_location
* 4];
1152 int num_components
= glsl_get_vector_elements(var
->type
);
1153 uint32_t conf
= 0xffffff00;
1156 conf
|= TLB_SAMPLE_MODE_PER_PIXEL
;
1157 conf
|= (7 - rt
) << TLB_RENDER_TARGET_SHIFT
;
1159 assert(num_components
!= 0);
1160 switch (glsl_get_base_type(var
->type
)) {
1161 case GLSL_TYPE_UINT
:
1163 conf
|= TLB_TYPE_I32_COLOR
;
1164 conf
|= ((num_components
- 1) <<
1165 TLB_VEC_SIZE_MINUS_1_SHIFT
);
1167 inst
= vir_MOV_dest(c
, vir_reg(QFILE_TLBU
, 0), color
[0]);
1168 inst
->src
[vir_get_implicit_uniform_src(inst
)] =
1169 vir_uniform_ui(c
, conf
);
1171 for (int i
= 1; i
< num_components
; i
++) {
1172 inst
= vir_MOV_dest(c
, vir_reg(QFILE_TLB
, 0),
1178 struct qreg r
= color
[0];
1179 struct qreg g
= color
[1];
1180 struct qreg b
= color
[2];
1181 struct qreg a
= color
[3];
1183 if (c
->fs_key
->f32_color_rb
) {
1184 conf
|= TLB_TYPE_F32_COLOR
;
1185 conf
|= ((num_components
- 1) <<
1186 TLB_VEC_SIZE_MINUS_1_SHIFT
);
1188 conf
|= TLB_TYPE_F16_COLOR
;
1189 conf
|= TLB_F16_SWAP_HI_LO
;
1190 if (num_components
>= 3)
1191 conf
|= TLB_VEC_SIZE_4_F16
;
1193 conf
|= TLB_VEC_SIZE_2_F16
;
1196 if (c
->fs_key
->swap_color_rb
& (1 << rt
)) {
1201 if (c
->fs_key
->f32_color_rb
& (1 << rt
)) {
1202 inst
= vir_MOV_dest(c
, vir_reg(QFILE_TLBU
, 0), color
[0]);
1203 inst
->src
[vir_get_implicit_uniform_src(inst
)] =
1204 vir_uniform_ui(c
, conf
);
1206 for (int i
= 1; i
< num_components
; i
++) {
1207 inst
= vir_MOV_dest(c
, vir_reg(QFILE_TLB
, 0),
1211 inst
= vir_VFPACK_dest(c
, vir_reg(QFILE_TLB
, 0), r
, g
);
1213 inst
->dst
.file
= QFILE_TLBU
;
1214 inst
->src
[vir_get_implicit_uniform_src(inst
)] =
1215 vir_uniform_ui(c
, conf
);
1218 if (num_components
>= 3)
1219 inst
= vir_VFPACK_dest(c
, vir_reg(QFILE_TLB
, 0), b
, a
);
1228 vir_VPM_WRITE(struct v3d_compile
*c
, struct qreg val
, uint32_t *vpm_index
)
1230 if (c
->devinfo
->ver
>= 40) {
1231 vir_STVPMV(c
, vir_uniform_ui(c
, *vpm_index
), val
);
1232 *vpm_index
= *vpm_index
+ 1;
1234 vir_MOV_dest(c
, vir_reg(QFILE_MAGIC
, V3D_QPU_WADDR_VPM
), val
);
1237 c
->num_vpm_writes
++;
1241 emit_scaled_viewport_write(struct v3d_compile
*c
, struct qreg rcp_w
,
1242 uint32_t *vpm_index
)
1244 for (int i
= 0; i
< 2; i
++) {
1245 struct qreg coord
= c
->outputs
[c
->output_position_index
+ i
];
1246 coord
= vir_FMUL(c
, coord
,
1247 vir_uniform(c
, QUNIFORM_VIEWPORT_X_SCALE
+ i
,
1249 coord
= vir_FMUL(c
, coord
, rcp_w
);
1250 vir_VPM_WRITE(c
, vir_FTOIN(c
, coord
), vpm_index
);
1256 emit_zs_write(struct v3d_compile
*c
, struct qreg rcp_w
, uint32_t *vpm_index
)
1258 struct qreg zscale
= vir_uniform(c
, QUNIFORM_VIEWPORT_Z_SCALE
, 0);
1259 struct qreg zoffset
= vir_uniform(c
, QUNIFORM_VIEWPORT_Z_OFFSET
, 0);
1261 struct qreg z
= c
->outputs
[c
->output_position_index
+ 2];
1262 z
= vir_FMUL(c
, z
, zscale
);
1263 z
= vir_FMUL(c
, z
, rcp_w
);
1264 z
= vir_FADD(c
, z
, zoffset
);
1265 vir_VPM_WRITE(c
, z
, vpm_index
);
1269 emit_rcp_wc_write(struct v3d_compile
*c
, struct qreg rcp_w
, uint32_t *vpm_index
)
1271 vir_VPM_WRITE(c
, rcp_w
, vpm_index
);
1275 emit_point_size_write(struct v3d_compile
*c
, uint32_t *vpm_index
)
1277 struct qreg point_size
;
1279 if (c
->output_point_size_index
!= -1)
1280 point_size
= c
->outputs
[c
->output_point_size_index
];
1282 point_size
= vir_uniform_f(c
, 1.0);
1284 /* Workaround: HW-2726 PTB does not handle zero-size points (BCM2835,
1287 point_size
= vir_FMAX(c
, point_size
, vir_uniform_f(c
, .125));
1289 vir_VPM_WRITE(c
, point_size
, vpm_index
);
1293 emit_vpm_write_setup(struct v3d_compile
*c
)
1295 if (c
->devinfo
->ver
>= 40)
1299 struct V3D33_VPM_GENERIC_BLOCK_WRITE_SETUP unpacked
= {
1300 V3D33_VPM_GENERIC_BLOCK_WRITE_SETUP_header
,
1306 .size
= VPM_SETUP_SIZE_32_BIT
,
1310 V3D33_VPM_GENERIC_BLOCK_WRITE_SETUP_pack(NULL
,
1313 vir_VPMSETUP(c
, vir_uniform_ui(c
, packed
));
1317 emit_vert_end(struct v3d_compile
*c
)
1319 uint32_t vpm_index
= 0;
1320 struct qreg rcp_w
= vir_SFU(c
, V3D_QPU_WADDR_RECIP
,
1321 c
->outputs
[c
->output_position_index
+ 3]);
1323 emit_vpm_write_setup(c
);
1325 if (c
->vs_key
->is_coord
) {
1326 for (int i
= 0; i
< 4; i
++)
1327 vir_VPM_WRITE(c
, c
->outputs
[c
->output_position_index
+ i
],
1329 emit_scaled_viewport_write(c
, rcp_w
, &vpm_index
);
1330 if (c
->vs_key
->per_vertex_point_size
) {
1331 emit_point_size_write(c
, &vpm_index
);
1332 /* emit_rcp_wc_write(c, rcp_w); */
1334 /* XXX: Z-only rendering */
1336 emit_zs_write(c
, rcp_w
, &vpm_index
);
1338 emit_scaled_viewport_write(c
, rcp_w
, &vpm_index
);
1339 emit_zs_write(c
, rcp_w
, &vpm_index
);
1340 emit_rcp_wc_write(c
, rcp_w
, &vpm_index
);
1341 if (c
->vs_key
->per_vertex_point_size
)
1342 emit_point_size_write(c
, &vpm_index
);
1345 for (int i
= 0; i
< c
->vs_key
->num_fs_inputs
; i
++) {
1346 struct v3d_varying_slot input
= c
->vs_key
->fs_inputs
[i
];
1349 for (j
= 0; j
< c
->num_outputs
; j
++) {
1350 struct v3d_varying_slot output
= c
->output_slots
[j
];
1352 if (!memcmp(&input
, &output
, sizeof(input
))) {
1353 vir_VPM_WRITE(c
, c
->outputs
[j
],
1358 /* Emit padding if we didn't find a declared VS output for
1361 if (j
== c
->num_outputs
)
1362 vir_VPM_WRITE(c
, vir_uniform_f(c
, 0.0),
1368 v3d_optimize_nir(struct nir_shader
*s
)
1375 NIR_PASS_V(s
, nir_lower_vars_to_ssa
);
1376 NIR_PASS(progress
, s
, nir_lower_alu_to_scalar
);
1377 NIR_PASS(progress
, s
, nir_lower_phis_to_scalar
);
1378 NIR_PASS(progress
, s
, nir_copy_prop
);
1379 NIR_PASS(progress
, s
, nir_opt_remove_phis
);
1380 NIR_PASS(progress
, s
, nir_opt_dce
);
1381 NIR_PASS(progress
, s
, nir_opt_dead_cf
);
1382 NIR_PASS(progress
, s
, nir_opt_cse
);
1383 NIR_PASS(progress
, s
, nir_opt_peephole_select
, 8);
1384 NIR_PASS(progress
, s
, nir_opt_algebraic
);
1385 NIR_PASS(progress
, s
, nir_opt_constant_folding
);
1386 NIR_PASS(progress
, s
, nir_opt_undef
);
1391 driver_location_compare(const void *in_a
, const void *in_b
)
1393 const nir_variable
*const *a
= in_a
;
1394 const nir_variable
*const *b
= in_b
;
1396 return (*a
)->data
.driver_location
- (*b
)->data
.driver_location
;
1400 ntq_emit_vpm_read(struct v3d_compile
*c
,
1401 uint32_t *num_components_queued
,
1402 uint32_t *remaining
,
1405 struct qreg vpm
= vir_reg(QFILE_VPM
, vpm_index
);
1407 if (c
->devinfo
->ver
>= 40 ) {
1408 return vir_LDVPMV_IN(c
,
1410 (*num_components_queued
)++));
1413 if (*num_components_queued
!= 0) {
1414 (*num_components_queued
)--;
1416 return vir_MOV(c
, vpm
);
1419 uint32_t num_components
= MIN2(*remaining
, 32);
1421 struct V3D33_VPM_GENERIC_BLOCK_READ_SETUP unpacked
= {
1422 V3D33_VPM_GENERIC_BLOCK_READ_SETUP_header
,
1426 /* If the field is 0, that means a read count of 32. */
1427 .num
= num_components
& 31,
1430 .size
= VPM_SETUP_SIZE_32_BIT
,
1431 .addr
= c
->num_inputs
,
1435 V3D33_VPM_GENERIC_BLOCK_READ_SETUP_pack(NULL
,
1438 vir_VPMSETUP(c
, vir_uniform_ui(c
, packed
));
1440 *num_components_queued
= num_components
- 1;
1441 *remaining
-= num_components
;
1444 return vir_MOV(c
, vpm
);
1448 ntq_setup_inputs(struct v3d_compile
*c
)
1450 unsigned num_entries
= 0;
1451 unsigned num_components
= 0;
1452 nir_foreach_variable(var
, &c
->s
->inputs
) {
1454 num_components
+= glsl_get_components(var
->type
);
1457 nir_variable
*vars
[num_entries
];
1460 nir_foreach_variable(var
, &c
->s
->inputs
)
1463 /* Sort the variables so that we emit the input setup in
1464 * driver_location order. This is required for VPM reads, whose data
1465 * is fetched into the VPM in driver_location (TGSI register index)
1468 qsort(&vars
, num_entries
, sizeof(*vars
), driver_location_compare
);
1470 uint32_t vpm_components_queued
= 0;
1471 if (c
->s
->info
.stage
== MESA_SHADER_VERTEX
) {
1472 bool uses_iid
= c
->s
->info
.system_values_read
&
1473 (1ull << SYSTEM_VALUE_INSTANCE_ID
);
1474 bool uses_vid
= c
->s
->info
.system_values_read
&
1475 (1ull << SYSTEM_VALUE_VERTEX_ID
);
1477 num_components
+= uses_iid
;
1478 num_components
+= uses_vid
;
1481 c
->iid
= ntq_emit_vpm_read(c
, &vpm_components_queued
,
1482 &num_components
, ~0);
1486 c
->vid
= ntq_emit_vpm_read(c
, &vpm_components_queued
,
1487 &num_components
, ~0);
1491 for (unsigned i
= 0; i
< num_entries
; i
++) {
1492 nir_variable
*var
= vars
[i
];
1493 unsigned array_len
= MAX2(glsl_get_length(var
->type
), 1);
1494 unsigned loc
= var
->data
.driver_location
;
1496 assert(array_len
== 1);
1498 resize_qreg_array(c
, &c
->inputs
, &c
->inputs_array_size
,
1501 if (c
->s
->info
.stage
== MESA_SHADER_FRAGMENT
) {
1502 if (var
->data
.location
== VARYING_SLOT_POS
) {
1503 emit_fragcoord_input(c
, loc
);
1504 } else if (var
->data
.location
== VARYING_SLOT_PNTC
||
1505 (var
->data
.location
>= VARYING_SLOT_VAR0
&&
1506 (c
->fs_key
->point_sprite_mask
&
1507 (1 << (var
->data
.location
-
1508 VARYING_SLOT_VAR0
))))) {
1509 c
->inputs
[loc
* 4 + 0] = c
->point_x
;
1510 c
->inputs
[loc
* 4 + 1] = c
->point_y
;
1512 emit_fragment_input(c
, loc
, var
);
1515 int var_components
= glsl_get_components(var
->type
);
1517 for (int i
= 0; i
< var_components
; i
++) {
1518 c
->inputs
[loc
* 4 + i
] =
1519 ntq_emit_vpm_read(c
,
1520 &vpm_components_queued
,
1525 c
->vattr_sizes
[loc
] = var_components
;
1529 if (c
->s
->info
.stage
== MESA_SHADER_VERTEX
) {
1530 if (c
->devinfo
->ver
>= 40) {
1531 assert(vpm_components_queued
== num_components
);
1533 assert(vpm_components_queued
== 0);
1534 assert(num_components
== 0);
1540 ntq_setup_outputs(struct v3d_compile
*c
)
1542 nir_foreach_variable(var
, &c
->s
->outputs
) {
1543 unsigned array_len
= MAX2(glsl_get_length(var
->type
), 1);
1544 unsigned loc
= var
->data
.driver_location
* 4;
1546 assert(array_len
== 1);
1549 for (int i
= 0; i
< glsl_get_vector_elements(var
->type
); i
++) {
1550 add_output(c
, loc
+ var
->data
.location_frac
+ i
,
1552 var
->data
.location_frac
+ i
);
1555 if (c
->s
->info
.stage
== MESA_SHADER_FRAGMENT
) {
1556 switch (var
->data
.location
) {
1557 case FRAG_RESULT_COLOR
:
1558 c
->output_color_var
[0] = var
;
1559 c
->output_color_var
[1] = var
;
1560 c
->output_color_var
[2] = var
;
1561 c
->output_color_var
[3] = var
;
1563 case FRAG_RESULT_DATA0
:
1564 case FRAG_RESULT_DATA1
:
1565 case FRAG_RESULT_DATA2
:
1566 case FRAG_RESULT_DATA3
:
1567 c
->output_color_var
[var
->data
.location
-
1568 FRAG_RESULT_DATA0
] = var
;
1570 case FRAG_RESULT_DEPTH
:
1571 c
->output_position_index
= loc
;
1573 case FRAG_RESULT_SAMPLE_MASK
:
1574 c
->output_sample_mask_index
= loc
;
1578 switch (var
->data
.location
) {
1579 case VARYING_SLOT_POS
:
1580 c
->output_position_index
= loc
;
1582 case VARYING_SLOT_PSIZ
:
1583 c
->output_point_size_index
= loc
;
1591 ntq_setup_uniforms(struct v3d_compile
*c
)
1593 nir_foreach_variable(var
, &c
->s
->uniforms
) {
1594 uint32_t vec4_count
= glsl_count_attribute_slots(var
->type
,
1596 unsigned vec4_size
= 4 * sizeof(float);
1598 declare_uniform_range(c
, var
->data
.driver_location
* vec4_size
,
1599 vec4_count
* vec4_size
);
1605 * Sets up the mapping from nir_register to struct qreg *.
1607 * Each nir_register gets a struct qreg per 32-bit component being stored.
1610 ntq_setup_registers(struct v3d_compile
*c
, struct exec_list
*list
)
1612 foreach_list_typed(nir_register
, nir_reg
, node
, list
) {
1613 unsigned array_len
= MAX2(nir_reg
->num_array_elems
, 1);
1614 struct qreg
*qregs
= ralloc_array(c
->def_ht
, struct qreg
,
1616 nir_reg
->num_components
);
1618 _mesa_hash_table_insert(c
->def_ht
, nir_reg
, qregs
);
1620 for (int i
= 0; i
< array_len
* nir_reg
->num_components
; i
++)
1621 qregs
[i
] = vir_get_temp(c
);
1626 ntq_emit_load_const(struct v3d_compile
*c
, nir_load_const_instr
*instr
)
1628 struct qreg
*qregs
= ntq_init_ssa_def(c
, &instr
->def
);
1629 for (int i
= 0; i
< instr
->def
.num_components
; i
++)
1630 qregs
[i
] = vir_uniform_ui(c
, instr
->value
.u32
[i
]);
1632 _mesa_hash_table_insert(c
->def_ht
, &instr
->def
, qregs
);
1636 ntq_emit_ssa_undef(struct v3d_compile
*c
, nir_ssa_undef_instr
*instr
)
1638 struct qreg
*qregs
= ntq_init_ssa_def(c
, &instr
->def
);
1640 /* VIR needs there to be *some* value, so pick 0 (same as for
1641 * ntq_setup_registers().
1643 for (int i
= 0; i
< instr
->def
.num_components
; i
++)
1644 qregs
[i
] = vir_uniform_ui(c
, 0);
1648 ntq_emit_intrinsic(struct v3d_compile
*c
, nir_intrinsic_instr
*instr
)
1650 nir_const_value
*const_offset
;
1653 switch (instr
->intrinsic
) {
1654 case nir_intrinsic_load_uniform
:
1655 assert(instr
->num_components
== 1);
1656 const_offset
= nir_src_as_const_value(instr
->src
[0]);
1658 offset
= nir_intrinsic_base(instr
) + const_offset
->u32
[0];
1659 assert(offset
% 4 == 0);
1660 /* We need dwords */
1661 offset
= offset
/ 4;
1662 ntq_store_dest(c
, &instr
->dest
, 0,
1663 vir_uniform(c
, QUNIFORM_UNIFORM
,
1666 ntq_store_dest(c
, &instr
->dest
, 0,
1667 indirect_uniform_load(c
, instr
));
1671 case nir_intrinsic_load_ubo
:
1672 for (int i
= 0; i
< instr
->num_components
; i
++) {
1673 int ubo
= nir_src_as_const_value(instr
->src
[0])->u32
[0];
1675 /* Adjust for where we stored the TGSI register base. */
1677 vir_reg(QFILE_MAGIC
, V3D_QPU_WADDR_TMUA
),
1678 vir_uniform(c
, QUNIFORM_UBO_ADDR
, 1 + ubo
),
1680 ntq_get_src(c
, instr
->src
[1], 0),
1681 vir_uniform_ui(c
, i
* 4)));
1683 ntq_store_dest(c
, &instr
->dest
, i
, vir_LDTMU(c
));
1687 const_offset
= nir_src_as_const_value(instr
->src
[0]);
1689 offset
= nir_intrinsic_base(instr
) + const_offset
->u32
[0];
1690 assert(offset
% 4 == 0);
1691 /* We need dwords */
1692 offset
= offset
/ 4;
1693 ntq_store_dest(c
, &instr
->dest
, 0,
1694 vir_uniform(c
, QUNIFORM_UNIFORM
,
1697 ntq_store_dest(c
, &instr
->dest
, 0,
1698 indirect_uniform_load(c
, instr
));
1702 case nir_intrinsic_load_user_clip_plane
:
1703 for (int i
= 0; i
< instr
->num_components
; i
++) {
1704 ntq_store_dest(c
, &instr
->dest
, i
,
1705 vir_uniform(c
, QUNIFORM_USER_CLIP_PLANE
,
1706 nir_intrinsic_ucp_id(instr
) *
1711 case nir_intrinsic_load_alpha_ref_float
:
1712 ntq_store_dest(c
, &instr
->dest
, 0,
1713 vir_uniform(c
, QUNIFORM_ALPHA_REF
, 0));
1716 case nir_intrinsic_load_sample_mask_in
:
1717 ntq_store_dest(c
, &instr
->dest
, 0,
1718 vir_uniform(c
, QUNIFORM_SAMPLE_MASK
, 0));
1721 case nir_intrinsic_load_front_face
:
1722 /* The register contains 0 (front) or 1 (back), and we need to
1723 * turn it into a NIR bool where true means front.
1725 ntq_store_dest(c
, &instr
->dest
, 0,
1727 vir_uniform_ui(c
, -1),
1731 case nir_intrinsic_load_instance_id
:
1732 ntq_store_dest(c
, &instr
->dest
, 0, vir_MOV(c
, c
->iid
));
1735 case nir_intrinsic_load_vertex_id
:
1736 ntq_store_dest(c
, &instr
->dest
, 0, vir_MOV(c
, c
->vid
));
1739 case nir_intrinsic_load_input
:
1740 const_offset
= nir_src_as_const_value(instr
->src
[0]);
1741 assert(const_offset
&& "v3d doesn't support indirect inputs");
1742 for (int i
= 0; i
< instr
->num_components
; i
++) {
1743 offset
= nir_intrinsic_base(instr
) + const_offset
->u32
[0];
1744 int comp
= nir_intrinsic_component(instr
) + i
;
1745 ntq_store_dest(c
, &instr
->dest
, i
,
1746 vir_MOV(c
, c
->inputs
[offset
* 4 + comp
]));
1750 case nir_intrinsic_store_output
:
1751 const_offset
= nir_src_as_const_value(instr
->src
[1]);
1752 assert(const_offset
&& "v3d doesn't support indirect outputs");
1753 offset
= ((nir_intrinsic_base(instr
) +
1754 const_offset
->u32
[0]) * 4 +
1755 nir_intrinsic_component(instr
));
1757 for (int i
= 0; i
< instr
->num_components
; i
++) {
1758 c
->outputs
[offset
+ i
] =
1759 vir_MOV(c
, ntq_get_src(c
, instr
->src
[0], i
));
1761 c
->num_outputs
= MAX2(c
->num_outputs
,
1762 offset
+ instr
->num_components
);
1765 case nir_intrinsic_discard
:
1766 if (c
->execute
.file
!= QFILE_NULL
) {
1767 vir_PF(c
, c
->execute
, V3D_QPU_PF_PUSHZ
);
1768 vir_set_cond(vir_SETMSF_dest(c
, vir_reg(QFILE_NULL
, 0),
1769 vir_uniform_ui(c
, 0)),
1772 vir_SETMSF_dest(c
, vir_reg(QFILE_NULL
, 0),
1773 vir_uniform_ui(c
, 0));
1777 case nir_intrinsic_discard_if
: {
1778 /* true (~0) if we're discarding */
1779 struct qreg cond
= ntq_get_src(c
, instr
->src
[0], 0);
1781 if (c
->execute
.file
!= QFILE_NULL
) {
1782 /* execute == 0 means the channel is active. Invert
1783 * the condition so that we can use zero as "executing
1786 vir_PF(c
, vir_OR(c
, c
->execute
, vir_NOT(c
, cond
)),
1788 vir_set_cond(vir_SETMSF_dest(c
, vir_reg(QFILE_NULL
, 0),
1789 vir_uniform_ui(c
, 0)),
1792 vir_PF(c
, cond
, V3D_QPU_PF_PUSHZ
);
1793 vir_set_cond(vir_SETMSF_dest(c
, vir_reg(QFILE_NULL
, 0),
1794 vir_uniform_ui(c
, 0)),
1802 fprintf(stderr
, "Unknown intrinsic: ");
1803 nir_print_instr(&instr
->instr
, stderr
);
1804 fprintf(stderr
, "\n");
1809 /* Clears (activates) the execute flags for any channels whose jump target
1810 * matches this block.
1813 ntq_activate_execute_for_block(struct v3d_compile
*c
)
1815 vir_PF(c
, vir_SUB(c
, c
->execute
, vir_uniform_ui(c
, c
->cur_block
->index
)),
1818 vir_MOV_cond(c
, V3D_QPU_COND_IFA
, c
->execute
, vir_uniform_ui(c
, 0));
1822 ntq_emit_if(struct v3d_compile
*c
, nir_if
*if_stmt
)
1824 nir_block
*nir_else_block
= nir_if_first_else_block(if_stmt
);
1825 bool empty_else_block
=
1826 (nir_else_block
== nir_if_last_else_block(if_stmt
) &&
1827 exec_list_is_empty(&nir_else_block
->instr_list
));
1829 struct qblock
*then_block
= vir_new_block(c
);
1830 struct qblock
*after_block
= vir_new_block(c
);
1831 struct qblock
*else_block
;
1832 if (empty_else_block
)
1833 else_block
= after_block
;
1835 else_block
= vir_new_block(c
);
1837 bool was_top_level
= false;
1838 if (c
->execute
.file
== QFILE_NULL
) {
1839 c
->execute
= vir_MOV(c
, vir_uniform_ui(c
, 0));
1840 was_top_level
= true;
1843 /* Set A for executing (execute == 0) and jumping (if->condition ==
1844 * 0) channels, and then update execute flags for those to point to
1849 ntq_get_src(c
, if_stmt
->condition
, 0)),
1851 vir_MOV_cond(c
, V3D_QPU_COND_IFA
,
1853 vir_uniform_ui(c
, else_block
->index
));
1855 /* Jump to ELSE if nothing is active for THEN, otherwise fall
1858 vir_PF(c
, c
->execute
, V3D_QPU_PF_PUSHZ
);
1859 vir_BRANCH(c
, V3D_QPU_BRANCH_COND_ALLNA
);
1860 vir_link_blocks(c
->cur_block
, else_block
);
1861 vir_link_blocks(c
->cur_block
, then_block
);
1863 /* Process the THEN block. */
1864 vir_set_emit_block(c
, then_block
);
1865 ntq_emit_cf_list(c
, &if_stmt
->then_list
);
1867 if (!empty_else_block
) {
1868 /* Handle the end of the THEN block. First, all currently
1869 * active channels update their execute flags to point to
1872 vir_PF(c
, c
->execute
, V3D_QPU_PF_PUSHZ
);
1873 vir_MOV_cond(c
, V3D_QPU_COND_IFA
, c
->execute
,
1874 vir_uniform_ui(c
, after_block
->index
));
1876 /* If everything points at ENDIF, then jump there immediately. */
1877 vir_PF(c
, vir_SUB(c
, c
->execute
,
1878 vir_uniform_ui(c
, after_block
->index
)),
1880 vir_BRANCH(c
, V3D_QPU_BRANCH_COND_ALLA
);
1881 vir_link_blocks(c
->cur_block
, after_block
);
1882 vir_link_blocks(c
->cur_block
, else_block
);
1884 vir_set_emit_block(c
, else_block
);
1885 ntq_activate_execute_for_block(c
);
1886 ntq_emit_cf_list(c
, &if_stmt
->else_list
);
1889 vir_link_blocks(c
->cur_block
, after_block
);
1891 vir_set_emit_block(c
, after_block
);
1893 c
->execute
= c
->undef
;
1895 ntq_activate_execute_for_block(c
);
1899 ntq_emit_jump(struct v3d_compile
*c
, nir_jump_instr
*jump
)
1901 switch (jump
->type
) {
1902 case nir_jump_break
:
1903 vir_PF(c
, c
->execute
, V3D_QPU_PF_PUSHZ
);
1904 vir_MOV_cond(c
, V3D_QPU_COND_IFA
, c
->execute
,
1905 vir_uniform_ui(c
, c
->loop_break_block
->index
));
1908 case nir_jump_continue
:
1909 vir_PF(c
, c
->execute
, V3D_QPU_PF_PUSHZ
);
1910 vir_MOV_cond(c
, V3D_QPU_COND_IFA
, c
->execute
,
1911 vir_uniform_ui(c
, c
->loop_cont_block
->index
));
1914 case nir_jump_return
:
1915 unreachable("All returns shouold be lowered\n");
1920 ntq_emit_instr(struct v3d_compile
*c
, nir_instr
*instr
)
1922 switch (instr
->type
) {
1923 case nir_instr_type_alu
:
1924 ntq_emit_alu(c
, nir_instr_as_alu(instr
));
1927 case nir_instr_type_intrinsic
:
1928 ntq_emit_intrinsic(c
, nir_instr_as_intrinsic(instr
));
1931 case nir_instr_type_load_const
:
1932 ntq_emit_load_const(c
, nir_instr_as_load_const(instr
));
1935 case nir_instr_type_ssa_undef
:
1936 ntq_emit_ssa_undef(c
, nir_instr_as_ssa_undef(instr
));
1939 case nir_instr_type_tex
:
1940 ntq_emit_tex(c
, nir_instr_as_tex(instr
));
1943 case nir_instr_type_jump
:
1944 ntq_emit_jump(c
, nir_instr_as_jump(instr
));
1948 fprintf(stderr
, "Unknown NIR instr type: ");
1949 nir_print_instr(instr
, stderr
);
1950 fprintf(stderr
, "\n");
1956 ntq_emit_block(struct v3d_compile
*c
, nir_block
*block
)
1958 nir_foreach_instr(instr
, block
) {
1959 ntq_emit_instr(c
, instr
);
1963 static void ntq_emit_cf_list(struct v3d_compile
*c
, struct exec_list
*list
);
1966 ntq_emit_loop(struct v3d_compile
*c
, nir_loop
*loop
)
1968 bool was_top_level
= false;
1969 if (c
->execute
.file
== QFILE_NULL
) {
1970 c
->execute
= vir_MOV(c
, vir_uniform_ui(c
, 0));
1971 was_top_level
= true;
1974 struct qblock
*save_loop_cont_block
= c
->loop_cont_block
;
1975 struct qblock
*save_loop_break_block
= c
->loop_break_block
;
1977 c
->loop_cont_block
= vir_new_block(c
);
1978 c
->loop_break_block
= vir_new_block(c
);
1980 vir_link_blocks(c
->cur_block
, c
->loop_cont_block
);
1981 vir_set_emit_block(c
, c
->loop_cont_block
);
1982 ntq_activate_execute_for_block(c
);
1984 ntq_emit_cf_list(c
, &loop
->body
);
1986 /* Re-enable any previous continues now, so our ANYA check below
1989 * XXX: Use the .ORZ flags update, instead.
1991 vir_PF(c
, vir_SUB(c
,
1993 vir_uniform_ui(c
, c
->loop_cont_block
->index
)),
1995 vir_MOV_cond(c
, V3D_QPU_COND_IFA
, c
->execute
, vir_uniform_ui(c
, 0));
1997 vir_PF(c
, c
->execute
, V3D_QPU_PF_PUSHZ
);
1999 vir_BRANCH(c
, V3D_QPU_BRANCH_COND_ANYA
);
2000 vir_link_blocks(c
->cur_block
, c
->loop_cont_block
);
2001 vir_link_blocks(c
->cur_block
, c
->loop_break_block
);
2003 vir_set_emit_block(c
, c
->loop_break_block
);
2005 c
->execute
= c
->undef
;
2007 ntq_activate_execute_for_block(c
);
2009 c
->loop_break_block
= save_loop_break_block
;
2010 c
->loop_cont_block
= save_loop_cont_block
;
2014 ntq_emit_function(struct v3d_compile
*c
, nir_function_impl
*func
)
2016 fprintf(stderr
, "FUNCTIONS not handled.\n");
2021 ntq_emit_cf_list(struct v3d_compile
*c
, struct exec_list
*list
)
2023 foreach_list_typed(nir_cf_node
, node
, node
, list
) {
2024 switch (node
->type
) {
2025 case nir_cf_node_block
:
2026 ntq_emit_block(c
, nir_cf_node_as_block(node
));
2029 case nir_cf_node_if
:
2030 ntq_emit_if(c
, nir_cf_node_as_if(node
));
2033 case nir_cf_node_loop
:
2034 ntq_emit_loop(c
, nir_cf_node_as_loop(node
));
2037 case nir_cf_node_function
:
2038 ntq_emit_function(c
, nir_cf_node_as_function(node
));
2042 fprintf(stderr
, "Unknown NIR node type\n");
2049 ntq_emit_impl(struct v3d_compile
*c
, nir_function_impl
*impl
)
2051 ntq_setup_registers(c
, &impl
->registers
);
2052 ntq_emit_cf_list(c
, &impl
->body
);
2056 nir_to_vir(struct v3d_compile
*c
)
2058 if (c
->s
->info
.stage
== MESA_SHADER_FRAGMENT
) {
2059 c
->payload_w
= vir_MOV(c
, vir_reg(QFILE_REG
, 0));
2060 c
->payload_w_centroid
= vir_MOV(c
, vir_reg(QFILE_REG
, 1));
2061 c
->payload_z
= vir_MOV(c
, vir_reg(QFILE_REG
, 2));
2063 if (c
->fs_key
->is_points
) {
2064 c
->point_x
= emit_fragment_varying(c
, NULL
, 0);
2065 c
->point_y
= emit_fragment_varying(c
, NULL
, 0);
2066 } else if (c
->fs_key
->is_lines
) {
2067 c
->line_x
= emit_fragment_varying(c
, NULL
, 0);
2071 ntq_setup_inputs(c
);
2072 ntq_setup_outputs(c
);
2073 ntq_setup_uniforms(c
);
2074 ntq_setup_registers(c
, &c
->s
->registers
);
2076 /* Find the main function and emit the body. */
2077 nir_foreach_function(function
, c
->s
) {
2078 assert(strcmp(function
->name
, "main") == 0);
2079 assert(function
->impl
);
2080 ntq_emit_impl(c
, function
->impl
);
2084 const nir_shader_compiler_options v3d_nir_options
= {
2085 .lower_extract_byte
= true,
2086 .lower_extract_word
= true,
2087 .lower_bitfield_insert
= true,
2088 .lower_bitfield_extract
= true,
2089 .lower_pack_unorm_2x16
= true,
2090 .lower_pack_snorm_2x16
= true,
2091 .lower_pack_unorm_4x8
= true,
2092 .lower_pack_snorm_4x8
= true,
2093 .lower_unpack_unorm_4x8
= true,
2094 .lower_unpack_snorm_4x8
= true,
2097 .lower_flrp32
= true,
2100 .lower_fsqrt
= true,
2101 .native_integers
= true,
2107 count_nir_instrs(nir_shader
*nir
)
2110 nir_foreach_function(function
, nir
) {
2111 if (!function
->impl
)
2113 nir_foreach_block(block
, function
->impl
) {
2114 nir_foreach_instr(instr
, block
)
2123 v3d_nir_to_vir(struct v3d_compile
*c
)
2125 if (V3D_DEBUG
& (V3D_DEBUG_NIR
|
2126 v3d_debug_flag_for_shader_stage(c
->s
->info
.stage
))) {
2127 fprintf(stderr
, "%s prog %d/%d NIR:\n",
2128 vir_get_stage_name(c
),
2129 c
->program_id
, c
->variant_id
);
2130 nir_print_shader(c
->s
, stderr
);
2135 switch (c
->s
->info
.stage
) {
2136 case MESA_SHADER_FRAGMENT
:
2139 case MESA_SHADER_VERTEX
:
2143 unreachable("bad stage");
2146 if (V3D_DEBUG
& (V3D_DEBUG_VIR
|
2147 v3d_debug_flag_for_shader_stage(c
->s
->info
.stage
))) {
2148 fprintf(stderr
, "%s prog %d/%d pre-opt VIR:\n",
2149 vir_get_stage_name(c
),
2150 c
->program_id
, c
->variant_id
);
2152 fprintf(stderr
, "\n");
2156 vir_lower_uniforms(c
);
2158 /* XXX: vir_schedule_instructions(c); */
2160 if (V3D_DEBUG
& (V3D_DEBUG_VIR
|
2161 v3d_debug_flag_for_shader_stage(c
->s
->info
.stage
))) {
2162 fprintf(stderr
, "%s prog %d/%d VIR:\n",
2163 vir_get_stage_name(c
),
2164 c
->program_id
, c
->variant_id
);
2166 fprintf(stderr
, "\n");