3 * Copyright © 2014 Broadcom
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
28 fail_instr(uint64_t inst
, const char *msg
)
30 fprintf(stderr
, "vc4_qpu_validate: %s: ", msg
);
31 vc4_qpu_disasm(&inst
, 1);
32 fprintf(stderr
, "\n");
37 writes_reg(uint64_t inst
, uint32_t w
)
39 return (QPU_GET_FIELD(inst
, QPU_WADDR_ADD
) == w
||
40 QPU_GET_FIELD(inst
, QPU_WADDR_MUL
) == w
);
44 _reads_reg(uint64_t inst
, uint32_t r
, bool ignore_a
, bool ignore_b
)
49 { QPU_GET_FIELD(inst
, QPU_ADD_A
) },
50 { QPU_GET_FIELD(inst
, QPU_ADD_B
) },
51 { QPU_GET_FIELD(inst
, QPU_MUL_A
) },
52 { QPU_GET_FIELD(inst
, QPU_MUL_B
) },
55 /* Branches only reference raddr_a (no mux), and we don't use that
56 * feature of branching.
58 if (QPU_GET_FIELD(inst
, QPU_SIG
) == QPU_SIG_BRANCH
)
61 for (int i
= 0; i
< ARRAY_SIZE(src_regs
); i
++) {
63 src_regs
[i
].mux
== QPU_MUX_A
&&
64 (QPU_GET_FIELD(inst
, QPU_RADDR_A
) == r
))
68 QPU_GET_FIELD(inst
, QPU_SIG
) != QPU_SIG_SMALL_IMM
&&
69 src_regs
[i
].mux
== QPU_MUX_B
&&
70 (QPU_GET_FIELD(inst
, QPU_RADDR_B
) == r
))
78 reads_reg(uint64_t inst
, uint32_t r
)
80 return _reads_reg(inst
, r
, false, false);
84 reads_a_reg(uint64_t inst
, uint32_t r
)
86 return _reads_reg(inst
, r
, false, true);
90 reads_b_reg(uint64_t inst
, uint32_t r
)
92 return _reads_reg(inst
, r
, true, false);
96 writes_sfu(uint64_t inst
)
98 return (writes_reg(inst
, QPU_W_SFU_RECIP
) ||
99 writes_reg(inst
, QPU_W_SFU_RECIPSQRT
) ||
100 writes_reg(inst
, QPU_W_SFU_EXP
) ||
101 writes_reg(inst
, QPU_W_SFU_LOG
));
105 * Checks for the instruction restrictions from page 37 ("Summary of
106 * Instruction Restrictions").
109 vc4_qpu_validate(uint64_t *insts
, uint32_t num_inst
)
111 bool scoreboard_locked
= false;
112 bool threaded
= false;
114 /* We don't want to do validation in release builds, but we want to
115 * keep compiling the validation code to make sure it doesn't get
122 for (int i
= 0; i
< num_inst
; i
++) {
123 uint64_t inst
= insts
[i
];
124 uint32_t sig
= QPU_GET_FIELD(inst
, QPU_SIG
);
126 if (sig
!= QPU_SIG_PROG_END
) {
127 if (qpu_inst_is_tlb(inst
))
128 scoreboard_locked
= true;
130 if (sig
== QPU_SIG_THREAD_SWITCH
||
131 sig
== QPU_SIG_LAST_THREAD_SWITCH
) {
138 /* "The Thread End instruction must not write to either physical
141 if (QPU_GET_FIELD(inst
, QPU_WADDR_ADD
) < 32 ||
142 QPU_GET_FIELD(inst
, QPU_WADDR_MUL
) < 32) {
143 fail_instr(inst
, "write to phys reg in thread end");
146 /* Can't trigger an implicit wait on scoreboard in the program
149 if (qpu_inst_is_tlb(inst
) && !scoreboard_locked
)
150 fail_instr(inst
, "implicit sb wait in program end");
152 /* Two delay slots will be executed. */
153 assert(i
+ 2 <= num_inst
);
155 for (int j
= i
; j
< i
+ 2; j
++) {
156 /* "The last three instructions of any program
157 * (Thread End plus the following two delay-slot
158 * instructions) must not do varyings read, uniforms
159 * read or any kind of VPM, VDR, or VDW read or
162 if (writes_reg(insts
[j
], QPU_W_VPM
) ||
163 reads_reg(insts
[j
], QPU_R_VARY
) ||
164 reads_reg(insts
[j
], QPU_R_UNIF
) ||
165 reads_reg(insts
[j
], QPU_R_VPM
)) {
166 fail_instr(insts
[j
], "last 3 instructions "
167 "using fixed functions");
170 /* "The Thread End instruction and the following two
171 * delay slot instructions must not write or read
172 * address 14 in either regfile A or B."
174 if (writes_reg(insts
[j
], 14) ||
175 reads_reg(insts
[j
], 14)) {
176 fail_instr(insts
[j
], "last 3 instructions "
181 /* "The final program instruction (the second delay slot
182 * instruction) must not do a TLB Z write."
184 if (writes_reg(insts
[i
+ 2], QPU_W_TLB_Z
)) {
185 fail_instr(insts
[i
+ 2], "final instruction doing "
190 /* "A scoreboard wait must not occur in the first two instructions of
191 * a fragment shader. This is either the explicit Wait for Scoreboard
192 * signal or an implicit wait with the first tile-buffer read or
193 * write instruction."
195 for (int i
= 0; i
< 2; i
++) {
196 uint64_t inst
= insts
[i
];
198 if (qpu_inst_is_tlb(inst
))
199 fail_instr(inst
, "sb wait in first two insts");
202 /* "If TMU_NOSWAP is written, the write must be three instructions
203 * before the first TMU write instruction. For example, if
204 * TMU_NOSWAP is written in the first shader instruction, the first
205 * TMU write cannot occur before the 4th shader instruction."
207 int last_tmu_noswap
= -10;
208 for (int i
= 0; i
< num_inst
; i
++) {
209 uint64_t inst
= insts
[i
];
211 if ((i
- last_tmu_noswap
) <= 3 &&
212 (writes_reg(inst
, QPU_W_TMU0_S
) ||
213 writes_reg(inst
, QPU_W_TMU1_S
))) {
214 fail_instr(inst
, "TMU write too soon after TMU_NOSWAP");
217 if (writes_reg(inst
, QPU_W_TMU_NOSWAP
))
221 /* "An instruction must not read from a location in physical regfile A
222 * or B that was written to by the previous instruction."
224 for (int i
= 0; i
< num_inst
- 1; i
++) {
225 uint64_t inst
= insts
[i
];
226 uint32_t add_waddr
= QPU_GET_FIELD(inst
, QPU_WADDR_ADD
);
227 uint32_t mul_waddr
= QPU_GET_FIELD(inst
, QPU_WADDR_MUL
);
228 uint32_t waddr_a
, waddr_b
;
238 if ((waddr_a
< 32 && reads_a_reg(insts
[i
+ 1], waddr_a
)) ||
239 (waddr_b
< 32 && reads_b_reg(insts
[i
+ 1], waddr_b
))) {
240 fail_instr(insts
[i
+ 1],
241 "Reads physical reg too soon after write");
245 /* "After an SFU lookup instruction, accumulator r4 must not be read
246 * in the following two instructions. Any other instruction that
247 * results in r4 being written (that is, TMU read, TLB read, SFU
248 * lookup) cannot occur in the two instructions following an SFU
251 int last_sfu_inst
= -10;
252 for (int i
= 0; i
< num_inst
- 1; i
++) {
253 uint64_t inst
= insts
[i
];
254 uint32_t sig
= QPU_GET_FIELD(inst
, QPU_SIG
);
256 if (i
- last_sfu_inst
<= 2 &&
258 sig
== QPU_SIG_LOAD_TMU0
||
259 sig
== QPU_SIG_LOAD_TMU1
||
260 sig
== QPU_SIG_COLOR_LOAD
)) {
261 fail_instr(inst
, "R4 write too soon after SFU write");
264 if (writes_sfu(inst
))
268 for (int i
= 0; i
< num_inst
- 1; i
++) {
269 uint64_t inst
= insts
[i
];
271 if (QPU_GET_FIELD(inst
, QPU_SIG
) == QPU_SIG_SMALL_IMM
&&
272 QPU_GET_FIELD(inst
, QPU_SMALL_IMM
) >=
273 QPU_SMALL_IMM_MUL_ROT
) {
274 uint32_t mux_a
= QPU_GET_FIELD(inst
, QPU_MUL_A
);
275 uint32_t mux_b
= QPU_GET_FIELD(inst
, QPU_MUL_B
);
277 /* "The full horizontal vector rotate is only
278 * available when both of the mul ALU input arguments
279 * are taken from accumulators r0-r3."
281 if (mux_a
> QPU_MUX_R3
|| mux_b
> QPU_MUX_R3
) {
283 "MUL rotate using non-accumulator "
287 if (QPU_GET_FIELD(inst
, QPU_SMALL_IMM
) ==
288 QPU_SMALL_IMM_MUL_ROT
) {
289 /* "An instruction that does a vector rotate
290 * by r5 must not immediately follow an
291 * instruction that writes to r5."
293 if (writes_reg(insts
[i
- 1], QPU_W_ACC5
)) {
295 "vector rotate by r5 "
296 "immediately after r5 write");
300 /* "An instruction that does a vector rotate must not
301 * immediately follow an instruction that writes to the
302 * accumulator that is being rotated."
304 if (writes_reg(insts
[i
- 1], QPU_W_ACC0
+ mux_a
) ||
305 writes_reg(insts
[i
- 1], QPU_W_ACC0
+ mux_b
)) {
307 "vector rotate of value "
308 "written in previous instruction");
313 /* "An instruction that does a vector rotate must not immediately
314 * follow an instruction that writes to the accumulator that is being
320 /* "After an instruction that does a TLB Z write, the multisample mask
321 * must not be read as an instruction input argument in the following
322 * two instruction. The TLB Z write instruction can, however, be
323 * followed immediately by a TLB color write."
325 for (int i
= 0; i
< num_inst
- 1; i
++) {
326 uint64_t inst
= insts
[i
];
327 if (writes_reg(inst
, QPU_W_TLB_Z
) &&
328 (reads_a_reg(insts
[i
+ 1], QPU_R_MS_REV_FLAGS
) ||
329 reads_a_reg(insts
[i
+ 2], QPU_R_MS_REV_FLAGS
))) {
330 fail_instr(inst
, "TLB Z write followed by MS mask read");
335 * "A single instruction can only perform a maximum of one of the
336 * following closely coupled peripheral accesses in a single
337 * instruction: TMU write, TMU read, TLB write, TLB read, TLB
338 * combined color read and write, SFU write, Mutex read or Semaphore
341 for (int i
= 0; i
< num_inst
- 1; i
++) {
342 uint64_t inst
= insts
[i
];
344 if (qpu_num_sf_accesses(inst
) > 1)
345 fail_instr(inst
, "Single instruction writes SFU twice");
348 /* "The uniform base pointer can be written (from SIMD element 0) by
349 * the processor to reset the stream, there must be at least two
350 * nonuniform-accessing instructions following a pointer change
351 * before uniforms can be accessed once more."
353 int last_unif_pointer_update
= -3;
354 for (int i
= 0; i
< num_inst
; i
++) {
355 uint64_t inst
= insts
[i
];
356 uint32_t waddr_add
= QPU_GET_FIELD(inst
, QPU_WADDR_ADD
);
357 uint32_t waddr_mul
= QPU_GET_FIELD(inst
, QPU_WADDR_MUL
);
359 if (reads_reg(inst
, QPU_R_UNIF
) &&
360 i
- last_unif_pointer_update
<= 2) {
362 "uniform read too soon after pointer update");
365 if (waddr_add
== QPU_W_UNIFORMS_ADDRESS
||
366 waddr_mul
== QPU_W_UNIFORMS_ADDRESS
)
367 last_unif_pointer_update
= i
;
371 bool last_thrsw_found
= false;
372 bool scoreboard_locked
= false;
373 int tex_samples_outstanding
= 0;
374 int last_tex_samples_outstanding
= 0;
377 for (int i
= 0; i
< num_inst
; i
++) {
378 uint64_t inst
= insts
[i
];
379 uint32_t sig
= QPU_GET_FIELD(inst
, QPU_SIG
);
382 /* In order to get texture results back in the
383 * correct order, before a new thrsw we have
384 * to read all the texture results from before
385 * the previous thrsw.
387 * FIXME: Is collecting the remaining results
388 * during the delay slots OK, or should we do
389 * this at THRSW signal time?
391 if (last_tex_samples_outstanding
!= 0) {
392 fail_instr(inst
, "THRSW with texture "
393 "results from the previous "
394 "THRSW still in the FIFO.");
397 last_tex_samples_outstanding
=
398 tex_samples_outstanding
;
399 tex_samples_outstanding
= 0;
402 if (qpu_inst_is_tlb(inst
))
403 scoreboard_locked
= true;
406 case QPU_SIG_THREAD_SWITCH
:
407 case QPU_SIG_LAST_THREAD_SWITCH
:
408 /* No thread switching with the scoreboard
409 * locked. Doing so means we may deadlock
410 * when the other thread tries to lock
413 if (scoreboard_locked
) {
414 fail_instr(inst
, "THRSW with the "
415 "scoreboard locked.");
418 /* No thread switching after lthrsw, since
419 * lthrsw means that we get delayed until the
420 * other shader is ready for us to terminate.
422 if (last_thrsw_found
) {
423 fail_instr(inst
, "THRSW after a "
427 if (sig
== QPU_SIG_LAST_THREAD_SWITCH
)
428 last_thrsw_found
= true;
430 /* No THRSW while we already have a THRSW
435 "THRSW with a THRSW queued.");
441 case QPU_SIG_LOAD_TMU0
:
442 case QPU_SIG_LOAD_TMU1
:
443 if (last_tex_samples_outstanding
== 0) {
444 fail_instr(inst
, "TMU load with nothing "
445 "in the results fifo from "
446 "the previous THRSW.");
449 last_tex_samples_outstanding
--;
453 uint32_t waddr_add
= QPU_GET_FIELD(inst
, QPU_WADDR_ADD
);
454 uint32_t waddr_mul
= QPU_GET_FIELD(inst
, QPU_WADDR_MUL
);
455 if (waddr_add
== QPU_W_TMU0_S
||
456 waddr_add
== QPU_W_TMU1_S
||
457 waddr_mul
== QPU_W_TMU0_S
||
458 waddr_mul
== QPU_W_TMU1_S
) {
459 tex_samples_outstanding
++;