3 * Copyright © 2014 Broadcom
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
28 fail_instr(uint64_t inst
, const char *msg
)
30 fprintf(stderr
, "vc4_qpu_validate: %s: ", msg
);
31 vc4_qpu_disasm(&inst
, 1);
32 fprintf(stderr
, "\n");
37 writes_reg(uint64_t inst
, uint32_t w
)
39 return (QPU_GET_FIELD(inst
, QPU_WADDR_ADD
) == w
||
40 QPU_GET_FIELD(inst
, QPU_WADDR_MUL
) == w
);
44 _reads_reg(uint64_t inst
, uint32_t r
, bool ignore_a
, bool ignore_b
)
49 { QPU_GET_FIELD(inst
, QPU_ADD_A
) },
50 { QPU_GET_FIELD(inst
, QPU_ADD_B
) },
51 { QPU_GET_FIELD(inst
, QPU_MUL_A
) },
52 { QPU_GET_FIELD(inst
, QPU_MUL_B
) },
55 /* Branches only reference raddr_a (no mux), and we don't use that
56 * feature of branching.
58 if (QPU_GET_FIELD(inst
, QPU_SIG
) == QPU_SIG_BRANCH
)
61 /* Load immediates don't read any registers. */
62 if (QPU_GET_FIELD(inst
, QPU_SIG
) == QPU_SIG_LOAD_IMM
)
65 for (int i
= 0; i
< ARRAY_SIZE(src_regs
); i
++) {
67 src_regs
[i
].mux
== QPU_MUX_A
&&
68 (QPU_GET_FIELD(inst
, QPU_RADDR_A
) == r
))
72 QPU_GET_FIELD(inst
, QPU_SIG
) != QPU_SIG_SMALL_IMM
&&
73 src_regs
[i
].mux
== QPU_MUX_B
&&
74 (QPU_GET_FIELD(inst
, QPU_RADDR_B
) == r
))
82 reads_reg(uint64_t inst
, uint32_t r
)
84 return _reads_reg(inst
, r
, false, false);
88 reads_a_reg(uint64_t inst
, uint32_t r
)
90 return _reads_reg(inst
, r
, false, true);
94 reads_b_reg(uint64_t inst
, uint32_t r
)
96 return _reads_reg(inst
, r
, true, false);
100 writes_sfu(uint64_t inst
)
102 return (writes_reg(inst
, QPU_W_SFU_RECIP
) ||
103 writes_reg(inst
, QPU_W_SFU_RECIPSQRT
) ||
104 writes_reg(inst
, QPU_W_SFU_EXP
) ||
105 writes_reg(inst
, QPU_W_SFU_LOG
));
109 * Checks for the instruction restrictions from page 37 ("Summary of
110 * Instruction Restrictions").
113 vc4_qpu_validate(uint64_t *insts
, uint32_t num_inst
)
115 bool scoreboard_locked
= false;
116 bool threaded
= false;
118 /* We don't want to do validation in release builds, but we want to
119 * keep compiling the validation code to make sure it doesn't get
126 for (int i
= 0; i
< num_inst
; i
++) {
127 uint64_t inst
= insts
[i
];
128 uint32_t sig
= QPU_GET_FIELD(inst
, QPU_SIG
);
130 if (sig
!= QPU_SIG_PROG_END
) {
131 if (qpu_inst_is_tlb(inst
))
132 scoreboard_locked
= true;
134 if (sig
== QPU_SIG_THREAD_SWITCH
||
135 sig
== QPU_SIG_LAST_THREAD_SWITCH
) {
142 /* "The Thread End instruction must not write to either physical
145 if (QPU_GET_FIELD(inst
, QPU_WADDR_ADD
) < 32 ||
146 QPU_GET_FIELD(inst
, QPU_WADDR_MUL
) < 32) {
147 fail_instr(inst
, "write to phys reg in thread end");
150 /* Can't trigger an implicit wait on scoreboard in the program
153 if (qpu_inst_is_tlb(inst
) && !scoreboard_locked
)
154 fail_instr(inst
, "implicit sb wait in program end");
156 /* Two delay slots will be executed. */
157 assert(i
+ 2 <= num_inst
);
159 for (int j
= i
; j
< i
+ 2; j
++) {
160 /* "The last three instructions of any program
161 * (Thread End plus the following two delay-slot
162 * instructions) must not do varyings read, uniforms
163 * read or any kind of VPM, VDR, or VDW read or
166 if (writes_reg(insts
[j
], QPU_W_VPM
) ||
167 reads_reg(insts
[j
], QPU_R_VARY
) ||
168 reads_reg(insts
[j
], QPU_R_UNIF
) ||
169 reads_reg(insts
[j
], QPU_R_VPM
)) {
170 fail_instr(insts
[j
], "last 3 instructions "
171 "using fixed functions");
174 /* "The Thread End instruction and the following two
175 * delay slot instructions must not write or read
176 * address 14 in either regfile A or B."
178 if (writes_reg(insts
[j
], 14) ||
179 reads_reg(insts
[j
], 14)) {
180 fail_instr(insts
[j
], "last 3 instructions "
185 /* "The final program instruction (the second delay slot
186 * instruction) must not do a TLB Z write."
188 if (writes_reg(insts
[i
+ 2], QPU_W_TLB_Z
)) {
189 fail_instr(insts
[i
+ 2], "final instruction doing "
194 /* "A scoreboard wait must not occur in the first two instructions of
195 * a fragment shader. This is either the explicit Wait for Scoreboard
196 * signal or an implicit wait with the first tile-buffer read or
197 * write instruction."
199 for (int i
= 0; i
< 2; i
++) {
200 uint64_t inst
= insts
[i
];
202 if (qpu_inst_is_tlb(inst
))
203 fail_instr(inst
, "sb wait in first two insts");
206 /* "If TMU_NOSWAP is written, the write must be three instructions
207 * before the first TMU write instruction. For example, if
208 * TMU_NOSWAP is written in the first shader instruction, the first
209 * TMU write cannot occur before the 4th shader instruction."
211 int last_tmu_noswap
= -10;
212 for (int i
= 0; i
< num_inst
; i
++) {
213 uint64_t inst
= insts
[i
];
215 if ((i
- last_tmu_noswap
) <= 3 &&
216 (writes_reg(inst
, QPU_W_TMU0_S
) ||
217 writes_reg(inst
, QPU_W_TMU1_S
))) {
218 fail_instr(inst
, "TMU write too soon after TMU_NOSWAP");
221 if (writes_reg(inst
, QPU_W_TMU_NOSWAP
))
225 /* "An instruction must not read from a location in physical regfile A
226 * or B that was written to by the previous instruction."
228 for (int i
= 0; i
< num_inst
- 1; i
++) {
229 uint64_t inst
= insts
[i
];
230 uint32_t add_waddr
= QPU_GET_FIELD(inst
, QPU_WADDR_ADD
);
231 uint32_t mul_waddr
= QPU_GET_FIELD(inst
, QPU_WADDR_MUL
);
232 uint32_t waddr_a
, waddr_b
;
242 if ((waddr_a
< 32 && reads_a_reg(insts
[i
+ 1], waddr_a
)) ||
243 (waddr_b
< 32 && reads_b_reg(insts
[i
+ 1], waddr_b
))) {
244 fail_instr(insts
[i
+ 1],
245 "Reads physical reg too soon after write");
249 /* "After an SFU lookup instruction, accumulator r4 must not be read
250 * in the following two instructions. Any other instruction that
251 * results in r4 being written (that is, TMU read, TLB read, SFU
252 * lookup) cannot occur in the two instructions following an SFU
255 int last_sfu_inst
= -10;
256 for (int i
= 0; i
< num_inst
- 1; i
++) {
257 uint64_t inst
= insts
[i
];
258 uint32_t sig
= QPU_GET_FIELD(inst
, QPU_SIG
);
260 if (i
- last_sfu_inst
<= 2 &&
262 sig
== QPU_SIG_LOAD_TMU0
||
263 sig
== QPU_SIG_LOAD_TMU1
||
264 sig
== QPU_SIG_COLOR_LOAD
)) {
265 fail_instr(inst
, "R4 write too soon after SFU write");
268 if (writes_sfu(inst
))
272 for (int i
= 0; i
< num_inst
- 1; i
++) {
273 uint64_t inst
= insts
[i
];
275 if (QPU_GET_FIELD(inst
, QPU_SIG
) == QPU_SIG_SMALL_IMM
&&
276 QPU_GET_FIELD(inst
, QPU_SMALL_IMM
) >=
277 QPU_SMALL_IMM_MUL_ROT
) {
278 uint32_t mux_a
= QPU_GET_FIELD(inst
, QPU_MUL_A
);
279 uint32_t mux_b
= QPU_GET_FIELD(inst
, QPU_MUL_B
);
281 /* "The full horizontal vector rotate is only
282 * available when both of the mul ALU input arguments
283 * are taken from accumulators r0-r3."
285 if (mux_a
> QPU_MUX_R3
|| mux_b
> QPU_MUX_R3
) {
287 "MUL rotate using non-accumulator "
291 if (QPU_GET_FIELD(inst
, QPU_SMALL_IMM
) ==
292 QPU_SMALL_IMM_MUL_ROT
) {
293 /* "An instruction that does a vector rotate
294 * by r5 must not immediately follow an
295 * instruction that writes to r5."
297 if (writes_reg(insts
[i
- 1], QPU_W_ACC5
)) {
299 "vector rotate by r5 "
300 "immediately after r5 write");
304 /* "An instruction that does a vector rotate must not
305 * immediately follow an instruction that writes to the
306 * accumulator that is being rotated."
308 if (writes_reg(insts
[i
- 1], QPU_W_ACC0
+ mux_a
) ||
309 writes_reg(insts
[i
- 1], QPU_W_ACC0
+ mux_b
)) {
311 "vector rotate of value "
312 "written in previous instruction");
317 /* "An instruction that does a vector rotate must not immediately
318 * follow an instruction that writes to the accumulator that is being
324 /* "After an instruction that does a TLB Z write, the multisample mask
325 * must not be read as an instruction input argument in the following
326 * two instruction. The TLB Z write instruction can, however, be
327 * followed immediately by a TLB color write."
329 for (int i
= 0; i
< num_inst
- 1; i
++) {
330 uint64_t inst
= insts
[i
];
331 if (writes_reg(inst
, QPU_W_TLB_Z
) &&
332 (reads_a_reg(insts
[i
+ 1], QPU_R_MS_REV_FLAGS
) ||
333 reads_a_reg(insts
[i
+ 2], QPU_R_MS_REV_FLAGS
))) {
334 fail_instr(inst
, "TLB Z write followed by MS mask read");
339 * "A single instruction can only perform a maximum of one of the
340 * following closely coupled peripheral accesses in a single
341 * instruction: TMU write, TMU read, TLB write, TLB read, TLB
342 * combined color read and write, SFU write, Mutex read or Semaphore
345 for (int i
= 0; i
< num_inst
- 1; i
++) {
346 uint64_t inst
= insts
[i
];
348 if (qpu_num_sf_accesses(inst
) > 1)
349 fail_instr(inst
, "Single instruction writes SFU twice");
352 /* "The uniform base pointer can be written (from SIMD element 0) by
353 * the processor to reset the stream, there must be at least two
354 * nonuniform-accessing instructions following a pointer change
355 * before uniforms can be accessed once more."
357 int last_unif_pointer_update
= -3;
358 for (int i
= 0; i
< num_inst
; i
++) {
359 uint64_t inst
= insts
[i
];
360 uint32_t waddr_add
= QPU_GET_FIELD(inst
, QPU_WADDR_ADD
);
361 uint32_t waddr_mul
= QPU_GET_FIELD(inst
, QPU_WADDR_MUL
);
363 if (reads_reg(inst
, QPU_R_UNIF
) &&
364 i
- last_unif_pointer_update
<= 2) {
366 "uniform read too soon after pointer update");
369 if (waddr_add
== QPU_W_UNIFORMS_ADDRESS
||
370 waddr_mul
== QPU_W_UNIFORMS_ADDRESS
)
371 last_unif_pointer_update
= i
;
375 bool last_thrsw_found
= false;
376 bool scoreboard_locked
= false;
377 int tex_samples_outstanding
= 0;
378 int last_tex_samples_outstanding
= 0;
381 for (int i
= 0; i
< num_inst
; i
++) {
382 uint64_t inst
= insts
[i
];
383 uint32_t sig
= QPU_GET_FIELD(inst
, QPU_SIG
);
386 /* In order to get texture results back in the
387 * correct order, before a new thrsw we have
388 * to read all the texture results from before
389 * the previous thrsw.
391 * FIXME: Is collecting the remaining results
392 * during the delay slots OK, or should we do
393 * this at THRSW signal time?
395 if (last_tex_samples_outstanding
!= 0) {
396 fail_instr(inst
, "THRSW with texture "
397 "results from the previous "
398 "THRSW still in the FIFO.");
401 last_tex_samples_outstanding
=
402 tex_samples_outstanding
;
403 tex_samples_outstanding
= 0;
406 if (qpu_inst_is_tlb(inst
))
407 scoreboard_locked
= true;
410 case QPU_SIG_THREAD_SWITCH
:
411 case QPU_SIG_LAST_THREAD_SWITCH
:
412 /* No thread switching with the scoreboard
413 * locked. Doing so means we may deadlock
414 * when the other thread tries to lock
417 if (scoreboard_locked
) {
418 fail_instr(inst
, "THRSW with the "
419 "scoreboard locked.");
422 /* No thread switching after lthrsw, since
423 * lthrsw means that we get delayed until the
424 * other shader is ready for us to terminate.
426 if (last_thrsw_found
) {
427 fail_instr(inst
, "THRSW after a "
431 if (sig
== QPU_SIG_LAST_THREAD_SWITCH
)
432 last_thrsw_found
= true;
434 /* No THRSW while we already have a THRSW
439 "THRSW with a THRSW queued.");
445 case QPU_SIG_LOAD_TMU0
:
446 case QPU_SIG_LOAD_TMU1
:
447 if (last_tex_samples_outstanding
== 0) {
448 fail_instr(inst
, "TMU load with nothing "
449 "in the results fifo from "
450 "the previous THRSW.");
453 last_tex_samples_outstanding
--;
457 uint32_t waddr_add
= QPU_GET_FIELD(inst
, QPU_WADDR_ADD
);
458 uint32_t waddr_mul
= QPU_GET_FIELD(inst
, QPU_WADDR_MUL
);
459 if (waddr_add
== QPU_W_TMU0_S
||
460 waddr_add
== QPU_W_TMU1_S
||
461 waddr_mul
== QPU_W_TMU0_S
||
462 waddr_mul
== QPU_W_TMU1_S
) {
463 tex_samples_outstanding
++;