vc4: Try to schedule QIR instructions between writing to and reading math.
[mesa.git] / src / gallium / drivers / vc4 / vc4_qpu_validate.c
1
2 /*
3 * Copyright © 2014 Broadcom
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25 #include "vc4_qpu.h"
26
27 static void
28 fail_instr(uint64_t inst, const char *msg)
29 {
30 fprintf(stderr, "vc4_qpu_validate: %s: ", msg);
31 vc4_qpu_disasm(&inst, 1);
32 fprintf(stderr, "\n");
33 abort();
34 }
35
36 static bool
37 writes_reg(uint64_t inst, uint32_t w)
38 {
39 return (QPU_GET_FIELD(inst, QPU_WADDR_ADD) == w ||
40 QPU_GET_FIELD(inst, QPU_WADDR_MUL) == w);
41 }
42
43 static bool
44 _reads_reg(uint64_t inst, uint32_t r, bool ignore_a, bool ignore_b)
45 {
46 struct {
47 uint32_t mux, addr;
48 } src_regs[] = {
49 { QPU_GET_FIELD(inst, QPU_ADD_A) },
50 { QPU_GET_FIELD(inst, QPU_ADD_B) },
51 { QPU_GET_FIELD(inst, QPU_MUL_A) },
52 { QPU_GET_FIELD(inst, QPU_MUL_B) },
53 };
54
55 /* Branches only reference raddr_a (no mux), and we don't use that
56 * feature of branching.
57 */
58 if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_BRANCH)
59 return false;
60
61 for (int i = 0; i < ARRAY_SIZE(src_regs); i++) {
62 if (!ignore_a &&
63 src_regs[i].mux == QPU_MUX_A &&
64 (QPU_GET_FIELD(inst, QPU_RADDR_A) == r))
65 return true;
66
67 if (!ignore_b &&
68 QPU_GET_FIELD(inst, QPU_SIG) != QPU_SIG_SMALL_IMM &&
69 src_regs[i].mux == QPU_MUX_B &&
70 (QPU_GET_FIELD(inst, QPU_RADDR_B) == r))
71 return true;
72 }
73
74 return false;
75 }
76
77 static bool
78 reads_reg(uint64_t inst, uint32_t r)
79 {
80 return _reads_reg(inst, r, false, false);
81 }
82
83 static bool
84 reads_a_reg(uint64_t inst, uint32_t r)
85 {
86 return _reads_reg(inst, r, false, true);
87 }
88
89 static bool
90 reads_b_reg(uint64_t inst, uint32_t r)
91 {
92 return _reads_reg(inst, r, true, false);
93 }
94
95 static bool
96 writes_sfu(uint64_t inst)
97 {
98 return (writes_reg(inst, QPU_W_SFU_RECIP) ||
99 writes_reg(inst, QPU_W_SFU_RECIPSQRT) ||
100 writes_reg(inst, QPU_W_SFU_EXP) ||
101 writes_reg(inst, QPU_W_SFU_LOG));
102 }
103
104 /**
105 * Checks for the instruction restrictions from page 37 ("Summary of
106 * Instruction Restrictions").
107 */
108 void
109 vc4_qpu_validate(uint64_t *insts, uint32_t num_inst)
110 {
111 bool scoreboard_locked = false;
112 bool threaded = false;
113
114 /* We don't want to do validation in release builds, but we want to
115 * keep compiling the validation code to make sure it doesn't get
116 * broken.
117 */
118 #ifndef DEBUG
119 return;
120 #endif
121
122 for (int i = 0; i < num_inst; i++) {
123 uint64_t inst = insts[i];
124 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
125
126 if (sig != QPU_SIG_PROG_END) {
127 if (qpu_inst_is_tlb(inst))
128 scoreboard_locked = true;
129
130 if (sig == QPU_SIG_THREAD_SWITCH ||
131 sig == QPU_SIG_LAST_THREAD_SWITCH) {
132 threaded = true;
133 }
134
135 continue;
136 }
137
138 /* "The Thread End instruction must not write to either physical
139 * regfile A or B."
140 */
141 if (QPU_GET_FIELD(inst, QPU_WADDR_ADD) < 32 ||
142 QPU_GET_FIELD(inst, QPU_WADDR_MUL) < 32) {
143 fail_instr(inst, "write to phys reg in thread end");
144 }
145
146 /* Can't trigger an implicit wait on scoreboard in the program
147 * end instruction.
148 */
149 if (qpu_inst_is_tlb(inst) && !scoreboard_locked)
150 fail_instr(inst, "implicit sb wait in program end");
151
152 /* Two delay slots will be executed. */
153 assert(i + 2 <= num_inst);
154
155 for (int j = i; j < i + 2; j++) {
156 /* "The last three instructions of any program
157 * (Thread End plus the following two delay-slot
158 * instructions) must not do varyings read, uniforms
159 * read or any kind of VPM, VDR, or VDW read or
160 * write."
161 */
162 if (writes_reg(insts[j], QPU_W_VPM) ||
163 reads_reg(insts[j], QPU_R_VARY) ||
164 reads_reg(insts[j], QPU_R_UNIF) ||
165 reads_reg(insts[j], QPU_R_VPM)) {
166 fail_instr(insts[j], "last 3 instructions "
167 "using fixed functions");
168 }
169
170 /* "The Thread End instruction and the following two
171 * delay slot instructions must not write or read
172 * address 14 in either regfile A or B."
173 */
174 if (writes_reg(insts[j], 14) ||
175 reads_reg(insts[j], 14)) {
176 fail_instr(insts[j], "last 3 instructions "
177 "must not use r14");
178 }
179 }
180
181 /* "The final program instruction (the second delay slot
182 * instruction) must not do a TLB Z write."
183 */
184 if (writes_reg(insts[i + 2], QPU_W_TLB_Z)) {
185 fail_instr(insts[i + 2], "final instruction doing "
186 "Z write");
187 }
188 }
189
190 /* "A scoreboard wait must not occur in the first two instructions of
191 * a fragment shader. This is either the explicit Wait for Scoreboard
192 * signal or an implicit wait with the first tile-buffer read or
193 * write instruction."
194 */
195 for (int i = 0; i < 2; i++) {
196 uint64_t inst = insts[i];
197
198 if (qpu_inst_is_tlb(inst))
199 fail_instr(inst, "sb wait in first two insts");
200 }
201
202 /* "If TMU_NOSWAP is written, the write must be three instructions
203 * before the first TMU write instruction. For example, if
204 * TMU_NOSWAP is written in the first shader instruction, the first
205 * TMU write cannot occur before the 4th shader instruction."
206 */
207 int last_tmu_noswap = -10;
208 for (int i = 0; i < num_inst; i++) {
209 uint64_t inst = insts[i];
210
211 if ((i - last_tmu_noswap) <= 3 &&
212 (writes_reg(inst, QPU_W_TMU0_S) ||
213 writes_reg(inst, QPU_W_TMU1_S))) {
214 fail_instr(inst, "TMU write too soon after TMU_NOSWAP");
215 }
216
217 if (writes_reg(inst, QPU_W_TMU_NOSWAP))
218 last_tmu_noswap = i;
219 }
220
221 /* "An instruction must not read from a location in physical regfile A
222 * or B that was written to by the previous instruction."
223 */
224 for (int i = 0; i < num_inst - 1; i++) {
225 uint64_t inst = insts[i];
226 uint32_t add_waddr = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
227 uint32_t mul_waddr = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
228 uint32_t waddr_a, waddr_b;
229
230 if (inst & QPU_WS) {
231 waddr_b = add_waddr;
232 waddr_a = mul_waddr;
233 } else {
234 waddr_a = add_waddr;
235 waddr_b = mul_waddr;
236 }
237
238 if ((waddr_a < 32 && reads_a_reg(insts[i + 1], waddr_a)) ||
239 (waddr_b < 32 && reads_b_reg(insts[i + 1], waddr_b))) {
240 fail_instr(insts[i + 1],
241 "Reads physical reg too soon after write");
242 }
243 }
244
245 /* "After an SFU lookup instruction, accumulator r4 must not be read
246 * in the following two instructions. Any other instruction that
247 * results in r4 being written (that is, TMU read, TLB read, SFU
248 * lookup) cannot occur in the two instructions following an SFU
249 * lookup."
250 */
251 int last_sfu_inst = -10;
252 for (int i = 0; i < num_inst - 1; i++) {
253 uint64_t inst = insts[i];
254 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
255
256 if (i - last_sfu_inst <= 2 &&
257 (writes_sfu(inst) ||
258 sig == QPU_SIG_LOAD_TMU0 ||
259 sig == QPU_SIG_LOAD_TMU1 ||
260 sig == QPU_SIG_COLOR_LOAD)) {
261 fail_instr(inst, "R4 write too soon after SFU write");
262 }
263
264 if (writes_sfu(inst))
265 last_sfu_inst = i;
266 }
267
268 for (int i = 0; i < num_inst - 1; i++) {
269 uint64_t inst = insts[i];
270
271 if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_SMALL_IMM &&
272 QPU_GET_FIELD(inst, QPU_SMALL_IMM) >=
273 QPU_SMALL_IMM_MUL_ROT) {
274 uint32_t mux_a = QPU_GET_FIELD(inst, QPU_MUL_A);
275 uint32_t mux_b = QPU_GET_FIELD(inst, QPU_MUL_B);
276
277 /* "The full horizontal vector rotate is only
278 * available when both of the mul ALU input arguments
279 * are taken from accumulators r0-r3."
280 */
281 if (mux_a > QPU_MUX_R3 || mux_b > QPU_MUX_R3) {
282 fail_instr(inst,
283 "MUL rotate using non-accumulator "
284 "input");
285 }
286
287 if (QPU_GET_FIELD(inst, QPU_SMALL_IMM) ==
288 QPU_SMALL_IMM_MUL_ROT) {
289 /* "An instruction that does a vector rotate
290 * by r5 must not immediately follow an
291 * instruction that writes to r5."
292 */
293 if (writes_reg(insts[i - 1], QPU_W_ACC5)) {
294 fail_instr(inst,
295 "vector rotate by r5 "
296 "immediately after r5 write");
297 }
298 }
299
300 /* "An instruction that does a vector rotate must not
301 * immediately follow an instruction that writes to the
302 * accumulator that is being rotated."
303 */
304 if (writes_reg(insts[i - 1], QPU_W_ACC0 + mux_a) ||
305 writes_reg(insts[i - 1], QPU_W_ACC0 + mux_b)) {
306 fail_instr(inst,
307 "vector rotate of value "
308 "written in previous instruction");
309 }
310 }
311 }
312
313 /* "An instruction that does a vector rotate must not immediately
314 * follow an instruction that writes to the accumulator that is being
315 * rotated.
316 *
317 * XXX: TODO.
318 */
319
320 /* "After an instruction that does a TLB Z write, the multisample mask
321 * must not be read as an instruction input argument in the following
322 * two instruction. The TLB Z write instruction can, however, be
323 * followed immediately by a TLB color write."
324 */
325 for (int i = 0; i < num_inst - 1; i++) {
326 uint64_t inst = insts[i];
327 if (writes_reg(inst, QPU_W_TLB_Z) &&
328 (reads_a_reg(insts[i + 1], QPU_R_MS_REV_FLAGS) ||
329 reads_a_reg(insts[i + 2], QPU_R_MS_REV_FLAGS))) {
330 fail_instr(inst, "TLB Z write followed by MS mask read");
331 }
332 }
333
334 /*
335 * "A single instruction can only perform a maximum of one of the
336 * following closely coupled peripheral accesses in a single
337 * instruction: TMU write, TMU read, TLB write, TLB read, TLB
338 * combined color read and write, SFU write, Mutex read or Semaphore
339 * access."
340 */
341 for (int i = 0; i < num_inst - 1; i++) {
342 uint64_t inst = insts[i];
343
344 if (qpu_num_sf_accesses(inst) > 1)
345 fail_instr(inst, "Single instruction writes SFU twice");
346 }
347
348 /* "The uniform base pointer can be written (from SIMD element 0) by
349 * the processor to reset the stream, there must be at least two
350 * nonuniform-accessing instructions following a pointer change
351 * before uniforms can be accessed once more."
352 */
353 int last_unif_pointer_update = -3;
354 for (int i = 0; i < num_inst; i++) {
355 uint64_t inst = insts[i];
356 uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
357 uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
358
359 if (reads_reg(inst, QPU_R_UNIF) &&
360 i - last_unif_pointer_update <= 2) {
361 fail_instr(inst,
362 "uniform read too soon after pointer update");
363 }
364
365 if (waddr_add == QPU_W_UNIFORMS_ADDRESS ||
366 waddr_mul == QPU_W_UNIFORMS_ADDRESS)
367 last_unif_pointer_update = i;
368 }
369
370 if (threaded) {
371 bool last_thrsw_found = false;
372 bool scoreboard_locked = false;
373 int tex_samples_outstanding = 0;
374 int last_tex_samples_outstanding = 0;
375 int thrsw_ip = -1;
376
377 for (int i = 0; i < num_inst; i++) {
378 uint64_t inst = insts[i];
379 uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
380
381 if (i == thrsw_ip) {
382 /* In order to get texture results back in the
383 * correct order, before a new thrsw we have
384 * to read all the texture results from before
385 * the previous thrsw.
386 *
387 * FIXME: Is collecting the remaining results
388 * during the delay slots OK, or should we do
389 * this at THRSW signal time?
390 */
391 if (last_tex_samples_outstanding != 0) {
392 fail_instr(inst, "THRSW with texture "
393 "results from the previous "
394 "THRSW still in the FIFO.");
395 }
396
397 last_tex_samples_outstanding =
398 tex_samples_outstanding;
399 tex_samples_outstanding = 0;
400 }
401
402 if (qpu_inst_is_tlb(inst))
403 scoreboard_locked = true;
404
405 switch (sig) {
406 case QPU_SIG_THREAD_SWITCH:
407 case QPU_SIG_LAST_THREAD_SWITCH:
408 /* No thread switching with the scoreboard
409 * locked. Doing so means we may deadlock
410 * when the other thread tries to lock
411 * scoreboard.
412 */
413 if (scoreboard_locked) {
414 fail_instr(inst, "THRSW with the "
415 "scoreboard locked.");
416 }
417
418 /* No thread switching after lthrsw, since
419 * lthrsw means that we get delayed until the
420 * other shader is ready for us to terminate.
421 */
422 if (last_thrsw_found) {
423 fail_instr(inst, "THRSW after a "
424 "previous LTHRSW");
425 }
426
427 if (sig == QPU_SIG_LAST_THREAD_SWITCH)
428 last_thrsw_found = true;
429
430 /* No THRSW while we already have a THRSW
431 * queued.
432 */
433 if (i < thrsw_ip) {
434 fail_instr(inst,
435 "THRSW with a THRSW queued.");
436 }
437
438 thrsw_ip = i + 3;
439 break;
440
441 case QPU_SIG_LOAD_TMU0:
442 case QPU_SIG_LOAD_TMU1:
443 if (last_tex_samples_outstanding == 0) {
444 fail_instr(inst, "TMU load with nothing "
445 "in the results fifo from "
446 "the previous THRSW.");
447 }
448
449 last_tex_samples_outstanding--;
450 break;
451 }
452
453 uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
454 uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
455 if (waddr_add == QPU_W_TMU0_S ||
456 waddr_add == QPU_W_TMU1_S ||
457 waddr_mul == QPU_W_TMU0_S ||
458 waddr_mul == QPU_W_TMU1_S) {
459 tex_samples_outstanding++;
460 }
461 }
462 }
463 }