broadcom/vc5: Use THRSW to enable multi-threaded shaders.
[mesa.git] / src / broadcom / compiler / qpu_validate.c
1 /*
2 * Copyright © 2014 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /**
25 * @file
26 *
27 * Validates the QPU instruction sequence after register allocation and
28 * scheduling.
29 */
30
31 #include <assert.h>
32 #include <stdio.h>
33 #include <stdlib.h>
34 #include "v3d_compiler.h"
35 #include "qpu/qpu_disasm.h"
36
37 struct v3d_qpu_validate_state {
38 struct v3d_compile *c;
39 const struct v3d_qpu_instr *last;
40 int ip;
41 int last_sfu_write;
42 int last_branch_ip;
43 int last_thrsw_ip;
44 bool last_thrsw_found;
45 int thrsw_count;
46 };
47
48 static void
49 fail_instr(struct v3d_qpu_validate_state *state, const char *msg)
50 {
51 struct v3d_compile *c = state->c;
52
53 fprintf(stderr, "v3d_qpu_validate at ip %d: %s:\n", state->ip, msg);
54
55 int dump_ip = 0;
56 vir_for_each_inst_inorder(inst, c) {
57 v3d_qpu_dump(c->devinfo, &inst->qpu);
58
59 if (dump_ip++ == state->ip)
60 fprintf(stderr, " *** ERROR ***");
61
62 fprintf(stderr, "\n");
63 }
64
65 fprintf(stderr, "\n");
66 abort();
67 }
68
69 static bool
70 in_branch_delay_slots(struct v3d_qpu_validate_state *state)
71 {
72 return (state->ip - state->last_branch_ip) < 3;
73 }
74
75 static bool
76 in_thrsw_delay_slots(struct v3d_qpu_validate_state *state)
77 {
78 return (state->ip - state->last_thrsw_ip) < 3;
79 }
80
81 static bool
82 qpu_magic_waddr_matches(const struct v3d_qpu_instr *inst,
83 bool (*predicate)(enum v3d_qpu_waddr waddr))
84 {
85 if (inst->type == V3D_QPU_INSTR_TYPE_ALU)
86 return false;
87
88 if (inst->alu.add.op != V3D_QPU_A_NOP &&
89 inst->alu.add.magic_write &&
90 predicate(inst->alu.add.waddr))
91 return true;
92
93 if (inst->alu.mul.op != V3D_QPU_M_NOP &&
94 inst->alu.mul.magic_write &&
95 predicate(inst->alu.mul.waddr))
96 return true;
97
98 return false;
99 }
100
101 static void
102 qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
103 {
104 const struct v3d_device_info *devinfo = state->c->devinfo;
105 const struct v3d_qpu_instr *inst = &qinst->qpu;
106
107 if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
108 return;
109
110 /* LDVARY writes r5 two instructions later and LDUNIF writes
111 * r5 one instruction later, which is illegal to have
112 * together.
113 */
114 if (state->last && state->last->sig.ldvary &&
115 (inst->sig.ldunif || inst->sig.ldunifa)) {
116 fail_instr(state, "LDUNIF after a LDVARY");
117 }
118
119 int tmu_writes = 0;
120 int sfu_writes = 0;
121 int vpm_writes = 0;
122 int tlb_writes = 0;
123 int tsy_writes = 0;
124
125 if (inst->alu.add.op != V3D_QPU_A_NOP) {
126 if (inst->alu.add.magic_write) {
127 if (v3d_qpu_magic_waddr_is_tmu(inst->alu.add.waddr))
128 tmu_writes++;
129 if (v3d_qpu_magic_waddr_is_sfu(inst->alu.add.waddr))
130 sfu_writes++;
131 if (v3d_qpu_magic_waddr_is_vpm(inst->alu.add.waddr))
132 vpm_writes++;
133 if (v3d_qpu_magic_waddr_is_tlb(inst->alu.add.waddr))
134 tlb_writes++;
135 if (v3d_qpu_magic_waddr_is_tsy(inst->alu.add.waddr))
136 tsy_writes++;
137 }
138 }
139
140 if (inst->alu.mul.op != V3D_QPU_M_NOP) {
141 if (inst->alu.mul.magic_write) {
142 if (v3d_qpu_magic_waddr_is_tmu(inst->alu.mul.waddr))
143 tmu_writes++;
144 if (v3d_qpu_magic_waddr_is_sfu(inst->alu.mul.waddr))
145 sfu_writes++;
146 if (v3d_qpu_magic_waddr_is_vpm(inst->alu.mul.waddr))
147 vpm_writes++;
148 if (v3d_qpu_magic_waddr_is_tlb(inst->alu.mul.waddr))
149 tlb_writes++;
150 if (v3d_qpu_magic_waddr_is_tsy(inst->alu.mul.waddr))
151 tsy_writes++;
152 }
153 }
154
155 if (in_thrsw_delay_slots(state)) {
156 /* There's no way you want to start SFU during the THRSW delay
157 * slots, since the result would land in the other thread.
158 */
159 if (sfu_writes) {
160 fail_instr(state,
161 "SFU write started during THRSW delay slots ");
162 }
163
164 if (inst->sig.ldvary)
165 fail_instr(state, "LDVARY during THRSW delay slots");
166 }
167
168 (void)qpu_magic_waddr_matches; /* XXX */
169
170 /* SFU r4 results come back two instructions later. No doing
171 * r4 read/writes or other SFU lookups until it's done.
172 */
173 if (state->ip - state->last_sfu_write < 2) {
174 if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_R4))
175 fail_instr(state, "R4 read too soon after SFU");
176
177 if (v3d_qpu_writes_r4(devinfo, inst))
178 fail_instr(state, "R4 write too soon after SFU");
179
180 if (sfu_writes)
181 fail_instr(state, "SFU write too soon after SFU");
182 }
183
184 /* XXX: The docs say VPM can happen with the others, but the simulator
185 * disagrees.
186 */
187 if (tmu_writes +
188 sfu_writes +
189 vpm_writes +
190 tlb_writes +
191 tsy_writes +
192 inst->sig.ldtmu +
193 inst->sig.ldtlb +
194 inst->sig.ldvpm +
195 inst->sig.ldtlbu > 1) {
196 fail_instr(state,
197 "Only one of [TMU, SFU, TSY, TLB read, VPM] allowed");
198 }
199
200 if (sfu_writes)
201 state->last_sfu_write = state->ip;
202
203 if (inst->sig.thrsw) {
204 if (in_branch_delay_slots(state))
205 fail_instr(state, "THRSW in a branch delay slot.");
206
207 if (state->last_thrsw_ip == state->ip - 1) {
208 /* If it's the second THRSW in a row, then it's just a
209 * last-thrsw signal.
210 */
211 if (state->last_thrsw_found)
212 fail_instr(state, "Two last-THRSW signals");
213 state->last_thrsw_found = true;
214 } else {
215 if (in_thrsw_delay_slots(state)) {
216 fail_instr(state,
217 "THRSW too close to another THRSW.");
218 }
219 state->thrsw_count++;
220 state->last_thrsw_ip = state->ip;
221 }
222 }
223
224 if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
225 if (in_branch_delay_slots(state))
226 fail_instr(state, "branch in a branch delay slot.");
227 if (in_thrsw_delay_slots(state))
228 fail_instr(state, "branch in a THRSW delay slot.");
229 state->last_branch_ip = state->ip;
230 }
231 }
232
233 static void
234 qpu_validate_block(struct v3d_qpu_validate_state *state, struct qblock *block)
235 {
236 vir_for_each_inst(qinst, block) {
237 qpu_validate_inst(state, qinst);
238
239 state->last = &qinst->qpu;
240 state->ip++;
241 }
242 }
243
244 /**
245 * Checks for the instruction restrictions from page 37 ("Summary of
246 * Instruction Restrictions").
247 */
248 void
249 qpu_validate(struct v3d_compile *c)
250 {
251 /* We don't want to do validation in release builds, but we want to
252 * keep compiling the validation code to make sure it doesn't get
253 * broken.
254 */
255 #ifndef DEBUG
256 return;
257 #endif
258
259 struct v3d_qpu_validate_state state = {
260 .c = c,
261 .last_sfu_write = -10,
262 .last_thrsw_ip = -10,
263 .last_branch_ip = -10,
264 .ip = 0,
265 };
266
267 vir_for_each_block(block, c) {
268 qpu_validate_block(&state, block);
269 }
270
271 if (state.thrsw_count > 1 && !state.last_thrsw_found) {
272 fail_instr(&state,
273 "thread switch found without last-THRSW in program");
274 }
275
276 if (state.thrsw_count == 0 ||
277 (state.last_thrsw_found && state.thrsw_count == 1)) {
278 fail_instr(&state, "No program-end THRSW found");
279 }
280 }