intel/compiler: Allow MESA_SHADER_KERNEL
[mesa.git] / src / intel / compiler / brw_ir_performance.cpp
1 /*
2 * Copyright © 2020 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_eu.h"
25 #include "brw_fs.h"
26 #include "brw_vec4.h"
27 #include "brw_cfg.h"
28
29 using namespace brw;
30
31 namespace {
32 /**
33 * Enumeration representing the various asynchronous units that can run
34 * computations in parallel on behalf of a shader thread.
35 */
36 enum unit {
37 /** EU front-end. */
38 unit_fe,
39 /** EU FPU0 (Note that co-issue to FPU1 is currently not modeled here). */
40 unit_fpu,
41 /** Extended Math unit (AKA FPU1 on Gen8-11, part of the EU on Gen6+). */
42 unit_em,
43 /** Sampler shared function. */
44 unit_sampler,
45 /** Pixel Interpolator shared function. */
46 unit_pi,
47 /** Unified Return Buffer shared function. */
48 unit_urb,
49 /** Data Port Data Cache shared function. */
50 unit_dp_dc,
51 /** Data Port Render Cache shared function. */
52 unit_dp_rc,
53 /** Data Port Constant Cache shared function. */
54 unit_dp_cc,
55 /** Message Gateway shared function. */
56 unit_gateway,
57 /** Thread Spawner shared function. */
58 unit_spawner,
59 /* unit_vme, */
60 /* unit_cre, */
61 /** Number of asynchronous units currently tracked. */
62 num_units,
63 /** Dummy unit for instructions that don't consume runtime from the above. */
64 unit_null = num_units
65 };
66
67 /**
68 * Enumeration representing a computation result another computation can
69 * potentially depend on.
70 */
71 enum dependency_id {
72 /* Register part of the GRF. */
73 dependency_id_grf0 = 0,
74 /* Register part of the MRF. Only used on Gen4-6. */
75 dependency_id_mrf0 = dependency_id_grf0 + BRW_MAX_GRF,
76 /* Address register part of the ARF. */
77 dependency_id_addr0 = dependency_id_mrf0 + 24,
78 /* Accumulator register part of the ARF. */
79 dependency_id_accum0 = dependency_id_addr0 + 1,
80 /* Flag register part of the ARF. */
81 dependency_id_flag0 = dependency_id_accum0 + 12,
82 /* SBID token write completion. Only used on Gen12+. */
83 dependency_id_sbid_wr0 = dependency_id_flag0 + 8,
84 /* SBID token read completion. Only used on Gen12+. */
85 dependency_id_sbid_rd0 = dependency_id_sbid_wr0 + 16,
86 /* Number of computation dependencies currently tracked. */
87 num_dependency_ids = dependency_id_sbid_rd0 + 16
88 };
89
90 /**
91 * State of our modeling of the program execution.
92 */
93 struct state {
94 state() : unit_ready(), dep_ready(), unit_busy(), weight(1.0) {}
95 /**
96 * Time at which a given unit will be ready to execute the next
97 * computation, in clock units.
98 */
99 unsigned unit_ready[num_units];
100 /**
101 * Time at which an instruction dependent on a given dependency ID will
102 * be ready to execute, in clock units.
103 */
104 unsigned dep_ready[num_dependency_ids];
105 /**
106 * Aggregated utilization of a given unit excluding idle cycles,
107 * in clock units.
108 */
109 float unit_busy[num_units];
110 /**
111 * Factor of the overhead of a computation accounted for in the
112 * aggregated utilization calculation.
113 */
114 float weight;
115 };
116
117 /**
118 * Information derived from an IR instruction used to compute performance
119 * estimates. Allows the timing calculation to work on both FS and VEC4
120 * instructions.
121 */
122 struct instruction_info {
123 instruction_info(const gen_device_info *devinfo, const fs_inst *inst) :
124 devinfo(devinfo), op(inst->opcode),
125 td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)),
126 tx(get_exec_type(inst)), sx(0), ss(0),
127 sc(has_bank_conflict(devinfo, inst) ? sd : 0),
128 desc(inst->desc), sfid(inst->sfid)
129 {
130 /* We typically want the maximum source size, except for split send
131 * messages which require the total size.
132 */
133 if (inst->opcode == SHADER_OPCODE_SEND) {
134 ss = DIV_ROUND_UP(inst->size_read(2), REG_SIZE) +
135 DIV_ROUND_UP(inst->size_read(3), REG_SIZE);
136 } else {
137 for (unsigned i = 0; i < inst->sources; i++)
138 ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
139 }
140
141 /* Convert the execution size to GRF units. */
142 sx = DIV_ROUND_UP(inst->exec_size * type_sz(tx), REG_SIZE);
143
144 /* 32x32 integer multiplication has half the usual ALU throughput.
145 * Treat it as double-precision.
146 */
147 if ((inst->opcode == BRW_OPCODE_MUL || inst->opcode == BRW_OPCODE_MAD) &&
148 !brw_reg_type_is_floating_point(tx) && type_sz(tx) == 4 &&
149 type_sz(inst->src[0].type) == type_sz(inst->src[1].type))
150 tx = brw_int_type(8, tx == BRW_REGISTER_TYPE_D);
151 }
152
153 instruction_info(const gen_device_info *devinfo,
154 const vec4_instruction *inst) :
155 devinfo(devinfo), op(inst->opcode),
156 td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)),
157 tx(get_exec_type(inst)), sx(0), ss(0), sc(0),
158 desc(inst->desc), sfid(inst->sfid)
159 {
160 /* Compute the maximum source size. */
161 for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++)
162 ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
163
164 /* Convert the execution size to GRF units. */
165 sx = DIV_ROUND_UP(inst->exec_size * type_sz(tx), REG_SIZE);
166
167 /* 32x32 integer multiplication has half the usual ALU throughput.
168 * Treat it as double-precision.
169 */
170 if ((inst->opcode == BRW_OPCODE_MUL || inst->opcode == BRW_OPCODE_MAD) &&
171 !brw_reg_type_is_floating_point(tx) && type_sz(tx) == 4 &&
172 type_sz(inst->src[0].type) == type_sz(inst->src[1].type))
173 tx = brw_int_type(8, tx == BRW_REGISTER_TYPE_D);
174 }
175
176 /** Device information. */
177 const struct gen_device_info *devinfo;
178 /** Instruction opcode. */
179 opcode op;
180 /** Destination type. */
181 brw_reg_type td;
182 /** Destination size in GRF units. */
183 unsigned sd;
184 /** Execution type. */
185 brw_reg_type tx;
186 /** Execution size in GRF units. */
187 unsigned sx;
188 /** Source size. */
189 unsigned ss;
190 /** Bank conflict penalty size in GRF units (equal to sd if non-zero). */
191 unsigned sc;
192 /** Send message descriptor. */
193 uint32_t desc;
194 /** Send message shared function ID. */
195 uint8_t sfid;
196 };
197
198 /**
199 * Timing information of an instruction used to estimate the performance of
200 * the program.
201 */
202 struct perf_desc {
203 perf_desc(unit u, int df, int db, int ls, int ld, int la, int lf) :
204 u(u), df(df), db(db), ls(ls), ld(ld), la(la), lf(lf) {}
205
206 /**
207 * Back-end unit its runtime shall be accounted to, in addition to the
208 * EU front-end which is always assumed to be involved.
209 */
210 unit u;
211 /**
212 * Overhead cycles from the time that the EU front-end starts executing
213 * the instruction until it's ready to execute the next instruction.
214 */
215 int df;
216 /**
217 * Overhead cycles from the time that the back-end starts executing the
218 * instruction until it's ready to execute the next instruction.
219 */
220 int db;
221 /**
222 * Latency cycles from the time that the back-end starts executing the
223 * instruction until its sources have been read from the register file.
224 */
225 int ls;
226 /**
227 * Latency cycles from the time that the back-end starts executing the
228 * instruction until its regular destination has been written to the
229 * register file.
230 */
231 int ld;
232 /**
233 * Latency cycles from the time that the back-end starts executing the
234 * instruction until its accumulator destination has been written to the
235 * ARF file.
236 *
237 * Note that this is an approximation of the real behavior of
238 * accumulating instructions in the hardware: Instead of modeling a pair
239 * of back-to-back accumulating instructions as a first computation with
240 * latency equal to ld followed by another computation with a
241 * mid-pipeline stall (e.g. after the "M" part of a MAC instruction), we
242 * model the stall as if it occurred at the top of the pipeline, with
243 * the latency of the accumulator computation offset accordingly.
244 */
245 int la;
246 /**
247 * Latency cycles from the time that the back-end starts executing the
248 * instruction until its flag destination has been written to the ARF
249 * file.
250 */
251 int lf;
252 };
253
254 /**
255 * Compute the timing information of an instruction based on any relevant
256 * information from the IR and a number of parameters specifying a linear
257 * approximation: Parameter X_Y specifies the derivative of timing X
258 * relative to info field Y, while X_1 specifies the independent term of
259 * the approximation of timing X.
260 */
261 perf_desc
262 calculate_desc(const instruction_info &info, unit u,
263 int df_1, int df_sd, int df_sc,
264 int db_1, int db_sx,
265 int ls_1, int ld_1, int la_1, int lf_1,
266 int l_ss, int l_sd)
267 {
268 return perf_desc(u, df_1 + df_sd * int(info.sd) + df_sc * int(info.sc),
269 db_1 + db_sx * int(info.sx),
270 ls_1 + l_ss * int(info.ss),
271 ld_1 + l_ss * int(info.ss) + l_sd * int(info.sd),
272 la_1, lf_1);
273 }
274
275 /**
276 * Compute the timing information of an instruction based on any relevant
277 * information from the IR and a number of linear approximation parameters
278 * hard-coded for each IR instruction.
279 *
280 * Most timing parameters are obtained from the multivariate linear
281 * regression of a sample of empirical timings measured using the tm0
282 * register (as can be done today by using the shader_time debugging
283 * option). The Gen4-5 math timings are obtained from BSpec Volume 5c.3
284 * "Shared Functions - Extended Math", Section 3.2 "Performance".
285 * Parameters marked XXX shall be considered low-quality, they're possibly
286 * high variance or completely guessed in cases where experimental data was
287 * unavailable.
288 */
289 const perf_desc
290 instruction_desc(const instruction_info &info)
291 {
292 const struct gen_device_info *devinfo = info.devinfo;
293
294 switch (info.op) {
295 case BRW_OPCODE_SYNC:
296 case BRW_OPCODE_SEL:
297 case BRW_OPCODE_NOT:
298 case BRW_OPCODE_AND:
299 case BRW_OPCODE_OR:
300 case BRW_OPCODE_XOR:
301 case BRW_OPCODE_SHR:
302 case BRW_OPCODE_SHL:
303 case BRW_OPCODE_DIM:
304 case BRW_OPCODE_ASR:
305 case BRW_OPCODE_CMPN:
306 case BRW_OPCODE_F16TO32:
307 case BRW_OPCODE_BFREV:
308 case BRW_OPCODE_BFI1:
309 case BRW_OPCODE_AVG:
310 case BRW_OPCODE_FRC:
311 case BRW_OPCODE_RNDU:
312 case BRW_OPCODE_RNDD:
313 case BRW_OPCODE_RNDE:
314 case BRW_OPCODE_RNDZ:
315 case BRW_OPCODE_MAC:
316 case BRW_OPCODE_MACH:
317 case BRW_OPCODE_LZD:
318 case BRW_OPCODE_FBH:
319 case BRW_OPCODE_FBL:
320 case BRW_OPCODE_CBIT:
321 case BRW_OPCODE_ADDC:
322 case BRW_OPCODE_ROR:
323 case BRW_OPCODE_ROL:
324 case BRW_OPCODE_SUBB:
325 case BRW_OPCODE_SAD2:
326 case BRW_OPCODE_SADA2:
327 case BRW_OPCODE_LINE:
328 case BRW_OPCODE_NOP:
329 case SHADER_OPCODE_CLUSTER_BROADCAST:
330 case FS_OPCODE_DDX_COARSE:
331 case FS_OPCODE_DDX_FINE:
332 case FS_OPCODE_DDY_COARSE:
333 case FS_OPCODE_PIXEL_X:
334 case FS_OPCODE_PIXEL_Y:
335 case FS_OPCODE_SET_SAMPLE_ID:
336 case VEC4_OPCODE_MOV_BYTES:
337 case VEC4_OPCODE_UNPACK_UNIFORM:
338 case VEC4_OPCODE_DOUBLE_TO_F32:
339 case VEC4_OPCODE_DOUBLE_TO_D32:
340 case VEC4_OPCODE_DOUBLE_TO_U32:
341 case VEC4_OPCODE_TO_DOUBLE:
342 case VEC4_OPCODE_PICK_LOW_32BIT:
343 case VEC4_OPCODE_PICK_HIGH_32BIT:
344 case VEC4_OPCODE_SET_LOW_32BIT:
345 case VEC4_OPCODE_SET_HIGH_32BIT:
346 case GS_OPCODE_SET_DWORD_2:
347 case GS_OPCODE_SET_WRITE_OFFSET:
348 case GS_OPCODE_SET_VERTEX_COUNT:
349 case GS_OPCODE_PREPARE_CHANNEL_MASKS:
350 case GS_OPCODE_SET_CHANNEL_MASKS:
351 case GS_OPCODE_GET_INSTANCE_ID:
352 case GS_OPCODE_SET_PRIMITIVE_ID:
353 case GS_OPCODE_SVB_SET_DST_INDEX:
354 case TCS_OPCODE_SRC0_010_IS_ZERO:
355 case TCS_OPCODE_GET_PRIMITIVE_ID:
356 case TES_OPCODE_GET_PRIMITIVE_ID:
357 if (devinfo->gen >= 11) {
358 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
359 0, 10, 6 /* XXX */, 14, 0, 0);
360 } else if (devinfo->gen >= 8) {
361 if (type_sz(info.tx) > 4)
362 return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
363 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
364 else
365 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
366 0, 8, 4, 12, 0, 0);
367 } else if (devinfo->is_haswell) {
368 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
369 0, 10, 6 /* XXX */, 16, 0, 0);
370 } else {
371 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
372 0, 12, 8 /* XXX */, 18, 0, 0);
373 }
374
375 case BRW_OPCODE_MOV:
376 case BRW_OPCODE_CMP:
377 case BRW_OPCODE_ADD:
378 case BRW_OPCODE_MUL:
379 if (devinfo->gen >= 11) {
380 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
381 0, 10, 6, 14, 0, 0);
382 } else if (devinfo->gen >= 8) {
383 if (type_sz(info.tx) > 4)
384 return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
385 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
386 else
387 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
388 0, 8, 4, 12, 0, 0);
389 } else if (devinfo->is_haswell) {
390 if (info.tx == BRW_REGISTER_TYPE_F)
391 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
392 0, 12, 8 /* XXX */, 18, 0, 0);
393 else
394 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
395 0, 10, 6 /* XXX */, 16, 0, 0);
396 } else if (devinfo->gen >= 7) {
397 if (info.tx == BRW_REGISTER_TYPE_F)
398 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
399 0, 14, 10 /* XXX */, 20, 0, 0);
400 else
401 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
402 0, 12, 8 /* XXX */, 18, 0, 0);
403 } else {
404 return calculate_desc(info, unit_fpu, 0, 2 /* XXX */, 0,
405 0, 2 /* XXX */,
406 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
407 0, 0);
408 }
409
410 case BRW_OPCODE_BFE:
411 case BRW_OPCODE_BFI2:
412 case BRW_OPCODE_CSEL:
413 if (devinfo->gen >= 11)
414 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
415 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
416 else if (devinfo->gen >= 8)
417 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
418 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
419 else if (devinfo->is_haswell)
420 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
421 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
422 else if (devinfo->gen >= 7)
423 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
424 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
425 else
426 abort();
427
428 case BRW_OPCODE_MAD:
429 if (devinfo->gen >= 11) {
430 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
431 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
432 } else if (devinfo->gen >= 8) {
433 if (type_sz(info.tx) > 4)
434 return calculate_desc(info, unit_fpu, 0, 4, 1, 0, 4,
435 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
436 else
437 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
438 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
439 } else if (devinfo->is_haswell) {
440 if (info.tx == BRW_REGISTER_TYPE_F)
441 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
442 0, 12, 8 /* XXX */, 18, 0, 0);
443 else
444 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
445 0, 10, 6 /* XXX */, 16, 0, 0);
446 } else if (devinfo->gen >= 7) {
447 if (info.tx == BRW_REGISTER_TYPE_F)
448 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
449 0, 14, 10 /* XXX */, 20, 0, 0);
450 else
451 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
452 0, 12, 8 /* XXX */, 18, 0, 0);
453 } else if (devinfo->gen >= 6) {
454 return calculate_desc(info, unit_fpu, 0, 2 /* XXX */, 1 /* XXX */,
455 0, 2 /* XXX */,
456 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
457 0, 0);
458 } else {
459 abort();
460 }
461
462 case BRW_OPCODE_F32TO16:
463 if (devinfo->gen >= 11)
464 return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
465 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
466 else if (devinfo->gen >= 8)
467 return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
468 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
469 else if (devinfo->is_haswell)
470 return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
471 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
472 else if (devinfo->gen >= 7)
473 return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
474 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
475 else
476 abort();
477
478 case BRW_OPCODE_DP4:
479 case BRW_OPCODE_DPH:
480 case BRW_OPCODE_DP3:
481 case BRW_OPCODE_DP2:
482 if (devinfo->gen >= 8)
483 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
484 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
485 else if (devinfo->is_haswell)
486 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
487 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
488 else
489 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
490 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
491
492 case SHADER_OPCODE_RCP:
493 case SHADER_OPCODE_RSQ:
494 case SHADER_OPCODE_SQRT:
495 case SHADER_OPCODE_EXP2:
496 case SHADER_OPCODE_LOG2:
497 case SHADER_OPCODE_SIN:
498 case SHADER_OPCODE_COS:
499 case SHADER_OPCODE_POW:
500 case SHADER_OPCODE_INT_QUOTIENT:
501 case SHADER_OPCODE_INT_REMAINDER:
502 if (devinfo->gen >= 6) {
503 switch (info.op) {
504 case SHADER_OPCODE_RCP:
505 case SHADER_OPCODE_RSQ:
506 case SHADER_OPCODE_SQRT:
507 case SHADER_OPCODE_EXP2:
508 case SHADER_OPCODE_LOG2:
509 case SHADER_OPCODE_SIN:
510 case SHADER_OPCODE_COS:
511 if (devinfo->gen >= 8)
512 return calculate_desc(info, unit_em, -2, 4, 0, 0, 4,
513 0, 16, 0, 0, 0, 0);
514 else if (devinfo->is_haswell)
515 return calculate_desc(info, unit_em, 0, 2, 0, 0, 2,
516 0, 12, 0, 0, 0, 0);
517 else
518 return calculate_desc(info, unit_em, 0, 2, 0, 0, 2,
519 0, 14, 0, 0, 0, 0);
520
521 case SHADER_OPCODE_POW:
522 if (devinfo->gen >= 8)
523 return calculate_desc(info, unit_em, -2, 4, 0, 0, 8,
524 0, 24, 0, 0, 0, 0);
525 else if (devinfo->is_haswell)
526 return calculate_desc(info, unit_em, 0, 2, 0, 0, 4,
527 0, 20, 0, 0, 0, 0);
528 else
529 return calculate_desc(info, unit_em, 0, 2, 0, 0, 4,
530 0, 22, 0, 0, 0, 0);
531
532 case SHADER_OPCODE_INT_QUOTIENT:
533 case SHADER_OPCODE_INT_REMAINDER:
534 return calculate_desc(info, unit_em, 2, 0, 0, 26, 0,
535 0, 28 /* XXX */, 0, 0, 0, 0);
536
537 default:
538 abort();
539 }
540 } else {
541 switch (info.op) {
542 case SHADER_OPCODE_RCP:
543 return calculate_desc(info, unit_em, 2, 0, 0, 0, 8,
544 0, 22, 0, 0, 0, 8);
545
546 case SHADER_OPCODE_RSQ:
547 return calculate_desc(info, unit_em, 2, 0, 0, 0, 16,
548 0, 44, 0, 0, 0, 8);
549
550 case SHADER_OPCODE_INT_QUOTIENT:
551 case SHADER_OPCODE_SQRT:
552 case SHADER_OPCODE_LOG2:
553 return calculate_desc(info, unit_em, 2, 0, 0, 0, 24,
554 0, 66, 0, 0, 0, 8);
555
556 case SHADER_OPCODE_INT_REMAINDER:
557 case SHADER_OPCODE_EXP2:
558 return calculate_desc(info, unit_em, 2, 0, 0, 0, 32,
559 0, 88, 0, 0, 0, 8);
560
561 case SHADER_OPCODE_SIN:
562 case SHADER_OPCODE_COS:
563 return calculate_desc(info, unit_em, 2, 0, 0, 0, 48,
564 0, 132, 0, 0, 0, 8);
565
566 case SHADER_OPCODE_POW:
567 return calculate_desc(info, unit_em, 2, 0, 0, 0, 64,
568 0, 176, 0, 0, 0, 8);
569
570 default:
571 abort();
572 }
573 }
574
575 case BRW_OPCODE_DO:
576 if (devinfo->gen >= 6)
577 return calculate_desc(info, unit_null, 0, 0, 0, 0, 0,
578 0, 0, 0, 0, 0, 0);
579 else
580 return calculate_desc(info, unit_null, 2 /* XXX */, 0, 0, 0, 0,
581 0, 0, 0, 0, 0, 0);
582
583 case BRW_OPCODE_IF:
584 case BRW_OPCODE_ELSE:
585 case BRW_OPCODE_ENDIF:
586 case BRW_OPCODE_WHILE:
587 case BRW_OPCODE_BREAK:
588 case BRW_OPCODE_CONTINUE:
589 case FS_OPCODE_DISCARD_JUMP:
590 if (devinfo->gen >= 8)
591 return calculate_desc(info, unit_null, 8, 0, 0, 0, 0,
592 0, 0, 0, 0, 0, 0);
593 else if (devinfo->is_haswell)
594 return calculate_desc(info, unit_null, 6, 0, 0, 0, 0,
595 0, 0, 0, 0, 0, 0);
596 else
597 return calculate_desc(info, unit_null, 2, 0, 0, 0, 0,
598 0, 0, 0, 0, 0, 0);
599
600 case FS_OPCODE_LINTERP:
601 if (devinfo->gen >= 8)
602 return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
603 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
604 else if (devinfo->is_haswell)
605 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
606 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
607 else
608 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
609 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
610
611 case BRW_OPCODE_LRP:
612 if (devinfo->gen >= 8)
613 return calculate_desc(info, unit_fpu, 0, 4, 1, 0, 4,
614 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
615 else if (devinfo->is_haswell)
616 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
617 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
618 else if (devinfo->gen >= 6)
619 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
620 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
621 else
622 abort();
623
624 case FS_OPCODE_PACK_HALF_2x16_SPLIT:
625 if (devinfo->gen >= 11)
626 return calculate_desc(info, unit_fpu, 20, 6, 0, 0, 6,
627 0, 10 /* XXX */, 6 /* XXX */,
628 14 /* XXX */, 0, 0);
629 else if (devinfo->gen >= 8)
630 return calculate_desc(info, unit_fpu, 16, 6, 0, 0, 6,
631 0, 8 /* XXX */, 4 /* XXX */,
632 12 /* XXX */, 0, 0);
633 else if (devinfo->is_haswell)
634 return calculate_desc(info, unit_fpu, 20, 6, 0, 0, 6,
635 0, 10 /* XXX */, 6 /* XXX */,
636 16 /* XXX */, 0, 0);
637 else if (devinfo->gen >= 7)
638 return calculate_desc(info, unit_fpu, 24, 6, 0, 0, 6,
639 0, 12 /* XXX */, 8 /* XXX */,
640 18 /* XXX */, 0, 0);
641 else
642 abort();
643
644 case SHADER_OPCODE_MOV_INDIRECT:
645 if (devinfo->gen >= 11)
646 return calculate_desc(info, unit_fpu, 34, 0, 0, 34, 0,
647 0, 10 /* XXX */, 6 /* XXX */,
648 14 /* XXX */, 0, 0);
649 else if (devinfo->gen >= 8)
650 return calculate_desc(info, unit_fpu, 34, 0, 0, 34, 0,
651 0, 8 /* XXX */, 4 /* XXX */,
652 12 /* XXX */, 0, 0);
653 else if (devinfo->is_haswell)
654 return calculate_desc(info, unit_fpu, 34, 0, 0, 34, 0,
655 0, 10 /* XXX */, 6 /* XXX */,
656 16 /* XXX */, 0, 0);
657 else
658 return calculate_desc(info, unit_fpu, 34, 0, 0, 34, 0,
659 0, 12 /* XXX */, 8 /* XXX */,
660 18 /* XXX */, 0, 0);
661
662 case SHADER_OPCODE_BROADCAST:
663 if (devinfo->gen >= 11)
664 return calculate_desc(info, unit_fpu, 20 /* XXX */, 0, 0, 4, 0,
665 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
666 else if (devinfo->gen >= 8)
667 return calculate_desc(info, unit_fpu, 18, 0, 0, 4, 0,
668 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
669 else if (devinfo->is_haswell)
670 return calculate_desc(info, unit_fpu, 18, 0, 0, 4, 0,
671 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
672 else if (devinfo->gen >= 7)
673 return calculate_desc(info, unit_fpu, 20, 0, 0, 4, 0,
674 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
675 else
676 abort();
677
678 case SHADER_OPCODE_FIND_LIVE_CHANNEL:
679 if (devinfo->gen >= 11)
680 return calculate_desc(info, unit_fpu, 2, 0, 0, 2, 0,
681 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
682 else if (devinfo->gen >= 8)
683 return calculate_desc(info, unit_fpu, 2, 0, 0, 2, 0,
684 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
685 else if (devinfo->is_haswell)
686 return calculate_desc(info, unit_fpu, 36, 0, 0, 6, 0,
687 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
688 else if (devinfo->gen >= 7)
689 return calculate_desc(info, unit_fpu, 40, 0, 0, 6, 0,
690 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
691 else
692 abort();
693
694 case SHADER_OPCODE_RND_MODE:
695 case SHADER_OPCODE_FLOAT_CONTROL_MODE:
696 if (devinfo->gen >= 11)
697 return calculate_desc(info, unit_fpu, 24 /* XXX */, 0, 0,
698 4 /* XXX */, 0,
699 0, 0, 0, 0, 0, 0);
700 else if (devinfo->gen >= 8)
701 return calculate_desc(info, unit_fpu, 20 /* XXX */, 0, 0,
702 4 /* XXX */, 0,
703 0, 0, 0, 0, 0, 0);
704 else if (devinfo->is_haswell)
705 return calculate_desc(info, unit_fpu, 24 /* XXX */, 0, 0,
706 4 /* XXX */, 0,
707 0, 0, 0, 0, 0, 0);
708 else if (devinfo->gen >= 6)
709 return calculate_desc(info, unit_fpu, 28 /* XXX */, 0, 0,
710 4 /* XXX */, 0,
711 0, 0, 0, 0, 0, 0);
712 else
713 abort();
714
715 case SHADER_OPCODE_SHUFFLE:
716 if (devinfo->gen >= 11)
717 return calculate_desc(info, unit_fpu, 44 /* XXX */, 0, 0,
718 44 /* XXX */, 0,
719 0, 10 /* XXX */, 6 /* XXX */,
720 14 /* XXX */, 0, 0);
721 else if (devinfo->gen >= 8)
722 return calculate_desc(info, unit_fpu, 42 /* XXX */, 0, 0,
723 42 /* XXX */, 0,
724 0, 8 /* XXX */, 4 /* XXX */,
725 12 /* XXX */, 0, 0);
726 else if (devinfo->is_haswell)
727 return calculate_desc(info, unit_fpu, 0, 44 /* XXX */, 0,
728 0, 44 /* XXX */,
729 0, 10 /* XXX */, 6 /* XXX */,
730 16 /* XXX */, 0, 0);
731 else if (devinfo->gen >= 6)
732 return calculate_desc(info, unit_fpu, 0, 46 /* XXX */, 0,
733 0, 46 /* XXX */,
734 0, 12 /* XXX */, 8 /* XXX */,
735 18 /* XXX */, 0, 0);
736 else
737 abort();
738
739 case SHADER_OPCODE_SEL_EXEC:
740 if (devinfo->gen >= 11)
741 return calculate_desc(info, unit_fpu, 10 /* XXX */, 4 /* XXX */, 0,
742 0, 4 /* XXX */,
743 0, 10 /* XXX */, 6 /* XXX */,
744 14 /* XXX */, 0, 0);
745 else if (devinfo->gen >= 8)
746 return calculate_desc(info, unit_fpu, 8 /* XXX */, 4 /* XXX */, 0,
747 0, 4 /* XXX */,
748 0, 8 /* XXX */, 4 /* XXX */,
749 12 /* XXX */, 0, 0);
750 else if (devinfo->is_haswell)
751 return calculate_desc(info, unit_fpu, 10 /* XXX */, 4 /* XXX */, 0,
752 0, 4 /* XXX */,
753 0, 10 /* XXX */, 6 /* XXX */,
754 16 /* XXX */, 0, 0);
755 else
756 return calculate_desc(info, unit_fpu, 12 /* XXX */, 4 /* XXX */, 0,
757 0, 4 /* XXX */,
758 0, 12 /* XXX */, 8 /* XXX */,
759 18 /* XXX */, 0, 0);
760
761 case SHADER_OPCODE_QUAD_SWIZZLE:
762 if (devinfo->gen >= 11)
763 return calculate_desc(info, unit_fpu, 0 /* XXX */, 8 /* XXX */, 0,
764 0, 8 /* XXX */,
765 0, 10 /* XXX */, 6 /* XXX */,
766 14 /* XXX */, 0, 0);
767 else if (devinfo->gen >= 8)
768 return calculate_desc(info, unit_fpu, 0 /* XXX */, 8 /* XXX */, 0,
769 0, 8 /* XXX */,
770 0, 8 /* XXX */, 4 /* XXX */,
771 12 /* XXX */, 0, 0);
772 else if (devinfo->is_haswell)
773 return calculate_desc(info, unit_fpu, 0 /* XXX */, 8 /* XXX */, 0,
774 0, 8 /* XXX */,
775 0, 10 /* XXX */, 6 /* XXX */,
776 16 /* XXX */, 0, 0);
777 else
778 return calculate_desc(info, unit_fpu, 0 /* XXX */, 8 /* XXX */, 0,
779 0, 8 /* XXX */,
780 0, 12 /* XXX */, 8 /* XXX */,
781 18 /* XXX */, 0, 0);
782
783 case FS_OPCODE_DDY_FINE:
784 if (devinfo->gen >= 11)
785 return calculate_desc(info, unit_fpu, 0, 14, 0, 0, 4,
786 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
787 else if (devinfo->gen >= 8)
788 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
789 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
790 else if (devinfo->is_haswell)
791 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
792 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
793 else
794 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
795 0, 14, 10 /* XXX */, 20 /* XXX */, 0, 0);
796
797 case FS_OPCODE_LOAD_LIVE_CHANNELS:
798 if (devinfo->gen >= 11)
799 return calculate_desc(info, unit_fpu, 2 /* XXX */, 0, 0,
800 2 /* XXX */, 0,
801 0, 0, 0, 10 /* XXX */, 0, 0);
802 else if (devinfo->gen >= 8)
803 return calculate_desc(info, unit_fpu, 0, 2 /* XXX */, 0,
804 0, 2 /* XXX */,
805 0, 0, 0, 8 /* XXX */, 0, 0);
806 else
807 abort();
808
809 case VEC4_OPCODE_PACK_BYTES:
810 if (devinfo->gen >= 8)
811 return calculate_desc(info, unit_fpu, 4 /* XXX */, 0, 0,
812 4 /* XXX */, 0,
813 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
814 0, 0);
815 else if (devinfo->is_haswell)
816 return calculate_desc(info, unit_fpu, 4 /* XXX */, 0, 0,
817 4 /* XXX */, 0,
818 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
819 0, 0);
820 else
821 return calculate_desc(info, unit_fpu, 4 /* XXX */, 0, 0,
822 4 /* XXX */, 0,
823 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
824 0, 0);
825
826 case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9:
827 if (devinfo->gen >= 8)
828 return calculate_desc(info, unit_fpu, 12 /* XXX */, 0, 0,
829 4 /* XXX */, 0,
830 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
831 0, 0);
832 else
833 abort();
834
835 case VS_OPCODE_UNPACK_FLAGS_SIMD4X2:
836 case TCS_OPCODE_GET_INSTANCE_ID:
837 case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
838 case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
839 case TES_OPCODE_CREATE_INPUT_READ_HEADER:
840 if (devinfo->gen >= 8)
841 return calculate_desc(info, unit_fpu, 22 /* XXX */, 0, 0,
842 6 /* XXX */, 0,
843 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
844 0, 0);
845 else if (devinfo->is_haswell)
846 return calculate_desc(info, unit_fpu, 26 /* XXX */, 0, 0,
847 6 /* XXX */, 0,
848 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
849 0, 0);
850 else
851 return calculate_desc(info, unit_fpu, 30 /* XXX */, 0, 0,
852 6 /* XXX */, 0,
853 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
854 0, 0);
855
856 case GS_OPCODE_FF_SYNC_SET_PRIMITIVES:
857 case TCS_OPCODE_CREATE_BARRIER_HEADER:
858 if (devinfo->gen >= 8)
859 return calculate_desc(info, unit_fpu, 32 /* XXX */, 0, 0,
860 8 /* XXX */, 0,
861 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
862 0, 0);
863 else if (devinfo->is_haswell)
864 return calculate_desc(info, unit_fpu, 38 /* XXX */, 0, 0,
865 8 /* XXX */, 0,
866 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
867 0, 0);
868 else if (devinfo->gen >= 6)
869 return calculate_desc(info, unit_fpu, 44 /* XXX */, 0, 0,
870 8 /* XXX */, 0,
871 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
872 0, 0);
873 else
874 abort();
875
876 case TES_OPCODE_ADD_INDIRECT_URB_OFFSET:
877 if (devinfo->gen >= 8)
878 return calculate_desc(info, unit_fpu, 12 /* XXX */, 0, 0,
879 4 /* XXX */, 0,
880 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
881 0, 0);
882 else if (devinfo->is_haswell)
883 return calculate_desc(info, unit_fpu, 14 /* XXX */, 0, 0,
884 4 /* XXX */, 0,
885 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
886 0, 0);
887 else if (devinfo->gen >= 7)
888 return calculate_desc(info, unit_fpu, 16 /* XXX */, 0, 0,
889 4 /* XXX */, 0,
890 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
891 0, 0);
892 else
893 abort();
894
895 case SHADER_OPCODE_TEX:
896 case FS_OPCODE_TXB:
897 case SHADER_OPCODE_TXD:
898 case SHADER_OPCODE_TXF:
899 case SHADER_OPCODE_TXF_LZ:
900 case SHADER_OPCODE_TXL:
901 case SHADER_OPCODE_TXL_LZ:
902 case SHADER_OPCODE_TXF_CMS:
903 case SHADER_OPCODE_TXF_CMS_W:
904 case SHADER_OPCODE_TXF_UMS:
905 case SHADER_OPCODE_TXF_MCS:
906 case SHADER_OPCODE_TXS:
907 case SHADER_OPCODE_LOD:
908 case SHADER_OPCODE_GET_BUFFER_SIZE:
909 case SHADER_OPCODE_TG4:
910 case SHADER_OPCODE_TG4_OFFSET:
911 case SHADER_OPCODE_SAMPLEINFO:
912 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4:
913 return calculate_desc(info, unit_sampler, 2, 0, 0, 0, 16 /* XXX */,
914 8 /* XXX */, 750 /* XXX */, 0, 0,
915 2 /* XXX */, 0);
916
917 case SHADER_OPCODE_URB_READ_SIMD8:
918 case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
919 case SHADER_OPCODE_URB_WRITE_SIMD8:
920 case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
921 case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
922 case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
923 case VEC4_OPCODE_URB_READ:
924 case VS_OPCODE_URB_WRITE:
925 case GS_OPCODE_URB_WRITE:
926 case GS_OPCODE_URB_WRITE_ALLOCATE:
927 case GS_OPCODE_THREAD_END:
928 case GS_OPCODE_FF_SYNC:
929 case TCS_OPCODE_URB_WRITE:
930 case TCS_OPCODE_RELEASE_INPUT:
931 case TCS_OPCODE_THREAD_END:
932 return calculate_desc(info, unit_urb, 2, 0, 0, 0, 6 /* XXX */,
933 32 /* XXX */, 200 /* XXX */, 0, 0, 0, 0);
934
935 case SHADER_OPCODE_MEMORY_FENCE:
936 case SHADER_OPCODE_INTERLOCK:
937 switch (info.sfid) {
938 case GEN6_SFID_DATAPORT_RENDER_CACHE:
939 if (devinfo->gen >= 7)
940 return calculate_desc(info, unit_dp_rc, 2, 0, 0, 30 /* XXX */, 0,
941 10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
942 else
943 abort();
944
945 case GEN7_SFID_DATAPORT_DATA_CACHE:
946 case HSW_SFID_DATAPORT_DATA_CACHE_1:
947 if (devinfo->gen >= 7)
948 return calculate_desc(info, unit_dp_dc, 2, 0, 0, 30 /* XXX */, 0,
949 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
950 else
951 abort();
952
953 default:
954 abort();
955 }
956
957 case SHADER_OPCODE_GEN4_SCRATCH_READ:
958 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
959 case SHADER_OPCODE_GEN7_SCRATCH_READ:
960 return calculate_desc(info, unit_dp_dc, 2, 0, 0, 0, 8 /* XXX */,
961 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
962
963 case VEC4_OPCODE_UNTYPED_ATOMIC:
964 if (devinfo->gen >= 7)
965 return calculate_desc(info, unit_dp_dc, 2, 0, 0,
966 30 /* XXX */, 400 /* XXX */,
967 10 /* XXX */, 100 /* XXX */, 0, 0,
968 0, 400 /* XXX */);
969 else
970 abort();
971
972 case VEC4_OPCODE_UNTYPED_SURFACE_READ:
973 case VEC4_OPCODE_UNTYPED_SURFACE_WRITE:
974 if (devinfo->gen >= 7)
975 return calculate_desc(info, unit_dp_dc, 2, 0, 0,
976 0, 20 /* XXX */,
977 10 /* XXX */, 100 /* XXX */, 0, 0,
978 0, 0);
979 else
980 abort();
981
982 case FS_OPCODE_FB_WRITE:
983 case FS_OPCODE_FB_READ:
984 case FS_OPCODE_REP_FB_WRITE:
985 return calculate_desc(info, unit_dp_rc, 2, 0, 0, 0, 450 /* XXX */,
986 10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
987
988 case GS_OPCODE_SVB_WRITE:
989 if (devinfo->gen >= 6)
990 return calculate_desc(info, unit_dp_rc, 2 /* XXX */, 0, 0,
991 0, 450 /* XXX */,
992 10 /* XXX */, 300 /* XXX */, 0, 0,
993 0, 0);
994 else
995 abort();
996
997 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
998 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
999 return calculate_desc(info, unit_dp_cc, 2, 0, 0, 0, 16 /* XXX */,
1000 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
1001
1002 case VS_OPCODE_PULL_CONSTANT_LOAD:
1003 case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
1004 return calculate_desc(info, unit_sampler, 2, 0, 0, 0, 16,
1005 8, 750, 0, 0, 2, 0);
1006
1007 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1008 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1009 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1010 if (devinfo->gen >= 7)
1011 return calculate_desc(info, unit_pi, 2, 0, 0, 14 /* XXX */, 0,
1012 0, 90 /* XXX */, 0, 0, 0, 0);
1013 else
1014 abort();
1015
1016 case SHADER_OPCODE_BARRIER:
1017 if (devinfo->gen >= 7)
1018 return calculate_desc(info, unit_gateway, 90 /* XXX */, 0, 0,
1019 0 /* XXX */, 0,
1020 0, 0, 0, 0, 0, 0);
1021 else
1022 abort();
1023
1024 case CS_OPCODE_CS_TERMINATE:
1025 if (devinfo->gen >= 7)
1026 return calculate_desc(info, unit_spawner, 2, 0, 0, 0 /* XXX */, 0,
1027 10 /* XXX */, 0, 0, 0, 0, 0);
1028 else
1029 abort();
1030
1031 case SHADER_OPCODE_SEND:
1032 switch (info.sfid) {
1033 case GEN6_SFID_DATAPORT_RENDER_CACHE:
1034 if (devinfo->gen >= 7) {
1035 switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
1036 case GEN7_DATAPORT_RC_TYPED_ATOMIC_OP:
1037 return calculate_desc(info, unit_dp_rc, 2, 0, 0,
1038 30 /* XXX */, 450 /* XXX */,
1039 10 /* XXX */, 100 /* XXX */,
1040 0, 0, 0, 400 /* XXX */);
1041 default:
1042 return calculate_desc(info, unit_dp_rc, 2, 0, 0,
1043 0, 450 /* XXX */,
1044 10 /* XXX */, 300 /* XXX */, 0, 0,
1045 0, 0);
1046 }
1047 } else if (devinfo->gen >= 6) {
1048 return calculate_desc(info, unit_dp_rc, 2 /* XXX */, 0, 0,
1049 0, 450 /* XXX */,
1050 10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
1051 } else {
1052 abort();
1053 }
1054 case BRW_SFID_SAMPLER: {
1055 if (devinfo->gen >= 6)
1056 return calculate_desc(info, unit_sampler, 2, 0, 0, 0, 16,
1057 8, 750, 0, 0, 2, 0);
1058 else
1059 abort();
1060 }
1061 case GEN7_SFID_DATAPORT_DATA_CACHE:
1062 case HSW_SFID_DATAPORT_DATA_CACHE_1:
1063 if (devinfo->gen >= 8 || devinfo->is_haswell) {
1064 switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
1065 case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP:
1066 case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2:
1067 case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2:
1068 case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP:
1069 return calculate_desc(info, unit_dp_dc, 2, 0, 0,
1070 30 /* XXX */, 400 /* XXX */,
1071 10 /* XXX */, 100 /* XXX */, 0, 0,
1072 0, 400 /* XXX */);
1073
1074 default:
1075 return calculate_desc(info, unit_dp_dc, 2, 0, 0,
1076 0, 20 /* XXX */,
1077 10 /* XXX */, 100 /* XXX */, 0, 0,
1078 0, 0);
1079 }
1080 } else if (devinfo->gen >= 7) {
1081 switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
1082 case GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP:
1083 return calculate_desc(info, unit_dp_dc, 2, 0, 0,
1084 30 /* XXX */, 400 /* XXX */,
1085 10 /* XXX */, 100 /* XXX */,
1086 0, 0, 0, 400 /* XXX */);
1087 default:
1088 return calculate_desc(info, unit_dp_dc, 2, 0, 0,
1089 0, 20 /* XXX */,
1090 10 /* XXX */, 100 /* XXX */, 0, 0,
1091 0, 0);
1092 }
1093 } else {
1094 abort();
1095 }
1096 default:
1097 abort();
1098 }
1099
1100 case SHADER_OPCODE_UNDEF:
1101 case FS_OPCODE_PLACEHOLDER_HALT:
1102 case FS_OPCODE_SCHEDULING_FENCE:
1103 return calculate_desc(info, unit_null, 0, 0, 0, 0, 0,
1104 0, 0, 0, 0, 0, 0);
1105
1106 default:
1107 abort();
1108 }
1109 }
1110
1111 /**
1112 * Model the performance behavior of a stall on the specified dependency
1113 * ID.
1114 */
1115 void
1116 stall_on_dependency(state &st, dependency_id id)
1117 {
1118 if (id < ARRAY_SIZE(st.dep_ready))
1119 st.unit_ready[unit_fe] = MAX2(st.unit_ready[unit_fe],
1120 st.dep_ready[id]);
1121 }
1122
1123 /**
1124 * Model the performance behavior of the front-end and back-end while
1125 * executing an instruction with the specified timing information, assuming
1126 * all dependencies are already clear.
1127 */
1128 void
1129 execute_instruction(state &st, const perf_desc &perf)
1130 {
1131 /* Compute the time at which the front-end will be ready to execute the
1132 * next instruction.
1133 */
1134 st.unit_ready[unit_fe] += perf.df;
1135
1136 if (perf.u < num_units) {
1137 /* Wait for the back-end to be ready to execute this instruction. */
1138 st.unit_ready[unit_fe] = MAX2(st.unit_ready[unit_fe],
1139 st.unit_ready[perf.u]);
1140
1141 /* Compute the time at which the back-end will be ready to execute
1142 * the next instruction, and update the back-end utilization.
1143 */
1144 st.unit_ready[perf.u] = st.unit_ready[unit_fe] + perf.db;
1145 st.unit_busy[perf.u] += perf.db * st.weight;
1146 }
1147 }
1148
1149 /**
1150 * Model the performance behavior of a read dependency provided by an
1151 * instruction.
1152 */
1153 void
1154 mark_read_dependency(state &st, const perf_desc &perf, dependency_id id)
1155 {
1156 if (id < ARRAY_SIZE(st.dep_ready))
1157 st.dep_ready[id] = st.unit_ready[unit_fe] + perf.ls;
1158 }
1159
1160 /**
1161 * Model the performance behavior of a write dependency provided by an
1162 * instruction.
1163 */
1164 void
1165 mark_write_dependency(state &st, const perf_desc &perf, dependency_id id)
1166 {
1167 if (id >= dependency_id_accum0 && id < dependency_id_flag0)
1168 st.dep_ready[id] = st.unit_ready[unit_fe] + perf.la;
1169 else if (id >= dependency_id_flag0 && id < dependency_id_sbid_wr0)
1170 st.dep_ready[id] = st.unit_ready[unit_fe] + perf.lf;
1171 else if (id < ARRAY_SIZE(st.dep_ready))
1172 st.dep_ready[id] = st.unit_ready[unit_fe] + perf.ld;
1173 }
1174
1175 /**
1176 * Return the dependency ID of a backend_reg, offset by \p delta GRFs.
1177 */
1178 dependency_id
1179 reg_dependency_id(const gen_device_info *devinfo, const backend_reg &r,
1180 const int delta)
1181 {
1182 if (r.file == VGRF) {
1183 const unsigned i = r.nr + r.offset / REG_SIZE + delta;
1184 assert(i < dependency_id_mrf0 - dependency_id_grf0);
1185 return dependency_id(dependency_id_grf0 + i);
1186
1187 } else if (r.file == FIXED_GRF) {
1188 const unsigned i = r.nr + delta;
1189 assert(i < dependency_id_mrf0 - dependency_id_grf0);
1190 return dependency_id(dependency_id_grf0 + i);
1191
1192 } else if (r.file == MRF && devinfo->gen >= 7) {
1193 const unsigned i = GEN7_MRF_HACK_START +
1194 r.nr + r.offset / REG_SIZE + delta;
1195 assert(i < dependency_id_mrf0 - dependency_id_grf0);
1196 return dependency_id(dependency_id_grf0 + i);
1197
1198 } else if (r.file == MRF && devinfo->gen < 7) {
1199 const unsigned i = (r.nr & ~BRW_MRF_COMPR4) +
1200 r.offset / REG_SIZE + delta;
1201 assert(i < dependency_id_addr0 - dependency_id_mrf0);
1202 return dependency_id(dependency_id_mrf0 + i);
1203
1204 } else if (r.file == ARF && r.nr >= BRW_ARF_ADDRESS &&
1205 r.nr < BRW_ARF_ACCUMULATOR) {
1206 assert(delta == 0);
1207 return dependency_id_addr0;
1208
1209 } else if (r.file == ARF && r.nr >= BRW_ARF_ACCUMULATOR &&
1210 r.nr < BRW_ARF_FLAG) {
1211 const unsigned i = r.nr - BRW_ARF_ACCUMULATOR + delta;
1212 assert(i < dependency_id_flag0 - dependency_id_accum0);
1213 return dependency_id(dependency_id_accum0 + i);
1214
1215 } else {
1216 return num_dependency_ids;
1217 }
1218 }
1219
1220 /**
1221 * Return the dependency ID of flag register starting at offset \p i.
1222 */
1223 dependency_id
1224 flag_dependency_id(unsigned i)
1225 {
1226 assert(i < dependency_id_sbid_wr0 - dependency_id_flag0);
1227 return dependency_id(dependency_id_flag0 + i);
1228 }
1229
1230 /**
1231 * Return the dependency ID corresponding to the SBID read completion
1232 * condition of a Gen12+ SWSB.
1233 */
1234 dependency_id
1235 tgl_swsb_rd_dependency_id(tgl_swsb swsb)
1236 {
1237 if (swsb.mode) {
1238 assert(swsb.sbid < num_dependency_ids - dependency_id_sbid_rd0);
1239 return dependency_id(dependency_id_sbid_rd0 + swsb.sbid);
1240 } else {
1241 return num_dependency_ids;
1242 }
1243 }
1244
1245 /**
1246 * Return the dependency ID corresponding to the SBID write completion
1247 * condition of a Gen12+ SWSB.
1248 */
1249 dependency_id
1250 tgl_swsb_wr_dependency_id(tgl_swsb swsb)
1251 {
1252 if (swsb.mode) {
1253 assert(swsb.sbid < dependency_id_sbid_rd0 - dependency_id_sbid_wr0);
1254 return dependency_id(dependency_id_sbid_wr0 + swsb.sbid);
1255 } else {
1256 return num_dependency_ids;
1257 }
1258 }
1259
1260 /**
1261 * Return the implicit accumulator register accessed by channel \p i of the
1262 * instruction.
1263 */
1264 unsigned
1265 accum_reg_of_channel(const gen_device_info *devinfo,
1266 const backend_instruction *inst,
1267 brw_reg_type tx, unsigned i)
1268 {
1269 assert(inst->reads_accumulator_implicitly() ||
1270 inst->writes_accumulator_implicitly(devinfo));
1271 const unsigned offset = (inst->group + i) * type_sz(tx) *
1272 (devinfo->gen < 7 || brw_reg_type_is_floating_point(tx) ? 1 : 2);
1273 return offset / REG_SIZE % 2;
1274 }
1275
1276 /**
1277 * Model the performance behavior of an FS back-end instruction.
1278 */
1279 void
1280 issue_fs_inst(state &st, const gen_device_info *devinfo,
1281 const backend_instruction *be_inst)
1282 {
1283 const fs_inst *inst = static_cast<const fs_inst *>(be_inst);
1284 const instruction_info info(devinfo, inst);
1285 const perf_desc perf = instruction_desc(info);
1286
1287 /* Stall on any source dependencies. */
1288 for (unsigned i = 0; i < inst->sources; i++) {
1289 for (unsigned j = 0; j < regs_read(inst, i); j++)
1290 stall_on_dependency(
1291 st, reg_dependency_id(devinfo, inst->src[i], j));
1292 }
1293
1294 if (inst->reads_accumulator_implicitly()) {
1295 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1296 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1297 inst->exec_size - 1); j++)
1298 stall_on_dependency(
1299 st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
1300 }
1301
1302 if (is_send(inst) && inst->base_mrf != -1) {
1303 for (unsigned j = 0; j < inst->mlen; j++)
1304 stall_on_dependency(
1305 st, reg_dependency_id(
1306 devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
1307 }
1308
1309 if (const unsigned mask = inst->flags_read(devinfo)) {
1310 for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
1311 if (mask & (1 << i))
1312 stall_on_dependency(st, flag_dependency_id(i));
1313 }
1314 }
1315
1316 /* Stall on any write dependencies. */
1317 if (!inst->no_dd_check) {
1318 if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1319 for (unsigned j = 0; j < regs_written(inst); j++)
1320 stall_on_dependency(
1321 st, reg_dependency_id(devinfo, inst->dst, j));
1322 }
1323
1324 if (inst->writes_accumulator_implicitly(devinfo)) {
1325 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1326 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1327 inst->exec_size - 1); j++)
1328 stall_on_dependency(
1329 st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
1330 }
1331
1332 if (const unsigned mask = inst->flags_written()) {
1333 for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
1334 if (mask & (1 << i))
1335 stall_on_dependency(st, flag_dependency_id(i));
1336 }
1337 }
1338 }
1339
1340 /* Stall on any SBID dependencies. */
1341 if (inst->sched.mode & (TGL_SBID_SET | TGL_SBID_DST))
1342 stall_on_dependency(st, tgl_swsb_wr_dependency_id(inst->sched));
1343 else if (inst->sched.mode & TGL_SBID_SRC)
1344 stall_on_dependency(st, tgl_swsb_rd_dependency_id(inst->sched));
1345
1346 /* Execute the instruction. */
1347 execute_instruction(st, perf);
1348
1349 /* Mark any source dependencies. */
1350 if (inst->is_send_from_grf()) {
1351 for (unsigned i = 0; i < inst->sources; i++) {
1352 if (inst->is_payload(i)) {
1353 for (unsigned j = 0; j < regs_read(inst, i); j++)
1354 mark_read_dependency(
1355 st, perf, reg_dependency_id(devinfo, inst->src[i], j));
1356 }
1357 }
1358 }
1359
1360 if (is_send(inst) && inst->base_mrf != -1) {
1361 for (unsigned j = 0; j < inst->mlen; j++)
1362 mark_read_dependency(st, perf,
1363 reg_dependency_id(devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
1364 }
1365
1366 /* Mark any destination dependencies. */
1367 if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1368 for (unsigned j = 0; j < regs_written(inst); j++) {
1369 mark_write_dependency(st, perf,
1370 reg_dependency_id(devinfo, inst->dst, j));
1371 }
1372 }
1373
1374 if (inst->writes_accumulator_implicitly(devinfo)) {
1375 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1376 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1377 inst->exec_size - 1); j++)
1378 mark_write_dependency(st, perf,
1379 reg_dependency_id(devinfo, brw_acc_reg(8), j));
1380 }
1381
1382 if (const unsigned mask = inst->flags_written()) {
1383 for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
1384 if (mask & (1 << i))
1385 mark_write_dependency(st, perf, flag_dependency_id(i));
1386 }
1387 }
1388
1389 /* Mark any SBID dependencies. */
1390 if (inst->sched.mode & TGL_SBID_SET) {
1391 mark_read_dependency(st, perf, tgl_swsb_rd_dependency_id(inst->sched));
1392 mark_write_dependency(st, perf, tgl_swsb_wr_dependency_id(inst->sched));
1393 }
1394 }
1395
1396 /**
1397 * Model the performance behavior of a VEC4 back-end instruction.
1398 */
1399 void
1400 issue_vec4_instruction(state &st, const gen_device_info *devinfo,
1401 const backend_instruction *be_inst)
1402 {
1403 const vec4_instruction *inst =
1404 static_cast<const vec4_instruction *>(be_inst);
1405 const instruction_info info(devinfo, inst);
1406 const perf_desc perf = instruction_desc(info);
1407
1408 /* Stall on any source dependencies. */
1409 for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) {
1410 for (unsigned j = 0; j < regs_read(inst, i); j++)
1411 stall_on_dependency(
1412 st, reg_dependency_id(devinfo, inst->src[i], j));
1413 }
1414
1415 if (inst->reads_accumulator_implicitly()) {
1416 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1417 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1418 inst->exec_size - 1); j++)
1419 stall_on_dependency(
1420 st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
1421 }
1422
1423 if (inst->base_mrf != -1) {
1424 for (unsigned j = 0; j < inst->mlen; j++)
1425 stall_on_dependency(
1426 st, reg_dependency_id(
1427 devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
1428 }
1429
1430 if (inst->reads_flag())
1431 stall_on_dependency(st, dependency_id_flag0);
1432
1433 /* Stall on any write dependencies. */
1434 if (!inst->no_dd_check) {
1435 if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1436 for (unsigned j = 0; j < regs_written(inst); j++)
1437 stall_on_dependency(
1438 st, reg_dependency_id(devinfo, inst->dst, j));
1439 }
1440
1441 if (inst->writes_accumulator_implicitly(devinfo)) {
1442 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1443 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1444 inst->exec_size - 1); j++)
1445 stall_on_dependency(
1446 st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
1447 }
1448
1449 if (inst->writes_flag())
1450 stall_on_dependency(st, dependency_id_flag0);
1451 }
1452
1453 /* Execute the instruction. */
1454 execute_instruction(st, perf);
1455
1456 /* Mark any source dependencies. */
1457 if (inst->is_send_from_grf()) {
1458 for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) {
1459 for (unsigned j = 0; j < regs_read(inst, i); j++)
1460 mark_read_dependency(
1461 st, perf, reg_dependency_id(devinfo, inst->src[i], j));
1462 }
1463 }
1464
1465 if (inst->base_mrf != -1) {
1466 for (unsigned j = 0; j < inst->mlen; j++)
1467 mark_read_dependency(st, perf,
1468 reg_dependency_id(devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
1469 }
1470
1471 /* Mark any destination dependencies. */
1472 if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1473 for (unsigned j = 0; j < regs_written(inst); j++) {
1474 mark_write_dependency(st, perf,
1475 reg_dependency_id(devinfo, inst->dst, j));
1476 }
1477 }
1478
1479 if (inst->writes_accumulator_implicitly(devinfo)) {
1480 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1481 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1482 inst->exec_size - 1); j++)
1483 mark_write_dependency(st, perf,
1484 reg_dependency_id(devinfo, brw_acc_reg(8), j));
1485 }
1486
1487 if (inst->writes_flag())
1488 mark_write_dependency(st, perf, dependency_id_flag0);
1489 }
1490
1491 /**
1492 * Calculate the maximum possible throughput of the program compatible with
1493 * the cycle-count utilization estimated for each asynchronous unit, in
1494 * threads-per-cycle units.
1495 */
1496 float
1497 calculate_thread_throughput(const state &st, float busy)
1498 {
1499 for (unsigned i = 0; i < num_units; i++)
1500 busy = MAX2(busy, st.unit_busy[i]);
1501
1502 return 1.0 / busy;
1503 }
1504
1505 /**
1506 * Estimate the performance of the specified shader.
1507 */
1508 void
1509 calculate_performance(performance &p, const backend_shader *s,
1510 void (*issue_instruction)(
1511 state &, const gen_device_info *,
1512 const backend_instruction *),
1513 unsigned dispatch_width)
1514 {
1515 /* XXX - Plumbing the trip counts from NIR loop analysis would allow us
1516 * to do a better job regarding the loop weights. And some branch
1517 * divergence analysis would allow us to do a better job with
1518 * branching weights.
1519 *
1520 * In the meantime use values that roughly match the control flow
1521 * weights used elsewhere in the compiler back-end -- Main
1522 * difference is the worst-case scenario branch_weight used for
1523 * SIMD32 which accounts for the possibility of a dynamically
1524 * uniform branch becoming divergent in SIMD32.
1525 *
1526 * Note that we provide slightly more pessimistic weights on
1527 * Gen12+ for SIMD32, since the effective warp size on that
1528 * platform is 2x the SIMD width due to EU fusion, which increases
1529 * the likelihood of divergent control flow in comparison to
1530 * previous generations, giving narrower SIMD modes a performance
1531 * advantage in several test-cases with non-uniform discard jumps.
1532 */
1533 const float branch_weight = (dispatch_width > 16 ? 1.0 : 0.5);
1534 const float discard_weight = (dispatch_width > 16 || s->devinfo->gen < 12 ?
1535 1.0 : 0.5);
1536 const float loop_weight = 10;
1537 unsigned discard_count = 0;
1538 unsigned elapsed = 0;
1539 state st;
1540
1541 foreach_block(block, s->cfg) {
1542 const unsigned elapsed0 = elapsed;
1543
1544 foreach_inst_in_block(backend_instruction, inst, block) {
1545 const unsigned clock0 = st.unit_ready[unit_fe];
1546
1547 issue_instruction(st, s->devinfo, inst);
1548
1549 if (inst->opcode == BRW_OPCODE_ENDIF)
1550 st.weight /= branch_weight;
1551 else if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT && discard_count)
1552 st.weight /= discard_weight;
1553
1554 elapsed += (st.unit_ready[unit_fe] - clock0) * st.weight;
1555
1556 if (inst->opcode == BRW_OPCODE_IF)
1557 st.weight *= branch_weight;
1558 else if (inst->opcode == BRW_OPCODE_DO)
1559 st.weight *= loop_weight;
1560 else if (inst->opcode == BRW_OPCODE_WHILE)
1561 st.weight /= loop_weight;
1562 else if (inst->opcode == FS_OPCODE_DISCARD_JUMP && !discard_count++)
1563 st.weight *= discard_weight;
1564 }
1565
1566 p.block_latency[block->num] = elapsed - elapsed0;
1567 }
1568
1569 p.latency = elapsed;
1570 p.throughput = dispatch_width * calculate_thread_throughput(st, elapsed);
1571 }
1572 }
1573
1574 brw::performance::performance(const fs_visitor *v) :
1575 block_latency(new unsigned[v->cfg->num_blocks])
1576 {
1577 calculate_performance(*this, v, issue_fs_inst, v->dispatch_width);
1578 }
1579
1580 brw::performance::performance(const vec4_visitor *v) :
1581 block_latency(new unsigned[v->cfg->num_blocks])
1582 {
1583 calculate_performance(*this, v, issue_vec4_instruction, 8);
1584 }
1585
1586 brw::performance::~performance()
1587 {
1588 delete[] block_latency;
1589 }