intel/ir: Import shader performance analysis pass.
[mesa.git] / src / intel / compiler / brw_ir_performance.cpp
1 /*
2 * Copyright © 2020 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_eu.h"
25 #include "brw_fs.h"
26 #include "brw_vec4.h"
27 #include "brw_cfg.h"
28
29 using namespace brw;
30
31 namespace {
32 /**
33 * Enumeration representing the various asynchronous units that can run
34 * computations in parallel on behalf of a shader thread.
35 */
36 enum unit {
37 /** EU front-end. */
38 unit_fe,
39 /** EU FPU0 (Note that co-issue to FPU1 is currently not modeled here). */
40 unit_fpu,
41 /** Extended Math unit (AKA FPU1 on Gen8-11, part of the EU on Gen6+). */
42 unit_em,
43 /** Sampler shared function. */
44 unit_sampler,
45 /** Pixel Interpolator shared function. */
46 unit_pi,
47 /** Unified Return Buffer shared function. */
48 unit_urb,
49 /** Data Port Data Cache shared function. */
50 unit_dp_dc,
51 /** Data Port Render Cache shared function. */
52 unit_dp_rc,
53 /** Data Port Constant Cache shared function. */
54 unit_dp_cc,
55 /** Message Gateway shared function. */
56 unit_gateway,
57 /** Thread Spawner shared function. */
58 unit_spawner,
59 /* unit_vme, */
60 /* unit_cre, */
61 /** Number of asynchronous units currently tracked. */
62 num_units,
63 /** Dummy unit for instructions that don't consume runtime from the above. */
64 unit_null = num_units
65 };
66
67 /**
68 * Enumeration representing a computation result another computation can
69 * potentially depend on.
70 */
71 enum dependency_id {
72 /* Register part of the GRF. */
73 dependency_id_grf0 = 0,
74 /* Register part of the MRF. Only used on Gen4-6. */
75 dependency_id_mrf0 = dependency_id_grf0 + BRW_MAX_GRF,
76 /* Address register part of the ARF. */
77 dependency_id_addr0 = dependency_id_mrf0 + 24,
78 /* Accumulator register part of the ARF. */
79 dependency_id_accum0 = dependency_id_addr0 + 1,
80 /* Flag register part of the ARF. */
81 dependency_id_flag0 = dependency_id_accum0 + 12,
82 /* SBID token write completion. Only used on Gen12+. */
83 dependency_id_sbid_wr0 = dependency_id_flag0 + 8,
84 /* SBID token read completion. Only used on Gen12+. */
85 dependency_id_sbid_rd0 = dependency_id_sbid_wr0 + 16,
86 /* Number of computation dependencies currently tracked. */
87 num_dependency_ids = dependency_id_sbid_rd0 + 16
88 };
89
90 /**
91 * State of our modeling of the program execution.
92 */
93 struct state {
94 state() : unit_ready(), dep_ready(), unit_busy(), weight(1.0) {}
95 /**
96 * Time at which a given unit will be ready to execute the next
97 * computation, in clock units.
98 */
99 unsigned unit_ready[num_units];
100 /**
101 * Time at which an instruction dependent on a given dependency ID will
102 * be ready to execute, in clock units.
103 */
104 unsigned dep_ready[num_dependency_ids];
105 /**
106 * Aggregated utilization of a given unit excluding idle cycles,
107 * in clock units.
108 */
109 float unit_busy[num_units];
110 /**
111 * Factor of the overhead of a computation accounted for in the
112 * aggregated utilization calculation.
113 */
114 float weight;
115 };
116
117 /**
118 * Information derived from an IR instruction used to compute performance
119 * estimates. Allows the timing calculation to work on both FS and VEC4
120 * instructions.
121 */
122 struct instruction_info {
123 instruction_info(const gen_device_info *devinfo, const fs_inst *inst) :
124 devinfo(devinfo), op(inst->opcode),
125 td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)),
126 tx(get_exec_type(inst)), sx(0), ss(0),
127 sc(has_bank_conflict(devinfo, inst) ? sd : 0),
128 desc(inst->desc), sfid(inst->sfid)
129 {
130 /* We typically want the maximum source size, except for split send
131 * messages which require the total size.
132 */
133 if (inst->opcode == SHADER_OPCODE_SEND) {
134 ss = DIV_ROUND_UP(inst->size_read(2), REG_SIZE) +
135 DIV_ROUND_UP(inst->size_read(3), REG_SIZE);
136 } else {
137 for (unsigned i = 0; i < inst->sources; i++)
138 ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
139 }
140
141 /* Convert the execution size to GRF units. */
142 sx = DIV_ROUND_UP(inst->exec_size * type_sz(tx), REG_SIZE);
143
144 /* 32x32 integer multiplication has half the usual ALU throughput.
145 * Treat it as double-precision.
146 */
147 if ((inst->opcode == BRW_OPCODE_MUL || inst->opcode == BRW_OPCODE_MAD) &&
148 !brw_reg_type_is_floating_point(tx) && type_sz(tx) == 4 &&
149 type_sz(inst->src[0].type) == type_sz(inst->src[1].type))
150 tx = brw_int_type(8, tx == BRW_REGISTER_TYPE_D);
151 }
152
153 instruction_info(const gen_device_info *devinfo,
154 const vec4_instruction *inst) :
155 devinfo(devinfo), op(inst->opcode),
156 td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)),
157 tx(get_exec_type(inst)), sx(0), ss(0), sc(0),
158 desc(inst->desc), sfid(inst->sfid)
159 {
160 /* Compute the maximum source size. */
161 for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++)
162 ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
163
164 /* Convert the execution size to GRF units. */
165 sx = DIV_ROUND_UP(inst->exec_size * type_sz(tx), REG_SIZE);
166
167 /* 32x32 integer multiplication has half the usual ALU throughput.
168 * Treat it as double-precision.
169 */
170 if ((inst->opcode == BRW_OPCODE_MUL || inst->opcode == BRW_OPCODE_MAD) &&
171 !brw_reg_type_is_floating_point(tx) && type_sz(tx) == 4 &&
172 type_sz(inst->src[0].type) == type_sz(inst->src[1].type))
173 tx = brw_int_type(8, tx == BRW_REGISTER_TYPE_D);
174 }
175
176 /** Device information. */
177 const struct gen_device_info *devinfo;
178 /** Instruction opcode. */
179 opcode op;
180 /** Destination type. */
181 brw_reg_type td;
182 /** Destination size in GRF units. */
183 unsigned sd;
184 /** Execution type. */
185 brw_reg_type tx;
186 /** Execution size in GRF units. */
187 unsigned sx;
188 /** Source size. */
189 unsigned ss;
190 /** Bank conflict penalty size in GRF units (equal to sd if non-zero). */
191 unsigned sc;
192 /** Send message descriptor. */
193 uint32_t desc;
194 /** Send message shared function ID. */
195 uint8_t sfid;
196 };
197
198 /**
199 * Timing information of an instruction used to estimate the performance of
200 * the program.
201 */
202 struct perf_desc {
203 perf_desc(unit u, int df, int db, int ls, int ld, int la, int lf) :
204 u(u), df(df), db(db), ls(ls), ld(ld), la(la), lf(lf) {}
205
206 /**
207 * Back-end unit its runtime shall be accounted to, in addition to the
208 * EU front-end which is always assumed to be involved.
209 */
210 unit u;
211 /**
212 * Overhead cycles from the time that the EU front-end starts executing
213 * the instruction until it's ready to execute the next instruction.
214 */
215 int df;
216 /**
217 * Overhead cycles from the time that the back-end starts executing the
218 * instruction until it's ready to execute the next instruction.
219 */
220 int db;
221 /**
222 * Latency cycles from the time that the back-end starts executing the
223 * instruction until its sources have been read from the register file.
224 */
225 int ls;
226 /**
227 * Latency cycles from the time that the back-end starts executing the
228 * instruction until its regular destination has been written to the
229 * register file.
230 */
231 int ld;
232 /**
233 * Latency cycles from the time that the back-end starts executing the
234 * instruction until its accumulator destination has been written to the
235 * ARF file.
236 *
237 * Note that this is an approximation of the real behavior of
238 * accumulating instructions in the hardware: Instead of modeling a pair
239 * of back-to-back accumulating instructions as a first computation with
240 * latency equal to ld followed by another computation with a
241 * mid-pipeline stall (e.g. after the "M" part of a MAC instruction), we
242 * model the stall as if it occurred at the top of the pipeline, with
243 * the latency of the accumulator computation offset accordingly.
244 */
245 int la;
246 /**
247 * Latency cycles from the time that the back-end starts executing the
248 * instruction until its flag destination has been written to the ARF
249 * file.
250 */
251 int lf;
252 };
253
254 /**
255 * Compute the timing information of an instruction based on any relevant
256 * information from the IR and a number of parameters specifying a linear
257 * approximation: Parameter X_Y specifies the derivative of timing X
258 * relative to info field Y, while X_1 specifies the independent term of
259 * the approximation of timing X.
260 */
261 perf_desc
262 calculate_desc(const instruction_info &info, unit u,
263 int df_1, int df_sd, int df_sc,
264 int db_1, int db_sx,
265 int ls_1, int ld_1, int la_1, int lf_1,
266 int l_ss, int l_sd)
267 {
268 return perf_desc(u, df_1 + df_sd * int(info.sd) + df_sc * int(info.sc),
269 db_1 + db_sx * int(info.sx),
270 ls_1 + l_ss * int(info.ss),
271 ld_1 + l_ss * int(info.ss) + l_sd * int(info.sd),
272 la_1, lf_1);
273 }
274
275 /**
276 * Compute the timing information of an instruction based on any relevant
277 * information from the IR and a number of linear approximation parameters
278 * hard-coded for each IR instruction.
279 *
280 * Most timing parameters are obtained from the multivariate linear
281 * regression of a sample of empirical timings measured using the tm0
282 * register (as can be done today by using the shader_time debugging
283 * option). The Gen4-5 math timings are obtained from BSpec Volume 5c.3
284 * "Shared Functions - Extended Math", Section 3.2 "Performance".
285 * Parameters marked XXX shall be considered low-quality, they're possibly
286 * high variance or completely guessed in cases where experimental data was
287 * unavailable.
288 */
289 const perf_desc
290 instruction_desc(const instruction_info &info)
291 {
292 const struct gen_device_info *devinfo = info.devinfo;
293
294 switch (info.op) {
295 case BRW_OPCODE_SYNC:
296 case BRW_OPCODE_SEL:
297 case BRW_OPCODE_NOT:
298 case BRW_OPCODE_AND:
299 case BRW_OPCODE_OR:
300 case BRW_OPCODE_XOR:
301 case BRW_OPCODE_SHR:
302 case BRW_OPCODE_SHL:
303 case BRW_OPCODE_DIM:
304 case BRW_OPCODE_ASR:
305 case BRW_OPCODE_CMPN:
306 case BRW_OPCODE_F16TO32:
307 case BRW_OPCODE_BFREV:
308 case BRW_OPCODE_BFI1:
309 case BRW_OPCODE_AVG:
310 case BRW_OPCODE_FRC:
311 case BRW_OPCODE_RNDU:
312 case BRW_OPCODE_RNDD:
313 case BRW_OPCODE_RNDE:
314 case BRW_OPCODE_RNDZ:
315 case BRW_OPCODE_MAC:
316 case BRW_OPCODE_MACH:
317 case BRW_OPCODE_LZD:
318 case BRW_OPCODE_FBH:
319 case BRW_OPCODE_FBL:
320 case BRW_OPCODE_CBIT:
321 case BRW_OPCODE_ADDC:
322 case BRW_OPCODE_ROR:
323 case BRW_OPCODE_ROL:
324 case BRW_OPCODE_SUBB:
325 case BRW_OPCODE_SAD2:
326 case BRW_OPCODE_SADA2:
327 case BRW_OPCODE_LINE:
328 case BRW_OPCODE_NOP:
329 case SHADER_OPCODE_CLUSTER_BROADCAST:
330 case FS_OPCODE_DDX_COARSE:
331 case FS_OPCODE_DDX_FINE:
332 case FS_OPCODE_DDY_COARSE:
333 case FS_OPCODE_PIXEL_X:
334 case FS_OPCODE_PIXEL_Y:
335 case FS_OPCODE_SET_SAMPLE_ID:
336 case VEC4_OPCODE_MOV_BYTES:
337 case VEC4_OPCODE_UNPACK_UNIFORM:
338 case VEC4_OPCODE_DOUBLE_TO_F32:
339 case VEC4_OPCODE_DOUBLE_TO_D32:
340 case VEC4_OPCODE_DOUBLE_TO_U32:
341 case VEC4_OPCODE_TO_DOUBLE:
342 case VEC4_OPCODE_PICK_LOW_32BIT:
343 case VEC4_OPCODE_PICK_HIGH_32BIT:
344 case VEC4_OPCODE_SET_LOW_32BIT:
345 case VEC4_OPCODE_SET_HIGH_32BIT:
346 case GS_OPCODE_SET_DWORD_2:
347 case GS_OPCODE_SET_WRITE_OFFSET:
348 case GS_OPCODE_SET_VERTEX_COUNT:
349 case GS_OPCODE_PREPARE_CHANNEL_MASKS:
350 case GS_OPCODE_SET_CHANNEL_MASKS:
351 case GS_OPCODE_GET_INSTANCE_ID:
352 case GS_OPCODE_SET_PRIMITIVE_ID:
353 case GS_OPCODE_SVB_SET_DST_INDEX:
354 case TCS_OPCODE_SRC0_010_IS_ZERO:
355 case TCS_OPCODE_GET_PRIMITIVE_ID:
356 case TES_OPCODE_GET_PRIMITIVE_ID:
357 if (devinfo->gen >= 11) {
358 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
359 0, 10, 6 /* XXX */, 14, 0, 0);
360 } else if (devinfo->gen >= 8) {
361 if (type_sz(info.tx) > 4)
362 return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
363 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
364 else
365 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
366 0, 8, 4, 12, 0, 0);
367 } else if (devinfo->is_haswell) {
368 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
369 0, 10, 6 /* XXX */, 16, 0, 0);
370 } else {
371 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
372 0, 12, 8 /* XXX */, 18, 0, 0);
373 }
374
375 case BRW_OPCODE_MOV:
376 case BRW_OPCODE_CMP:
377 case BRW_OPCODE_ADD:
378 case BRW_OPCODE_MUL:
379 if (devinfo->gen >= 11) {
380 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
381 0, 10, 6, 14, 0, 0);
382 } else if (devinfo->gen >= 8) {
383 if (type_sz(info.tx) > 4)
384 return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
385 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
386 else
387 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
388 0, 8, 4, 12, 0, 0);
389 } else if (devinfo->is_haswell) {
390 if (info.tx == BRW_REGISTER_TYPE_F)
391 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
392 0, 12, 8 /* XXX */, 18, 0, 0);
393 else
394 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
395 0, 10, 6 /* XXX */, 16, 0, 0);
396 } else if (devinfo->gen >= 7) {
397 if (info.tx == BRW_REGISTER_TYPE_F)
398 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
399 0, 14, 10 /* XXX */, 20, 0, 0);
400 else
401 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
402 0, 12, 8 /* XXX */, 18, 0, 0);
403 } else {
404 return calculate_desc(info, unit_fpu, 0, 2 /* XXX */, 0,
405 0, 2 /* XXX */,
406 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
407 0, 0);
408 }
409
410 case BRW_OPCODE_BFE:
411 case BRW_OPCODE_BFI2:
412 case BRW_OPCODE_CSEL:
413 if (devinfo->gen >= 11)
414 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
415 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
416 else if (devinfo->gen >= 8)
417 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
418 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
419 else if (devinfo->is_haswell)
420 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
421 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
422 else if (devinfo->gen >= 7)
423 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
424 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
425 else
426 abort();
427
428 case BRW_OPCODE_MAD:
429 if (devinfo->gen >= 11) {
430 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
431 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
432 } else if (devinfo->gen >= 8) {
433 if (type_sz(info.tx) > 4)
434 return calculate_desc(info, unit_fpu, 0, 4, 1, 0, 4,
435 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
436 else
437 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
438 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
439 } else if (devinfo->is_haswell) {
440 if (info.tx == BRW_REGISTER_TYPE_F)
441 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
442 0, 12, 8 /* XXX */, 18, 0, 0);
443 else
444 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
445 0, 10, 6 /* XXX */, 16, 0, 0);
446 } else if (devinfo->gen >= 7) {
447 if (info.tx == BRW_REGISTER_TYPE_F)
448 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
449 0, 14, 10 /* XXX */, 20, 0, 0);
450 else
451 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
452 0, 12, 8 /* XXX */, 18, 0, 0);
453 } else if (devinfo->gen >= 6) {
454 return calculate_desc(info, unit_fpu, 0, 2 /* XXX */, 1 /* XXX */,
455 0, 2 /* XXX */,
456 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
457 0, 0);
458 } else {
459 abort();
460 }
461
462 case BRW_OPCODE_F32TO16:
463 if (devinfo->gen >= 11)
464 return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
465 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
466 else if (devinfo->gen >= 8)
467 return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
468 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
469 else if (devinfo->is_haswell)
470 return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
471 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
472 else if (devinfo->gen >= 7)
473 return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
474 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
475 else
476 abort();
477
478 case BRW_OPCODE_DP4:
479 case BRW_OPCODE_DPH:
480 case BRW_OPCODE_DP3:
481 case BRW_OPCODE_DP2:
482 if (devinfo->gen >= 8)
483 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
484 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
485 else if (devinfo->is_haswell)
486 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
487 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
488 else
489 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
490 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
491
492 case SHADER_OPCODE_RCP:
493 case SHADER_OPCODE_RSQ:
494 case SHADER_OPCODE_SQRT:
495 case SHADER_OPCODE_EXP2:
496 case SHADER_OPCODE_LOG2:
497 case SHADER_OPCODE_SIN:
498 case SHADER_OPCODE_COS:
499 case SHADER_OPCODE_POW:
500 case SHADER_OPCODE_INT_QUOTIENT:
501 case SHADER_OPCODE_INT_REMAINDER:
502 if (devinfo->gen >= 6) {
503 switch (info.op) {
504 case SHADER_OPCODE_RCP:
505 case SHADER_OPCODE_RSQ:
506 case SHADER_OPCODE_SQRT:
507 case SHADER_OPCODE_EXP2:
508 case SHADER_OPCODE_LOG2:
509 case SHADER_OPCODE_SIN:
510 case SHADER_OPCODE_COS:
511 if (devinfo->gen >= 8)
512 return calculate_desc(info, unit_em, -2, 4, 0, 0, 4,
513 0, 16, 0, 0, 0, 0);
514 else if (devinfo->is_haswell)
515 return calculate_desc(info, unit_em, 0, 2, 0, 0, 2,
516 0, 12, 0, 0, 0, 0);
517 else
518 return calculate_desc(info, unit_em, 0, 2, 0, 0, 2,
519 0, 14, 0, 0, 0, 0);
520
521 case SHADER_OPCODE_POW:
522 if (devinfo->gen >= 8)
523 return calculate_desc(info, unit_em, -2, 4, 0, 0, 8,
524 0, 24, 0, 0, 0, 0);
525 else if (devinfo->is_haswell)
526 return calculate_desc(info, unit_em, 0, 2, 0, 0, 4,
527 0, 20, 0, 0, 0, 0);
528 else
529 return calculate_desc(info, unit_em, 0, 2, 0, 0, 4,
530 0, 22, 0, 0, 0, 0);
531
532 case SHADER_OPCODE_INT_QUOTIENT:
533 case SHADER_OPCODE_INT_REMAINDER:
534 return calculate_desc(info, unit_em, 2, 0, 0, 26, 0,
535 0, 28 /* XXX */, 0, 0, 0, 0);
536
537 default:
538 abort();
539 }
540 } else {
541 switch (info.op) {
542 case SHADER_OPCODE_RCP:
543 return calculate_desc(info, unit_em, 2, 0, 0, 0, 8,
544 0, 22, 0, 0, 0, 8);
545
546 case SHADER_OPCODE_RSQ:
547 return calculate_desc(info, unit_em, 2, 0, 0, 0, 16,
548 0, 44, 0, 0, 0, 8);
549
550 case SHADER_OPCODE_INT_QUOTIENT:
551 case SHADER_OPCODE_SQRT:
552 case SHADER_OPCODE_LOG2:
553 return calculate_desc(info, unit_em, 2, 0, 0, 0, 24,
554 0, 66, 0, 0, 0, 8);
555
556 case SHADER_OPCODE_INT_REMAINDER:
557 case SHADER_OPCODE_EXP2:
558 return calculate_desc(info, unit_em, 2, 0, 0, 0, 32,
559 0, 88, 0, 0, 0, 8);
560
561 case SHADER_OPCODE_SIN:
562 case SHADER_OPCODE_COS:
563 return calculate_desc(info, unit_em, 2, 0, 0, 0, 48,
564 0, 132, 0, 0, 0, 8);
565
566 case SHADER_OPCODE_POW:
567 return calculate_desc(info, unit_em, 2, 0, 0, 0, 64,
568 0, 176, 0, 0, 0, 8);
569
570 default:
571 abort();
572 }
573 }
574
575 case BRW_OPCODE_DO:
576 if (devinfo->gen >= 6)
577 return calculate_desc(info, unit_null, 0, 0, 0, 0, 0,
578 0, 0, 0, 0, 0, 0);
579 else
580 return calculate_desc(info, unit_null, 2 /* XXX */, 0, 0, 0, 0,
581 0, 0, 0, 0, 0, 0);
582
583 case BRW_OPCODE_IF:
584 case BRW_OPCODE_ELSE:
585 case BRW_OPCODE_ENDIF:
586 case BRW_OPCODE_WHILE:
587 case BRW_OPCODE_BREAK:
588 case BRW_OPCODE_CONTINUE:
589 case FS_OPCODE_DISCARD_JUMP:
590 if (devinfo->gen >= 8)
591 return calculate_desc(info, unit_null, 8, 0, 0, 0, 0,
592 0, 0, 0, 0, 0, 0);
593 else if (devinfo->is_haswell)
594 return calculate_desc(info, unit_null, 6, 0, 0, 0, 0,
595 0, 0, 0, 0, 0, 0);
596 else
597 return calculate_desc(info, unit_null, 2, 0, 0, 0, 0,
598 0, 0, 0, 0, 0, 0);
599
600 case FS_OPCODE_LINTERP:
601 if (devinfo->gen >= 8)
602 return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
603 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
604 else if (devinfo->is_haswell)
605 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
606 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
607 else
608 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
609 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
610
611 case BRW_OPCODE_LRP:
612 if (devinfo->gen >= 8)
613 return calculate_desc(info, unit_fpu, 0, 4, 1, 0, 4,
614 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
615 else if (devinfo->is_haswell)
616 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
617 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
618 else if (devinfo->gen >= 6)
619 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
620 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
621 else
622 abort();
623
624 case FS_OPCODE_PACK_HALF_2x16_SPLIT:
625 if (devinfo->gen >= 11)
626 return calculate_desc(info, unit_fpu, 20, 6, 0, 0, 6,
627 0, 10 /* XXX */, 6 /* XXX */,
628 14 /* XXX */, 0, 0);
629 else if (devinfo->gen >= 8)
630 return calculate_desc(info, unit_fpu, 16, 6, 0, 0, 6,
631 0, 8 /* XXX */, 4 /* XXX */,
632 12 /* XXX */, 0, 0);
633 else if (devinfo->is_haswell)
634 return calculate_desc(info, unit_fpu, 20, 6, 0, 0, 6,
635 0, 10 /* XXX */, 6 /* XXX */,
636 16 /* XXX */, 0, 0);
637 else if (devinfo->gen >= 7)
638 return calculate_desc(info, unit_fpu, 24, 6, 0, 0, 6,
639 0, 12 /* XXX */, 8 /* XXX */,
640 18 /* XXX */, 0, 0);
641 else
642 abort();
643
644 case SHADER_OPCODE_MOV_INDIRECT:
645 if (devinfo->gen >= 11)
646 return calculate_desc(info, unit_fpu, 34, 0, 0, 34, 0,
647 0, 10 /* XXX */, 6 /* XXX */,
648 14 /* XXX */, 0, 0);
649 else if (devinfo->gen >= 8)
650 return calculate_desc(info, unit_fpu, 34, 0, 0, 34, 0,
651 0, 8 /* XXX */, 4 /* XXX */,
652 12 /* XXX */, 0, 0);
653 else if (devinfo->is_haswell)
654 return calculate_desc(info, unit_fpu, 34, 0, 0, 34, 0,
655 0, 10 /* XXX */, 6 /* XXX */,
656 16 /* XXX */, 0, 0);
657 else
658 return calculate_desc(info, unit_fpu, 34, 0, 0, 34, 0,
659 0, 12 /* XXX */, 8 /* XXX */,
660 18 /* XXX */, 0, 0);
661
662 case SHADER_OPCODE_BROADCAST:
663 if (devinfo->gen >= 11)
664 return calculate_desc(info, unit_fpu, 20 /* XXX */, 0, 0, 4, 0,
665 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
666 else if (devinfo->gen >= 8)
667 return calculate_desc(info, unit_fpu, 18, 0, 0, 4, 0,
668 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
669 else if (devinfo->is_haswell)
670 return calculate_desc(info, unit_fpu, 18, 0, 0, 4, 0,
671 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
672 else if (devinfo->gen >= 7)
673 return calculate_desc(info, unit_fpu, 20, 0, 0, 4, 0,
674 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
675 else
676 abort();
677
678 case SHADER_OPCODE_FIND_LIVE_CHANNEL:
679 if (devinfo->gen >= 11)
680 return calculate_desc(info, unit_fpu, 2, 0, 0, 2, 0,
681 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
682 else if (devinfo->gen >= 8)
683 return calculate_desc(info, unit_fpu, 2, 0, 0, 2, 0,
684 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
685 else if (devinfo->is_haswell)
686 return calculate_desc(info, unit_fpu, 36, 0, 0, 6, 0,
687 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
688 else if (devinfo->gen >= 7)
689 return calculate_desc(info, unit_fpu, 40, 0, 0, 6, 0,
690 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
691 else
692 abort();
693
694 case SHADER_OPCODE_RND_MODE:
695 case SHADER_OPCODE_FLOAT_CONTROL_MODE:
696 if (devinfo->gen >= 11)
697 return calculate_desc(info, unit_fpu, 24 /* XXX */, 0, 0,
698 4 /* XXX */, 0,
699 0, 0, 0, 0, 0, 0);
700 else if (devinfo->gen >= 8)
701 return calculate_desc(info, unit_fpu, 20 /* XXX */, 0, 0,
702 4 /* XXX */, 0,
703 0, 0, 0, 0, 0, 0);
704 else if (devinfo->is_haswell)
705 return calculate_desc(info, unit_fpu, 24 /* XXX */, 0, 0,
706 4 /* XXX */, 0,
707 0, 0, 0, 0, 0, 0);
708 else if (devinfo->gen >= 6)
709 return calculate_desc(info, unit_fpu, 28 /* XXX */, 0, 0,
710 4 /* XXX */, 0,
711 0, 0, 0, 0, 0, 0);
712 else
713 abort();
714
715 case SHADER_OPCODE_SHUFFLE:
716 if (devinfo->gen >= 11)
717 return calculate_desc(info, unit_fpu, 44 /* XXX */, 0, 0,
718 44 /* XXX */, 0,
719 0, 10 /* XXX */, 6 /* XXX */,
720 14 /* XXX */, 0, 0);
721 else if (devinfo->gen >= 8)
722 return calculate_desc(info, unit_fpu, 42 /* XXX */, 0, 0,
723 42 /* XXX */, 0,
724 0, 8 /* XXX */, 4 /* XXX */,
725 12 /* XXX */, 0, 0);
726 else if (devinfo->is_haswell)
727 return calculate_desc(info, unit_fpu, 0, 44 /* XXX */, 0,
728 0, 44 /* XXX */,
729 0, 10 /* XXX */, 6 /* XXX */,
730 16 /* XXX */, 0, 0);
731 else if (devinfo->gen >= 6)
732 return calculate_desc(info, unit_fpu, 0, 46 /* XXX */, 0,
733 0, 46 /* XXX */,
734 0, 12 /* XXX */, 8 /* XXX */,
735 18 /* XXX */, 0, 0);
736 else
737 abort();
738
739 case SHADER_OPCODE_SEL_EXEC:
740 if (devinfo->gen >= 11)
741 return calculate_desc(info, unit_fpu, 10 /* XXX */, 4 /* XXX */, 0,
742 0, 4 /* XXX */,
743 0, 10 /* XXX */, 6 /* XXX */,
744 14 /* XXX */, 0, 0);
745 else if (devinfo->gen >= 8)
746 return calculate_desc(info, unit_fpu, 8 /* XXX */, 4 /* XXX */, 0,
747 0, 4 /* XXX */,
748 0, 8 /* XXX */, 4 /* XXX */,
749 12 /* XXX */, 0, 0);
750 else if (devinfo->is_haswell)
751 return calculate_desc(info, unit_fpu, 10 /* XXX */, 4 /* XXX */, 0,
752 0, 4 /* XXX */,
753 0, 10 /* XXX */, 6 /* XXX */,
754 16 /* XXX */, 0, 0);
755 else
756 return calculate_desc(info, unit_fpu, 12 /* XXX */, 4 /* XXX */, 0,
757 0, 4 /* XXX */,
758 0, 12 /* XXX */, 8 /* XXX */,
759 18 /* XXX */, 0, 0);
760
761 case SHADER_OPCODE_QUAD_SWIZZLE:
762 if (devinfo->gen >= 11)
763 return calculate_desc(info, unit_fpu, 0 /* XXX */, 8 /* XXX */, 0,
764 0, 8 /* XXX */,
765 0, 10 /* XXX */, 6 /* XXX */,
766 14 /* XXX */, 0, 0);
767 else if (devinfo->gen >= 8)
768 return calculate_desc(info, unit_fpu, 0 /* XXX */, 8 /* XXX */, 0,
769 0, 8 /* XXX */,
770 0, 8 /* XXX */, 4 /* XXX */,
771 12 /* XXX */, 0, 0);
772 else if (devinfo->is_haswell)
773 return calculate_desc(info, unit_fpu, 0 /* XXX */, 8 /* XXX */, 0,
774 0, 8 /* XXX */,
775 0, 10 /* XXX */, 6 /* XXX */,
776 16 /* XXX */, 0, 0);
777 else
778 return calculate_desc(info, unit_fpu, 0 /* XXX */, 8 /* XXX */, 0,
779 0, 8 /* XXX */,
780 0, 12 /* XXX */, 8 /* XXX */,
781 18 /* XXX */, 0, 0);
782
783 case FS_OPCODE_DDY_FINE:
784 if (devinfo->gen >= 11)
785 return calculate_desc(info, unit_fpu, 0, 14, 0, 0, 4,
786 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
787 else if (devinfo->gen >= 8)
788 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
789 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
790 else if (devinfo->is_haswell)
791 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
792 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
793 else
794 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
795 0, 14, 10 /* XXX */, 20 /* XXX */, 0, 0);
796
797 case FS_OPCODE_LOAD_LIVE_CHANNELS:
798 if (devinfo->gen >= 11)
799 return calculate_desc(info, unit_fpu, 2 /* XXX */, 0, 0,
800 2 /* XXX */, 0,
801 0, 0, 0, 10 /* XXX */, 0, 0);
802 else if (devinfo->gen >= 8)
803 return calculate_desc(info, unit_fpu, 0, 2 /* XXX */, 0,
804 0, 2 /* XXX */,
805 0, 0, 0, 8 /* XXX */, 0, 0);
806 else
807 abort();
808
809 case VEC4_OPCODE_PACK_BYTES:
810 if (devinfo->gen >= 8)
811 return calculate_desc(info, unit_fpu, 4 /* XXX */, 0, 0,
812 4 /* XXX */, 0,
813 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
814 0, 0);
815 else if (devinfo->is_haswell)
816 return calculate_desc(info, unit_fpu, 4 /* XXX */, 0, 0,
817 4 /* XXX */, 0,
818 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
819 0, 0);
820 else
821 return calculate_desc(info, unit_fpu, 4 /* XXX */, 0, 0,
822 4 /* XXX */, 0,
823 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
824 0, 0);
825
826 case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9:
827 if (devinfo->gen >= 8)
828 return calculate_desc(info, unit_fpu, 12 /* XXX */, 0, 0,
829 4 /* XXX */, 0,
830 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
831 0, 0);
832 else
833 abort();
834
835 case VS_OPCODE_UNPACK_FLAGS_SIMD4X2:
836 case TCS_OPCODE_GET_INSTANCE_ID:
837 case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
838 case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
839 case TES_OPCODE_CREATE_INPUT_READ_HEADER:
840 if (devinfo->gen >= 8)
841 return calculate_desc(info, unit_fpu, 22 /* XXX */, 0, 0,
842 6 /* XXX */, 0,
843 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
844 0, 0);
845 else if (devinfo->is_haswell)
846 return calculate_desc(info, unit_fpu, 26 /* XXX */, 0, 0,
847 6 /* XXX */, 0,
848 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
849 0, 0);
850 else
851 return calculate_desc(info, unit_fpu, 30 /* XXX */, 0, 0,
852 6 /* XXX */, 0,
853 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
854 0, 0);
855
856 case GS_OPCODE_FF_SYNC_SET_PRIMITIVES:
857 case TCS_OPCODE_CREATE_BARRIER_HEADER:
858 if (devinfo->gen >= 8)
859 return calculate_desc(info, unit_fpu, 32 /* XXX */, 0, 0,
860 8 /* XXX */, 0,
861 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
862 0, 0);
863 else if (devinfo->is_haswell)
864 return calculate_desc(info, unit_fpu, 38 /* XXX */, 0, 0,
865 8 /* XXX */, 0,
866 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
867 0, 0);
868 else if (devinfo->gen >= 6)
869 return calculate_desc(info, unit_fpu, 44 /* XXX */, 0, 0,
870 8 /* XXX */, 0,
871 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
872 0, 0);
873 else
874 abort();
875
876 case TES_OPCODE_ADD_INDIRECT_URB_OFFSET:
877 if (devinfo->gen >= 8)
878 return calculate_desc(info, unit_fpu, 12 /* XXX */, 0, 0,
879 4 /* XXX */, 0,
880 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
881 0, 0);
882 else if (devinfo->is_haswell)
883 return calculate_desc(info, unit_fpu, 14 /* XXX */, 0, 0,
884 4 /* XXX */, 0,
885 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
886 0, 0);
887 else if (devinfo->gen >= 7)
888 return calculate_desc(info, unit_fpu, 16 /* XXX */, 0, 0,
889 4 /* XXX */, 0,
890 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
891 0, 0);
892 else
893 abort();
894
895 case SHADER_OPCODE_TEX:
896 case FS_OPCODE_TXB:
897 case SHADER_OPCODE_TXD:
898 case SHADER_OPCODE_TXF:
899 case SHADER_OPCODE_TXF_LZ:
900 case SHADER_OPCODE_TXL:
901 case SHADER_OPCODE_TXL_LZ:
902 case SHADER_OPCODE_TXF_CMS:
903 case SHADER_OPCODE_TXF_CMS_W:
904 case SHADER_OPCODE_TXF_UMS:
905 case SHADER_OPCODE_TXF_MCS:
906 case SHADER_OPCODE_TXS:
907 case SHADER_OPCODE_LOD:
908 case SHADER_OPCODE_GET_BUFFER_SIZE:
909 case SHADER_OPCODE_TG4:
910 case SHADER_OPCODE_TG4_OFFSET:
911 case SHADER_OPCODE_SAMPLEINFO:
912 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4:
913 return calculate_desc(info, unit_sampler, 2, 0, 0, 0, 16 /* XXX */,
914 8 /* XXX */, 750 /* XXX */, 0, 0,
915 2 /* XXX */, 0);
916
917 case SHADER_OPCODE_URB_READ_SIMD8:
918 case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
919 case SHADER_OPCODE_URB_WRITE_SIMD8:
920 case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
921 case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
922 case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
923 case VEC4_OPCODE_URB_READ:
924 case VS_OPCODE_URB_WRITE:
925 case GS_OPCODE_URB_WRITE:
926 case GS_OPCODE_URB_WRITE_ALLOCATE:
927 case GS_OPCODE_THREAD_END:
928 case GS_OPCODE_FF_SYNC:
929 case TCS_OPCODE_URB_WRITE:
930 case TCS_OPCODE_RELEASE_INPUT:
931 case TCS_OPCODE_THREAD_END:
932 return calculate_desc(info, unit_urb, 2, 0, 0, 0, 6 /* XXX */,
933 32 /* XXX */, 200 /* XXX */, 0, 0, 0, 0);
934
935 case SHADER_OPCODE_MEMORY_FENCE:
936 case SHADER_OPCODE_INTERLOCK:
937 if (devinfo->gen >= 7)
938 return calculate_desc(info, unit_dp_dc, 2, 0, 0, 30 /* XXX */, 0,
939 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
940 else
941 abort();
942
943 case SHADER_OPCODE_GEN4_SCRATCH_READ:
944 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
945 case SHADER_OPCODE_GEN7_SCRATCH_READ:
946 return calculate_desc(info, unit_dp_dc, 2, 0, 0, 0, 8 /* XXX */,
947 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
948
949 case VEC4_OPCODE_UNTYPED_ATOMIC:
950 if (devinfo->gen >= 7)
951 return calculate_desc(info, unit_dp_dc, 2, 0, 0,
952 30 /* XXX */, 400 /* XXX */,
953 10 /* XXX */, 100 /* XXX */, 0, 0,
954 0, 400 /* XXX */);
955 else
956 abort();
957
958 case VEC4_OPCODE_UNTYPED_SURFACE_READ:
959 case VEC4_OPCODE_UNTYPED_SURFACE_WRITE:
960 if (devinfo->gen >= 7)
961 return calculate_desc(info, unit_dp_dc, 2, 0, 0,
962 0, 20 /* XXX */,
963 10 /* XXX */, 100 /* XXX */, 0, 0,
964 0, 0);
965 else
966 abort();
967
968 case FS_OPCODE_FB_WRITE:
969 case FS_OPCODE_FB_READ:
970 case FS_OPCODE_REP_FB_WRITE:
971 return calculate_desc(info, unit_dp_rc, 2, 0, 0, 0, 450 /* XXX */,
972 10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
973
974 case GS_OPCODE_SVB_WRITE:
975 if (devinfo->gen >= 6)
976 return calculate_desc(info, unit_dp_rc, 2 /* XXX */, 0, 0,
977 0, 450 /* XXX */,
978 10 /* XXX */, 300 /* XXX */, 0, 0,
979 0, 0);
980 else
981 abort();
982
983 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
984 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
985 return calculate_desc(info, unit_dp_cc, 2, 0, 0, 0, 16 /* XXX */,
986 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
987
988 case VS_OPCODE_PULL_CONSTANT_LOAD:
989 case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
990 return calculate_desc(info, unit_sampler, 2, 0, 0, 0, 16,
991 8, 750, 0, 0, 2, 0);
992
993 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
994 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
995 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
996 if (devinfo->gen >= 7)
997 return calculate_desc(info, unit_pi, 2, 0, 0, 14 /* XXX */, 0,
998 0, 90 /* XXX */, 0, 0, 0, 0);
999 else
1000 abort();
1001
1002 case SHADER_OPCODE_BARRIER:
1003 if (devinfo->gen >= 7)
1004 return calculate_desc(info, unit_gateway, 90 /* XXX */, 0, 0,
1005 0 /* XXX */, 0,
1006 0, 0, 0, 0, 0, 0);
1007 else
1008 abort();
1009
1010 case CS_OPCODE_CS_TERMINATE:
1011 if (devinfo->gen >= 7)
1012 return calculate_desc(info, unit_spawner, 2, 0, 0, 0 /* XXX */, 0,
1013 10 /* XXX */, 0, 0, 0, 0, 0);
1014 else
1015 abort();
1016
1017 case SHADER_OPCODE_SEND:
1018 switch (info.sfid) {
1019 case GEN6_SFID_DATAPORT_RENDER_CACHE:
1020 if (devinfo->gen >= 7) {
1021 switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
1022 case GEN7_DATAPORT_RC_TYPED_ATOMIC_OP:
1023 return calculate_desc(info, unit_dp_rc, 2, 0, 0,
1024 30 /* XXX */, 450 /* XXX */,
1025 10 /* XXX */, 100 /* XXX */,
1026 0, 0, 0, 400 /* XXX */);
1027 default:
1028 return calculate_desc(info, unit_dp_rc, 2, 0, 0,
1029 0, 450 /* XXX */,
1030 10 /* XXX */, 300 /* XXX */, 0, 0,
1031 0, 0);
1032 }
1033 } else if (devinfo->gen >= 6) {
1034 return calculate_desc(info, unit_dp_rc, 2 /* XXX */, 0, 0,
1035 0, 450 /* XXX */,
1036 10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
1037 } else {
1038 abort();
1039 }
1040 case BRW_SFID_SAMPLER: {
1041 if (devinfo->gen >= 6)
1042 return calculate_desc(info, unit_sampler, 2, 0, 0, 0, 16,
1043 8, 750, 0, 0, 2, 0);
1044 else
1045 abort();
1046 }
1047 case GEN7_SFID_DATAPORT_DATA_CACHE:
1048 case HSW_SFID_DATAPORT_DATA_CACHE_1:
1049 if (devinfo->gen >= 8 || devinfo->is_haswell) {
1050 switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
1051 case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP:
1052 case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2:
1053 case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2:
1054 case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP:
1055 return calculate_desc(info, unit_dp_dc, 2, 0, 0,
1056 30 /* XXX */, 400 /* XXX */,
1057 10 /* XXX */, 100 /* XXX */, 0, 0,
1058 0, 400 /* XXX */);
1059
1060 default:
1061 return calculate_desc(info, unit_dp_dc, 2, 0, 0,
1062 0, 20 /* XXX */,
1063 10 /* XXX */, 100 /* XXX */, 0, 0,
1064 0, 0);
1065 }
1066 } else if (devinfo->gen >= 7) {
1067 switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
1068 case GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP:
1069 return calculate_desc(info, unit_dp_dc, 2, 0, 0,
1070 30 /* XXX */, 400 /* XXX */,
1071 10 /* XXX */, 100 /* XXX */,
1072 0, 0, 0, 400 /* XXX */);
1073 default:
1074 return calculate_desc(info, unit_dp_dc, 2, 0, 0,
1075 0, 20 /* XXX */,
1076 10 /* XXX */, 100 /* XXX */, 0, 0,
1077 0, 0);
1078 }
1079 } else {
1080 abort();
1081 }
1082 default:
1083 abort();
1084 }
1085
1086 case SHADER_OPCODE_UNDEF:
1087 case FS_OPCODE_PLACEHOLDER_HALT:
1088 case FS_OPCODE_SCHEDULING_FENCE:
1089 return calculate_desc(info, unit_null, 0, 0, 0, 0, 0,
1090 0, 0, 0, 0, 0, 0);
1091
1092 default:
1093 abort();
1094 }
1095 }
1096
1097 /**
1098 * Model the performance behavior of a stall on the specified dependency
1099 * ID.
1100 */
1101 void
1102 stall_on_dependency(state &st, dependency_id id)
1103 {
1104 if (id < ARRAY_SIZE(st.dep_ready))
1105 st.unit_ready[unit_fe] = MAX2(st.unit_ready[unit_fe],
1106 st.dep_ready[id]);
1107 }
1108
1109 /**
1110 * Model the performance behavior of the front-end and back-end while
1111 * executing an instruction with the specified timing information, assuming
1112 * all dependencies are already clear.
1113 */
1114 void
1115 execute_instruction(state &st, const perf_desc &perf)
1116 {
1117 /* Compute the time at which the front-end will be ready to execute the
1118 * next instruction.
1119 */
1120 st.unit_ready[unit_fe] += perf.df;
1121
1122 if (perf.u < num_units) {
1123 /* Wait for the back-end to be ready to execute this instruction. */
1124 st.unit_ready[unit_fe] = MAX2(st.unit_ready[unit_fe],
1125 st.unit_ready[perf.u]);
1126
1127 /* Compute the time at which the back-end will be ready to execute
1128 * the next instruction, and update the back-end utilization.
1129 */
1130 st.unit_ready[perf.u] = st.unit_ready[unit_fe] + perf.db;
1131 st.unit_busy[perf.u] += perf.db * st.weight;
1132 }
1133 }
1134
1135 /**
1136 * Model the performance behavior of a read dependency provided by an
1137 * instruction.
1138 */
1139 void
1140 mark_read_dependency(state &st, const perf_desc &perf, dependency_id id)
1141 {
1142 if (id < ARRAY_SIZE(st.dep_ready))
1143 st.dep_ready[id] = st.unit_ready[unit_fe] + perf.ls;
1144 }
1145
1146 /**
1147 * Model the performance behavior of a write dependency provided by an
1148 * instruction.
1149 */
1150 void
1151 mark_write_dependency(state &st, const perf_desc &perf, dependency_id id)
1152 {
1153 if (id >= dependency_id_accum0 && id < dependency_id_flag0)
1154 st.dep_ready[id] = st.unit_ready[unit_fe] + perf.la;
1155 else if (id >= dependency_id_flag0 && id < dependency_id_sbid_wr0)
1156 st.dep_ready[id] = st.unit_ready[unit_fe] + perf.lf;
1157 else if (id < ARRAY_SIZE(st.dep_ready))
1158 st.dep_ready[id] = st.unit_ready[unit_fe] + perf.ld;
1159 }
1160
1161 /**
1162 * Return the dependency ID of a backend_reg, offset by \p delta GRFs.
1163 */
1164 dependency_id
1165 reg_dependency_id(const gen_device_info *devinfo, const backend_reg &r,
1166 const int delta)
1167 {
1168 if (r.file == VGRF) {
1169 const unsigned i = r.nr + r.offset / REG_SIZE + delta;
1170 assert(i < dependency_id_mrf0 - dependency_id_grf0);
1171 return dependency_id(dependency_id_grf0 + i);
1172
1173 } else if (r.file == FIXED_GRF) {
1174 const unsigned i = r.nr + delta;
1175 assert(i < dependency_id_mrf0 - dependency_id_grf0);
1176 return dependency_id(dependency_id_grf0 + i);
1177
1178 } else if (r.file == MRF && devinfo->gen >= 7) {
1179 const unsigned i = GEN7_MRF_HACK_START +
1180 r.nr + r.offset / REG_SIZE + delta;
1181 assert(i < dependency_id_mrf0 - dependency_id_grf0);
1182 return dependency_id(dependency_id_grf0 + i);
1183
1184 } else if (r.file == MRF && devinfo->gen < 7) {
1185 const unsigned i = (r.nr & ~BRW_MRF_COMPR4) +
1186 r.offset / REG_SIZE + delta;
1187 assert(i < dependency_id_addr0 - dependency_id_mrf0);
1188 return dependency_id(dependency_id_mrf0 + i);
1189
1190 } else if (r.file == ARF && r.nr >= BRW_ARF_ADDRESS &&
1191 r.nr < BRW_ARF_ACCUMULATOR) {
1192 assert(delta == 0);
1193 return dependency_id_addr0;
1194
1195 } else if (r.file == ARF && r.nr >= BRW_ARF_ACCUMULATOR &&
1196 r.nr < BRW_ARF_FLAG) {
1197 const unsigned i = r.nr - BRW_ARF_ACCUMULATOR + delta;
1198 assert(i < dependency_id_flag0 - dependency_id_accum0);
1199 return dependency_id(dependency_id_accum0 + i);
1200
1201 } else {
1202 return num_dependency_ids;
1203 }
1204 }
1205
1206 /**
1207 * Return the dependency ID of flag register starting at offset \p i.
1208 */
1209 dependency_id
1210 flag_dependency_id(unsigned i)
1211 {
1212 assert(i < dependency_id_sbid_wr0 - dependency_id_flag0);
1213 return dependency_id(dependency_id_flag0 + i);
1214 }
1215
1216 /**
1217 * Return the dependency ID corresponding to the SBID read completion
1218 * condition of a Gen12+ SWSB.
1219 */
1220 dependency_id
1221 tgl_swsb_rd_dependency_id(tgl_swsb swsb)
1222 {
1223 if (swsb.mode) {
1224 assert(swsb.sbid < num_dependency_ids - dependency_id_sbid_rd0);
1225 return dependency_id(dependency_id_sbid_rd0 + swsb.sbid);
1226 } else {
1227 return num_dependency_ids;
1228 }
1229 }
1230
1231 /**
1232 * Return the dependency ID corresponding to the SBID write completion
1233 * condition of a Gen12+ SWSB.
1234 */
1235 dependency_id
1236 tgl_swsb_wr_dependency_id(tgl_swsb swsb)
1237 {
1238 if (swsb.mode) {
1239 assert(swsb.sbid < dependency_id_sbid_rd0 - dependency_id_sbid_wr0);
1240 return dependency_id(dependency_id_sbid_wr0 + swsb.sbid);
1241 } else {
1242 return num_dependency_ids;
1243 }
1244 }
1245
1246 /**
1247 * Return the implicit accumulator register accessed by channel \p i of the
1248 * instruction.
1249 */
1250 unsigned
1251 accum_reg_of_channel(const gen_device_info *devinfo,
1252 const backend_instruction *inst,
1253 brw_reg_type tx, unsigned i)
1254 {
1255 assert(inst->reads_accumulator_implicitly() ||
1256 inst->writes_accumulator_implicitly(devinfo));
1257 const unsigned offset = (inst->group + i) * type_sz(tx) *
1258 (devinfo->gen < 7 || brw_reg_type_is_floating_point(tx) ? 1 : 2);
1259 return offset / REG_SIZE % 2;
1260 }
1261
1262 /**
1263 * Model the performance behavior of an FS back-end instruction.
1264 */
1265 void
1266 issue_fs_inst(state &st, const gen_device_info *devinfo,
1267 const backend_instruction *be_inst)
1268 {
1269 const fs_inst *inst = static_cast<const fs_inst *>(be_inst);
1270 const instruction_info info(devinfo, inst);
1271 const perf_desc perf = instruction_desc(info);
1272
1273 /* Stall on any source dependencies. */
1274 for (unsigned i = 0; i < inst->sources; i++) {
1275 for (unsigned j = 0; j < regs_read(inst, i); j++)
1276 stall_on_dependency(
1277 st, reg_dependency_id(devinfo, inst->src[i], j));
1278 }
1279
1280 if (inst->reads_accumulator_implicitly()) {
1281 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1282 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1283 inst->exec_size - 1); j++)
1284 stall_on_dependency(
1285 st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
1286 }
1287
1288 if (is_send(inst) && inst->base_mrf != -1) {
1289 for (unsigned j = 0; j < inst->mlen; j++)
1290 stall_on_dependency(
1291 st, reg_dependency_id(
1292 devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
1293 }
1294
1295 if (const unsigned mask = inst->flags_read(devinfo)) {
1296 for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
1297 if (mask & (1 << i))
1298 stall_on_dependency(st, flag_dependency_id(i));
1299 }
1300 }
1301
1302 /* Stall on any write dependencies. */
1303 if (!inst->no_dd_check) {
1304 if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1305 for (unsigned j = 0; j < regs_written(inst); j++)
1306 stall_on_dependency(
1307 st, reg_dependency_id(devinfo, inst->dst, j));
1308 }
1309
1310 if (inst->writes_accumulator_implicitly(devinfo)) {
1311 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1312 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1313 inst->exec_size - 1); j++)
1314 stall_on_dependency(
1315 st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
1316 }
1317
1318 if (const unsigned mask = inst->flags_written()) {
1319 for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
1320 if (mask & (1 << i))
1321 stall_on_dependency(st, flag_dependency_id(i));
1322 }
1323 }
1324 }
1325
1326 /* Stall on any SBID dependencies. */
1327 if (inst->sched.mode & (TGL_SBID_SET | TGL_SBID_DST))
1328 stall_on_dependency(st, tgl_swsb_wr_dependency_id(inst->sched));
1329 else if (inst->sched.mode & TGL_SBID_SRC)
1330 stall_on_dependency(st, tgl_swsb_rd_dependency_id(inst->sched));
1331
1332 /* Execute the instruction. */
1333 execute_instruction(st, perf);
1334
1335 /* Mark any source dependencies. */
1336 if (inst->is_send_from_grf()) {
1337 for (unsigned i = 0; i < inst->sources; i++) {
1338 if (inst->is_payload(i)) {
1339 for (unsigned j = 0; j < regs_read(inst, i); j++)
1340 mark_read_dependency(
1341 st, perf, reg_dependency_id(devinfo, inst->src[i], j));
1342 }
1343 }
1344 }
1345
1346 if (is_send(inst) && inst->base_mrf != -1) {
1347 for (unsigned j = 0; j < inst->mlen; j++)
1348 mark_read_dependency(st, perf,
1349 reg_dependency_id(devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
1350 }
1351
1352 /* Mark any destination dependencies. */
1353 if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1354 for (unsigned j = 0; j < regs_written(inst); j++) {
1355 mark_write_dependency(st, perf,
1356 reg_dependency_id(devinfo, inst->dst, j));
1357 }
1358 }
1359
1360 if (inst->writes_accumulator_implicitly(devinfo)) {
1361 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1362 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1363 inst->exec_size - 1); j++)
1364 mark_write_dependency(st, perf,
1365 reg_dependency_id(devinfo, brw_acc_reg(8), j));
1366 }
1367
1368 if (const unsigned mask = inst->flags_written()) {
1369 for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
1370 if (mask & (1 << i))
1371 mark_write_dependency(st, perf, flag_dependency_id(i));
1372 }
1373 }
1374
1375 /* Mark any SBID dependencies. */
1376 if (inst->sched.mode & TGL_SBID_SET) {
1377 mark_read_dependency(st, perf, tgl_swsb_rd_dependency_id(inst->sched));
1378 mark_write_dependency(st, perf, tgl_swsb_wr_dependency_id(inst->sched));
1379 }
1380 }
1381
1382 /**
1383 * Model the performance behavior of a VEC4 back-end instruction.
1384 */
1385 void
1386 issue_vec4_instruction(state &st, const gen_device_info *devinfo,
1387 const backend_instruction *be_inst)
1388 {
1389 const vec4_instruction *inst =
1390 static_cast<const vec4_instruction *>(be_inst);
1391 const instruction_info info(devinfo, inst);
1392 const perf_desc perf = instruction_desc(info);
1393
1394 /* Stall on any source dependencies. */
1395 for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) {
1396 for (unsigned j = 0; j < regs_read(inst, i); j++)
1397 stall_on_dependency(
1398 st, reg_dependency_id(devinfo, inst->src[i], j));
1399 }
1400
1401 if (inst->reads_accumulator_implicitly()) {
1402 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1403 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1404 inst->exec_size - 1); j++)
1405 stall_on_dependency(
1406 st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
1407 }
1408
1409 if (inst->base_mrf != -1) {
1410 for (unsigned j = 0; j < inst->mlen; j++)
1411 stall_on_dependency(
1412 st, reg_dependency_id(
1413 devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
1414 }
1415
1416 if (inst->reads_flag())
1417 stall_on_dependency(st, dependency_id_flag0);
1418
1419 /* Stall on any write dependencies. */
1420 if (!inst->no_dd_check) {
1421 if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1422 for (unsigned j = 0; j < regs_written(inst); j++)
1423 stall_on_dependency(
1424 st, reg_dependency_id(devinfo, inst->dst, j));
1425 }
1426
1427 if (inst->writes_accumulator_implicitly(devinfo)) {
1428 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1429 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1430 inst->exec_size - 1); j++)
1431 stall_on_dependency(
1432 st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
1433 }
1434
1435 if (inst->writes_flag())
1436 stall_on_dependency(st, dependency_id_flag0);
1437 }
1438
1439 /* Execute the instruction. */
1440 execute_instruction(st, perf);
1441
1442 /* Mark any source dependencies. */
1443 if (inst->is_send_from_grf()) {
1444 for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) {
1445 for (unsigned j = 0; j < regs_read(inst, i); j++)
1446 mark_read_dependency(
1447 st, perf, reg_dependency_id(devinfo, inst->src[i], j));
1448 }
1449 }
1450
1451 if (inst->base_mrf != -1) {
1452 for (unsigned j = 0; j < inst->mlen; j++)
1453 mark_read_dependency(st, perf,
1454 reg_dependency_id(devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
1455 }
1456
1457 /* Mark any destination dependencies. */
1458 if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1459 for (unsigned j = 0; j < regs_written(inst); j++) {
1460 mark_write_dependency(st, perf,
1461 reg_dependency_id(devinfo, inst->dst, j));
1462 }
1463 }
1464
1465 if (inst->writes_accumulator_implicitly(devinfo)) {
1466 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1467 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1468 inst->exec_size - 1); j++)
1469 mark_write_dependency(st, perf,
1470 reg_dependency_id(devinfo, brw_acc_reg(8), j));
1471 }
1472
1473 if (inst->writes_flag())
1474 mark_write_dependency(st, perf, dependency_id_flag0);
1475 }
1476
1477 /**
1478 * Calculate the maximum possible throughput of the program compatible with
1479 * the cycle-count utilization estimated for each asynchronous unit, in
1480 * threads-per-cycle units.
1481 */
1482 float
1483 calculate_thread_throughput(const state &st, float busy)
1484 {
1485 for (unsigned i = 0; i < num_units; i++)
1486 busy = MAX2(busy, st.unit_busy[i]);
1487
1488 return 1.0 / busy;
1489 }
1490
1491 /**
1492 * Estimate the performance of the specified shader.
1493 */
1494 void
1495 calculate_performance(performance &p, const backend_shader *s,
1496 void (*issue_instruction)(
1497 state &, const gen_device_info *,
1498 const backend_instruction *),
1499 unsigned dispatch_width)
1500 {
1501 /* XXX - Plumbing the trip counts from NIR loop analysis would allow us
1502 * to do a better job regarding the loop weights. And some branch
1503 * divergence analysis would allow us to do a better job with
1504 * branching weights.
1505 *
1506 * In the meantime use values that roughly match the control flow
1507 * weights used elsewhere in the compiler back-end -- Main
1508 * difference is the worst-case scenario branch_weight used for
1509 * SIMD32 which accounts for the possibility of a dynamically
1510 * uniform branch becoming divergent in SIMD32.
1511 */
1512 const float branch_weight = (dispatch_width > 16 ? 1.0 : 0.5);
1513 const float loop_weight = 10;
1514 unsigned elapsed = 0;
1515 state st;
1516
1517 foreach_block(block, s->cfg) {
1518 const unsigned elapsed0 = elapsed;
1519
1520 foreach_inst_in_block(backend_instruction, inst, block) {
1521 const unsigned clock0 = st.unit_ready[unit_fe];
1522
1523 issue_instruction(st, s->devinfo, inst);
1524
1525 if (inst->opcode == BRW_OPCODE_ENDIF)
1526 st.weight /= branch_weight;
1527
1528 elapsed += (st.unit_ready[unit_fe] - clock0) * st.weight;
1529
1530 if (inst->opcode == BRW_OPCODE_IF)
1531 st.weight *= branch_weight;
1532 else if (inst->opcode == BRW_OPCODE_DO)
1533 st.weight *= loop_weight;
1534 else if (inst->opcode == BRW_OPCODE_WHILE)
1535 st.weight /= loop_weight;
1536 }
1537
1538 p.block_latency[block->num] = elapsed - elapsed0;
1539 }
1540
1541 p.latency = elapsed;
1542 p.throughput = dispatch_width * calculate_thread_throughput(st, elapsed);
1543 }
1544 }
1545
1546 brw::performance::performance(const fs_visitor *v) :
1547 block_latency(new unsigned[v->cfg->num_blocks])
1548 {
1549 calculate_performance(*this, v, issue_fs_inst, v->dispatch_width);
1550 }
1551
1552 brw::performance::performance(const vec4_visitor *v) :
1553 block_latency(new unsigned[v->cfg->num_blocks])
1554 {
1555 calculate_performance(*this, v, issue_vec4_instruction, 8);
1556 }
1557
1558 brw::performance::~performance()
1559 {
1560 delete[] block_latency;
1561 }