Added few more stubs so that control reaches to DestroyDevice().
[mesa.git] / src / intel / compiler / brw_ir_performance.cpp
1 /*
2 * Copyright © 2020 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_eu.h"
25 #include "brw_fs.h"
26 #include "brw_vec4.h"
27 #include "brw_cfg.h"
28
29 using namespace brw;
30
31 namespace {
32 /**
33 * Enumeration representing the various asynchronous units that can run
34 * computations in parallel on behalf of a shader thread.
35 */
36 enum unit {
37 /** EU front-end. */
38 unit_fe,
39 /** EU FPU0 (Note that co-issue to FPU1 is currently not modeled here). */
40 unit_fpu,
41 /** Extended Math unit (AKA FPU1 on Gen8-11, part of the EU on Gen6+). */
42 unit_em,
43 /** Sampler shared function. */
44 unit_sampler,
45 /** Pixel Interpolator shared function. */
46 unit_pi,
47 /** Unified Return Buffer shared function. */
48 unit_urb,
49 /** Data Port Data Cache shared function. */
50 unit_dp_dc,
51 /** Data Port Render Cache shared function. */
52 unit_dp_rc,
53 /** Data Port Constant Cache shared function. */
54 unit_dp_cc,
55 /** Message Gateway shared function. */
56 unit_gateway,
57 /** Thread Spawner shared function. */
58 unit_spawner,
59 /* unit_vme, */
60 /* unit_cre, */
61 /** Number of asynchronous units currently tracked. */
62 num_units,
63 /** Dummy unit for instructions that don't consume runtime from the above. */
64 unit_null = num_units
65 };
66
67 /**
68 * Enumeration representing a computation result another computation can
69 * potentially depend on.
70 */
71 enum dependency_id {
72 /* Register part of the GRF. */
73 dependency_id_grf0 = 0,
74 /* Register part of the MRF. Only used on Gen4-6. */
75 dependency_id_mrf0 = dependency_id_grf0 + BRW_MAX_GRF,
76 /* Address register part of the ARF. */
77 dependency_id_addr0 = dependency_id_mrf0 + 24,
78 /* Accumulator register part of the ARF. */
79 dependency_id_accum0 = dependency_id_addr0 + 1,
80 /* Flag register part of the ARF. */
81 dependency_id_flag0 = dependency_id_accum0 + 12,
82 /* SBID token write completion. Only used on Gen12+. */
83 dependency_id_sbid_wr0 = dependency_id_flag0 + 8,
84 /* SBID token read completion. Only used on Gen12+. */
85 dependency_id_sbid_rd0 = dependency_id_sbid_wr0 + 16,
86 /* Number of computation dependencies currently tracked. */
87 num_dependency_ids = dependency_id_sbid_rd0 + 16
88 };
89
90 /**
91 * State of our modeling of the program execution.
92 */
93 struct state {
94 state() : unit_ready(), dep_ready(), unit_busy(), weight(1.0) {}
95 /**
96 * Time at which a given unit will be ready to execute the next
97 * computation, in clock units.
98 */
99 unsigned unit_ready[num_units];
100 /**
101 * Time at which an instruction dependent on a given dependency ID will
102 * be ready to execute, in clock units.
103 */
104 unsigned dep_ready[num_dependency_ids];
105 /**
106 * Aggregated utilization of a given unit excluding idle cycles,
107 * in clock units.
108 */
109 float unit_busy[num_units];
110 /**
111 * Factor of the overhead of a computation accounted for in the
112 * aggregated utilization calculation.
113 */
114 float weight;
115 };
116
117 /**
118 * Information derived from an IR instruction used to compute performance
119 * estimates. Allows the timing calculation to work on both FS and VEC4
120 * instructions.
121 */
122 struct instruction_info {
123 instruction_info(const gen_device_info *devinfo, const fs_inst *inst) :
124 devinfo(devinfo), op(inst->opcode),
125 td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)),
126 tx(get_exec_type(inst)), sx(0), ss(0),
127 sc(has_bank_conflict(devinfo, inst) ? sd : 0),
128 desc(inst->desc), sfid(inst->sfid)
129 {
130 /* We typically want the maximum source size, except for split send
131 * messages which require the total size.
132 */
133 if (inst->opcode == SHADER_OPCODE_SEND) {
134 ss = DIV_ROUND_UP(inst->size_read(2), REG_SIZE) +
135 DIV_ROUND_UP(inst->size_read(3), REG_SIZE);
136 } else {
137 for (unsigned i = 0; i < inst->sources; i++)
138 ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
139 }
140
141 /* Convert the execution size to GRF units. */
142 sx = DIV_ROUND_UP(inst->exec_size * type_sz(tx), REG_SIZE);
143
144 /* 32x32 integer multiplication has half the usual ALU throughput.
145 * Treat it as double-precision.
146 */
147 if ((inst->opcode == BRW_OPCODE_MUL || inst->opcode == BRW_OPCODE_MAD) &&
148 !brw_reg_type_is_floating_point(tx) && type_sz(tx) == 4 &&
149 type_sz(inst->src[0].type) == type_sz(inst->src[1].type))
150 tx = brw_int_type(8, tx == BRW_REGISTER_TYPE_D);
151 }
152
153 instruction_info(const gen_device_info *devinfo,
154 const vec4_instruction *inst) :
155 devinfo(devinfo), op(inst->opcode),
156 td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)),
157 tx(get_exec_type(inst)), sx(0), ss(0), sc(0),
158 desc(inst->desc), sfid(inst->sfid)
159 {
160 /* Compute the maximum source size. */
161 for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++)
162 ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
163
164 /* Convert the execution size to GRF units. */
165 sx = DIV_ROUND_UP(inst->exec_size * type_sz(tx), REG_SIZE);
166
167 /* 32x32 integer multiplication has half the usual ALU throughput.
168 * Treat it as double-precision.
169 */
170 if ((inst->opcode == BRW_OPCODE_MUL || inst->opcode == BRW_OPCODE_MAD) &&
171 !brw_reg_type_is_floating_point(tx) && type_sz(tx) == 4 &&
172 type_sz(inst->src[0].type) == type_sz(inst->src[1].type))
173 tx = brw_int_type(8, tx == BRW_REGISTER_TYPE_D);
174 }
175
176 /** Device information. */
177 const struct gen_device_info *devinfo;
178 /** Instruction opcode. */
179 opcode op;
180 /** Destination type. */
181 brw_reg_type td;
182 /** Destination size in GRF units. */
183 unsigned sd;
184 /** Execution type. */
185 brw_reg_type tx;
186 /** Execution size in GRF units. */
187 unsigned sx;
188 /** Source size. */
189 unsigned ss;
190 /** Bank conflict penalty size in GRF units (equal to sd if non-zero). */
191 unsigned sc;
192 /** Send message descriptor. */
193 uint32_t desc;
194 /** Send message shared function ID. */
195 uint8_t sfid;
196 };
197
198 /**
199 * Timing information of an instruction used to estimate the performance of
200 * the program.
201 */
202 struct perf_desc {
203 perf_desc(unit u, int df, int db, int ls, int ld, int la, int lf) :
204 u(u), df(df), db(db), ls(ls), ld(ld), la(la), lf(lf) {}
205
206 /**
207 * Back-end unit its runtime shall be accounted to, in addition to the
208 * EU front-end which is always assumed to be involved.
209 */
210 unit u;
211 /**
212 * Overhead cycles from the time that the EU front-end starts executing
213 * the instruction until it's ready to execute the next instruction.
214 */
215 int df;
216 /**
217 * Overhead cycles from the time that the back-end starts executing the
218 * instruction until it's ready to execute the next instruction.
219 */
220 int db;
221 /**
222 * Latency cycles from the time that the back-end starts executing the
223 * instruction until its sources have been read from the register file.
224 */
225 int ls;
226 /**
227 * Latency cycles from the time that the back-end starts executing the
228 * instruction until its regular destination has been written to the
229 * register file.
230 */
231 int ld;
232 /**
233 * Latency cycles from the time that the back-end starts executing the
234 * instruction until its accumulator destination has been written to the
235 * ARF file.
236 *
237 * Note that this is an approximation of the real behavior of
238 * accumulating instructions in the hardware: Instead of modeling a pair
239 * of back-to-back accumulating instructions as a first computation with
240 * latency equal to ld followed by another computation with a
241 * mid-pipeline stall (e.g. after the "M" part of a MAC instruction), we
242 * model the stall as if it occurred at the top of the pipeline, with
243 * the latency of the accumulator computation offset accordingly.
244 */
245 int la;
246 /**
247 * Latency cycles from the time that the back-end starts executing the
248 * instruction until its flag destination has been written to the ARF
249 * file.
250 */
251 int lf;
252 };
253
254 /**
255 * Compute the timing information of an instruction based on any relevant
256 * information from the IR and a number of parameters specifying a linear
257 * approximation: Parameter X_Y specifies the derivative of timing X
258 * relative to info field Y, while X_1 specifies the independent term of
259 * the approximation of timing X.
260 */
261 perf_desc
262 calculate_desc(const instruction_info &info, unit u,
263 int df_1, int df_sd, int df_sc,
264 int db_1, int db_sx,
265 int ls_1, int ld_1, int la_1, int lf_1,
266 int l_ss, int l_sd)
267 {
268 return perf_desc(u, df_1 + df_sd * int(info.sd) + df_sc * int(info.sc),
269 db_1 + db_sx * int(info.sx),
270 ls_1 + l_ss * int(info.ss),
271 ld_1 + l_ss * int(info.ss) + l_sd * int(info.sd),
272 la_1, lf_1);
273 }
274
275 /**
276 * Compute the timing information of an instruction based on any relevant
277 * information from the IR and a number of linear approximation parameters
278 * hard-coded for each IR instruction.
279 *
280 * Most timing parameters are obtained from the multivariate linear
281 * regression of a sample of empirical timings measured using the tm0
282 * register (as can be done today by using the shader_time debugging
283 * option). The Gen4-5 math timings are obtained from BSpec Volume 5c.3
284 * "Shared Functions - Extended Math", Section 3.2 "Performance".
285 * Parameters marked XXX shall be considered low-quality, they're possibly
286 * high variance or completely guessed in cases where experimental data was
287 * unavailable.
288 */
289 const perf_desc
290 instruction_desc(const instruction_info &info)
291 {
292 const struct gen_device_info *devinfo = info.devinfo;
293
294 switch (info.op) {
295 case BRW_OPCODE_SYNC:
296 case BRW_OPCODE_SEL:
297 case BRW_OPCODE_NOT:
298 case BRW_OPCODE_AND:
299 case BRW_OPCODE_OR:
300 case BRW_OPCODE_XOR:
301 case BRW_OPCODE_SHR:
302 case BRW_OPCODE_SHL:
303 case BRW_OPCODE_DIM:
304 case BRW_OPCODE_ASR:
305 case BRW_OPCODE_CMPN:
306 case BRW_OPCODE_F16TO32:
307 case BRW_OPCODE_BFREV:
308 case BRW_OPCODE_BFI1:
309 case BRW_OPCODE_AVG:
310 case BRW_OPCODE_FRC:
311 case BRW_OPCODE_RNDU:
312 case BRW_OPCODE_RNDD:
313 case BRW_OPCODE_RNDE:
314 case BRW_OPCODE_RNDZ:
315 case BRW_OPCODE_MAC:
316 case BRW_OPCODE_MACH:
317 case BRW_OPCODE_LZD:
318 case BRW_OPCODE_FBH:
319 case BRW_OPCODE_FBL:
320 case BRW_OPCODE_CBIT:
321 case BRW_OPCODE_ADDC:
322 case BRW_OPCODE_ROR:
323 case BRW_OPCODE_ROL:
324 case BRW_OPCODE_SUBB:
325 case BRW_OPCODE_SAD2:
326 case BRW_OPCODE_SADA2:
327 case BRW_OPCODE_LINE:
328 case BRW_OPCODE_NOP:
329 case SHADER_OPCODE_CLUSTER_BROADCAST:
330 case FS_OPCODE_DDX_COARSE:
331 case FS_OPCODE_DDX_FINE:
332 case FS_OPCODE_DDY_COARSE:
333 case FS_OPCODE_PIXEL_X:
334 case FS_OPCODE_PIXEL_Y:
335 case FS_OPCODE_SET_SAMPLE_ID:
336 case VEC4_OPCODE_MOV_BYTES:
337 case VEC4_OPCODE_UNPACK_UNIFORM:
338 case VEC4_OPCODE_DOUBLE_TO_F32:
339 case VEC4_OPCODE_DOUBLE_TO_D32:
340 case VEC4_OPCODE_DOUBLE_TO_U32:
341 case VEC4_OPCODE_TO_DOUBLE:
342 case VEC4_OPCODE_PICK_LOW_32BIT:
343 case VEC4_OPCODE_PICK_HIGH_32BIT:
344 case VEC4_OPCODE_SET_LOW_32BIT:
345 case VEC4_OPCODE_SET_HIGH_32BIT:
346 case GS_OPCODE_SET_DWORD_2:
347 case GS_OPCODE_SET_WRITE_OFFSET:
348 case GS_OPCODE_SET_VERTEX_COUNT:
349 case GS_OPCODE_PREPARE_CHANNEL_MASKS:
350 case GS_OPCODE_SET_CHANNEL_MASKS:
351 case GS_OPCODE_GET_INSTANCE_ID:
352 case GS_OPCODE_SET_PRIMITIVE_ID:
353 case GS_OPCODE_SVB_SET_DST_INDEX:
354 case TCS_OPCODE_SRC0_010_IS_ZERO:
355 case TCS_OPCODE_GET_PRIMITIVE_ID:
356 case TES_OPCODE_GET_PRIMITIVE_ID:
357 if (devinfo->gen >= 11) {
358 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
359 0, 10, 6 /* XXX */, 14, 0, 0);
360 } else if (devinfo->gen >= 8) {
361 if (type_sz(info.tx) > 4)
362 return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
363 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
364 else
365 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
366 0, 8, 4, 12, 0, 0);
367 } else if (devinfo->is_haswell) {
368 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
369 0, 10, 6 /* XXX */, 16, 0, 0);
370 } else {
371 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
372 0, 12, 8 /* XXX */, 18, 0, 0);
373 }
374
375 case BRW_OPCODE_MOV:
376 case BRW_OPCODE_CMP:
377 case BRW_OPCODE_ADD:
378 case BRW_OPCODE_MUL:
379 case SHADER_OPCODE_MOV_RELOC_IMM:
380 if (devinfo->gen >= 11) {
381 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
382 0, 10, 6, 14, 0, 0);
383 } else if (devinfo->gen >= 8) {
384 if (type_sz(info.tx) > 4)
385 return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
386 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
387 else
388 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
389 0, 8, 4, 12, 0, 0);
390 } else if (devinfo->is_haswell) {
391 if (info.tx == BRW_REGISTER_TYPE_F)
392 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
393 0, 12, 8 /* XXX */, 18, 0, 0);
394 else
395 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
396 0, 10, 6 /* XXX */, 16, 0, 0);
397 } else if (devinfo->gen >= 7) {
398 if (info.tx == BRW_REGISTER_TYPE_F)
399 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
400 0, 14, 10 /* XXX */, 20, 0, 0);
401 else
402 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
403 0, 12, 8 /* XXX */, 18, 0, 0);
404 } else {
405 return calculate_desc(info, unit_fpu, 0, 2 /* XXX */, 0,
406 0, 2 /* XXX */,
407 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
408 0, 0);
409 }
410
411 case BRW_OPCODE_BFE:
412 case BRW_OPCODE_BFI2:
413 case BRW_OPCODE_CSEL:
414 if (devinfo->gen >= 11)
415 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
416 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
417 else if (devinfo->gen >= 8)
418 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
419 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
420 else if (devinfo->is_haswell)
421 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
422 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
423 else if (devinfo->gen >= 7)
424 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
425 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
426 else
427 abort();
428
429 case BRW_OPCODE_MAD:
430 if (devinfo->gen >= 11) {
431 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
432 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
433 } else if (devinfo->gen >= 8) {
434 if (type_sz(info.tx) > 4)
435 return calculate_desc(info, unit_fpu, 0, 4, 1, 0, 4,
436 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
437 else
438 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
439 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
440 } else if (devinfo->is_haswell) {
441 if (info.tx == BRW_REGISTER_TYPE_F)
442 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
443 0, 12, 8 /* XXX */, 18, 0, 0);
444 else
445 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
446 0, 10, 6 /* XXX */, 16, 0, 0);
447 } else if (devinfo->gen >= 7) {
448 if (info.tx == BRW_REGISTER_TYPE_F)
449 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
450 0, 14, 10 /* XXX */, 20, 0, 0);
451 else
452 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
453 0, 12, 8 /* XXX */, 18, 0, 0);
454 } else if (devinfo->gen >= 6) {
455 return calculate_desc(info, unit_fpu, 0, 2 /* XXX */, 1 /* XXX */,
456 0, 2 /* XXX */,
457 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
458 0, 0);
459 } else {
460 abort();
461 }
462
463 case BRW_OPCODE_F32TO16:
464 if (devinfo->gen >= 11)
465 return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
466 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
467 else if (devinfo->gen >= 8)
468 return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
469 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
470 else if (devinfo->is_haswell)
471 return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
472 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
473 else if (devinfo->gen >= 7)
474 return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
475 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
476 else
477 abort();
478
479 case BRW_OPCODE_DP4:
480 case BRW_OPCODE_DPH:
481 case BRW_OPCODE_DP3:
482 case BRW_OPCODE_DP2:
483 if (devinfo->gen >= 8)
484 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
485 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
486 else if (devinfo->is_haswell)
487 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
488 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
489 else
490 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
491 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
492
493 case SHADER_OPCODE_RCP:
494 case SHADER_OPCODE_RSQ:
495 case SHADER_OPCODE_SQRT:
496 case SHADER_OPCODE_EXP2:
497 case SHADER_OPCODE_LOG2:
498 case SHADER_OPCODE_SIN:
499 case SHADER_OPCODE_COS:
500 case SHADER_OPCODE_POW:
501 case SHADER_OPCODE_INT_QUOTIENT:
502 case SHADER_OPCODE_INT_REMAINDER:
503 if (devinfo->gen >= 6) {
504 switch (info.op) {
505 case SHADER_OPCODE_RCP:
506 case SHADER_OPCODE_RSQ:
507 case SHADER_OPCODE_SQRT:
508 case SHADER_OPCODE_EXP2:
509 case SHADER_OPCODE_LOG2:
510 case SHADER_OPCODE_SIN:
511 case SHADER_OPCODE_COS:
512 if (devinfo->gen >= 8)
513 return calculate_desc(info, unit_em, -2, 4, 0, 0, 4,
514 0, 16, 0, 0, 0, 0);
515 else if (devinfo->is_haswell)
516 return calculate_desc(info, unit_em, 0, 2, 0, 0, 2,
517 0, 12, 0, 0, 0, 0);
518 else
519 return calculate_desc(info, unit_em, 0, 2, 0, 0, 2,
520 0, 14, 0, 0, 0, 0);
521
522 case SHADER_OPCODE_POW:
523 if (devinfo->gen >= 8)
524 return calculate_desc(info, unit_em, -2, 4, 0, 0, 8,
525 0, 24, 0, 0, 0, 0);
526 else if (devinfo->is_haswell)
527 return calculate_desc(info, unit_em, 0, 2, 0, 0, 4,
528 0, 20, 0, 0, 0, 0);
529 else
530 return calculate_desc(info, unit_em, 0, 2, 0, 0, 4,
531 0, 22, 0, 0, 0, 0);
532
533 case SHADER_OPCODE_INT_QUOTIENT:
534 case SHADER_OPCODE_INT_REMAINDER:
535 return calculate_desc(info, unit_em, 2, 0, 0, 26, 0,
536 0, 28 /* XXX */, 0, 0, 0, 0);
537
538 default:
539 abort();
540 }
541 } else {
542 switch (info.op) {
543 case SHADER_OPCODE_RCP:
544 return calculate_desc(info, unit_em, 2, 0, 0, 0, 8,
545 0, 22, 0, 0, 0, 8);
546
547 case SHADER_OPCODE_RSQ:
548 return calculate_desc(info, unit_em, 2, 0, 0, 0, 16,
549 0, 44, 0, 0, 0, 8);
550
551 case SHADER_OPCODE_INT_QUOTIENT:
552 case SHADER_OPCODE_SQRT:
553 case SHADER_OPCODE_LOG2:
554 return calculate_desc(info, unit_em, 2, 0, 0, 0, 24,
555 0, 66, 0, 0, 0, 8);
556
557 case SHADER_OPCODE_INT_REMAINDER:
558 case SHADER_OPCODE_EXP2:
559 return calculate_desc(info, unit_em, 2, 0, 0, 0, 32,
560 0, 88, 0, 0, 0, 8);
561
562 case SHADER_OPCODE_SIN:
563 case SHADER_OPCODE_COS:
564 return calculate_desc(info, unit_em, 2, 0, 0, 0, 48,
565 0, 132, 0, 0, 0, 8);
566
567 case SHADER_OPCODE_POW:
568 return calculate_desc(info, unit_em, 2, 0, 0, 0, 64,
569 0, 176, 0, 0, 0, 8);
570
571 default:
572 abort();
573 }
574 }
575
576 case BRW_OPCODE_DO:
577 if (devinfo->gen >= 6)
578 return calculate_desc(info, unit_null, 0, 0, 0, 0, 0,
579 0, 0, 0, 0, 0, 0);
580 else
581 return calculate_desc(info, unit_null, 2 /* XXX */, 0, 0, 0, 0,
582 0, 0, 0, 0, 0, 0);
583
584 case BRW_OPCODE_IF:
585 case BRW_OPCODE_ELSE:
586 case BRW_OPCODE_ENDIF:
587 case BRW_OPCODE_WHILE:
588 case BRW_OPCODE_BREAK:
589 case BRW_OPCODE_CONTINUE:
590 case FS_OPCODE_DISCARD_JUMP:
591 if (devinfo->gen >= 8)
592 return calculate_desc(info, unit_null, 8, 0, 0, 0, 0,
593 0, 0, 0, 0, 0, 0);
594 else if (devinfo->is_haswell)
595 return calculate_desc(info, unit_null, 6, 0, 0, 0, 0,
596 0, 0, 0, 0, 0, 0);
597 else
598 return calculate_desc(info, unit_null, 2, 0, 0, 0, 0,
599 0, 0, 0, 0, 0, 0);
600
601 case FS_OPCODE_LINTERP:
602 if (devinfo->gen >= 8)
603 return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4,
604 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
605 else if (devinfo->is_haswell)
606 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
607 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
608 else
609 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
610 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
611
612 case BRW_OPCODE_LRP:
613 if (devinfo->gen >= 8)
614 return calculate_desc(info, unit_fpu, 0, 4, 1, 0, 4,
615 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
616 else if (devinfo->is_haswell)
617 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
618 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
619 else if (devinfo->gen >= 6)
620 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2,
621 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
622 else
623 abort();
624
625 case FS_OPCODE_PACK_HALF_2x16_SPLIT:
626 if (devinfo->gen >= 11)
627 return calculate_desc(info, unit_fpu, 20, 6, 0, 0, 6,
628 0, 10 /* XXX */, 6 /* XXX */,
629 14 /* XXX */, 0, 0);
630 else if (devinfo->gen >= 8)
631 return calculate_desc(info, unit_fpu, 16, 6, 0, 0, 6,
632 0, 8 /* XXX */, 4 /* XXX */,
633 12 /* XXX */, 0, 0);
634 else if (devinfo->is_haswell)
635 return calculate_desc(info, unit_fpu, 20, 6, 0, 0, 6,
636 0, 10 /* XXX */, 6 /* XXX */,
637 16 /* XXX */, 0, 0);
638 else if (devinfo->gen >= 7)
639 return calculate_desc(info, unit_fpu, 24, 6, 0, 0, 6,
640 0, 12 /* XXX */, 8 /* XXX */,
641 18 /* XXX */, 0, 0);
642 else
643 abort();
644
645 case SHADER_OPCODE_MOV_INDIRECT:
646 if (devinfo->gen >= 11)
647 return calculate_desc(info, unit_fpu, 34, 0, 0, 34, 0,
648 0, 10 /* XXX */, 6 /* XXX */,
649 14 /* XXX */, 0, 0);
650 else if (devinfo->gen >= 8)
651 return calculate_desc(info, unit_fpu, 34, 0, 0, 34, 0,
652 0, 8 /* XXX */, 4 /* XXX */,
653 12 /* XXX */, 0, 0);
654 else if (devinfo->is_haswell)
655 return calculate_desc(info, unit_fpu, 34, 0, 0, 34, 0,
656 0, 10 /* XXX */, 6 /* XXX */,
657 16 /* XXX */, 0, 0);
658 else
659 return calculate_desc(info, unit_fpu, 34, 0, 0, 34, 0,
660 0, 12 /* XXX */, 8 /* XXX */,
661 18 /* XXX */, 0, 0);
662
663 case SHADER_OPCODE_BROADCAST:
664 if (devinfo->gen >= 11)
665 return calculate_desc(info, unit_fpu, 20 /* XXX */, 0, 0, 4, 0,
666 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
667 else if (devinfo->gen >= 8)
668 return calculate_desc(info, unit_fpu, 18, 0, 0, 4, 0,
669 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
670 else if (devinfo->is_haswell)
671 return calculate_desc(info, unit_fpu, 18, 0, 0, 4, 0,
672 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
673 else if (devinfo->gen >= 7)
674 return calculate_desc(info, unit_fpu, 20, 0, 0, 4, 0,
675 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
676 else
677 abort();
678
679 case SHADER_OPCODE_FIND_LIVE_CHANNEL:
680 if (devinfo->gen >= 11)
681 return calculate_desc(info, unit_fpu, 2, 0, 0, 2, 0,
682 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
683 else if (devinfo->gen >= 8)
684 return calculate_desc(info, unit_fpu, 2, 0, 0, 2, 0,
685 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
686 else if (devinfo->is_haswell)
687 return calculate_desc(info, unit_fpu, 36, 0, 0, 6, 0,
688 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
689 else if (devinfo->gen >= 7)
690 return calculate_desc(info, unit_fpu, 40, 0, 0, 6, 0,
691 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
692 else
693 abort();
694
695 case SHADER_OPCODE_RND_MODE:
696 case SHADER_OPCODE_FLOAT_CONTROL_MODE:
697 if (devinfo->gen >= 11)
698 return calculate_desc(info, unit_fpu, 24 /* XXX */, 0, 0,
699 4 /* XXX */, 0,
700 0, 0, 0, 0, 0, 0);
701 else if (devinfo->gen >= 8)
702 return calculate_desc(info, unit_fpu, 20 /* XXX */, 0, 0,
703 4 /* XXX */, 0,
704 0, 0, 0, 0, 0, 0);
705 else if (devinfo->is_haswell)
706 return calculate_desc(info, unit_fpu, 24 /* XXX */, 0, 0,
707 4 /* XXX */, 0,
708 0, 0, 0, 0, 0, 0);
709 else if (devinfo->gen >= 6)
710 return calculate_desc(info, unit_fpu, 28 /* XXX */, 0, 0,
711 4 /* XXX */, 0,
712 0, 0, 0, 0, 0, 0);
713 else
714 abort();
715
716 case SHADER_OPCODE_SHUFFLE:
717 if (devinfo->gen >= 11)
718 return calculate_desc(info, unit_fpu, 44 /* XXX */, 0, 0,
719 44 /* XXX */, 0,
720 0, 10 /* XXX */, 6 /* XXX */,
721 14 /* XXX */, 0, 0);
722 else if (devinfo->gen >= 8)
723 return calculate_desc(info, unit_fpu, 42 /* XXX */, 0, 0,
724 42 /* XXX */, 0,
725 0, 8 /* XXX */, 4 /* XXX */,
726 12 /* XXX */, 0, 0);
727 else if (devinfo->is_haswell)
728 return calculate_desc(info, unit_fpu, 0, 44 /* XXX */, 0,
729 0, 44 /* XXX */,
730 0, 10 /* XXX */, 6 /* XXX */,
731 16 /* XXX */, 0, 0);
732 else if (devinfo->gen >= 6)
733 return calculate_desc(info, unit_fpu, 0, 46 /* XXX */, 0,
734 0, 46 /* XXX */,
735 0, 12 /* XXX */, 8 /* XXX */,
736 18 /* XXX */, 0, 0);
737 else
738 abort();
739
740 case SHADER_OPCODE_SEL_EXEC:
741 if (devinfo->gen >= 11)
742 return calculate_desc(info, unit_fpu, 10 /* XXX */, 4 /* XXX */, 0,
743 0, 4 /* XXX */,
744 0, 10 /* XXX */, 6 /* XXX */,
745 14 /* XXX */, 0, 0);
746 else if (devinfo->gen >= 8)
747 return calculate_desc(info, unit_fpu, 8 /* XXX */, 4 /* XXX */, 0,
748 0, 4 /* XXX */,
749 0, 8 /* XXX */, 4 /* XXX */,
750 12 /* XXX */, 0, 0);
751 else if (devinfo->is_haswell)
752 return calculate_desc(info, unit_fpu, 10 /* XXX */, 4 /* XXX */, 0,
753 0, 4 /* XXX */,
754 0, 10 /* XXX */, 6 /* XXX */,
755 16 /* XXX */, 0, 0);
756 else
757 return calculate_desc(info, unit_fpu, 12 /* XXX */, 4 /* XXX */, 0,
758 0, 4 /* XXX */,
759 0, 12 /* XXX */, 8 /* XXX */,
760 18 /* XXX */, 0, 0);
761
762 case SHADER_OPCODE_QUAD_SWIZZLE:
763 if (devinfo->gen >= 11)
764 return calculate_desc(info, unit_fpu, 0 /* XXX */, 8 /* XXX */, 0,
765 0, 8 /* XXX */,
766 0, 10 /* XXX */, 6 /* XXX */,
767 14 /* XXX */, 0, 0);
768 else if (devinfo->gen >= 8)
769 return calculate_desc(info, unit_fpu, 0 /* XXX */, 8 /* XXX */, 0,
770 0, 8 /* XXX */,
771 0, 8 /* XXX */, 4 /* XXX */,
772 12 /* XXX */, 0, 0);
773 else if (devinfo->is_haswell)
774 return calculate_desc(info, unit_fpu, 0 /* XXX */, 8 /* XXX */, 0,
775 0, 8 /* XXX */,
776 0, 10 /* XXX */, 6 /* XXX */,
777 16 /* XXX */, 0, 0);
778 else
779 return calculate_desc(info, unit_fpu, 0 /* XXX */, 8 /* XXX */, 0,
780 0, 8 /* XXX */,
781 0, 12 /* XXX */, 8 /* XXX */,
782 18 /* XXX */, 0, 0);
783
784 case FS_OPCODE_DDY_FINE:
785 if (devinfo->gen >= 11)
786 return calculate_desc(info, unit_fpu, 0, 14, 0, 0, 4,
787 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
788 else if (devinfo->gen >= 8)
789 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
790 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
791 else if (devinfo->is_haswell)
792 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
793 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
794 else
795 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2,
796 0, 14, 10 /* XXX */, 20 /* XXX */, 0, 0);
797
798 case FS_OPCODE_LOAD_LIVE_CHANNELS:
799 if (devinfo->gen >= 11)
800 return calculate_desc(info, unit_fpu, 2 /* XXX */, 0, 0,
801 2 /* XXX */, 0,
802 0, 0, 0, 10 /* XXX */, 0, 0);
803 else if (devinfo->gen >= 8)
804 return calculate_desc(info, unit_fpu, 0, 2 /* XXX */, 0,
805 0, 2 /* XXX */,
806 0, 0, 0, 8 /* XXX */, 0, 0);
807 else
808 abort();
809
810 case VEC4_OPCODE_PACK_BYTES:
811 if (devinfo->gen >= 8)
812 return calculate_desc(info, unit_fpu, 4 /* XXX */, 0, 0,
813 4 /* XXX */, 0,
814 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
815 0, 0);
816 else if (devinfo->is_haswell)
817 return calculate_desc(info, unit_fpu, 4 /* XXX */, 0, 0,
818 4 /* XXX */, 0,
819 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
820 0, 0);
821 else
822 return calculate_desc(info, unit_fpu, 4 /* XXX */, 0, 0,
823 4 /* XXX */, 0,
824 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
825 0, 0);
826
827 case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9:
828 if (devinfo->gen >= 8)
829 return calculate_desc(info, unit_fpu, 12 /* XXX */, 0, 0,
830 4 /* XXX */, 0,
831 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
832 0, 0);
833 else
834 abort();
835
836 case VS_OPCODE_UNPACK_FLAGS_SIMD4X2:
837 case TCS_OPCODE_GET_INSTANCE_ID:
838 case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
839 case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
840 case TES_OPCODE_CREATE_INPUT_READ_HEADER:
841 if (devinfo->gen >= 8)
842 return calculate_desc(info, unit_fpu, 22 /* XXX */, 0, 0,
843 6 /* XXX */, 0,
844 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
845 0, 0);
846 else if (devinfo->is_haswell)
847 return calculate_desc(info, unit_fpu, 26 /* XXX */, 0, 0,
848 6 /* XXX */, 0,
849 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
850 0, 0);
851 else
852 return calculate_desc(info, unit_fpu, 30 /* XXX */, 0, 0,
853 6 /* XXX */, 0,
854 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
855 0, 0);
856
857 case GS_OPCODE_FF_SYNC_SET_PRIMITIVES:
858 case TCS_OPCODE_CREATE_BARRIER_HEADER:
859 if (devinfo->gen >= 8)
860 return calculate_desc(info, unit_fpu, 32 /* XXX */, 0, 0,
861 8 /* XXX */, 0,
862 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
863 0, 0);
864 else if (devinfo->is_haswell)
865 return calculate_desc(info, unit_fpu, 38 /* XXX */, 0, 0,
866 8 /* XXX */, 0,
867 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
868 0, 0);
869 else if (devinfo->gen >= 6)
870 return calculate_desc(info, unit_fpu, 44 /* XXX */, 0, 0,
871 8 /* XXX */, 0,
872 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
873 0, 0);
874 else
875 abort();
876
877 case TES_OPCODE_ADD_INDIRECT_URB_OFFSET:
878 if (devinfo->gen >= 8)
879 return calculate_desc(info, unit_fpu, 12 /* XXX */, 0, 0,
880 4 /* XXX */, 0,
881 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
882 0, 0);
883 else if (devinfo->is_haswell)
884 return calculate_desc(info, unit_fpu, 14 /* XXX */, 0, 0,
885 4 /* XXX */, 0,
886 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
887 0, 0);
888 else if (devinfo->gen >= 7)
889 return calculate_desc(info, unit_fpu, 16 /* XXX */, 0, 0,
890 4 /* XXX */, 0,
891 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
892 0, 0);
893 else
894 abort();
895
896 case SHADER_OPCODE_TEX:
897 case FS_OPCODE_TXB:
898 case SHADER_OPCODE_TXD:
899 case SHADER_OPCODE_TXF:
900 case SHADER_OPCODE_TXF_LZ:
901 case SHADER_OPCODE_TXL:
902 case SHADER_OPCODE_TXL_LZ:
903 case SHADER_OPCODE_TXF_CMS:
904 case SHADER_OPCODE_TXF_CMS_W:
905 case SHADER_OPCODE_TXF_UMS:
906 case SHADER_OPCODE_TXF_MCS:
907 case SHADER_OPCODE_TXS:
908 case SHADER_OPCODE_LOD:
909 case SHADER_OPCODE_GET_BUFFER_SIZE:
910 case SHADER_OPCODE_TG4:
911 case SHADER_OPCODE_TG4_OFFSET:
912 case SHADER_OPCODE_SAMPLEINFO:
913 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4:
914 return calculate_desc(info, unit_sampler, 2, 0, 0, 0, 16 /* XXX */,
915 8 /* XXX */, 750 /* XXX */, 0, 0,
916 2 /* XXX */, 0);
917
918 case SHADER_OPCODE_URB_READ_SIMD8:
919 case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
920 case SHADER_OPCODE_URB_WRITE_SIMD8:
921 case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
922 case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
923 case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
924 case VEC4_OPCODE_URB_READ:
925 case VS_OPCODE_URB_WRITE:
926 case GS_OPCODE_URB_WRITE:
927 case GS_OPCODE_URB_WRITE_ALLOCATE:
928 case GS_OPCODE_THREAD_END:
929 case GS_OPCODE_FF_SYNC:
930 case TCS_OPCODE_URB_WRITE:
931 case TCS_OPCODE_RELEASE_INPUT:
932 case TCS_OPCODE_THREAD_END:
933 return calculate_desc(info, unit_urb, 2, 0, 0, 0, 6 /* XXX */,
934 32 /* XXX */, 200 /* XXX */, 0, 0, 0, 0);
935
936 case SHADER_OPCODE_MEMORY_FENCE:
937 case SHADER_OPCODE_INTERLOCK:
938 switch (info.sfid) {
939 case GEN6_SFID_DATAPORT_RENDER_CACHE:
940 if (devinfo->gen >= 7)
941 return calculate_desc(info, unit_dp_rc, 2, 0, 0, 30 /* XXX */, 0,
942 10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
943 else
944 abort();
945
946 case GEN7_SFID_DATAPORT_DATA_CACHE:
947 case HSW_SFID_DATAPORT_DATA_CACHE_1:
948 if (devinfo->gen >= 7)
949 return calculate_desc(info, unit_dp_dc, 2, 0, 0, 30 /* XXX */, 0,
950 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
951 else
952 abort();
953
954 default:
955 abort();
956 }
957
958 case SHADER_OPCODE_GEN4_SCRATCH_READ:
959 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
960 case SHADER_OPCODE_GEN7_SCRATCH_READ:
961 return calculate_desc(info, unit_dp_dc, 2, 0, 0, 0, 8 /* XXX */,
962 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
963
964 case VEC4_OPCODE_UNTYPED_ATOMIC:
965 if (devinfo->gen >= 7)
966 return calculate_desc(info, unit_dp_dc, 2, 0, 0,
967 30 /* XXX */, 400 /* XXX */,
968 10 /* XXX */, 100 /* XXX */, 0, 0,
969 0, 400 /* XXX */);
970 else
971 abort();
972
973 case VEC4_OPCODE_UNTYPED_SURFACE_READ:
974 case VEC4_OPCODE_UNTYPED_SURFACE_WRITE:
975 if (devinfo->gen >= 7)
976 return calculate_desc(info, unit_dp_dc, 2, 0, 0,
977 0, 20 /* XXX */,
978 10 /* XXX */, 100 /* XXX */, 0, 0,
979 0, 0);
980 else
981 abort();
982
983 case FS_OPCODE_FB_WRITE:
984 case FS_OPCODE_FB_READ:
985 case FS_OPCODE_REP_FB_WRITE:
986 return calculate_desc(info, unit_dp_rc, 2, 0, 0, 0, 450 /* XXX */,
987 10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
988
989 case GS_OPCODE_SVB_WRITE:
990 if (devinfo->gen >= 6)
991 return calculate_desc(info, unit_dp_rc, 2 /* XXX */, 0, 0,
992 0, 450 /* XXX */,
993 10 /* XXX */, 300 /* XXX */, 0, 0,
994 0, 0);
995 else
996 abort();
997
998 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
999 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
1000 return calculate_desc(info, unit_dp_cc, 2, 0, 0, 0, 16 /* XXX */,
1001 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
1002
1003 case VS_OPCODE_PULL_CONSTANT_LOAD:
1004 case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
1005 return calculate_desc(info, unit_sampler, 2, 0, 0, 0, 16,
1006 8, 750, 0, 0, 2, 0);
1007
1008 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1009 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1010 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1011 if (devinfo->gen >= 7)
1012 return calculate_desc(info, unit_pi, 2, 0, 0, 14 /* XXX */, 0,
1013 0, 90 /* XXX */, 0, 0, 0, 0);
1014 else
1015 abort();
1016
1017 case SHADER_OPCODE_BARRIER:
1018 if (devinfo->gen >= 7)
1019 return calculate_desc(info, unit_gateway, 90 /* XXX */, 0, 0,
1020 0 /* XXX */, 0,
1021 0, 0, 0, 0, 0, 0);
1022 else
1023 abort();
1024
1025 case CS_OPCODE_CS_TERMINATE:
1026 if (devinfo->gen >= 7)
1027 return calculate_desc(info, unit_spawner, 2, 0, 0, 0 /* XXX */, 0,
1028 10 /* XXX */, 0, 0, 0, 0, 0);
1029 else
1030 abort();
1031
1032 case SHADER_OPCODE_SEND:
1033 switch (info.sfid) {
1034 case GEN6_SFID_DATAPORT_RENDER_CACHE:
1035 if (devinfo->gen >= 7) {
1036 switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
1037 case GEN7_DATAPORT_RC_TYPED_ATOMIC_OP:
1038 return calculate_desc(info, unit_dp_rc, 2, 0, 0,
1039 30 /* XXX */, 450 /* XXX */,
1040 10 /* XXX */, 100 /* XXX */,
1041 0, 0, 0, 400 /* XXX */);
1042 default:
1043 return calculate_desc(info, unit_dp_rc, 2, 0, 0,
1044 0, 450 /* XXX */,
1045 10 /* XXX */, 300 /* XXX */, 0, 0,
1046 0, 0);
1047 }
1048 } else if (devinfo->gen >= 6) {
1049 return calculate_desc(info, unit_dp_rc, 2 /* XXX */, 0, 0,
1050 0, 450 /* XXX */,
1051 10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
1052 } else {
1053 abort();
1054 }
1055 case BRW_SFID_SAMPLER: {
1056 if (devinfo->gen >= 6)
1057 return calculate_desc(info, unit_sampler, 2, 0, 0, 0, 16,
1058 8, 750, 0, 0, 2, 0);
1059 else
1060 abort();
1061 }
1062 case GEN7_SFID_DATAPORT_DATA_CACHE:
1063 case HSW_SFID_DATAPORT_DATA_CACHE_1:
1064 if (devinfo->gen >= 8 || devinfo->is_haswell) {
1065 switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
1066 case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP:
1067 case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2:
1068 case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2:
1069 case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP:
1070 return calculate_desc(info, unit_dp_dc, 2, 0, 0,
1071 30 /* XXX */, 400 /* XXX */,
1072 10 /* XXX */, 100 /* XXX */, 0, 0,
1073 0, 400 /* XXX */);
1074
1075 default:
1076 return calculate_desc(info, unit_dp_dc, 2, 0, 0,
1077 0, 20 /* XXX */,
1078 10 /* XXX */, 100 /* XXX */, 0, 0,
1079 0, 0);
1080 }
1081 } else if (devinfo->gen >= 7) {
1082 switch (brw_dp_desc_msg_type(devinfo, info.desc)) {
1083 case GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP:
1084 return calculate_desc(info, unit_dp_dc, 2, 0, 0,
1085 30 /* XXX */, 400 /* XXX */,
1086 10 /* XXX */, 100 /* XXX */,
1087 0, 0, 0, 400 /* XXX */);
1088 default:
1089 return calculate_desc(info, unit_dp_dc, 2, 0, 0,
1090 0, 20 /* XXX */,
1091 10 /* XXX */, 100 /* XXX */, 0, 0,
1092 0, 0);
1093 }
1094 } else {
1095 abort();
1096 }
1097 default:
1098 abort();
1099 }
1100
1101 case SHADER_OPCODE_UNDEF:
1102 case FS_OPCODE_PLACEHOLDER_HALT:
1103 case FS_OPCODE_SCHEDULING_FENCE:
1104 return calculate_desc(info, unit_null, 0, 0, 0, 0, 0,
1105 0, 0, 0, 0, 0, 0);
1106
1107 default:
1108 abort();
1109 }
1110 }
1111
1112 /**
1113 * Model the performance behavior of a stall on the specified dependency
1114 * ID.
1115 */
1116 void
1117 stall_on_dependency(state &st, dependency_id id)
1118 {
1119 if (id < ARRAY_SIZE(st.dep_ready))
1120 st.unit_ready[unit_fe] = MAX2(st.unit_ready[unit_fe],
1121 st.dep_ready[id]);
1122 }
1123
1124 /**
1125 * Model the performance behavior of the front-end and back-end while
1126 * executing an instruction with the specified timing information, assuming
1127 * all dependencies are already clear.
1128 */
1129 void
1130 execute_instruction(state &st, const perf_desc &perf)
1131 {
1132 /* Compute the time at which the front-end will be ready to execute the
1133 * next instruction.
1134 */
1135 st.unit_ready[unit_fe] += perf.df;
1136
1137 if (perf.u < num_units) {
1138 /* Wait for the back-end to be ready to execute this instruction. */
1139 st.unit_ready[unit_fe] = MAX2(st.unit_ready[unit_fe],
1140 st.unit_ready[perf.u]);
1141
1142 /* Compute the time at which the back-end will be ready to execute
1143 * the next instruction, and update the back-end utilization.
1144 */
1145 st.unit_ready[perf.u] = st.unit_ready[unit_fe] + perf.db;
1146 st.unit_busy[perf.u] += perf.db * st.weight;
1147 }
1148 }
1149
1150 /**
1151 * Model the performance behavior of a read dependency provided by an
1152 * instruction.
1153 */
1154 void
1155 mark_read_dependency(state &st, const perf_desc &perf, dependency_id id)
1156 {
1157 if (id < ARRAY_SIZE(st.dep_ready))
1158 st.dep_ready[id] = st.unit_ready[unit_fe] + perf.ls;
1159 }
1160
1161 /**
1162 * Model the performance behavior of a write dependency provided by an
1163 * instruction.
1164 */
1165 void
1166 mark_write_dependency(state &st, const perf_desc &perf, dependency_id id)
1167 {
1168 if (id >= dependency_id_accum0 && id < dependency_id_flag0)
1169 st.dep_ready[id] = st.unit_ready[unit_fe] + perf.la;
1170 else if (id >= dependency_id_flag0 && id < dependency_id_sbid_wr0)
1171 st.dep_ready[id] = st.unit_ready[unit_fe] + perf.lf;
1172 else if (id < ARRAY_SIZE(st.dep_ready))
1173 st.dep_ready[id] = st.unit_ready[unit_fe] + perf.ld;
1174 }
1175
1176 /**
1177 * Return the dependency ID of a backend_reg, offset by \p delta GRFs.
1178 */
1179 dependency_id
1180 reg_dependency_id(const gen_device_info *devinfo, const backend_reg &r,
1181 const int delta)
1182 {
1183 if (r.file == VGRF) {
1184 const unsigned i = r.nr + r.offset / REG_SIZE + delta;
1185 assert(i < dependency_id_mrf0 - dependency_id_grf0);
1186 return dependency_id(dependency_id_grf0 + i);
1187
1188 } else if (r.file == FIXED_GRF) {
1189 const unsigned i = r.nr + delta;
1190 assert(i < dependency_id_mrf0 - dependency_id_grf0);
1191 return dependency_id(dependency_id_grf0 + i);
1192
1193 } else if (r.file == MRF && devinfo->gen >= 7) {
1194 const unsigned i = GEN7_MRF_HACK_START +
1195 r.nr + r.offset / REG_SIZE + delta;
1196 assert(i < dependency_id_mrf0 - dependency_id_grf0);
1197 return dependency_id(dependency_id_grf0 + i);
1198
1199 } else if (r.file == MRF && devinfo->gen < 7) {
1200 const unsigned i = (r.nr & ~BRW_MRF_COMPR4) +
1201 r.offset / REG_SIZE + delta;
1202 assert(i < dependency_id_addr0 - dependency_id_mrf0);
1203 return dependency_id(dependency_id_mrf0 + i);
1204
1205 } else if (r.file == ARF && r.nr >= BRW_ARF_ADDRESS &&
1206 r.nr < BRW_ARF_ACCUMULATOR) {
1207 assert(delta == 0);
1208 return dependency_id_addr0;
1209
1210 } else if (r.file == ARF && r.nr >= BRW_ARF_ACCUMULATOR &&
1211 r.nr < BRW_ARF_FLAG) {
1212 const unsigned i = r.nr - BRW_ARF_ACCUMULATOR + delta;
1213 assert(i < dependency_id_flag0 - dependency_id_accum0);
1214 return dependency_id(dependency_id_accum0 + i);
1215
1216 } else {
1217 return num_dependency_ids;
1218 }
1219 }
1220
1221 /**
1222 * Return the dependency ID of flag register starting at offset \p i.
1223 */
1224 dependency_id
1225 flag_dependency_id(unsigned i)
1226 {
1227 assert(i < dependency_id_sbid_wr0 - dependency_id_flag0);
1228 return dependency_id(dependency_id_flag0 + i);
1229 }
1230
1231 /**
1232 * Return the dependency ID corresponding to the SBID read completion
1233 * condition of a Gen12+ SWSB.
1234 */
1235 dependency_id
1236 tgl_swsb_rd_dependency_id(tgl_swsb swsb)
1237 {
1238 if (swsb.mode) {
1239 assert(swsb.sbid < num_dependency_ids - dependency_id_sbid_rd0);
1240 return dependency_id(dependency_id_sbid_rd0 + swsb.sbid);
1241 } else {
1242 return num_dependency_ids;
1243 }
1244 }
1245
1246 /**
1247 * Return the dependency ID corresponding to the SBID write completion
1248 * condition of a Gen12+ SWSB.
1249 */
1250 dependency_id
1251 tgl_swsb_wr_dependency_id(tgl_swsb swsb)
1252 {
1253 if (swsb.mode) {
1254 assert(swsb.sbid < dependency_id_sbid_rd0 - dependency_id_sbid_wr0);
1255 return dependency_id(dependency_id_sbid_wr0 + swsb.sbid);
1256 } else {
1257 return num_dependency_ids;
1258 }
1259 }
1260
1261 /**
1262 * Return the implicit accumulator register accessed by channel \p i of the
1263 * instruction.
1264 */
1265 unsigned
1266 accum_reg_of_channel(const gen_device_info *devinfo,
1267 const backend_instruction *inst,
1268 brw_reg_type tx, unsigned i)
1269 {
1270 assert(inst->reads_accumulator_implicitly() ||
1271 inst->writes_accumulator_implicitly(devinfo));
1272 const unsigned offset = (inst->group + i) * type_sz(tx) *
1273 (devinfo->gen < 7 || brw_reg_type_is_floating_point(tx) ? 1 : 2);
1274 return offset / REG_SIZE % 2;
1275 }
1276
1277 /**
1278 * Model the performance behavior of an FS back-end instruction.
1279 */
1280 void
1281 issue_fs_inst(state &st, const gen_device_info *devinfo,
1282 const backend_instruction *be_inst)
1283 {
1284 const fs_inst *inst = static_cast<const fs_inst *>(be_inst);
1285 const instruction_info info(devinfo, inst);
1286 const perf_desc perf = instruction_desc(info);
1287
1288 /* Stall on any source dependencies. */
1289 for (unsigned i = 0; i < inst->sources; i++) {
1290 for (unsigned j = 0; j < regs_read(inst, i); j++)
1291 stall_on_dependency(
1292 st, reg_dependency_id(devinfo, inst->src[i], j));
1293 }
1294
1295 if (inst->reads_accumulator_implicitly()) {
1296 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1297 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1298 inst->exec_size - 1); j++)
1299 stall_on_dependency(
1300 st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
1301 }
1302
1303 if (is_send(inst) && inst->base_mrf != -1) {
1304 for (unsigned j = 0; j < inst->mlen; j++)
1305 stall_on_dependency(
1306 st, reg_dependency_id(
1307 devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
1308 }
1309
1310 if (const unsigned mask = inst->flags_read(devinfo)) {
1311 for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
1312 if (mask & (1 << i))
1313 stall_on_dependency(st, flag_dependency_id(i));
1314 }
1315 }
1316
1317 /* Stall on any write dependencies. */
1318 if (!inst->no_dd_check) {
1319 if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1320 for (unsigned j = 0; j < regs_written(inst); j++)
1321 stall_on_dependency(
1322 st, reg_dependency_id(devinfo, inst->dst, j));
1323 }
1324
1325 if (inst->writes_accumulator_implicitly(devinfo)) {
1326 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1327 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1328 inst->exec_size - 1); j++)
1329 stall_on_dependency(
1330 st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
1331 }
1332
1333 if (const unsigned mask = inst->flags_written()) {
1334 for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
1335 if (mask & (1 << i))
1336 stall_on_dependency(st, flag_dependency_id(i));
1337 }
1338 }
1339 }
1340
1341 /* Stall on any SBID dependencies. */
1342 if (inst->sched.mode & (TGL_SBID_SET | TGL_SBID_DST))
1343 stall_on_dependency(st, tgl_swsb_wr_dependency_id(inst->sched));
1344 else if (inst->sched.mode & TGL_SBID_SRC)
1345 stall_on_dependency(st, tgl_swsb_rd_dependency_id(inst->sched));
1346
1347 /* Execute the instruction. */
1348 execute_instruction(st, perf);
1349
1350 /* Mark any source dependencies. */
1351 if (inst->is_send_from_grf()) {
1352 for (unsigned i = 0; i < inst->sources; i++) {
1353 if (inst->is_payload(i)) {
1354 for (unsigned j = 0; j < regs_read(inst, i); j++)
1355 mark_read_dependency(
1356 st, perf, reg_dependency_id(devinfo, inst->src[i], j));
1357 }
1358 }
1359 }
1360
1361 if (is_send(inst) && inst->base_mrf != -1) {
1362 for (unsigned j = 0; j < inst->mlen; j++)
1363 mark_read_dependency(st, perf,
1364 reg_dependency_id(devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
1365 }
1366
1367 /* Mark any destination dependencies. */
1368 if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1369 for (unsigned j = 0; j < regs_written(inst); j++) {
1370 mark_write_dependency(st, perf,
1371 reg_dependency_id(devinfo, inst->dst, j));
1372 }
1373 }
1374
1375 if (inst->writes_accumulator_implicitly(devinfo)) {
1376 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1377 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1378 inst->exec_size - 1); j++)
1379 mark_write_dependency(st, perf,
1380 reg_dependency_id(devinfo, brw_acc_reg(8), j));
1381 }
1382
1383 if (const unsigned mask = inst->flags_written()) {
1384 for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) {
1385 if (mask & (1 << i))
1386 mark_write_dependency(st, perf, flag_dependency_id(i));
1387 }
1388 }
1389
1390 /* Mark any SBID dependencies. */
1391 if (inst->sched.mode & TGL_SBID_SET) {
1392 mark_read_dependency(st, perf, tgl_swsb_rd_dependency_id(inst->sched));
1393 mark_write_dependency(st, perf, tgl_swsb_wr_dependency_id(inst->sched));
1394 }
1395 }
1396
1397 /**
1398 * Model the performance behavior of a VEC4 back-end instruction.
1399 */
1400 void
1401 issue_vec4_instruction(state &st, const gen_device_info *devinfo,
1402 const backend_instruction *be_inst)
1403 {
1404 const vec4_instruction *inst =
1405 static_cast<const vec4_instruction *>(be_inst);
1406 const instruction_info info(devinfo, inst);
1407 const perf_desc perf = instruction_desc(info);
1408
1409 /* Stall on any source dependencies. */
1410 for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) {
1411 for (unsigned j = 0; j < regs_read(inst, i); j++)
1412 stall_on_dependency(
1413 st, reg_dependency_id(devinfo, inst->src[i], j));
1414 }
1415
1416 if (inst->reads_accumulator_implicitly()) {
1417 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1418 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1419 inst->exec_size - 1); j++)
1420 stall_on_dependency(
1421 st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
1422 }
1423
1424 if (inst->base_mrf != -1) {
1425 for (unsigned j = 0; j < inst->mlen; j++)
1426 stall_on_dependency(
1427 st, reg_dependency_id(
1428 devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
1429 }
1430
1431 if (inst->reads_flag())
1432 stall_on_dependency(st, dependency_id_flag0);
1433
1434 /* Stall on any write dependencies. */
1435 if (!inst->no_dd_check) {
1436 if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1437 for (unsigned j = 0; j < regs_written(inst); j++)
1438 stall_on_dependency(
1439 st, reg_dependency_id(devinfo, inst->dst, j));
1440 }
1441
1442 if (inst->writes_accumulator_implicitly(devinfo)) {
1443 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1444 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1445 inst->exec_size - 1); j++)
1446 stall_on_dependency(
1447 st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
1448 }
1449
1450 if (inst->writes_flag())
1451 stall_on_dependency(st, dependency_id_flag0);
1452 }
1453
1454 /* Execute the instruction. */
1455 execute_instruction(st, perf);
1456
1457 /* Mark any source dependencies. */
1458 if (inst->is_send_from_grf()) {
1459 for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) {
1460 for (unsigned j = 0; j < regs_read(inst, i); j++)
1461 mark_read_dependency(
1462 st, perf, reg_dependency_id(devinfo, inst->src[i], j));
1463 }
1464 }
1465
1466 if (inst->base_mrf != -1) {
1467 for (unsigned j = 0; j < inst->mlen; j++)
1468 mark_read_dependency(st, perf,
1469 reg_dependency_id(devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
1470 }
1471
1472 /* Mark any destination dependencies. */
1473 if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
1474 for (unsigned j = 0; j < regs_written(inst); j++) {
1475 mark_write_dependency(st, perf,
1476 reg_dependency_id(devinfo, inst->dst, j));
1477 }
1478 }
1479
1480 if (inst->writes_accumulator_implicitly(devinfo)) {
1481 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
1482 j <= accum_reg_of_channel(devinfo, inst, info.tx,
1483 inst->exec_size - 1); j++)
1484 mark_write_dependency(st, perf,
1485 reg_dependency_id(devinfo, brw_acc_reg(8), j));
1486 }
1487
1488 if (inst->writes_flag())
1489 mark_write_dependency(st, perf, dependency_id_flag0);
1490 }
1491
1492 /**
1493 * Calculate the maximum possible throughput of the program compatible with
1494 * the cycle-count utilization estimated for each asynchronous unit, in
1495 * threads-per-cycle units.
1496 */
1497 float
1498 calculate_thread_throughput(const state &st, float busy)
1499 {
1500 for (unsigned i = 0; i < num_units; i++)
1501 busy = MAX2(busy, st.unit_busy[i]);
1502
1503 return 1.0 / busy;
1504 }
1505
1506 /**
1507 * Estimate the performance of the specified shader.
1508 */
1509 void
1510 calculate_performance(performance &p, const backend_shader *s,
1511 void (*issue_instruction)(
1512 state &, const gen_device_info *,
1513 const backend_instruction *),
1514 unsigned dispatch_width)
1515 {
1516 /* XXX - Plumbing the trip counts from NIR loop analysis would allow us
1517 * to do a better job regarding the loop weights. And some branch
1518 * divergence analysis would allow us to do a better job with
1519 * branching weights.
1520 *
1521 * In the meantime use values that roughly match the control flow
1522 * weights used elsewhere in the compiler back-end -- Main
1523 * difference is the worst-case scenario branch_weight used for
1524 * SIMD32 which accounts for the possibility of a dynamically
1525 * uniform branch becoming divergent in SIMD32.
1526 *
1527 * Note that we provide slightly more pessimistic weights on
1528 * Gen12+ for SIMD32, since the effective warp size on that
1529 * platform is 2x the SIMD width due to EU fusion, which increases
1530 * the likelihood of divergent control flow in comparison to
1531 * previous generations, giving narrower SIMD modes a performance
1532 * advantage in several test-cases with non-uniform discard jumps.
1533 */
1534 const float branch_weight = (dispatch_width > 16 ? 1.0 : 0.5);
1535 const float discard_weight = (dispatch_width > 16 || s->devinfo->gen < 12 ?
1536 1.0 : 0.5);
1537 const float loop_weight = 10;
1538 unsigned discard_count = 0;
1539 unsigned elapsed = 0;
1540 state st;
1541
1542 foreach_block(block, s->cfg) {
1543 const unsigned elapsed0 = elapsed;
1544
1545 foreach_inst_in_block(backend_instruction, inst, block) {
1546 const unsigned clock0 = st.unit_ready[unit_fe];
1547
1548 issue_instruction(st, s->devinfo, inst);
1549
1550 if (inst->opcode == BRW_OPCODE_ENDIF)
1551 st.weight /= branch_weight;
1552 else if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT && discard_count)
1553 st.weight /= discard_weight;
1554
1555 elapsed += (st.unit_ready[unit_fe] - clock0) * st.weight;
1556
1557 if (inst->opcode == BRW_OPCODE_IF)
1558 st.weight *= branch_weight;
1559 else if (inst->opcode == BRW_OPCODE_DO)
1560 st.weight *= loop_weight;
1561 else if (inst->opcode == BRW_OPCODE_WHILE)
1562 st.weight /= loop_weight;
1563 else if (inst->opcode == FS_OPCODE_DISCARD_JUMP && !discard_count++)
1564 st.weight *= discard_weight;
1565 }
1566
1567 p.block_latency[block->num] = elapsed - elapsed0;
1568 }
1569
1570 p.latency = elapsed;
1571 p.throughput = dispatch_width * calculate_thread_throughput(st, elapsed);
1572 }
1573 }
1574
1575 brw::performance::performance(const fs_visitor *v) :
1576 block_latency(new unsigned[v->cfg->num_blocks])
1577 {
1578 calculate_performance(*this, v, issue_fs_inst, v->dispatch_width);
1579 }
1580
1581 brw::performance::performance(const vec4_visitor *v) :
1582 block_latency(new unsigned[v->cfg->num_blocks])
1583 {
1584 calculate_performance(*this, v, issue_vec4_instruction, 8);
1585 }
1586
1587 brw::performance::~performance()
1588 {
1589 delete[] block_latency;
1590 }