2 * Copyright © 2020 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
33 * Enumeration representing the various asynchronous units that can run
34 * computations in parallel on behalf of a shader thread.
39 /** EU FPU0 (Note that co-issue to FPU1 is currently not modeled here). */
41 /** Extended Math unit (AKA FPU1 on Gen8-11, part of the EU on Gen6+). */
43 /** Sampler shared function. */
45 /** Pixel Interpolator shared function. */
47 /** Unified Return Buffer shared function. */
49 /** Data Port Data Cache shared function. */
51 /** Data Port Render Cache shared function. */
53 /** Data Port Constant Cache shared function. */
55 /** Message Gateway shared function. */
57 /** Thread Spawner shared function. */
61 /** Number of asynchronous units currently tracked. */
63 /** Dummy unit for instructions that don't consume runtime from the above. */
68 * Enumeration representing a computation result another computation can
69 * potentially depend on.
72 /* Register part of the GRF. */
73 dependency_id_grf0
= 0,
74 /* Register part of the MRF. Only used on Gen4-6. */
75 dependency_id_mrf0
= dependency_id_grf0
+ BRW_MAX_GRF
,
76 /* Address register part of the ARF. */
77 dependency_id_addr0
= dependency_id_mrf0
+ 24,
78 /* Accumulator register part of the ARF. */
79 dependency_id_accum0
= dependency_id_addr0
+ 1,
80 /* Flag register part of the ARF. */
81 dependency_id_flag0
= dependency_id_accum0
+ 12,
82 /* SBID token write completion. Only used on Gen12+. */
83 dependency_id_sbid_wr0
= dependency_id_flag0
+ 8,
84 /* SBID token read completion. Only used on Gen12+. */
85 dependency_id_sbid_rd0
= dependency_id_sbid_wr0
+ 16,
86 /* Number of computation dependencies currently tracked. */
87 num_dependency_ids
= dependency_id_sbid_rd0
+ 16
91 * State of our modeling of the program execution.
94 state() : unit_ready(), dep_ready(), unit_busy(), weight(1.0) {}
96 * Time at which a given unit will be ready to execute the next
97 * computation, in clock units.
99 unsigned unit_ready
[num_units
];
101 * Time at which an instruction dependent on a given dependency ID will
102 * be ready to execute, in clock units.
104 unsigned dep_ready
[num_dependency_ids
];
106 * Aggregated utilization of a given unit excluding idle cycles,
109 float unit_busy
[num_units
];
111 * Factor of the overhead of a computation accounted for in the
112 * aggregated utilization calculation.
118 * Information derived from an IR instruction used to compute performance
119 * estimates. Allows the timing calculation to work on both FS and VEC4
122 struct instruction_info
{
123 instruction_info(const gen_device_info
*devinfo
, const fs_inst
*inst
) :
124 devinfo(devinfo
), op(inst
->opcode
),
125 td(inst
->dst
.type
), sd(DIV_ROUND_UP(inst
->size_written
, REG_SIZE
)),
126 tx(get_exec_type(inst
)), sx(0), ss(0),
127 sc(has_bank_conflict(devinfo
, inst
) ? sd
: 0),
128 desc(inst
->desc
), sfid(inst
->sfid
)
130 /* We typically want the maximum source size, except for split send
131 * messages which require the total size.
133 if (inst
->opcode
== SHADER_OPCODE_SEND
) {
134 ss
= DIV_ROUND_UP(inst
->size_read(2), REG_SIZE
) +
135 DIV_ROUND_UP(inst
->size_read(3), REG_SIZE
);
137 for (unsigned i
= 0; i
< inst
->sources
; i
++)
138 ss
= MAX2(ss
, DIV_ROUND_UP(inst
->size_read(i
), REG_SIZE
));
141 /* Convert the execution size to GRF units. */
142 sx
= DIV_ROUND_UP(inst
->exec_size
* type_sz(tx
), REG_SIZE
);
144 /* 32x32 integer multiplication has half the usual ALU throughput.
145 * Treat it as double-precision.
147 if ((inst
->opcode
== BRW_OPCODE_MUL
|| inst
->opcode
== BRW_OPCODE_MAD
) &&
148 !brw_reg_type_is_floating_point(tx
) && type_sz(tx
) == 4 &&
149 type_sz(inst
->src
[0].type
) == type_sz(inst
->src
[1].type
))
150 tx
= brw_int_type(8, tx
== BRW_REGISTER_TYPE_D
);
153 instruction_info(const gen_device_info
*devinfo
,
154 const vec4_instruction
*inst
) :
155 devinfo(devinfo
), op(inst
->opcode
),
156 td(inst
->dst
.type
), sd(DIV_ROUND_UP(inst
->size_written
, REG_SIZE
)),
157 tx(get_exec_type(inst
)), sx(0), ss(0), sc(0),
158 desc(inst
->desc
), sfid(inst
->sfid
)
160 /* Compute the maximum source size. */
161 for (unsigned i
= 0; i
< ARRAY_SIZE(inst
->src
); i
++)
162 ss
= MAX2(ss
, DIV_ROUND_UP(inst
->size_read(i
), REG_SIZE
));
164 /* Convert the execution size to GRF units. */
165 sx
= DIV_ROUND_UP(inst
->exec_size
* type_sz(tx
), REG_SIZE
);
167 /* 32x32 integer multiplication has half the usual ALU throughput.
168 * Treat it as double-precision.
170 if ((inst
->opcode
== BRW_OPCODE_MUL
|| inst
->opcode
== BRW_OPCODE_MAD
) &&
171 !brw_reg_type_is_floating_point(tx
) && type_sz(tx
) == 4 &&
172 type_sz(inst
->src
[0].type
) == type_sz(inst
->src
[1].type
))
173 tx
= brw_int_type(8, tx
== BRW_REGISTER_TYPE_D
);
176 /** Device information. */
177 const struct gen_device_info
*devinfo
;
178 /** Instruction opcode. */
180 /** Destination type. */
182 /** Destination size in GRF units. */
184 /** Execution type. */
186 /** Execution size in GRF units. */
190 /** Bank conflict penalty size in GRF units (equal to sd if non-zero). */
192 /** Send message descriptor. */
194 /** Send message shared function ID. */
199 * Timing information of an instruction used to estimate the performance of
203 perf_desc(unit u
, int df
, int db
, int ls
, int ld
, int la
, int lf
) :
204 u(u
), df(df
), db(db
), ls(ls
), ld(ld
), la(la
), lf(lf
) {}
207 * Back-end unit its runtime shall be accounted to, in addition to the
208 * EU front-end which is always assumed to be involved.
212 * Overhead cycles from the time that the EU front-end starts executing
213 * the instruction until it's ready to execute the next instruction.
217 * Overhead cycles from the time that the back-end starts executing the
218 * instruction until it's ready to execute the next instruction.
222 * Latency cycles from the time that the back-end starts executing the
223 * instruction until its sources have been read from the register file.
227 * Latency cycles from the time that the back-end starts executing the
228 * instruction until its regular destination has been written to the
233 * Latency cycles from the time that the back-end starts executing the
234 * instruction until its accumulator destination has been written to the
237 * Note that this is an approximation of the real behavior of
238 * accumulating instructions in the hardware: Instead of modeling a pair
239 * of back-to-back accumulating instructions as a first computation with
240 * latency equal to ld followed by another computation with a
241 * mid-pipeline stall (e.g. after the "M" part of a MAC instruction), we
242 * model the stall as if it occurred at the top of the pipeline, with
243 * the latency of the accumulator computation offset accordingly.
247 * Latency cycles from the time that the back-end starts executing the
248 * instruction until its flag destination has been written to the ARF
255 * Compute the timing information of an instruction based on any relevant
256 * information from the IR and a number of parameters specifying a linear
257 * approximation: Parameter X_Y specifies the derivative of timing X
258 * relative to info field Y, while X_1 specifies the independent term of
259 * the approximation of timing X.
262 calculate_desc(const instruction_info
&info
, unit u
,
263 int df_1
, int df_sd
, int df_sc
,
265 int ls_1
, int ld_1
, int la_1
, int lf_1
,
268 return perf_desc(u
, df_1
+ df_sd
* int(info
.sd
) + df_sc
* int(info
.sc
),
269 db_1
+ db_sx
* int(info
.sx
),
270 ls_1
+ l_ss
* int(info
.ss
),
271 ld_1
+ l_ss
* int(info
.ss
) + l_sd
* int(info
.sd
),
276 * Compute the timing information of an instruction based on any relevant
277 * information from the IR and a number of linear approximation parameters
278 * hard-coded for each IR instruction.
280 * Most timing parameters are obtained from the multivariate linear
281 * regression of a sample of empirical timings measured using the tm0
282 * register (as can be done today by using the shader_time debugging
283 * option). The Gen4-5 math timings are obtained from BSpec Volume 5c.3
284 * "Shared Functions - Extended Math", Section 3.2 "Performance".
285 * Parameters marked XXX shall be considered low-quality, they're possibly
286 * high variance or completely guessed in cases where experimental data was
290 instruction_desc(const instruction_info
&info
)
292 const struct gen_device_info
*devinfo
= info
.devinfo
;
295 case BRW_OPCODE_SYNC
:
305 case BRW_OPCODE_CMPN
:
306 case BRW_OPCODE_F16TO32
:
307 case BRW_OPCODE_BFREV
:
308 case BRW_OPCODE_BFI1
:
311 case BRW_OPCODE_RNDU
:
312 case BRW_OPCODE_RNDD
:
313 case BRW_OPCODE_RNDE
:
314 case BRW_OPCODE_RNDZ
:
316 case BRW_OPCODE_MACH
:
320 case BRW_OPCODE_CBIT
:
321 case BRW_OPCODE_ADDC
:
324 case BRW_OPCODE_SUBB
:
325 case BRW_OPCODE_SAD2
:
326 case BRW_OPCODE_SADA2
:
327 case BRW_OPCODE_LINE
:
329 case SHADER_OPCODE_CLUSTER_BROADCAST
:
330 case FS_OPCODE_DDX_COARSE
:
331 case FS_OPCODE_DDX_FINE
:
332 case FS_OPCODE_DDY_COARSE
:
333 case FS_OPCODE_PIXEL_X
:
334 case FS_OPCODE_PIXEL_Y
:
335 case FS_OPCODE_SET_SAMPLE_ID
:
336 case VEC4_OPCODE_MOV_BYTES
:
337 case VEC4_OPCODE_UNPACK_UNIFORM
:
338 case VEC4_OPCODE_DOUBLE_TO_F32
:
339 case VEC4_OPCODE_DOUBLE_TO_D32
:
340 case VEC4_OPCODE_DOUBLE_TO_U32
:
341 case VEC4_OPCODE_TO_DOUBLE
:
342 case VEC4_OPCODE_PICK_LOW_32BIT
:
343 case VEC4_OPCODE_PICK_HIGH_32BIT
:
344 case VEC4_OPCODE_SET_LOW_32BIT
:
345 case VEC4_OPCODE_SET_HIGH_32BIT
:
346 case GS_OPCODE_SET_DWORD_2
:
347 case GS_OPCODE_SET_WRITE_OFFSET
:
348 case GS_OPCODE_SET_VERTEX_COUNT
:
349 case GS_OPCODE_PREPARE_CHANNEL_MASKS
:
350 case GS_OPCODE_SET_CHANNEL_MASKS
:
351 case GS_OPCODE_GET_INSTANCE_ID
:
352 case GS_OPCODE_SET_PRIMITIVE_ID
:
353 case GS_OPCODE_SVB_SET_DST_INDEX
:
354 case TCS_OPCODE_SRC0_010_IS_ZERO
:
355 case TCS_OPCODE_GET_PRIMITIVE_ID
:
356 case TES_OPCODE_GET_PRIMITIVE_ID
:
357 if (devinfo
->gen
>= 11) {
358 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
359 0, 10, 6 /* XXX */, 14, 0, 0);
360 } else if (devinfo
->gen
>= 8) {
361 if (type_sz(info
.tx
) > 4)
362 return calculate_desc(info
, unit_fpu
, 0, 4, 0, 0, 4,
363 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
365 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
367 } else if (devinfo
->is_haswell
) {
368 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
369 0, 10, 6 /* XXX */, 16, 0, 0);
371 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
372 0, 12, 8 /* XXX */, 18, 0, 0);
379 case SHADER_OPCODE_MOV_RELOC_IMM
:
380 if (devinfo
->gen
>= 11) {
381 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
383 } else if (devinfo
->gen
>= 8) {
384 if (type_sz(info
.tx
) > 4)
385 return calculate_desc(info
, unit_fpu
, 0, 4, 0, 0, 4,
386 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
388 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
390 } else if (devinfo
->is_haswell
) {
391 if (info
.tx
== BRW_REGISTER_TYPE_F
)
392 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
393 0, 12, 8 /* XXX */, 18, 0, 0);
395 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
396 0, 10, 6 /* XXX */, 16, 0, 0);
397 } else if (devinfo
->gen
>= 7) {
398 if (info
.tx
== BRW_REGISTER_TYPE_F
)
399 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
400 0, 14, 10 /* XXX */, 20, 0, 0);
402 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
403 0, 12, 8 /* XXX */, 18, 0, 0);
405 return calculate_desc(info
, unit_fpu
, 0, 2 /* XXX */, 0,
407 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
412 case BRW_OPCODE_BFI2
:
413 case BRW_OPCODE_CSEL
:
414 if (devinfo
->gen
>= 11)
415 return calculate_desc(info
, unit_fpu
, 0, 2, 1, 0, 2,
416 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
417 else if (devinfo
->gen
>= 8)
418 return calculate_desc(info
, unit_fpu
, 0, 2, 1, 0, 2,
419 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
420 else if (devinfo
->is_haswell
)
421 return calculate_desc(info
, unit_fpu
, 0, 2, 1, 0, 2,
422 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
423 else if (devinfo
->gen
>= 7)
424 return calculate_desc(info
, unit_fpu
, 0, 2, 1, 0, 2,
425 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
430 if (devinfo
->gen
>= 11) {
431 return calculate_desc(info
, unit_fpu
, 0, 2, 1, 0, 2,
432 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
433 } else if (devinfo
->gen
>= 8) {
434 if (type_sz(info
.tx
) > 4)
435 return calculate_desc(info
, unit_fpu
, 0, 4, 1, 0, 4,
436 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
438 return calculate_desc(info
, unit_fpu
, 0, 2, 1, 0, 2,
439 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
440 } else if (devinfo
->is_haswell
) {
441 if (info
.tx
== BRW_REGISTER_TYPE_F
)
442 return calculate_desc(info
, unit_fpu
, 0, 2, 1, 0, 2,
443 0, 12, 8 /* XXX */, 18, 0, 0);
445 return calculate_desc(info
, unit_fpu
, 0, 2, 1, 0, 2,
446 0, 10, 6 /* XXX */, 16, 0, 0);
447 } else if (devinfo
->gen
>= 7) {
448 if (info
.tx
== BRW_REGISTER_TYPE_F
)
449 return calculate_desc(info
, unit_fpu
, 0, 2, 1, 0, 2,
450 0, 14, 10 /* XXX */, 20, 0, 0);
452 return calculate_desc(info
, unit_fpu
, 0, 2, 1, 0, 2,
453 0, 12, 8 /* XXX */, 18, 0, 0);
454 } else if (devinfo
->gen
>= 6) {
455 return calculate_desc(info
, unit_fpu
, 0, 2 /* XXX */, 1 /* XXX */,
457 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
463 case BRW_OPCODE_F32TO16
:
464 if (devinfo
->gen
>= 11)
465 return calculate_desc(info
, unit_fpu
, 0, 4, 0, 0, 4,
466 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
467 else if (devinfo
->gen
>= 8)
468 return calculate_desc(info
, unit_fpu
, 0, 4, 0, 0, 4,
469 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
470 else if (devinfo
->is_haswell
)
471 return calculate_desc(info
, unit_fpu
, 0, 4, 0, 0, 4,
472 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
473 else if (devinfo
->gen
>= 7)
474 return calculate_desc(info
, unit_fpu
, 0, 4, 0, 0, 4,
475 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
483 if (devinfo
->gen
>= 8)
484 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
485 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
486 else if (devinfo
->is_haswell
)
487 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
488 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
490 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
491 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
493 case SHADER_OPCODE_RCP
:
494 case SHADER_OPCODE_RSQ
:
495 case SHADER_OPCODE_SQRT
:
496 case SHADER_OPCODE_EXP2
:
497 case SHADER_OPCODE_LOG2
:
498 case SHADER_OPCODE_SIN
:
499 case SHADER_OPCODE_COS
:
500 case SHADER_OPCODE_POW
:
501 case SHADER_OPCODE_INT_QUOTIENT
:
502 case SHADER_OPCODE_INT_REMAINDER
:
503 if (devinfo
->gen
>= 6) {
505 case SHADER_OPCODE_RCP
:
506 case SHADER_OPCODE_RSQ
:
507 case SHADER_OPCODE_SQRT
:
508 case SHADER_OPCODE_EXP2
:
509 case SHADER_OPCODE_LOG2
:
510 case SHADER_OPCODE_SIN
:
511 case SHADER_OPCODE_COS
:
512 if (devinfo
->gen
>= 8)
513 return calculate_desc(info
, unit_em
, -2, 4, 0, 0, 4,
515 else if (devinfo
->is_haswell
)
516 return calculate_desc(info
, unit_em
, 0, 2, 0, 0, 2,
519 return calculate_desc(info
, unit_em
, 0, 2, 0, 0, 2,
522 case SHADER_OPCODE_POW
:
523 if (devinfo
->gen
>= 8)
524 return calculate_desc(info
, unit_em
, -2, 4, 0, 0, 8,
526 else if (devinfo
->is_haswell
)
527 return calculate_desc(info
, unit_em
, 0, 2, 0, 0, 4,
530 return calculate_desc(info
, unit_em
, 0, 2, 0, 0, 4,
533 case SHADER_OPCODE_INT_QUOTIENT
:
534 case SHADER_OPCODE_INT_REMAINDER
:
535 return calculate_desc(info
, unit_em
, 2, 0, 0, 26, 0,
536 0, 28 /* XXX */, 0, 0, 0, 0);
543 case SHADER_OPCODE_RCP
:
544 return calculate_desc(info
, unit_em
, 2, 0, 0, 0, 8,
547 case SHADER_OPCODE_RSQ
:
548 return calculate_desc(info
, unit_em
, 2, 0, 0, 0, 16,
551 case SHADER_OPCODE_INT_QUOTIENT
:
552 case SHADER_OPCODE_SQRT
:
553 case SHADER_OPCODE_LOG2
:
554 return calculate_desc(info
, unit_em
, 2, 0, 0, 0, 24,
557 case SHADER_OPCODE_INT_REMAINDER
:
558 case SHADER_OPCODE_EXP2
:
559 return calculate_desc(info
, unit_em
, 2, 0, 0, 0, 32,
562 case SHADER_OPCODE_SIN
:
563 case SHADER_OPCODE_COS
:
564 return calculate_desc(info
, unit_em
, 2, 0, 0, 0, 48,
567 case SHADER_OPCODE_POW
:
568 return calculate_desc(info
, unit_em
, 2, 0, 0, 0, 64,
577 if (devinfo
->gen
>= 6)
578 return calculate_desc(info
, unit_null
, 0, 0, 0, 0, 0,
581 return calculate_desc(info
, unit_null
, 2 /* XXX */, 0, 0, 0, 0,
585 case BRW_OPCODE_ELSE
:
586 case BRW_OPCODE_ENDIF
:
587 case BRW_OPCODE_WHILE
:
588 case BRW_OPCODE_BREAK
:
589 case BRW_OPCODE_CONTINUE
:
590 case FS_OPCODE_DISCARD_JUMP
:
591 if (devinfo
->gen
>= 8)
592 return calculate_desc(info
, unit_null
, 8, 0, 0, 0, 0,
594 else if (devinfo
->is_haswell
)
595 return calculate_desc(info
, unit_null
, 6, 0, 0, 0, 0,
598 return calculate_desc(info
, unit_null
, 2, 0, 0, 0, 0,
601 case FS_OPCODE_LINTERP
:
602 if (devinfo
->gen
>= 8)
603 return calculate_desc(info
, unit_fpu
, 0, 4, 0, 0, 4,
604 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
605 else if (devinfo
->is_haswell
)
606 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
607 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
609 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
610 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
613 if (devinfo
->gen
>= 8)
614 return calculate_desc(info
, unit_fpu
, 0, 4, 1, 0, 4,
615 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
616 else if (devinfo
->is_haswell
)
617 return calculate_desc(info
, unit_fpu
, 0, 2, 1, 0, 2,
618 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
619 else if (devinfo
->gen
>= 6)
620 return calculate_desc(info
, unit_fpu
, 0, 2, 1, 0, 2,
621 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
625 case FS_OPCODE_PACK_HALF_2x16_SPLIT
:
626 if (devinfo
->gen
>= 11)
627 return calculate_desc(info
, unit_fpu
, 20, 6, 0, 0, 6,
628 0, 10 /* XXX */, 6 /* XXX */,
630 else if (devinfo
->gen
>= 8)
631 return calculate_desc(info
, unit_fpu
, 16, 6, 0, 0, 6,
632 0, 8 /* XXX */, 4 /* XXX */,
634 else if (devinfo
->is_haswell
)
635 return calculate_desc(info
, unit_fpu
, 20, 6, 0, 0, 6,
636 0, 10 /* XXX */, 6 /* XXX */,
638 else if (devinfo
->gen
>= 7)
639 return calculate_desc(info
, unit_fpu
, 24, 6, 0, 0, 6,
640 0, 12 /* XXX */, 8 /* XXX */,
645 case SHADER_OPCODE_MOV_INDIRECT
:
646 if (devinfo
->gen
>= 11)
647 return calculate_desc(info
, unit_fpu
, 34, 0, 0, 34, 0,
648 0, 10 /* XXX */, 6 /* XXX */,
650 else if (devinfo
->gen
>= 8)
651 return calculate_desc(info
, unit_fpu
, 34, 0, 0, 34, 0,
652 0, 8 /* XXX */, 4 /* XXX */,
654 else if (devinfo
->is_haswell
)
655 return calculate_desc(info
, unit_fpu
, 34, 0, 0, 34, 0,
656 0, 10 /* XXX */, 6 /* XXX */,
659 return calculate_desc(info
, unit_fpu
, 34, 0, 0, 34, 0,
660 0, 12 /* XXX */, 8 /* XXX */,
663 case SHADER_OPCODE_BROADCAST
:
664 if (devinfo
->gen
>= 11)
665 return calculate_desc(info
, unit_fpu
, 20 /* XXX */, 0, 0, 4, 0,
666 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
667 else if (devinfo
->gen
>= 8)
668 return calculate_desc(info
, unit_fpu
, 18, 0, 0, 4, 0,
669 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
670 else if (devinfo
->is_haswell
)
671 return calculate_desc(info
, unit_fpu
, 18, 0, 0, 4, 0,
672 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
673 else if (devinfo
->gen
>= 7)
674 return calculate_desc(info
, unit_fpu
, 20, 0, 0, 4, 0,
675 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
679 case SHADER_OPCODE_FIND_LIVE_CHANNEL
:
680 if (devinfo
->gen
>= 11)
681 return calculate_desc(info
, unit_fpu
, 2, 0, 0, 2, 0,
682 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
683 else if (devinfo
->gen
>= 8)
684 return calculate_desc(info
, unit_fpu
, 2, 0, 0, 2, 0,
685 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
686 else if (devinfo
->is_haswell
)
687 return calculate_desc(info
, unit_fpu
, 36, 0, 0, 6, 0,
688 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
689 else if (devinfo
->gen
>= 7)
690 return calculate_desc(info
, unit_fpu
, 40, 0, 0, 6, 0,
691 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
695 case SHADER_OPCODE_RND_MODE
:
696 case SHADER_OPCODE_FLOAT_CONTROL_MODE
:
697 if (devinfo
->gen
>= 11)
698 return calculate_desc(info
, unit_fpu
, 24 /* XXX */, 0, 0,
701 else if (devinfo
->gen
>= 8)
702 return calculate_desc(info
, unit_fpu
, 20 /* XXX */, 0, 0,
705 else if (devinfo
->is_haswell
)
706 return calculate_desc(info
, unit_fpu
, 24 /* XXX */, 0, 0,
709 else if (devinfo
->gen
>= 6)
710 return calculate_desc(info
, unit_fpu
, 28 /* XXX */, 0, 0,
716 case SHADER_OPCODE_SHUFFLE
:
717 if (devinfo
->gen
>= 11)
718 return calculate_desc(info
, unit_fpu
, 44 /* XXX */, 0, 0,
720 0, 10 /* XXX */, 6 /* XXX */,
722 else if (devinfo
->gen
>= 8)
723 return calculate_desc(info
, unit_fpu
, 42 /* XXX */, 0, 0,
725 0, 8 /* XXX */, 4 /* XXX */,
727 else if (devinfo
->is_haswell
)
728 return calculate_desc(info
, unit_fpu
, 0, 44 /* XXX */, 0,
730 0, 10 /* XXX */, 6 /* XXX */,
732 else if (devinfo
->gen
>= 6)
733 return calculate_desc(info
, unit_fpu
, 0, 46 /* XXX */, 0,
735 0, 12 /* XXX */, 8 /* XXX */,
740 case SHADER_OPCODE_SEL_EXEC
:
741 if (devinfo
->gen
>= 11)
742 return calculate_desc(info
, unit_fpu
, 10 /* XXX */, 4 /* XXX */, 0,
744 0, 10 /* XXX */, 6 /* XXX */,
746 else if (devinfo
->gen
>= 8)
747 return calculate_desc(info
, unit_fpu
, 8 /* XXX */, 4 /* XXX */, 0,
749 0, 8 /* XXX */, 4 /* XXX */,
751 else if (devinfo
->is_haswell
)
752 return calculate_desc(info
, unit_fpu
, 10 /* XXX */, 4 /* XXX */, 0,
754 0, 10 /* XXX */, 6 /* XXX */,
757 return calculate_desc(info
, unit_fpu
, 12 /* XXX */, 4 /* XXX */, 0,
759 0, 12 /* XXX */, 8 /* XXX */,
762 case SHADER_OPCODE_QUAD_SWIZZLE
:
763 if (devinfo
->gen
>= 11)
764 return calculate_desc(info
, unit_fpu
, 0 /* XXX */, 8 /* XXX */, 0,
766 0, 10 /* XXX */, 6 /* XXX */,
768 else if (devinfo
->gen
>= 8)
769 return calculate_desc(info
, unit_fpu
, 0 /* XXX */, 8 /* XXX */, 0,
771 0, 8 /* XXX */, 4 /* XXX */,
773 else if (devinfo
->is_haswell
)
774 return calculate_desc(info
, unit_fpu
, 0 /* XXX */, 8 /* XXX */, 0,
776 0, 10 /* XXX */, 6 /* XXX */,
779 return calculate_desc(info
, unit_fpu
, 0 /* XXX */, 8 /* XXX */, 0,
781 0, 12 /* XXX */, 8 /* XXX */,
784 case FS_OPCODE_DDY_FINE
:
785 if (devinfo
->gen
>= 11)
786 return calculate_desc(info
, unit_fpu
, 0, 14, 0, 0, 4,
787 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
788 else if (devinfo
->gen
>= 8)
789 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
790 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
791 else if (devinfo
->is_haswell
)
792 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
793 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
795 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
796 0, 14, 10 /* XXX */, 20 /* XXX */, 0, 0);
798 case FS_OPCODE_LOAD_LIVE_CHANNELS
:
799 if (devinfo
->gen
>= 11)
800 return calculate_desc(info
, unit_fpu
, 2 /* XXX */, 0, 0,
802 0, 0, 0, 10 /* XXX */, 0, 0);
803 else if (devinfo
->gen
>= 8)
804 return calculate_desc(info
, unit_fpu
, 0, 2 /* XXX */, 0,
806 0, 0, 0, 8 /* XXX */, 0, 0);
810 case VEC4_OPCODE_PACK_BYTES
:
811 if (devinfo
->gen
>= 8)
812 return calculate_desc(info
, unit_fpu
, 4 /* XXX */, 0, 0,
814 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
816 else if (devinfo
->is_haswell
)
817 return calculate_desc(info
, unit_fpu
, 4 /* XXX */, 0, 0,
819 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
822 return calculate_desc(info
, unit_fpu
, 4 /* XXX */, 0, 0,
824 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
827 case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9
:
828 if (devinfo
->gen
>= 8)
829 return calculate_desc(info
, unit_fpu
, 12 /* XXX */, 0, 0,
831 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
836 case VS_OPCODE_UNPACK_FLAGS_SIMD4X2
:
837 case TCS_OPCODE_GET_INSTANCE_ID
:
838 case TCS_OPCODE_SET_INPUT_URB_OFFSETS
:
839 case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS
:
840 case TES_OPCODE_CREATE_INPUT_READ_HEADER
:
841 if (devinfo
->gen
>= 8)
842 return calculate_desc(info
, unit_fpu
, 22 /* XXX */, 0, 0,
844 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
846 else if (devinfo
->is_haswell
)
847 return calculate_desc(info
, unit_fpu
, 26 /* XXX */, 0, 0,
849 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
852 return calculate_desc(info
, unit_fpu
, 30 /* XXX */, 0, 0,
854 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
857 case GS_OPCODE_FF_SYNC_SET_PRIMITIVES
:
858 case TCS_OPCODE_CREATE_BARRIER_HEADER
:
859 if (devinfo
->gen
>= 8)
860 return calculate_desc(info
, unit_fpu
, 32 /* XXX */, 0, 0,
862 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
864 else if (devinfo
->is_haswell
)
865 return calculate_desc(info
, unit_fpu
, 38 /* XXX */, 0, 0,
867 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
869 else if (devinfo
->gen
>= 6)
870 return calculate_desc(info
, unit_fpu
, 44 /* XXX */, 0, 0,
872 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
877 case TES_OPCODE_ADD_INDIRECT_URB_OFFSET
:
878 if (devinfo
->gen
>= 8)
879 return calculate_desc(info
, unit_fpu
, 12 /* XXX */, 0, 0,
881 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
883 else if (devinfo
->is_haswell
)
884 return calculate_desc(info
, unit_fpu
, 14 /* XXX */, 0, 0,
886 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
888 else if (devinfo
->gen
>= 7)
889 return calculate_desc(info
, unit_fpu
, 16 /* XXX */, 0, 0,
891 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
896 case SHADER_OPCODE_TEX
:
898 case SHADER_OPCODE_TXD
:
899 case SHADER_OPCODE_TXF
:
900 case SHADER_OPCODE_TXF_LZ
:
901 case SHADER_OPCODE_TXL
:
902 case SHADER_OPCODE_TXL_LZ
:
903 case SHADER_OPCODE_TXF_CMS
:
904 case SHADER_OPCODE_TXF_CMS_W
:
905 case SHADER_OPCODE_TXF_UMS
:
906 case SHADER_OPCODE_TXF_MCS
:
907 case SHADER_OPCODE_TXS
:
908 case SHADER_OPCODE_LOD
:
909 case SHADER_OPCODE_GET_BUFFER_SIZE
:
910 case SHADER_OPCODE_TG4
:
911 case SHADER_OPCODE_TG4_OFFSET
:
912 case SHADER_OPCODE_SAMPLEINFO
:
913 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4
:
914 return calculate_desc(info
, unit_sampler
, 2, 0, 0, 0, 16 /* XXX */,
915 8 /* XXX */, 750 /* XXX */, 0, 0,
918 case SHADER_OPCODE_URB_READ_SIMD8
:
919 case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT
:
920 case SHADER_OPCODE_URB_WRITE_SIMD8
:
921 case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT
:
922 case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED
:
923 case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT
:
924 case VEC4_OPCODE_URB_READ
:
925 case VS_OPCODE_URB_WRITE
:
926 case GS_OPCODE_URB_WRITE
:
927 case GS_OPCODE_URB_WRITE_ALLOCATE
:
928 case GS_OPCODE_THREAD_END
:
929 case GS_OPCODE_FF_SYNC
:
930 case TCS_OPCODE_URB_WRITE
:
931 case TCS_OPCODE_RELEASE_INPUT
:
932 case TCS_OPCODE_THREAD_END
:
933 return calculate_desc(info
, unit_urb
, 2, 0, 0, 0, 6 /* XXX */,
934 32 /* XXX */, 200 /* XXX */, 0, 0, 0, 0);
936 case SHADER_OPCODE_MEMORY_FENCE
:
937 case SHADER_OPCODE_INTERLOCK
:
939 case GEN6_SFID_DATAPORT_RENDER_CACHE
:
940 if (devinfo
->gen
>= 7)
941 return calculate_desc(info
, unit_dp_rc
, 2, 0, 0, 30 /* XXX */, 0,
942 10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
946 case GEN7_SFID_DATAPORT_DATA_CACHE
:
947 case HSW_SFID_DATAPORT_DATA_CACHE_1
:
948 if (devinfo
->gen
>= 7)
949 return calculate_desc(info
, unit_dp_dc
, 2, 0, 0, 30 /* XXX */, 0,
950 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
958 case SHADER_OPCODE_GEN4_SCRATCH_READ
:
959 case SHADER_OPCODE_GEN4_SCRATCH_WRITE
:
960 case SHADER_OPCODE_GEN7_SCRATCH_READ
:
961 return calculate_desc(info
, unit_dp_dc
, 2, 0, 0, 0, 8 /* XXX */,
962 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
964 case VEC4_OPCODE_UNTYPED_ATOMIC
:
965 if (devinfo
->gen
>= 7)
966 return calculate_desc(info
, unit_dp_dc
, 2, 0, 0,
967 30 /* XXX */, 400 /* XXX */,
968 10 /* XXX */, 100 /* XXX */, 0, 0,
973 case VEC4_OPCODE_UNTYPED_SURFACE_READ
:
974 case VEC4_OPCODE_UNTYPED_SURFACE_WRITE
:
975 if (devinfo
->gen
>= 7)
976 return calculate_desc(info
, unit_dp_dc
, 2, 0, 0,
978 10 /* XXX */, 100 /* XXX */, 0, 0,
983 case FS_OPCODE_FB_WRITE
:
984 case FS_OPCODE_FB_READ
:
985 case FS_OPCODE_REP_FB_WRITE
:
986 return calculate_desc(info
, unit_dp_rc
, 2, 0, 0, 0, 450 /* XXX */,
987 10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
989 case GS_OPCODE_SVB_WRITE
:
990 if (devinfo
->gen
>= 6)
991 return calculate_desc(info
, unit_dp_rc
, 2 /* XXX */, 0, 0,
993 10 /* XXX */, 300 /* XXX */, 0, 0,
998 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD
:
999 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7
:
1000 return calculate_desc(info
, unit_dp_cc
, 2, 0, 0, 0, 16 /* XXX */,
1001 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
1003 case VS_OPCODE_PULL_CONSTANT_LOAD
:
1004 case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7
:
1005 return calculate_desc(info
, unit_sampler
, 2, 0, 0, 0, 16,
1006 8, 750, 0, 0, 2, 0);
1008 case FS_OPCODE_INTERPOLATE_AT_SAMPLE
:
1009 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET
:
1010 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET
:
1011 if (devinfo
->gen
>= 7)
1012 return calculate_desc(info
, unit_pi
, 2, 0, 0, 14 /* XXX */, 0,
1013 0, 90 /* XXX */, 0, 0, 0, 0);
1017 case SHADER_OPCODE_BARRIER
:
1018 if (devinfo
->gen
>= 7)
1019 return calculate_desc(info
, unit_gateway
, 90 /* XXX */, 0, 0,
1025 case CS_OPCODE_CS_TERMINATE
:
1026 if (devinfo
->gen
>= 7)
1027 return calculate_desc(info
, unit_spawner
, 2, 0, 0, 0 /* XXX */, 0,
1028 10 /* XXX */, 0, 0, 0, 0, 0);
1032 case SHADER_OPCODE_SEND
:
1033 switch (info
.sfid
) {
1034 case GEN6_SFID_DATAPORT_RENDER_CACHE
:
1035 if (devinfo
->gen
>= 7) {
1036 switch (brw_dp_desc_msg_type(devinfo
, info
.desc
)) {
1037 case GEN7_DATAPORT_RC_TYPED_ATOMIC_OP
:
1038 return calculate_desc(info
, unit_dp_rc
, 2, 0, 0,
1039 30 /* XXX */, 450 /* XXX */,
1040 10 /* XXX */, 100 /* XXX */,
1041 0, 0, 0, 400 /* XXX */);
1043 return calculate_desc(info
, unit_dp_rc
, 2, 0, 0,
1045 10 /* XXX */, 300 /* XXX */, 0, 0,
1048 } else if (devinfo
->gen
>= 6) {
1049 return calculate_desc(info
, unit_dp_rc
, 2 /* XXX */, 0, 0,
1051 10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
1055 case BRW_SFID_SAMPLER
: {
1056 if (devinfo
->gen
>= 6)
1057 return calculate_desc(info
, unit_sampler
, 2, 0, 0, 0, 16,
1058 8, 750, 0, 0, 2, 0);
1062 case GEN7_SFID_DATAPORT_DATA_CACHE
:
1063 case HSW_SFID_DATAPORT_DATA_CACHE_1
:
1064 if (devinfo
->gen
>= 8 || devinfo
->is_haswell
) {
1065 switch (brw_dp_desc_msg_type(devinfo
, info
.desc
)) {
1066 case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP
:
1067 case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2
:
1068 case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2
:
1069 case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP
:
1070 return calculate_desc(info
, unit_dp_dc
, 2, 0, 0,
1071 30 /* XXX */, 400 /* XXX */,
1072 10 /* XXX */, 100 /* XXX */, 0, 0,
1076 return calculate_desc(info
, unit_dp_dc
, 2, 0, 0,
1078 10 /* XXX */, 100 /* XXX */, 0, 0,
1081 } else if (devinfo
->gen
>= 7) {
1082 switch (brw_dp_desc_msg_type(devinfo
, info
.desc
)) {
1083 case GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP
:
1084 return calculate_desc(info
, unit_dp_dc
, 2, 0, 0,
1085 30 /* XXX */, 400 /* XXX */,
1086 10 /* XXX */, 100 /* XXX */,
1087 0, 0, 0, 400 /* XXX */);
1089 return calculate_desc(info
, unit_dp_dc
, 2, 0, 0,
1091 10 /* XXX */, 100 /* XXX */, 0, 0,
1101 case SHADER_OPCODE_UNDEF
:
1102 case FS_OPCODE_PLACEHOLDER_HALT
:
1103 case FS_OPCODE_SCHEDULING_FENCE
:
1104 return calculate_desc(info
, unit_null
, 0, 0, 0, 0, 0,
1113 * Model the performance behavior of a stall on the specified dependency
1117 stall_on_dependency(state
&st
, dependency_id id
)
1119 if (id
< ARRAY_SIZE(st
.dep_ready
))
1120 st
.unit_ready
[unit_fe
] = MAX2(st
.unit_ready
[unit_fe
],
1125 * Model the performance behavior of the front-end and back-end while
1126 * executing an instruction with the specified timing information, assuming
1127 * all dependencies are already clear.
1130 execute_instruction(state
&st
, const perf_desc
&perf
)
1132 /* Compute the time at which the front-end will be ready to execute the
1135 st
.unit_ready
[unit_fe
] += perf
.df
;
1137 if (perf
.u
< num_units
) {
1138 /* Wait for the back-end to be ready to execute this instruction. */
1139 st
.unit_ready
[unit_fe
] = MAX2(st
.unit_ready
[unit_fe
],
1140 st
.unit_ready
[perf
.u
]);
1142 /* Compute the time at which the back-end will be ready to execute
1143 * the next instruction, and update the back-end utilization.
1145 st
.unit_ready
[perf
.u
] = st
.unit_ready
[unit_fe
] + perf
.db
;
1146 st
.unit_busy
[perf
.u
] += perf
.db
* st
.weight
;
1151 * Model the performance behavior of a read dependency provided by an
1155 mark_read_dependency(state
&st
, const perf_desc
&perf
, dependency_id id
)
1157 if (id
< ARRAY_SIZE(st
.dep_ready
))
1158 st
.dep_ready
[id
] = st
.unit_ready
[unit_fe
] + perf
.ls
;
1162 * Model the performance behavior of a write dependency provided by an
1166 mark_write_dependency(state
&st
, const perf_desc
&perf
, dependency_id id
)
1168 if (id
>= dependency_id_accum0
&& id
< dependency_id_flag0
)
1169 st
.dep_ready
[id
] = st
.unit_ready
[unit_fe
] + perf
.la
;
1170 else if (id
>= dependency_id_flag0
&& id
< dependency_id_sbid_wr0
)
1171 st
.dep_ready
[id
] = st
.unit_ready
[unit_fe
] + perf
.lf
;
1172 else if (id
< ARRAY_SIZE(st
.dep_ready
))
1173 st
.dep_ready
[id
] = st
.unit_ready
[unit_fe
] + perf
.ld
;
1177 * Return the dependency ID of a backend_reg, offset by \p delta GRFs.
1180 reg_dependency_id(const gen_device_info
*devinfo
, const backend_reg
&r
,
1183 if (r
.file
== VGRF
) {
1184 const unsigned i
= r
.nr
+ r
.offset
/ REG_SIZE
+ delta
;
1185 assert(i
< dependency_id_mrf0
- dependency_id_grf0
);
1186 return dependency_id(dependency_id_grf0
+ i
);
1188 } else if (r
.file
== FIXED_GRF
) {
1189 const unsigned i
= r
.nr
+ delta
;
1190 assert(i
< dependency_id_mrf0
- dependency_id_grf0
);
1191 return dependency_id(dependency_id_grf0
+ i
);
1193 } else if (r
.file
== MRF
&& devinfo
->gen
>= 7) {
1194 const unsigned i
= GEN7_MRF_HACK_START
+
1195 r
.nr
+ r
.offset
/ REG_SIZE
+ delta
;
1196 assert(i
< dependency_id_mrf0
- dependency_id_grf0
);
1197 return dependency_id(dependency_id_grf0
+ i
);
1199 } else if (r
.file
== MRF
&& devinfo
->gen
< 7) {
1200 const unsigned i
= (r
.nr
& ~BRW_MRF_COMPR4
) +
1201 r
.offset
/ REG_SIZE
+ delta
;
1202 assert(i
< dependency_id_addr0
- dependency_id_mrf0
);
1203 return dependency_id(dependency_id_mrf0
+ i
);
1205 } else if (r
.file
== ARF
&& r
.nr
>= BRW_ARF_ADDRESS
&&
1206 r
.nr
< BRW_ARF_ACCUMULATOR
) {
1208 return dependency_id_addr0
;
1210 } else if (r
.file
== ARF
&& r
.nr
>= BRW_ARF_ACCUMULATOR
&&
1211 r
.nr
< BRW_ARF_FLAG
) {
1212 const unsigned i
= r
.nr
- BRW_ARF_ACCUMULATOR
+ delta
;
1213 assert(i
< dependency_id_flag0
- dependency_id_accum0
);
1214 return dependency_id(dependency_id_accum0
+ i
);
1217 return num_dependency_ids
;
1222 * Return the dependency ID of flag register starting at offset \p i.
1225 flag_dependency_id(unsigned i
)
1227 assert(i
< dependency_id_sbid_wr0
- dependency_id_flag0
);
1228 return dependency_id(dependency_id_flag0
+ i
);
1232 * Return the dependency ID corresponding to the SBID read completion
1233 * condition of a Gen12+ SWSB.
1236 tgl_swsb_rd_dependency_id(tgl_swsb swsb
)
1239 assert(swsb
.sbid
< num_dependency_ids
- dependency_id_sbid_rd0
);
1240 return dependency_id(dependency_id_sbid_rd0
+ swsb
.sbid
);
1242 return num_dependency_ids
;
1247 * Return the dependency ID corresponding to the SBID write completion
1248 * condition of a Gen12+ SWSB.
1251 tgl_swsb_wr_dependency_id(tgl_swsb swsb
)
1254 assert(swsb
.sbid
< dependency_id_sbid_rd0
- dependency_id_sbid_wr0
);
1255 return dependency_id(dependency_id_sbid_wr0
+ swsb
.sbid
);
1257 return num_dependency_ids
;
1262 * Return the implicit accumulator register accessed by channel \p i of the
1266 accum_reg_of_channel(const gen_device_info
*devinfo
,
1267 const backend_instruction
*inst
,
1268 brw_reg_type tx
, unsigned i
)
1270 assert(inst
->reads_accumulator_implicitly() ||
1271 inst
->writes_accumulator_implicitly(devinfo
));
1272 const unsigned offset
= (inst
->group
+ i
) * type_sz(tx
) *
1273 (devinfo
->gen
< 7 || brw_reg_type_is_floating_point(tx
) ? 1 : 2);
1274 return offset
/ REG_SIZE
% 2;
1278 * Model the performance behavior of an FS back-end instruction.
1281 issue_fs_inst(state
&st
, const gen_device_info
*devinfo
,
1282 const backend_instruction
*be_inst
)
1284 const fs_inst
*inst
= static_cast<const fs_inst
*>(be_inst
);
1285 const instruction_info
info(devinfo
, inst
);
1286 const perf_desc perf
= instruction_desc(info
);
1288 /* Stall on any source dependencies. */
1289 for (unsigned i
= 0; i
< inst
->sources
; i
++) {
1290 for (unsigned j
= 0; j
< regs_read(inst
, i
); j
++)
1291 stall_on_dependency(
1292 st
, reg_dependency_id(devinfo
, inst
->src
[i
], j
));
1295 if (inst
->reads_accumulator_implicitly()) {
1296 for (unsigned j
= accum_reg_of_channel(devinfo
, inst
, info
.tx
, 0);
1297 j
<= accum_reg_of_channel(devinfo
, inst
, info
.tx
,
1298 inst
->exec_size
- 1); j
++)
1299 stall_on_dependency(
1300 st
, reg_dependency_id(devinfo
, brw_acc_reg(8), j
));
1303 if (is_send(inst
) && inst
->base_mrf
!= -1) {
1304 for (unsigned j
= 0; j
< inst
->mlen
; j
++)
1305 stall_on_dependency(
1306 st
, reg_dependency_id(
1307 devinfo
, brw_uvec_mrf(8, inst
->base_mrf
, 0), j
));
1310 if (const unsigned mask
= inst
->flags_read(devinfo
)) {
1311 for (unsigned i
= 0; i
< sizeof(mask
) * CHAR_BIT
; i
++) {
1312 if (mask
& (1 << i
))
1313 stall_on_dependency(st
, flag_dependency_id(i
));
1317 /* Stall on any write dependencies. */
1318 if (!inst
->no_dd_check
) {
1319 if (inst
->dst
.file
!= BAD_FILE
&& !inst
->dst
.is_null()) {
1320 for (unsigned j
= 0; j
< regs_written(inst
); j
++)
1321 stall_on_dependency(
1322 st
, reg_dependency_id(devinfo
, inst
->dst
, j
));
1325 if (inst
->writes_accumulator_implicitly(devinfo
)) {
1326 for (unsigned j
= accum_reg_of_channel(devinfo
, inst
, info
.tx
, 0);
1327 j
<= accum_reg_of_channel(devinfo
, inst
, info
.tx
,
1328 inst
->exec_size
- 1); j
++)
1329 stall_on_dependency(
1330 st
, reg_dependency_id(devinfo
, brw_acc_reg(8), j
));
1333 if (const unsigned mask
= inst
->flags_written()) {
1334 for (unsigned i
= 0; i
< sizeof(mask
) * CHAR_BIT
; i
++) {
1335 if (mask
& (1 << i
))
1336 stall_on_dependency(st
, flag_dependency_id(i
));
1341 /* Stall on any SBID dependencies. */
1342 if (inst
->sched
.mode
& (TGL_SBID_SET
| TGL_SBID_DST
))
1343 stall_on_dependency(st
, tgl_swsb_wr_dependency_id(inst
->sched
));
1344 else if (inst
->sched
.mode
& TGL_SBID_SRC
)
1345 stall_on_dependency(st
, tgl_swsb_rd_dependency_id(inst
->sched
));
1347 /* Execute the instruction. */
1348 execute_instruction(st
, perf
);
1350 /* Mark any source dependencies. */
1351 if (inst
->is_send_from_grf()) {
1352 for (unsigned i
= 0; i
< inst
->sources
; i
++) {
1353 if (inst
->is_payload(i
)) {
1354 for (unsigned j
= 0; j
< regs_read(inst
, i
); j
++)
1355 mark_read_dependency(
1356 st
, perf
, reg_dependency_id(devinfo
, inst
->src
[i
], j
));
1361 if (is_send(inst
) && inst
->base_mrf
!= -1) {
1362 for (unsigned j
= 0; j
< inst
->mlen
; j
++)
1363 mark_read_dependency(st
, perf
,
1364 reg_dependency_id(devinfo
, brw_uvec_mrf(8, inst
->base_mrf
, 0), j
));
1367 /* Mark any destination dependencies. */
1368 if (inst
->dst
.file
!= BAD_FILE
&& !inst
->dst
.is_null()) {
1369 for (unsigned j
= 0; j
< regs_written(inst
); j
++) {
1370 mark_write_dependency(st
, perf
,
1371 reg_dependency_id(devinfo
, inst
->dst
, j
));
1375 if (inst
->writes_accumulator_implicitly(devinfo
)) {
1376 for (unsigned j
= accum_reg_of_channel(devinfo
, inst
, info
.tx
, 0);
1377 j
<= accum_reg_of_channel(devinfo
, inst
, info
.tx
,
1378 inst
->exec_size
- 1); j
++)
1379 mark_write_dependency(st
, perf
,
1380 reg_dependency_id(devinfo
, brw_acc_reg(8), j
));
1383 if (const unsigned mask
= inst
->flags_written()) {
1384 for (unsigned i
= 0; i
< sizeof(mask
) * CHAR_BIT
; i
++) {
1385 if (mask
& (1 << i
))
1386 mark_write_dependency(st
, perf
, flag_dependency_id(i
));
1390 /* Mark any SBID dependencies. */
1391 if (inst
->sched
.mode
& TGL_SBID_SET
) {
1392 mark_read_dependency(st
, perf
, tgl_swsb_rd_dependency_id(inst
->sched
));
1393 mark_write_dependency(st
, perf
, tgl_swsb_wr_dependency_id(inst
->sched
));
1398 * Model the performance behavior of a VEC4 back-end instruction.
1401 issue_vec4_instruction(state
&st
, const gen_device_info
*devinfo
,
1402 const backend_instruction
*be_inst
)
1404 const vec4_instruction
*inst
=
1405 static_cast<const vec4_instruction
*>(be_inst
);
1406 const instruction_info
info(devinfo
, inst
);
1407 const perf_desc perf
= instruction_desc(info
);
1409 /* Stall on any source dependencies. */
1410 for (unsigned i
= 0; i
< ARRAY_SIZE(inst
->src
); i
++) {
1411 for (unsigned j
= 0; j
< regs_read(inst
, i
); j
++)
1412 stall_on_dependency(
1413 st
, reg_dependency_id(devinfo
, inst
->src
[i
], j
));
1416 if (inst
->reads_accumulator_implicitly()) {
1417 for (unsigned j
= accum_reg_of_channel(devinfo
, inst
, info
.tx
, 0);
1418 j
<= accum_reg_of_channel(devinfo
, inst
, info
.tx
,
1419 inst
->exec_size
- 1); j
++)
1420 stall_on_dependency(
1421 st
, reg_dependency_id(devinfo
, brw_acc_reg(8), j
));
1424 if (inst
->base_mrf
!= -1) {
1425 for (unsigned j
= 0; j
< inst
->mlen
; j
++)
1426 stall_on_dependency(
1427 st
, reg_dependency_id(
1428 devinfo
, brw_uvec_mrf(8, inst
->base_mrf
, 0), j
));
1431 if (inst
->reads_flag())
1432 stall_on_dependency(st
, dependency_id_flag0
);
1434 /* Stall on any write dependencies. */
1435 if (!inst
->no_dd_check
) {
1436 if (inst
->dst
.file
!= BAD_FILE
&& !inst
->dst
.is_null()) {
1437 for (unsigned j
= 0; j
< regs_written(inst
); j
++)
1438 stall_on_dependency(
1439 st
, reg_dependency_id(devinfo
, inst
->dst
, j
));
1442 if (inst
->writes_accumulator_implicitly(devinfo
)) {
1443 for (unsigned j
= accum_reg_of_channel(devinfo
, inst
, info
.tx
, 0);
1444 j
<= accum_reg_of_channel(devinfo
, inst
, info
.tx
,
1445 inst
->exec_size
- 1); j
++)
1446 stall_on_dependency(
1447 st
, reg_dependency_id(devinfo
, brw_acc_reg(8), j
));
1450 if (inst
->writes_flag())
1451 stall_on_dependency(st
, dependency_id_flag0
);
1454 /* Execute the instruction. */
1455 execute_instruction(st
, perf
);
1457 /* Mark any source dependencies. */
1458 if (inst
->is_send_from_grf()) {
1459 for (unsigned i
= 0; i
< ARRAY_SIZE(inst
->src
); i
++) {
1460 for (unsigned j
= 0; j
< regs_read(inst
, i
); j
++)
1461 mark_read_dependency(
1462 st
, perf
, reg_dependency_id(devinfo
, inst
->src
[i
], j
));
1466 if (inst
->base_mrf
!= -1) {
1467 for (unsigned j
= 0; j
< inst
->mlen
; j
++)
1468 mark_read_dependency(st
, perf
,
1469 reg_dependency_id(devinfo
, brw_uvec_mrf(8, inst
->base_mrf
, 0), j
));
1472 /* Mark any destination dependencies. */
1473 if (inst
->dst
.file
!= BAD_FILE
&& !inst
->dst
.is_null()) {
1474 for (unsigned j
= 0; j
< regs_written(inst
); j
++) {
1475 mark_write_dependency(st
, perf
,
1476 reg_dependency_id(devinfo
, inst
->dst
, j
));
1480 if (inst
->writes_accumulator_implicitly(devinfo
)) {
1481 for (unsigned j
= accum_reg_of_channel(devinfo
, inst
, info
.tx
, 0);
1482 j
<= accum_reg_of_channel(devinfo
, inst
, info
.tx
,
1483 inst
->exec_size
- 1); j
++)
1484 mark_write_dependency(st
, perf
,
1485 reg_dependency_id(devinfo
, brw_acc_reg(8), j
));
1488 if (inst
->writes_flag())
1489 mark_write_dependency(st
, perf
, dependency_id_flag0
);
1493 * Calculate the maximum possible throughput of the program compatible with
1494 * the cycle-count utilization estimated for each asynchronous unit, in
1495 * threads-per-cycle units.
1498 calculate_thread_throughput(const state
&st
, float busy
)
1500 for (unsigned i
= 0; i
< num_units
; i
++)
1501 busy
= MAX2(busy
, st
.unit_busy
[i
]);
1507 * Estimate the performance of the specified shader.
1510 calculate_performance(performance
&p
, const backend_shader
*s
,
1511 void (*issue_instruction
)(
1512 state
&, const gen_device_info
*,
1513 const backend_instruction
*),
1514 unsigned dispatch_width
)
1516 /* XXX - Plumbing the trip counts from NIR loop analysis would allow us
1517 * to do a better job regarding the loop weights. And some branch
1518 * divergence analysis would allow us to do a better job with
1519 * branching weights.
1521 * In the meantime use values that roughly match the control flow
1522 * weights used elsewhere in the compiler back-end -- Main
1523 * difference is the worst-case scenario branch_weight used for
1524 * SIMD32 which accounts for the possibility of a dynamically
1525 * uniform branch becoming divergent in SIMD32.
1527 * Note that we provide slightly more pessimistic weights on
1528 * Gen12+ for SIMD32, since the effective warp size on that
1529 * platform is 2x the SIMD width due to EU fusion, which increases
1530 * the likelihood of divergent control flow in comparison to
1531 * previous generations, giving narrower SIMD modes a performance
1532 * advantage in several test-cases with non-uniform discard jumps.
1534 const float branch_weight
= (dispatch_width
> 16 ? 1.0 : 0.5);
1535 const float discard_weight
= (dispatch_width
> 16 || s
->devinfo
->gen
< 12 ?
1537 const float loop_weight
= 10;
1538 unsigned discard_count
= 0;
1539 unsigned elapsed
= 0;
1542 foreach_block(block
, s
->cfg
) {
1543 const unsigned elapsed0
= elapsed
;
1545 foreach_inst_in_block(backend_instruction
, inst
, block
) {
1546 const unsigned clock0
= st
.unit_ready
[unit_fe
];
1548 issue_instruction(st
, s
->devinfo
, inst
);
1550 if (inst
->opcode
== BRW_OPCODE_ENDIF
)
1551 st
.weight
/= branch_weight
;
1552 else if (inst
->opcode
== FS_OPCODE_PLACEHOLDER_HALT
&& discard_count
)
1553 st
.weight
/= discard_weight
;
1555 elapsed
+= (st
.unit_ready
[unit_fe
] - clock0
) * st
.weight
;
1557 if (inst
->opcode
== BRW_OPCODE_IF
)
1558 st
.weight
*= branch_weight
;
1559 else if (inst
->opcode
== BRW_OPCODE_DO
)
1560 st
.weight
*= loop_weight
;
1561 else if (inst
->opcode
== BRW_OPCODE_WHILE
)
1562 st
.weight
/= loop_weight
;
1563 else if (inst
->opcode
== FS_OPCODE_DISCARD_JUMP
&& !discard_count
++)
1564 st
.weight
*= discard_weight
;
1567 p
.block_latency
[block
->num
] = elapsed
- elapsed0
;
1570 p
.latency
= elapsed
;
1571 p
.throughput
= dispatch_width
* calculate_thread_throughput(st
, elapsed
);
1575 brw::performance::performance(const fs_visitor
*v
) :
1576 block_latency(new unsigned[v
->cfg
->num_blocks
])
1578 calculate_performance(*this, v
, issue_fs_inst
, v
->dispatch_width
);
1581 brw::performance::performance(const vec4_visitor
*v
) :
1582 block_latency(new unsigned[v
->cfg
->num_blocks
])
1584 calculate_performance(*this, v
, issue_vec4_instruction
, 8);
1587 brw::performance::~performance()
1589 delete[] block_latency
;