2 * Copyright © 2020 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
33 * Enumeration representing the various asynchronous units that can run
34 * computations in parallel on behalf of a shader thread.
39 /** EU FPU0 (Note that co-issue to FPU1 is currently not modeled here). */
41 /** Extended Math unit (AKA FPU1 on Gen8-11, part of the EU on Gen6+). */
43 /** Sampler shared function. */
45 /** Pixel Interpolator shared function. */
47 /** Unified Return Buffer shared function. */
49 /** Data Port Data Cache shared function. */
51 /** Data Port Render Cache shared function. */
53 /** Data Port Constant Cache shared function. */
55 /** Message Gateway shared function. */
57 /** Thread Spawner shared function. */
61 /** Number of asynchronous units currently tracked. */
63 /** Dummy unit for instructions that don't consume runtime from the above. */
68 * Enumeration representing a computation result another computation can
69 * potentially depend on.
72 /* Register part of the GRF. */
73 dependency_id_grf0
= 0,
74 /* Register part of the MRF. Only used on Gen4-6. */
75 dependency_id_mrf0
= dependency_id_grf0
+ BRW_MAX_GRF
,
76 /* Address register part of the ARF. */
77 dependency_id_addr0
= dependency_id_mrf0
+ 24,
78 /* Accumulator register part of the ARF. */
79 dependency_id_accum0
= dependency_id_addr0
+ 1,
80 /* Flag register part of the ARF. */
81 dependency_id_flag0
= dependency_id_accum0
+ 12,
82 /* SBID token write completion. Only used on Gen12+. */
83 dependency_id_sbid_wr0
= dependency_id_flag0
+ 8,
84 /* SBID token read completion. Only used on Gen12+. */
85 dependency_id_sbid_rd0
= dependency_id_sbid_wr0
+ 16,
86 /* Number of computation dependencies currently tracked. */
87 num_dependency_ids
= dependency_id_sbid_rd0
+ 16
91 * State of our modeling of the program execution.
94 state() : unit_ready(), dep_ready(), unit_busy(), weight(1.0) {}
96 * Time at which a given unit will be ready to execute the next
97 * computation, in clock units.
99 unsigned unit_ready
[num_units
];
101 * Time at which an instruction dependent on a given dependency ID will
102 * be ready to execute, in clock units.
104 unsigned dep_ready
[num_dependency_ids
];
106 * Aggregated utilization of a given unit excluding idle cycles,
109 float unit_busy
[num_units
];
111 * Factor of the overhead of a computation accounted for in the
112 * aggregated utilization calculation.
118 * Information derived from an IR instruction used to compute performance
119 * estimates. Allows the timing calculation to work on both FS and VEC4
122 struct instruction_info
{
123 instruction_info(const gen_device_info
*devinfo
, const fs_inst
*inst
) :
124 devinfo(devinfo
), op(inst
->opcode
),
125 td(inst
->dst
.type
), sd(DIV_ROUND_UP(inst
->size_written
, REG_SIZE
)),
126 tx(get_exec_type(inst
)), sx(0), ss(0),
127 sc(has_bank_conflict(devinfo
, inst
) ? sd
: 0),
128 desc(inst
->desc
), sfid(inst
->sfid
)
130 /* We typically want the maximum source size, except for split send
131 * messages which require the total size.
133 if (inst
->opcode
== SHADER_OPCODE_SEND
) {
134 ss
= DIV_ROUND_UP(inst
->size_read(2), REG_SIZE
) +
135 DIV_ROUND_UP(inst
->size_read(3), REG_SIZE
);
137 for (unsigned i
= 0; i
< inst
->sources
; i
++)
138 ss
= MAX2(ss
, DIV_ROUND_UP(inst
->size_read(i
), REG_SIZE
));
141 /* Convert the execution size to GRF units. */
142 sx
= DIV_ROUND_UP(inst
->exec_size
* type_sz(tx
), REG_SIZE
);
144 /* 32x32 integer multiplication has half the usual ALU throughput.
145 * Treat it as double-precision.
147 if ((inst
->opcode
== BRW_OPCODE_MUL
|| inst
->opcode
== BRW_OPCODE_MAD
) &&
148 !brw_reg_type_is_floating_point(tx
) && type_sz(tx
) == 4 &&
149 type_sz(inst
->src
[0].type
) == type_sz(inst
->src
[1].type
))
150 tx
= brw_int_type(8, tx
== BRW_REGISTER_TYPE_D
);
153 instruction_info(const gen_device_info
*devinfo
,
154 const vec4_instruction
*inst
) :
155 devinfo(devinfo
), op(inst
->opcode
),
156 td(inst
->dst
.type
), sd(DIV_ROUND_UP(inst
->size_written
, REG_SIZE
)),
157 tx(get_exec_type(inst
)), sx(0), ss(0), sc(0),
158 desc(inst
->desc
), sfid(inst
->sfid
)
160 /* Compute the maximum source size. */
161 for (unsigned i
= 0; i
< ARRAY_SIZE(inst
->src
); i
++)
162 ss
= MAX2(ss
, DIV_ROUND_UP(inst
->size_read(i
), REG_SIZE
));
164 /* Convert the execution size to GRF units. */
165 sx
= DIV_ROUND_UP(inst
->exec_size
* type_sz(tx
), REG_SIZE
);
167 /* 32x32 integer multiplication has half the usual ALU throughput.
168 * Treat it as double-precision.
170 if ((inst
->opcode
== BRW_OPCODE_MUL
|| inst
->opcode
== BRW_OPCODE_MAD
) &&
171 !brw_reg_type_is_floating_point(tx
) && type_sz(tx
) == 4 &&
172 type_sz(inst
->src
[0].type
) == type_sz(inst
->src
[1].type
))
173 tx
= brw_int_type(8, tx
== BRW_REGISTER_TYPE_D
);
176 /** Device information. */
177 const struct gen_device_info
*devinfo
;
178 /** Instruction opcode. */
180 /** Destination type. */
182 /** Destination size in GRF units. */
184 /** Execution type. */
186 /** Execution size in GRF units. */
190 /** Bank conflict penalty size in GRF units (equal to sd if non-zero). */
192 /** Send message descriptor. */
194 /** Send message shared function ID. */
199 * Timing information of an instruction used to estimate the performance of
203 perf_desc(unit u
, int df
, int db
, int ls
, int ld
, int la
, int lf
) :
204 u(u
), df(df
), db(db
), ls(ls
), ld(ld
), la(la
), lf(lf
) {}
207 * Back-end unit its runtime shall be accounted to, in addition to the
208 * EU front-end which is always assumed to be involved.
212 * Overhead cycles from the time that the EU front-end starts executing
213 * the instruction until it's ready to execute the next instruction.
217 * Overhead cycles from the time that the back-end starts executing the
218 * instruction until it's ready to execute the next instruction.
222 * Latency cycles from the time that the back-end starts executing the
223 * instruction until its sources have been read from the register file.
227 * Latency cycles from the time that the back-end starts executing the
228 * instruction until its regular destination has been written to the
233 * Latency cycles from the time that the back-end starts executing the
234 * instruction until its accumulator destination has been written to the
237 * Note that this is an approximation of the real behavior of
238 * accumulating instructions in the hardware: Instead of modeling a pair
239 * of back-to-back accumulating instructions as a first computation with
240 * latency equal to ld followed by another computation with a
241 * mid-pipeline stall (e.g. after the "M" part of a MAC instruction), we
242 * model the stall as if it occurred at the top of the pipeline, with
243 * the latency of the accumulator computation offset accordingly.
247 * Latency cycles from the time that the back-end starts executing the
248 * instruction until its flag destination has been written to the ARF
255 * Compute the timing information of an instruction based on any relevant
256 * information from the IR and a number of parameters specifying a linear
257 * approximation: Parameter X_Y specifies the derivative of timing X
258 * relative to info field Y, while X_1 specifies the independent term of
259 * the approximation of timing X.
262 calculate_desc(const instruction_info
&info
, unit u
,
263 int df_1
, int df_sd
, int df_sc
,
265 int ls_1
, int ld_1
, int la_1
, int lf_1
,
268 return perf_desc(u
, df_1
+ df_sd
* int(info
.sd
) + df_sc
* int(info
.sc
),
269 db_1
+ db_sx
* int(info
.sx
),
270 ls_1
+ l_ss
* int(info
.ss
),
271 ld_1
+ l_ss
* int(info
.ss
) + l_sd
* int(info
.sd
),
276 * Compute the timing information of an instruction based on any relevant
277 * information from the IR and a number of linear approximation parameters
278 * hard-coded for each IR instruction.
280 * Most timing parameters are obtained from the multivariate linear
281 * regression of a sample of empirical timings measured using the tm0
282 * register (as can be done today by using the shader_time debugging
283 * option). The Gen4-5 math timings are obtained from BSpec Volume 5c.3
284 * "Shared Functions - Extended Math", Section 3.2 "Performance".
285 * Parameters marked XXX shall be considered low-quality, they're possibly
286 * high variance or completely guessed in cases where experimental data was
290 instruction_desc(const instruction_info
&info
)
292 const struct gen_device_info
*devinfo
= info
.devinfo
;
295 case BRW_OPCODE_SYNC
:
305 case BRW_OPCODE_CMPN
:
306 case BRW_OPCODE_F16TO32
:
307 case BRW_OPCODE_BFREV
:
308 case BRW_OPCODE_BFI1
:
311 case BRW_OPCODE_RNDU
:
312 case BRW_OPCODE_RNDD
:
313 case BRW_OPCODE_RNDE
:
314 case BRW_OPCODE_RNDZ
:
316 case BRW_OPCODE_MACH
:
320 case BRW_OPCODE_CBIT
:
321 case BRW_OPCODE_ADDC
:
324 case BRW_OPCODE_SUBB
:
325 case BRW_OPCODE_SAD2
:
326 case BRW_OPCODE_SADA2
:
327 case BRW_OPCODE_LINE
:
329 case SHADER_OPCODE_CLUSTER_BROADCAST
:
330 case FS_OPCODE_DDX_COARSE
:
331 case FS_OPCODE_DDX_FINE
:
332 case FS_OPCODE_DDY_COARSE
:
333 case FS_OPCODE_PIXEL_X
:
334 case FS_OPCODE_PIXEL_Y
:
335 case FS_OPCODE_SET_SAMPLE_ID
:
336 case VEC4_OPCODE_MOV_BYTES
:
337 case VEC4_OPCODE_UNPACK_UNIFORM
:
338 case VEC4_OPCODE_DOUBLE_TO_F32
:
339 case VEC4_OPCODE_DOUBLE_TO_D32
:
340 case VEC4_OPCODE_DOUBLE_TO_U32
:
341 case VEC4_OPCODE_TO_DOUBLE
:
342 case VEC4_OPCODE_PICK_LOW_32BIT
:
343 case VEC4_OPCODE_PICK_HIGH_32BIT
:
344 case VEC4_OPCODE_SET_LOW_32BIT
:
345 case VEC4_OPCODE_SET_HIGH_32BIT
:
346 case GS_OPCODE_SET_DWORD_2
:
347 case GS_OPCODE_SET_WRITE_OFFSET
:
348 case GS_OPCODE_SET_VERTEX_COUNT
:
349 case GS_OPCODE_PREPARE_CHANNEL_MASKS
:
350 case GS_OPCODE_SET_CHANNEL_MASKS
:
351 case GS_OPCODE_GET_INSTANCE_ID
:
352 case GS_OPCODE_SET_PRIMITIVE_ID
:
353 case GS_OPCODE_SVB_SET_DST_INDEX
:
354 case TCS_OPCODE_SRC0_010_IS_ZERO
:
355 case TCS_OPCODE_GET_PRIMITIVE_ID
:
356 case TES_OPCODE_GET_PRIMITIVE_ID
:
357 if (devinfo
->gen
>= 11) {
358 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
359 0, 10, 6 /* XXX */, 14, 0, 0);
360 } else if (devinfo
->gen
>= 8) {
361 if (type_sz(info
.tx
) > 4)
362 return calculate_desc(info
, unit_fpu
, 0, 4, 0, 0, 4,
363 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
365 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
367 } else if (devinfo
->is_haswell
) {
368 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
369 0, 10, 6 /* XXX */, 16, 0, 0);
371 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
372 0, 12, 8 /* XXX */, 18, 0, 0);
379 if (devinfo
->gen
>= 11) {
380 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
382 } else if (devinfo
->gen
>= 8) {
383 if (type_sz(info
.tx
) > 4)
384 return calculate_desc(info
, unit_fpu
, 0, 4, 0, 0, 4,
385 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
387 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
389 } else if (devinfo
->is_haswell
) {
390 if (info
.tx
== BRW_REGISTER_TYPE_F
)
391 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
392 0, 12, 8 /* XXX */, 18, 0, 0);
394 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
395 0, 10, 6 /* XXX */, 16, 0, 0);
396 } else if (devinfo
->gen
>= 7) {
397 if (info
.tx
== BRW_REGISTER_TYPE_F
)
398 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
399 0, 14, 10 /* XXX */, 20, 0, 0);
401 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
402 0, 12, 8 /* XXX */, 18, 0, 0);
404 return calculate_desc(info
, unit_fpu
, 0, 2 /* XXX */, 0,
406 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
411 case BRW_OPCODE_BFI2
:
412 case BRW_OPCODE_CSEL
:
413 if (devinfo
->gen
>= 11)
414 return calculate_desc(info
, unit_fpu
, 0, 2, 1, 0, 2,
415 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
416 else if (devinfo
->gen
>= 8)
417 return calculate_desc(info
, unit_fpu
, 0, 2, 1, 0, 2,
418 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
419 else if (devinfo
->is_haswell
)
420 return calculate_desc(info
, unit_fpu
, 0, 2, 1, 0, 2,
421 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
422 else if (devinfo
->gen
>= 7)
423 return calculate_desc(info
, unit_fpu
, 0, 2, 1, 0, 2,
424 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
429 if (devinfo
->gen
>= 11) {
430 return calculate_desc(info
, unit_fpu
, 0, 2, 1, 0, 2,
431 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
432 } else if (devinfo
->gen
>= 8) {
433 if (type_sz(info
.tx
) > 4)
434 return calculate_desc(info
, unit_fpu
, 0, 4, 1, 0, 4,
435 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
437 return calculate_desc(info
, unit_fpu
, 0, 2, 1, 0, 2,
438 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
439 } else if (devinfo
->is_haswell
) {
440 if (info
.tx
== BRW_REGISTER_TYPE_F
)
441 return calculate_desc(info
, unit_fpu
, 0, 2, 1, 0, 2,
442 0, 12, 8 /* XXX */, 18, 0, 0);
444 return calculate_desc(info
, unit_fpu
, 0, 2, 1, 0, 2,
445 0, 10, 6 /* XXX */, 16, 0, 0);
446 } else if (devinfo
->gen
>= 7) {
447 if (info
.tx
== BRW_REGISTER_TYPE_F
)
448 return calculate_desc(info
, unit_fpu
, 0, 2, 1, 0, 2,
449 0, 14, 10 /* XXX */, 20, 0, 0);
451 return calculate_desc(info
, unit_fpu
, 0, 2, 1, 0, 2,
452 0, 12, 8 /* XXX */, 18, 0, 0);
453 } else if (devinfo
->gen
>= 6) {
454 return calculate_desc(info
, unit_fpu
, 0, 2 /* XXX */, 1 /* XXX */,
456 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
462 case BRW_OPCODE_F32TO16
:
463 if (devinfo
->gen
>= 11)
464 return calculate_desc(info
, unit_fpu
, 0, 4, 0, 0, 4,
465 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
466 else if (devinfo
->gen
>= 8)
467 return calculate_desc(info
, unit_fpu
, 0, 4, 0, 0, 4,
468 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
469 else if (devinfo
->is_haswell
)
470 return calculate_desc(info
, unit_fpu
, 0, 4, 0, 0, 4,
471 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
472 else if (devinfo
->gen
>= 7)
473 return calculate_desc(info
, unit_fpu
, 0, 4, 0, 0, 4,
474 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
482 if (devinfo
->gen
>= 8)
483 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
484 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
485 else if (devinfo
->is_haswell
)
486 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
487 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
489 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
490 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
492 case SHADER_OPCODE_RCP
:
493 case SHADER_OPCODE_RSQ
:
494 case SHADER_OPCODE_SQRT
:
495 case SHADER_OPCODE_EXP2
:
496 case SHADER_OPCODE_LOG2
:
497 case SHADER_OPCODE_SIN
:
498 case SHADER_OPCODE_COS
:
499 case SHADER_OPCODE_POW
:
500 case SHADER_OPCODE_INT_QUOTIENT
:
501 case SHADER_OPCODE_INT_REMAINDER
:
502 if (devinfo
->gen
>= 6) {
504 case SHADER_OPCODE_RCP
:
505 case SHADER_OPCODE_RSQ
:
506 case SHADER_OPCODE_SQRT
:
507 case SHADER_OPCODE_EXP2
:
508 case SHADER_OPCODE_LOG2
:
509 case SHADER_OPCODE_SIN
:
510 case SHADER_OPCODE_COS
:
511 if (devinfo
->gen
>= 8)
512 return calculate_desc(info
, unit_em
, -2, 4, 0, 0, 4,
514 else if (devinfo
->is_haswell
)
515 return calculate_desc(info
, unit_em
, 0, 2, 0, 0, 2,
518 return calculate_desc(info
, unit_em
, 0, 2, 0, 0, 2,
521 case SHADER_OPCODE_POW
:
522 if (devinfo
->gen
>= 8)
523 return calculate_desc(info
, unit_em
, -2, 4, 0, 0, 8,
525 else if (devinfo
->is_haswell
)
526 return calculate_desc(info
, unit_em
, 0, 2, 0, 0, 4,
529 return calculate_desc(info
, unit_em
, 0, 2, 0, 0, 4,
532 case SHADER_OPCODE_INT_QUOTIENT
:
533 case SHADER_OPCODE_INT_REMAINDER
:
534 return calculate_desc(info
, unit_em
, 2, 0, 0, 26, 0,
535 0, 28 /* XXX */, 0, 0, 0, 0);
542 case SHADER_OPCODE_RCP
:
543 return calculate_desc(info
, unit_em
, 2, 0, 0, 0, 8,
546 case SHADER_OPCODE_RSQ
:
547 return calculate_desc(info
, unit_em
, 2, 0, 0, 0, 16,
550 case SHADER_OPCODE_INT_QUOTIENT
:
551 case SHADER_OPCODE_SQRT
:
552 case SHADER_OPCODE_LOG2
:
553 return calculate_desc(info
, unit_em
, 2, 0, 0, 0, 24,
556 case SHADER_OPCODE_INT_REMAINDER
:
557 case SHADER_OPCODE_EXP2
:
558 return calculate_desc(info
, unit_em
, 2, 0, 0, 0, 32,
561 case SHADER_OPCODE_SIN
:
562 case SHADER_OPCODE_COS
:
563 return calculate_desc(info
, unit_em
, 2, 0, 0, 0, 48,
566 case SHADER_OPCODE_POW
:
567 return calculate_desc(info
, unit_em
, 2, 0, 0, 0, 64,
576 if (devinfo
->gen
>= 6)
577 return calculate_desc(info
, unit_null
, 0, 0, 0, 0, 0,
580 return calculate_desc(info
, unit_null
, 2 /* XXX */, 0, 0, 0, 0,
584 case BRW_OPCODE_ELSE
:
585 case BRW_OPCODE_ENDIF
:
586 case BRW_OPCODE_WHILE
:
587 case BRW_OPCODE_BREAK
:
588 case BRW_OPCODE_CONTINUE
:
589 case FS_OPCODE_DISCARD_JUMP
:
590 if (devinfo
->gen
>= 8)
591 return calculate_desc(info
, unit_null
, 8, 0, 0, 0, 0,
593 else if (devinfo
->is_haswell
)
594 return calculate_desc(info
, unit_null
, 6, 0, 0, 0, 0,
597 return calculate_desc(info
, unit_null
, 2, 0, 0, 0, 0,
600 case FS_OPCODE_LINTERP
:
601 if (devinfo
->gen
>= 8)
602 return calculate_desc(info
, unit_fpu
, 0, 4, 0, 0, 4,
603 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
604 else if (devinfo
->is_haswell
)
605 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
606 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
608 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
609 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
612 if (devinfo
->gen
>= 8)
613 return calculate_desc(info
, unit_fpu
, 0, 4, 1, 0, 4,
614 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
615 else if (devinfo
->is_haswell
)
616 return calculate_desc(info
, unit_fpu
, 0, 2, 1, 0, 2,
617 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
618 else if (devinfo
->gen
>= 6)
619 return calculate_desc(info
, unit_fpu
, 0, 2, 1, 0, 2,
620 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
624 case FS_OPCODE_PACK_HALF_2x16_SPLIT
:
625 if (devinfo
->gen
>= 11)
626 return calculate_desc(info
, unit_fpu
, 20, 6, 0, 0, 6,
627 0, 10 /* XXX */, 6 /* XXX */,
629 else if (devinfo
->gen
>= 8)
630 return calculate_desc(info
, unit_fpu
, 16, 6, 0, 0, 6,
631 0, 8 /* XXX */, 4 /* XXX */,
633 else if (devinfo
->is_haswell
)
634 return calculate_desc(info
, unit_fpu
, 20, 6, 0, 0, 6,
635 0, 10 /* XXX */, 6 /* XXX */,
637 else if (devinfo
->gen
>= 7)
638 return calculate_desc(info
, unit_fpu
, 24, 6, 0, 0, 6,
639 0, 12 /* XXX */, 8 /* XXX */,
644 case SHADER_OPCODE_MOV_INDIRECT
:
645 if (devinfo
->gen
>= 11)
646 return calculate_desc(info
, unit_fpu
, 34, 0, 0, 34, 0,
647 0, 10 /* XXX */, 6 /* XXX */,
649 else if (devinfo
->gen
>= 8)
650 return calculate_desc(info
, unit_fpu
, 34, 0, 0, 34, 0,
651 0, 8 /* XXX */, 4 /* XXX */,
653 else if (devinfo
->is_haswell
)
654 return calculate_desc(info
, unit_fpu
, 34, 0, 0, 34, 0,
655 0, 10 /* XXX */, 6 /* XXX */,
658 return calculate_desc(info
, unit_fpu
, 34, 0, 0, 34, 0,
659 0, 12 /* XXX */, 8 /* XXX */,
662 case SHADER_OPCODE_BROADCAST
:
663 if (devinfo
->gen
>= 11)
664 return calculate_desc(info
, unit_fpu
, 20 /* XXX */, 0, 0, 4, 0,
665 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
666 else if (devinfo
->gen
>= 8)
667 return calculate_desc(info
, unit_fpu
, 18, 0, 0, 4, 0,
668 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
669 else if (devinfo
->is_haswell
)
670 return calculate_desc(info
, unit_fpu
, 18, 0, 0, 4, 0,
671 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
672 else if (devinfo
->gen
>= 7)
673 return calculate_desc(info
, unit_fpu
, 20, 0, 0, 4, 0,
674 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
678 case SHADER_OPCODE_FIND_LIVE_CHANNEL
:
679 if (devinfo
->gen
>= 11)
680 return calculate_desc(info
, unit_fpu
, 2, 0, 0, 2, 0,
681 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
682 else if (devinfo
->gen
>= 8)
683 return calculate_desc(info
, unit_fpu
, 2, 0, 0, 2, 0,
684 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
685 else if (devinfo
->is_haswell
)
686 return calculate_desc(info
, unit_fpu
, 36, 0, 0, 6, 0,
687 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
688 else if (devinfo
->gen
>= 7)
689 return calculate_desc(info
, unit_fpu
, 40, 0, 0, 6, 0,
690 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
694 case SHADER_OPCODE_RND_MODE
:
695 case SHADER_OPCODE_FLOAT_CONTROL_MODE
:
696 if (devinfo
->gen
>= 11)
697 return calculate_desc(info
, unit_fpu
, 24 /* XXX */, 0, 0,
700 else if (devinfo
->gen
>= 8)
701 return calculate_desc(info
, unit_fpu
, 20 /* XXX */, 0, 0,
704 else if (devinfo
->is_haswell
)
705 return calculate_desc(info
, unit_fpu
, 24 /* XXX */, 0, 0,
708 else if (devinfo
->gen
>= 6)
709 return calculate_desc(info
, unit_fpu
, 28 /* XXX */, 0, 0,
715 case SHADER_OPCODE_SHUFFLE
:
716 if (devinfo
->gen
>= 11)
717 return calculate_desc(info
, unit_fpu
, 44 /* XXX */, 0, 0,
719 0, 10 /* XXX */, 6 /* XXX */,
721 else if (devinfo
->gen
>= 8)
722 return calculate_desc(info
, unit_fpu
, 42 /* XXX */, 0, 0,
724 0, 8 /* XXX */, 4 /* XXX */,
726 else if (devinfo
->is_haswell
)
727 return calculate_desc(info
, unit_fpu
, 0, 44 /* XXX */, 0,
729 0, 10 /* XXX */, 6 /* XXX */,
731 else if (devinfo
->gen
>= 6)
732 return calculate_desc(info
, unit_fpu
, 0, 46 /* XXX */, 0,
734 0, 12 /* XXX */, 8 /* XXX */,
739 case SHADER_OPCODE_SEL_EXEC
:
740 if (devinfo
->gen
>= 11)
741 return calculate_desc(info
, unit_fpu
, 10 /* XXX */, 4 /* XXX */, 0,
743 0, 10 /* XXX */, 6 /* XXX */,
745 else if (devinfo
->gen
>= 8)
746 return calculate_desc(info
, unit_fpu
, 8 /* XXX */, 4 /* XXX */, 0,
748 0, 8 /* XXX */, 4 /* XXX */,
750 else if (devinfo
->is_haswell
)
751 return calculate_desc(info
, unit_fpu
, 10 /* XXX */, 4 /* XXX */, 0,
753 0, 10 /* XXX */, 6 /* XXX */,
756 return calculate_desc(info
, unit_fpu
, 12 /* XXX */, 4 /* XXX */, 0,
758 0, 12 /* XXX */, 8 /* XXX */,
761 case SHADER_OPCODE_QUAD_SWIZZLE
:
762 if (devinfo
->gen
>= 11)
763 return calculate_desc(info
, unit_fpu
, 0 /* XXX */, 8 /* XXX */, 0,
765 0, 10 /* XXX */, 6 /* XXX */,
767 else if (devinfo
->gen
>= 8)
768 return calculate_desc(info
, unit_fpu
, 0 /* XXX */, 8 /* XXX */, 0,
770 0, 8 /* XXX */, 4 /* XXX */,
772 else if (devinfo
->is_haswell
)
773 return calculate_desc(info
, unit_fpu
, 0 /* XXX */, 8 /* XXX */, 0,
775 0, 10 /* XXX */, 6 /* XXX */,
778 return calculate_desc(info
, unit_fpu
, 0 /* XXX */, 8 /* XXX */, 0,
780 0, 12 /* XXX */, 8 /* XXX */,
783 case FS_OPCODE_DDY_FINE
:
784 if (devinfo
->gen
>= 11)
785 return calculate_desc(info
, unit_fpu
, 0, 14, 0, 0, 4,
786 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
787 else if (devinfo
->gen
>= 8)
788 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
789 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
790 else if (devinfo
->is_haswell
)
791 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
792 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
794 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
795 0, 14, 10 /* XXX */, 20 /* XXX */, 0, 0);
797 case FS_OPCODE_LOAD_LIVE_CHANNELS
:
798 if (devinfo
->gen
>= 11)
799 return calculate_desc(info
, unit_fpu
, 2 /* XXX */, 0, 0,
801 0, 0, 0, 10 /* XXX */, 0, 0);
802 else if (devinfo
->gen
>= 8)
803 return calculate_desc(info
, unit_fpu
, 0, 2 /* XXX */, 0,
805 0, 0, 0, 8 /* XXX */, 0, 0);
809 case VEC4_OPCODE_PACK_BYTES
:
810 if (devinfo
->gen
>= 8)
811 return calculate_desc(info
, unit_fpu
, 4 /* XXX */, 0, 0,
813 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
815 else if (devinfo
->is_haswell
)
816 return calculate_desc(info
, unit_fpu
, 4 /* XXX */, 0, 0,
818 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
821 return calculate_desc(info
, unit_fpu
, 4 /* XXX */, 0, 0,
823 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
826 case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9
:
827 if (devinfo
->gen
>= 8)
828 return calculate_desc(info
, unit_fpu
, 12 /* XXX */, 0, 0,
830 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
835 case VS_OPCODE_UNPACK_FLAGS_SIMD4X2
:
836 case TCS_OPCODE_GET_INSTANCE_ID
:
837 case TCS_OPCODE_SET_INPUT_URB_OFFSETS
:
838 case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS
:
839 case TES_OPCODE_CREATE_INPUT_READ_HEADER
:
840 if (devinfo
->gen
>= 8)
841 return calculate_desc(info
, unit_fpu
, 22 /* XXX */, 0, 0,
843 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
845 else if (devinfo
->is_haswell
)
846 return calculate_desc(info
, unit_fpu
, 26 /* XXX */, 0, 0,
848 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
851 return calculate_desc(info
, unit_fpu
, 30 /* XXX */, 0, 0,
853 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
856 case GS_OPCODE_FF_SYNC_SET_PRIMITIVES
:
857 case TCS_OPCODE_CREATE_BARRIER_HEADER
:
858 if (devinfo
->gen
>= 8)
859 return calculate_desc(info
, unit_fpu
, 32 /* XXX */, 0, 0,
861 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
863 else if (devinfo
->is_haswell
)
864 return calculate_desc(info
, unit_fpu
, 38 /* XXX */, 0, 0,
866 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
868 else if (devinfo
->gen
>= 6)
869 return calculate_desc(info
, unit_fpu
, 44 /* XXX */, 0, 0,
871 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
876 case TES_OPCODE_ADD_INDIRECT_URB_OFFSET
:
877 if (devinfo
->gen
>= 8)
878 return calculate_desc(info
, unit_fpu
, 12 /* XXX */, 0, 0,
880 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
882 else if (devinfo
->is_haswell
)
883 return calculate_desc(info
, unit_fpu
, 14 /* XXX */, 0, 0,
885 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
887 else if (devinfo
->gen
>= 7)
888 return calculate_desc(info
, unit_fpu
, 16 /* XXX */, 0, 0,
890 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
895 case SHADER_OPCODE_TEX
:
897 case SHADER_OPCODE_TXD
:
898 case SHADER_OPCODE_TXF
:
899 case SHADER_OPCODE_TXF_LZ
:
900 case SHADER_OPCODE_TXL
:
901 case SHADER_OPCODE_TXL_LZ
:
902 case SHADER_OPCODE_TXF_CMS
:
903 case SHADER_OPCODE_TXF_CMS_W
:
904 case SHADER_OPCODE_TXF_UMS
:
905 case SHADER_OPCODE_TXF_MCS
:
906 case SHADER_OPCODE_TXS
:
907 case SHADER_OPCODE_LOD
:
908 case SHADER_OPCODE_GET_BUFFER_SIZE
:
909 case SHADER_OPCODE_TG4
:
910 case SHADER_OPCODE_TG4_OFFSET
:
911 case SHADER_OPCODE_SAMPLEINFO
:
912 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4
:
913 return calculate_desc(info
, unit_sampler
, 2, 0, 0, 0, 16 /* XXX */,
914 8 /* XXX */, 750 /* XXX */, 0, 0,
917 case SHADER_OPCODE_URB_READ_SIMD8
:
918 case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT
:
919 case SHADER_OPCODE_URB_WRITE_SIMD8
:
920 case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT
:
921 case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED
:
922 case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT
:
923 case VEC4_OPCODE_URB_READ
:
924 case VS_OPCODE_URB_WRITE
:
925 case GS_OPCODE_URB_WRITE
:
926 case GS_OPCODE_URB_WRITE_ALLOCATE
:
927 case GS_OPCODE_THREAD_END
:
928 case GS_OPCODE_FF_SYNC
:
929 case TCS_OPCODE_URB_WRITE
:
930 case TCS_OPCODE_RELEASE_INPUT
:
931 case TCS_OPCODE_THREAD_END
:
932 return calculate_desc(info
, unit_urb
, 2, 0, 0, 0, 6 /* XXX */,
933 32 /* XXX */, 200 /* XXX */, 0, 0, 0, 0);
935 case SHADER_OPCODE_MEMORY_FENCE
:
936 case SHADER_OPCODE_INTERLOCK
:
938 case GEN6_SFID_DATAPORT_RENDER_CACHE
:
939 if (devinfo
->gen
>= 7)
940 return calculate_desc(info
, unit_dp_rc
, 2, 0, 0, 30 /* XXX */, 0,
941 10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
945 case GEN7_SFID_DATAPORT_DATA_CACHE
:
946 case HSW_SFID_DATAPORT_DATA_CACHE_1
:
947 if (devinfo
->gen
>= 7)
948 return calculate_desc(info
, unit_dp_dc
, 2, 0, 0, 30 /* XXX */, 0,
949 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
957 case SHADER_OPCODE_GEN4_SCRATCH_READ
:
958 case SHADER_OPCODE_GEN4_SCRATCH_WRITE
:
959 case SHADER_OPCODE_GEN7_SCRATCH_READ
:
960 return calculate_desc(info
, unit_dp_dc
, 2, 0, 0, 0, 8 /* XXX */,
961 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
963 case VEC4_OPCODE_UNTYPED_ATOMIC
:
964 if (devinfo
->gen
>= 7)
965 return calculate_desc(info
, unit_dp_dc
, 2, 0, 0,
966 30 /* XXX */, 400 /* XXX */,
967 10 /* XXX */, 100 /* XXX */, 0, 0,
972 case VEC4_OPCODE_UNTYPED_SURFACE_READ
:
973 case VEC4_OPCODE_UNTYPED_SURFACE_WRITE
:
974 if (devinfo
->gen
>= 7)
975 return calculate_desc(info
, unit_dp_dc
, 2, 0, 0,
977 10 /* XXX */, 100 /* XXX */, 0, 0,
982 case FS_OPCODE_FB_WRITE
:
983 case FS_OPCODE_FB_READ
:
984 case FS_OPCODE_REP_FB_WRITE
:
985 return calculate_desc(info
, unit_dp_rc
, 2, 0, 0, 0, 450 /* XXX */,
986 10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
988 case GS_OPCODE_SVB_WRITE
:
989 if (devinfo
->gen
>= 6)
990 return calculate_desc(info
, unit_dp_rc
, 2 /* XXX */, 0, 0,
992 10 /* XXX */, 300 /* XXX */, 0, 0,
997 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD
:
998 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7
:
999 return calculate_desc(info
, unit_dp_cc
, 2, 0, 0, 0, 16 /* XXX */,
1000 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
1002 case VS_OPCODE_PULL_CONSTANT_LOAD
:
1003 case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7
:
1004 return calculate_desc(info
, unit_sampler
, 2, 0, 0, 0, 16,
1005 8, 750, 0, 0, 2, 0);
1007 case FS_OPCODE_INTERPOLATE_AT_SAMPLE
:
1008 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET
:
1009 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET
:
1010 if (devinfo
->gen
>= 7)
1011 return calculate_desc(info
, unit_pi
, 2, 0, 0, 14 /* XXX */, 0,
1012 0, 90 /* XXX */, 0, 0, 0, 0);
1016 case SHADER_OPCODE_BARRIER
:
1017 if (devinfo
->gen
>= 7)
1018 return calculate_desc(info
, unit_gateway
, 90 /* XXX */, 0, 0,
1024 case CS_OPCODE_CS_TERMINATE
:
1025 if (devinfo
->gen
>= 7)
1026 return calculate_desc(info
, unit_spawner
, 2, 0, 0, 0 /* XXX */, 0,
1027 10 /* XXX */, 0, 0, 0, 0, 0);
1031 case SHADER_OPCODE_SEND
:
1032 switch (info
.sfid
) {
1033 case GEN6_SFID_DATAPORT_RENDER_CACHE
:
1034 if (devinfo
->gen
>= 7) {
1035 switch (brw_dp_desc_msg_type(devinfo
, info
.desc
)) {
1036 case GEN7_DATAPORT_RC_TYPED_ATOMIC_OP
:
1037 return calculate_desc(info
, unit_dp_rc
, 2, 0, 0,
1038 30 /* XXX */, 450 /* XXX */,
1039 10 /* XXX */, 100 /* XXX */,
1040 0, 0, 0, 400 /* XXX */);
1042 return calculate_desc(info
, unit_dp_rc
, 2, 0, 0,
1044 10 /* XXX */, 300 /* XXX */, 0, 0,
1047 } else if (devinfo
->gen
>= 6) {
1048 return calculate_desc(info
, unit_dp_rc
, 2 /* XXX */, 0, 0,
1050 10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
1054 case BRW_SFID_SAMPLER
: {
1055 if (devinfo
->gen
>= 6)
1056 return calculate_desc(info
, unit_sampler
, 2, 0, 0, 0, 16,
1057 8, 750, 0, 0, 2, 0);
1061 case GEN7_SFID_DATAPORT_DATA_CACHE
:
1062 case HSW_SFID_DATAPORT_DATA_CACHE_1
:
1063 if (devinfo
->gen
>= 8 || devinfo
->is_haswell
) {
1064 switch (brw_dp_desc_msg_type(devinfo
, info
.desc
)) {
1065 case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP
:
1066 case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2
:
1067 case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2
:
1068 case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP
:
1069 return calculate_desc(info
, unit_dp_dc
, 2, 0, 0,
1070 30 /* XXX */, 400 /* XXX */,
1071 10 /* XXX */, 100 /* XXX */, 0, 0,
1075 return calculate_desc(info
, unit_dp_dc
, 2, 0, 0,
1077 10 /* XXX */, 100 /* XXX */, 0, 0,
1080 } else if (devinfo
->gen
>= 7) {
1081 switch (brw_dp_desc_msg_type(devinfo
, info
.desc
)) {
1082 case GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP
:
1083 return calculate_desc(info
, unit_dp_dc
, 2, 0, 0,
1084 30 /* XXX */, 400 /* XXX */,
1085 10 /* XXX */, 100 /* XXX */,
1086 0, 0, 0, 400 /* XXX */);
1088 return calculate_desc(info
, unit_dp_dc
, 2, 0, 0,
1090 10 /* XXX */, 100 /* XXX */, 0, 0,
1100 case SHADER_OPCODE_UNDEF
:
1101 case FS_OPCODE_PLACEHOLDER_HALT
:
1102 case FS_OPCODE_SCHEDULING_FENCE
:
1103 return calculate_desc(info
, unit_null
, 0, 0, 0, 0, 0,
1112 * Model the performance behavior of a stall on the specified dependency
1116 stall_on_dependency(state
&st
, dependency_id id
)
1118 if (id
< ARRAY_SIZE(st
.dep_ready
))
1119 st
.unit_ready
[unit_fe
] = MAX2(st
.unit_ready
[unit_fe
],
1124 * Model the performance behavior of the front-end and back-end while
1125 * executing an instruction with the specified timing information, assuming
1126 * all dependencies are already clear.
1129 execute_instruction(state
&st
, const perf_desc
&perf
)
1131 /* Compute the time at which the front-end will be ready to execute the
1134 st
.unit_ready
[unit_fe
] += perf
.df
;
1136 if (perf
.u
< num_units
) {
1137 /* Wait for the back-end to be ready to execute this instruction. */
1138 st
.unit_ready
[unit_fe
] = MAX2(st
.unit_ready
[unit_fe
],
1139 st
.unit_ready
[perf
.u
]);
1141 /* Compute the time at which the back-end will be ready to execute
1142 * the next instruction, and update the back-end utilization.
1144 st
.unit_ready
[perf
.u
] = st
.unit_ready
[unit_fe
] + perf
.db
;
1145 st
.unit_busy
[perf
.u
] += perf
.db
* st
.weight
;
1150 * Model the performance behavior of a read dependency provided by an
1154 mark_read_dependency(state
&st
, const perf_desc
&perf
, dependency_id id
)
1156 if (id
< ARRAY_SIZE(st
.dep_ready
))
1157 st
.dep_ready
[id
] = st
.unit_ready
[unit_fe
] + perf
.ls
;
1161 * Model the performance behavior of a write dependency provided by an
1165 mark_write_dependency(state
&st
, const perf_desc
&perf
, dependency_id id
)
1167 if (id
>= dependency_id_accum0
&& id
< dependency_id_flag0
)
1168 st
.dep_ready
[id
] = st
.unit_ready
[unit_fe
] + perf
.la
;
1169 else if (id
>= dependency_id_flag0
&& id
< dependency_id_sbid_wr0
)
1170 st
.dep_ready
[id
] = st
.unit_ready
[unit_fe
] + perf
.lf
;
1171 else if (id
< ARRAY_SIZE(st
.dep_ready
))
1172 st
.dep_ready
[id
] = st
.unit_ready
[unit_fe
] + perf
.ld
;
1176 * Return the dependency ID of a backend_reg, offset by \p delta GRFs.
1179 reg_dependency_id(const gen_device_info
*devinfo
, const backend_reg
&r
,
1182 if (r
.file
== VGRF
) {
1183 const unsigned i
= r
.nr
+ r
.offset
/ REG_SIZE
+ delta
;
1184 assert(i
< dependency_id_mrf0
- dependency_id_grf0
);
1185 return dependency_id(dependency_id_grf0
+ i
);
1187 } else if (r
.file
== FIXED_GRF
) {
1188 const unsigned i
= r
.nr
+ delta
;
1189 assert(i
< dependency_id_mrf0
- dependency_id_grf0
);
1190 return dependency_id(dependency_id_grf0
+ i
);
1192 } else if (r
.file
== MRF
&& devinfo
->gen
>= 7) {
1193 const unsigned i
= GEN7_MRF_HACK_START
+
1194 r
.nr
+ r
.offset
/ REG_SIZE
+ delta
;
1195 assert(i
< dependency_id_mrf0
- dependency_id_grf0
);
1196 return dependency_id(dependency_id_grf0
+ i
);
1198 } else if (r
.file
== MRF
&& devinfo
->gen
< 7) {
1199 const unsigned i
= (r
.nr
& ~BRW_MRF_COMPR4
) +
1200 r
.offset
/ REG_SIZE
+ delta
;
1201 assert(i
< dependency_id_addr0
- dependency_id_mrf0
);
1202 return dependency_id(dependency_id_mrf0
+ i
);
1204 } else if (r
.file
== ARF
&& r
.nr
>= BRW_ARF_ADDRESS
&&
1205 r
.nr
< BRW_ARF_ACCUMULATOR
) {
1207 return dependency_id_addr0
;
1209 } else if (r
.file
== ARF
&& r
.nr
>= BRW_ARF_ACCUMULATOR
&&
1210 r
.nr
< BRW_ARF_FLAG
) {
1211 const unsigned i
= r
.nr
- BRW_ARF_ACCUMULATOR
+ delta
;
1212 assert(i
< dependency_id_flag0
- dependency_id_accum0
);
1213 return dependency_id(dependency_id_accum0
+ i
);
1216 return num_dependency_ids
;
1221 * Return the dependency ID of flag register starting at offset \p i.
1224 flag_dependency_id(unsigned i
)
1226 assert(i
< dependency_id_sbid_wr0
- dependency_id_flag0
);
1227 return dependency_id(dependency_id_flag0
+ i
);
1231 * Return the dependency ID corresponding to the SBID read completion
1232 * condition of a Gen12+ SWSB.
1235 tgl_swsb_rd_dependency_id(tgl_swsb swsb
)
1238 assert(swsb
.sbid
< num_dependency_ids
- dependency_id_sbid_rd0
);
1239 return dependency_id(dependency_id_sbid_rd0
+ swsb
.sbid
);
1241 return num_dependency_ids
;
1246 * Return the dependency ID corresponding to the SBID write completion
1247 * condition of a Gen12+ SWSB.
1250 tgl_swsb_wr_dependency_id(tgl_swsb swsb
)
1253 assert(swsb
.sbid
< dependency_id_sbid_rd0
- dependency_id_sbid_wr0
);
1254 return dependency_id(dependency_id_sbid_wr0
+ swsb
.sbid
);
1256 return num_dependency_ids
;
1261 * Return the implicit accumulator register accessed by channel \p i of the
1265 accum_reg_of_channel(const gen_device_info
*devinfo
,
1266 const backend_instruction
*inst
,
1267 brw_reg_type tx
, unsigned i
)
1269 assert(inst
->reads_accumulator_implicitly() ||
1270 inst
->writes_accumulator_implicitly(devinfo
));
1271 const unsigned offset
= (inst
->group
+ i
) * type_sz(tx
) *
1272 (devinfo
->gen
< 7 || brw_reg_type_is_floating_point(tx
) ? 1 : 2);
1273 return offset
/ REG_SIZE
% 2;
1277 * Model the performance behavior of an FS back-end instruction.
1280 issue_fs_inst(state
&st
, const gen_device_info
*devinfo
,
1281 const backend_instruction
*be_inst
)
1283 const fs_inst
*inst
= static_cast<const fs_inst
*>(be_inst
);
1284 const instruction_info
info(devinfo
, inst
);
1285 const perf_desc perf
= instruction_desc(info
);
1287 /* Stall on any source dependencies. */
1288 for (unsigned i
= 0; i
< inst
->sources
; i
++) {
1289 for (unsigned j
= 0; j
< regs_read(inst
, i
); j
++)
1290 stall_on_dependency(
1291 st
, reg_dependency_id(devinfo
, inst
->src
[i
], j
));
1294 if (inst
->reads_accumulator_implicitly()) {
1295 for (unsigned j
= accum_reg_of_channel(devinfo
, inst
, info
.tx
, 0);
1296 j
<= accum_reg_of_channel(devinfo
, inst
, info
.tx
,
1297 inst
->exec_size
- 1); j
++)
1298 stall_on_dependency(
1299 st
, reg_dependency_id(devinfo
, brw_acc_reg(8), j
));
1302 if (is_send(inst
) && inst
->base_mrf
!= -1) {
1303 for (unsigned j
= 0; j
< inst
->mlen
; j
++)
1304 stall_on_dependency(
1305 st
, reg_dependency_id(
1306 devinfo
, brw_uvec_mrf(8, inst
->base_mrf
, 0), j
));
1309 if (const unsigned mask
= inst
->flags_read(devinfo
)) {
1310 for (unsigned i
= 0; i
< sizeof(mask
) * CHAR_BIT
; i
++) {
1311 if (mask
& (1 << i
))
1312 stall_on_dependency(st
, flag_dependency_id(i
));
1316 /* Stall on any write dependencies. */
1317 if (!inst
->no_dd_check
) {
1318 if (inst
->dst
.file
!= BAD_FILE
&& !inst
->dst
.is_null()) {
1319 for (unsigned j
= 0; j
< regs_written(inst
); j
++)
1320 stall_on_dependency(
1321 st
, reg_dependency_id(devinfo
, inst
->dst
, j
));
1324 if (inst
->writes_accumulator_implicitly(devinfo
)) {
1325 for (unsigned j
= accum_reg_of_channel(devinfo
, inst
, info
.tx
, 0);
1326 j
<= accum_reg_of_channel(devinfo
, inst
, info
.tx
,
1327 inst
->exec_size
- 1); j
++)
1328 stall_on_dependency(
1329 st
, reg_dependency_id(devinfo
, brw_acc_reg(8), j
));
1332 if (const unsigned mask
= inst
->flags_written()) {
1333 for (unsigned i
= 0; i
< sizeof(mask
) * CHAR_BIT
; i
++) {
1334 if (mask
& (1 << i
))
1335 stall_on_dependency(st
, flag_dependency_id(i
));
1340 /* Stall on any SBID dependencies. */
1341 if (inst
->sched
.mode
& (TGL_SBID_SET
| TGL_SBID_DST
))
1342 stall_on_dependency(st
, tgl_swsb_wr_dependency_id(inst
->sched
));
1343 else if (inst
->sched
.mode
& TGL_SBID_SRC
)
1344 stall_on_dependency(st
, tgl_swsb_rd_dependency_id(inst
->sched
));
1346 /* Execute the instruction. */
1347 execute_instruction(st
, perf
);
1349 /* Mark any source dependencies. */
1350 if (inst
->is_send_from_grf()) {
1351 for (unsigned i
= 0; i
< inst
->sources
; i
++) {
1352 if (inst
->is_payload(i
)) {
1353 for (unsigned j
= 0; j
< regs_read(inst
, i
); j
++)
1354 mark_read_dependency(
1355 st
, perf
, reg_dependency_id(devinfo
, inst
->src
[i
], j
));
1360 if (is_send(inst
) && inst
->base_mrf
!= -1) {
1361 for (unsigned j
= 0; j
< inst
->mlen
; j
++)
1362 mark_read_dependency(st
, perf
,
1363 reg_dependency_id(devinfo
, brw_uvec_mrf(8, inst
->base_mrf
, 0), j
));
1366 /* Mark any destination dependencies. */
1367 if (inst
->dst
.file
!= BAD_FILE
&& !inst
->dst
.is_null()) {
1368 for (unsigned j
= 0; j
< regs_written(inst
); j
++) {
1369 mark_write_dependency(st
, perf
,
1370 reg_dependency_id(devinfo
, inst
->dst
, j
));
1374 if (inst
->writes_accumulator_implicitly(devinfo
)) {
1375 for (unsigned j
= accum_reg_of_channel(devinfo
, inst
, info
.tx
, 0);
1376 j
<= accum_reg_of_channel(devinfo
, inst
, info
.tx
,
1377 inst
->exec_size
- 1); j
++)
1378 mark_write_dependency(st
, perf
,
1379 reg_dependency_id(devinfo
, brw_acc_reg(8), j
));
1382 if (const unsigned mask
= inst
->flags_written()) {
1383 for (unsigned i
= 0; i
< sizeof(mask
) * CHAR_BIT
; i
++) {
1384 if (mask
& (1 << i
))
1385 mark_write_dependency(st
, perf
, flag_dependency_id(i
));
1389 /* Mark any SBID dependencies. */
1390 if (inst
->sched
.mode
& TGL_SBID_SET
) {
1391 mark_read_dependency(st
, perf
, tgl_swsb_rd_dependency_id(inst
->sched
));
1392 mark_write_dependency(st
, perf
, tgl_swsb_wr_dependency_id(inst
->sched
));
1397 * Model the performance behavior of a VEC4 back-end instruction.
1400 issue_vec4_instruction(state
&st
, const gen_device_info
*devinfo
,
1401 const backend_instruction
*be_inst
)
1403 const vec4_instruction
*inst
=
1404 static_cast<const vec4_instruction
*>(be_inst
);
1405 const instruction_info
info(devinfo
, inst
);
1406 const perf_desc perf
= instruction_desc(info
);
1408 /* Stall on any source dependencies. */
1409 for (unsigned i
= 0; i
< ARRAY_SIZE(inst
->src
); i
++) {
1410 for (unsigned j
= 0; j
< regs_read(inst
, i
); j
++)
1411 stall_on_dependency(
1412 st
, reg_dependency_id(devinfo
, inst
->src
[i
], j
));
1415 if (inst
->reads_accumulator_implicitly()) {
1416 for (unsigned j
= accum_reg_of_channel(devinfo
, inst
, info
.tx
, 0);
1417 j
<= accum_reg_of_channel(devinfo
, inst
, info
.tx
,
1418 inst
->exec_size
- 1); j
++)
1419 stall_on_dependency(
1420 st
, reg_dependency_id(devinfo
, brw_acc_reg(8), j
));
1423 if (inst
->base_mrf
!= -1) {
1424 for (unsigned j
= 0; j
< inst
->mlen
; j
++)
1425 stall_on_dependency(
1426 st
, reg_dependency_id(
1427 devinfo
, brw_uvec_mrf(8, inst
->base_mrf
, 0), j
));
1430 if (inst
->reads_flag())
1431 stall_on_dependency(st
, dependency_id_flag0
);
1433 /* Stall on any write dependencies. */
1434 if (!inst
->no_dd_check
) {
1435 if (inst
->dst
.file
!= BAD_FILE
&& !inst
->dst
.is_null()) {
1436 for (unsigned j
= 0; j
< regs_written(inst
); j
++)
1437 stall_on_dependency(
1438 st
, reg_dependency_id(devinfo
, inst
->dst
, j
));
1441 if (inst
->writes_accumulator_implicitly(devinfo
)) {
1442 for (unsigned j
= accum_reg_of_channel(devinfo
, inst
, info
.tx
, 0);
1443 j
<= accum_reg_of_channel(devinfo
, inst
, info
.tx
,
1444 inst
->exec_size
- 1); j
++)
1445 stall_on_dependency(
1446 st
, reg_dependency_id(devinfo
, brw_acc_reg(8), j
));
1449 if (inst
->writes_flag())
1450 stall_on_dependency(st
, dependency_id_flag0
);
1453 /* Execute the instruction. */
1454 execute_instruction(st
, perf
);
1456 /* Mark any source dependencies. */
1457 if (inst
->is_send_from_grf()) {
1458 for (unsigned i
= 0; i
< ARRAY_SIZE(inst
->src
); i
++) {
1459 for (unsigned j
= 0; j
< regs_read(inst
, i
); j
++)
1460 mark_read_dependency(
1461 st
, perf
, reg_dependency_id(devinfo
, inst
->src
[i
], j
));
1465 if (inst
->base_mrf
!= -1) {
1466 for (unsigned j
= 0; j
< inst
->mlen
; j
++)
1467 mark_read_dependency(st
, perf
,
1468 reg_dependency_id(devinfo
, brw_uvec_mrf(8, inst
->base_mrf
, 0), j
));
1471 /* Mark any destination dependencies. */
1472 if (inst
->dst
.file
!= BAD_FILE
&& !inst
->dst
.is_null()) {
1473 for (unsigned j
= 0; j
< regs_written(inst
); j
++) {
1474 mark_write_dependency(st
, perf
,
1475 reg_dependency_id(devinfo
, inst
->dst
, j
));
1479 if (inst
->writes_accumulator_implicitly(devinfo
)) {
1480 for (unsigned j
= accum_reg_of_channel(devinfo
, inst
, info
.tx
, 0);
1481 j
<= accum_reg_of_channel(devinfo
, inst
, info
.tx
,
1482 inst
->exec_size
- 1); j
++)
1483 mark_write_dependency(st
, perf
,
1484 reg_dependency_id(devinfo
, brw_acc_reg(8), j
));
1487 if (inst
->writes_flag())
1488 mark_write_dependency(st
, perf
, dependency_id_flag0
);
1492 * Calculate the maximum possible throughput of the program compatible with
1493 * the cycle-count utilization estimated for each asynchronous unit, in
1494 * threads-per-cycle units.
1497 calculate_thread_throughput(const state
&st
, float busy
)
1499 for (unsigned i
= 0; i
< num_units
; i
++)
1500 busy
= MAX2(busy
, st
.unit_busy
[i
]);
1506 * Estimate the performance of the specified shader.
1509 calculate_performance(performance
&p
, const backend_shader
*s
,
1510 void (*issue_instruction
)(
1511 state
&, const gen_device_info
*,
1512 const backend_instruction
*),
1513 unsigned dispatch_width
)
1515 /* XXX - Plumbing the trip counts from NIR loop analysis would allow us
1516 * to do a better job regarding the loop weights. And some branch
1517 * divergence analysis would allow us to do a better job with
1518 * branching weights.
1520 * In the meantime use values that roughly match the control flow
1521 * weights used elsewhere in the compiler back-end -- Main
1522 * difference is the worst-case scenario branch_weight used for
1523 * SIMD32 which accounts for the possibility of a dynamically
1524 * uniform branch becoming divergent in SIMD32.
1526 * Note that we provide slightly more pessimistic weights on
1527 * Gen12+ for SIMD32, since the effective warp size on that
1528 * platform is 2x the SIMD width due to EU fusion, which increases
1529 * the likelihood of divergent control flow in comparison to
1530 * previous generations, giving narrower SIMD modes a performance
1531 * advantage in several test-cases with non-uniform discard jumps.
1533 const float branch_weight
= (dispatch_width
> 16 ? 1.0 : 0.5);
1534 const float discard_weight
= (dispatch_width
> 16 || s
->devinfo
->gen
< 12 ?
1536 const float loop_weight
= 10;
1537 unsigned discard_count
= 0;
1538 unsigned elapsed
= 0;
1541 foreach_block(block
, s
->cfg
) {
1542 const unsigned elapsed0
= elapsed
;
1544 foreach_inst_in_block(backend_instruction
, inst
, block
) {
1545 const unsigned clock0
= st
.unit_ready
[unit_fe
];
1547 issue_instruction(st
, s
->devinfo
, inst
);
1549 if (inst
->opcode
== BRW_OPCODE_ENDIF
)
1550 st
.weight
/= branch_weight
;
1551 else if (inst
->opcode
== FS_OPCODE_PLACEHOLDER_HALT
&& discard_count
)
1552 st
.weight
/= discard_weight
;
1554 elapsed
+= (st
.unit_ready
[unit_fe
] - clock0
) * st
.weight
;
1556 if (inst
->opcode
== BRW_OPCODE_IF
)
1557 st
.weight
*= branch_weight
;
1558 else if (inst
->opcode
== BRW_OPCODE_DO
)
1559 st
.weight
*= loop_weight
;
1560 else if (inst
->opcode
== BRW_OPCODE_WHILE
)
1561 st
.weight
/= loop_weight
;
1562 else if (inst
->opcode
== FS_OPCODE_DISCARD_JUMP
&& !discard_count
++)
1563 st
.weight
*= discard_weight
;
1566 p
.block_latency
[block
->num
] = elapsed
- elapsed0
;
1569 p
.latency
= elapsed
;
1570 p
.throughput
= dispatch_width
* calculate_thread_throughput(st
, elapsed
);
1574 brw::performance::performance(const fs_visitor
*v
) :
1575 block_latency(new unsigned[v
->cfg
->num_blocks
])
1577 calculate_performance(*this, v
, issue_fs_inst
, v
->dispatch_width
);
1580 brw::performance::performance(const vec4_visitor
*v
) :
1581 block_latency(new unsigned[v
->cfg
->num_blocks
])
1583 calculate_performance(*this, v
, issue_vec4_instruction
, 8);
1586 brw::performance::~performance()
1588 delete[] block_latency
;