2 * Copyright © 2020 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
33 * Enumeration representing the various asynchronous units that can run
34 * computations in parallel on behalf of a shader thread.
39 /** EU FPU0 (Note that co-issue to FPU1 is currently not modeled here). */
41 /** Extended Math unit (AKA FPU1 on Gen8-11, part of the EU on Gen6+). */
43 /** Sampler shared function. */
45 /** Pixel Interpolator shared function. */
47 /** Unified Return Buffer shared function. */
49 /** Data Port Data Cache shared function. */
51 /** Data Port Render Cache shared function. */
53 /** Data Port Constant Cache shared function. */
55 /** Message Gateway shared function. */
57 /** Thread Spawner shared function. */
61 /** Number of asynchronous units currently tracked. */
63 /** Dummy unit for instructions that don't consume runtime from the above. */
68 * Enumeration representing a computation result another computation can
69 * potentially depend on.
72 /* Register part of the GRF. */
73 dependency_id_grf0
= 0,
74 /* Register part of the MRF. Only used on Gen4-6. */
75 dependency_id_mrf0
= dependency_id_grf0
+ BRW_MAX_GRF
,
76 /* Address register part of the ARF. */
77 dependency_id_addr0
= dependency_id_mrf0
+ 24,
78 /* Accumulator register part of the ARF. */
79 dependency_id_accum0
= dependency_id_addr0
+ 1,
80 /* Flag register part of the ARF. */
81 dependency_id_flag0
= dependency_id_accum0
+ 12,
82 /* SBID token write completion. Only used on Gen12+. */
83 dependency_id_sbid_wr0
= dependency_id_flag0
+ 8,
84 /* SBID token read completion. Only used on Gen12+. */
85 dependency_id_sbid_rd0
= dependency_id_sbid_wr0
+ 16,
86 /* Number of computation dependencies currently tracked. */
87 num_dependency_ids
= dependency_id_sbid_rd0
+ 16
91 * State of our modeling of the program execution.
94 state() : unit_ready(), dep_ready(), unit_busy(), weight(1.0) {}
96 * Time at which a given unit will be ready to execute the next
97 * computation, in clock units.
99 unsigned unit_ready
[num_units
];
101 * Time at which an instruction dependent on a given dependency ID will
102 * be ready to execute, in clock units.
104 unsigned dep_ready
[num_dependency_ids
];
106 * Aggregated utilization of a given unit excluding idle cycles,
109 float unit_busy
[num_units
];
111 * Factor of the overhead of a computation accounted for in the
112 * aggregated utilization calculation.
118 * Information derived from an IR instruction used to compute performance
119 * estimates. Allows the timing calculation to work on both FS and VEC4
122 struct instruction_info
{
123 instruction_info(const gen_device_info
*devinfo
, const fs_inst
*inst
) :
124 devinfo(devinfo
), op(inst
->opcode
),
125 td(inst
->dst
.type
), sd(DIV_ROUND_UP(inst
->size_written
, REG_SIZE
)),
126 tx(get_exec_type(inst
)), sx(0), ss(0),
127 sc(has_bank_conflict(devinfo
, inst
) ? sd
: 0),
128 desc(inst
->desc
), sfid(inst
->sfid
)
130 /* We typically want the maximum source size, except for split send
131 * messages which require the total size.
133 if (inst
->opcode
== SHADER_OPCODE_SEND
) {
134 ss
= DIV_ROUND_UP(inst
->size_read(2), REG_SIZE
) +
135 DIV_ROUND_UP(inst
->size_read(3), REG_SIZE
);
137 for (unsigned i
= 0; i
< inst
->sources
; i
++)
138 ss
= MAX2(ss
, DIV_ROUND_UP(inst
->size_read(i
), REG_SIZE
));
141 /* Convert the execution size to GRF units. */
142 sx
= DIV_ROUND_UP(inst
->exec_size
* type_sz(tx
), REG_SIZE
);
144 /* 32x32 integer multiplication has half the usual ALU throughput.
145 * Treat it as double-precision.
147 if ((inst
->opcode
== BRW_OPCODE_MUL
|| inst
->opcode
== BRW_OPCODE_MAD
) &&
148 !brw_reg_type_is_floating_point(tx
) && type_sz(tx
) == 4 &&
149 type_sz(inst
->src
[0].type
) == type_sz(inst
->src
[1].type
))
150 tx
= brw_int_type(8, tx
== BRW_REGISTER_TYPE_D
);
153 instruction_info(const gen_device_info
*devinfo
,
154 const vec4_instruction
*inst
) :
155 devinfo(devinfo
), op(inst
->opcode
),
156 td(inst
->dst
.type
), sd(DIV_ROUND_UP(inst
->size_written
, REG_SIZE
)),
157 tx(get_exec_type(inst
)), sx(0), ss(0), sc(0),
158 desc(inst
->desc
), sfid(inst
->sfid
)
160 /* Compute the maximum source size. */
161 for (unsigned i
= 0; i
< ARRAY_SIZE(inst
->src
); i
++)
162 ss
= MAX2(ss
, DIV_ROUND_UP(inst
->size_read(i
), REG_SIZE
));
164 /* Convert the execution size to GRF units. */
165 sx
= DIV_ROUND_UP(inst
->exec_size
* type_sz(tx
), REG_SIZE
);
167 /* 32x32 integer multiplication has half the usual ALU throughput.
168 * Treat it as double-precision.
170 if ((inst
->opcode
== BRW_OPCODE_MUL
|| inst
->opcode
== BRW_OPCODE_MAD
) &&
171 !brw_reg_type_is_floating_point(tx
) && type_sz(tx
) == 4 &&
172 type_sz(inst
->src
[0].type
) == type_sz(inst
->src
[1].type
))
173 tx
= brw_int_type(8, tx
== BRW_REGISTER_TYPE_D
);
176 /** Device information. */
177 const struct gen_device_info
*devinfo
;
178 /** Instruction opcode. */
180 /** Destination type. */
182 /** Destination size in GRF units. */
184 /** Execution type. */
186 /** Execution size in GRF units. */
190 /** Bank conflict penalty size in GRF units (equal to sd if non-zero). */
192 /** Send message descriptor. */
194 /** Send message shared function ID. */
199 * Timing information of an instruction used to estimate the performance of
203 perf_desc(unit u
, int df
, int db
, int ls
, int ld
, int la
, int lf
) :
204 u(u
), df(df
), db(db
), ls(ls
), ld(ld
), la(la
), lf(lf
) {}
207 * Back-end unit its runtime shall be accounted to, in addition to the
208 * EU front-end which is always assumed to be involved.
212 * Overhead cycles from the time that the EU front-end starts executing
213 * the instruction until it's ready to execute the next instruction.
217 * Overhead cycles from the time that the back-end starts executing the
218 * instruction until it's ready to execute the next instruction.
222 * Latency cycles from the time that the back-end starts executing the
223 * instruction until its sources have been read from the register file.
227 * Latency cycles from the time that the back-end starts executing the
228 * instruction until its regular destination has been written to the
233 * Latency cycles from the time that the back-end starts executing the
234 * instruction until its accumulator destination has been written to the
237 * Note that this is an approximation of the real behavior of
238 * accumulating instructions in the hardware: Instead of modeling a pair
239 * of back-to-back accumulating instructions as a first computation with
240 * latency equal to ld followed by another computation with a
241 * mid-pipeline stall (e.g. after the "M" part of a MAC instruction), we
242 * model the stall as if it occurred at the top of the pipeline, with
243 * the latency of the accumulator computation offset accordingly.
247 * Latency cycles from the time that the back-end starts executing the
248 * instruction until its flag destination has been written to the ARF
255 * Compute the timing information of an instruction based on any relevant
256 * information from the IR and a number of parameters specifying a linear
257 * approximation: Parameter X_Y specifies the derivative of timing X
258 * relative to info field Y, while X_1 specifies the independent term of
259 * the approximation of timing X.
262 calculate_desc(const instruction_info
&info
, unit u
,
263 int df_1
, int df_sd
, int df_sc
,
265 int ls_1
, int ld_1
, int la_1
, int lf_1
,
268 return perf_desc(u
, df_1
+ df_sd
* int(info
.sd
) + df_sc
* int(info
.sc
),
269 db_1
+ db_sx
* int(info
.sx
),
270 ls_1
+ l_ss
* int(info
.ss
),
271 ld_1
+ l_ss
* int(info
.ss
) + l_sd
* int(info
.sd
),
276 * Compute the timing information of an instruction based on any relevant
277 * information from the IR and a number of linear approximation parameters
278 * hard-coded for each IR instruction.
280 * Most timing parameters are obtained from the multivariate linear
281 * regression of a sample of empirical timings measured using the tm0
282 * register (as can be done today by using the shader_time debugging
283 * option). The Gen4-5 math timings are obtained from BSpec Volume 5c.3
284 * "Shared Functions - Extended Math", Section 3.2 "Performance".
285 * Parameters marked XXX shall be considered low-quality, they're possibly
286 * high variance or completely guessed in cases where experimental data was
290 instruction_desc(const instruction_info
&info
)
292 const struct gen_device_info
*devinfo
= info
.devinfo
;
295 case BRW_OPCODE_SYNC
:
305 case BRW_OPCODE_CMPN
:
306 case BRW_OPCODE_F16TO32
:
307 case BRW_OPCODE_BFREV
:
308 case BRW_OPCODE_BFI1
:
311 case BRW_OPCODE_RNDU
:
312 case BRW_OPCODE_RNDD
:
313 case BRW_OPCODE_RNDE
:
314 case BRW_OPCODE_RNDZ
:
316 case BRW_OPCODE_MACH
:
320 case BRW_OPCODE_CBIT
:
321 case BRW_OPCODE_ADDC
:
324 case BRW_OPCODE_SUBB
:
325 case BRW_OPCODE_SAD2
:
326 case BRW_OPCODE_SADA2
:
327 case BRW_OPCODE_LINE
:
329 case SHADER_OPCODE_CLUSTER_BROADCAST
:
330 case FS_OPCODE_DDX_COARSE
:
331 case FS_OPCODE_DDX_FINE
:
332 case FS_OPCODE_DDY_COARSE
:
333 case FS_OPCODE_PIXEL_X
:
334 case FS_OPCODE_PIXEL_Y
:
335 case FS_OPCODE_SET_SAMPLE_ID
:
336 case VEC4_OPCODE_MOV_BYTES
:
337 case VEC4_OPCODE_UNPACK_UNIFORM
:
338 case VEC4_OPCODE_DOUBLE_TO_F32
:
339 case VEC4_OPCODE_DOUBLE_TO_D32
:
340 case VEC4_OPCODE_DOUBLE_TO_U32
:
341 case VEC4_OPCODE_TO_DOUBLE
:
342 case VEC4_OPCODE_PICK_LOW_32BIT
:
343 case VEC4_OPCODE_PICK_HIGH_32BIT
:
344 case VEC4_OPCODE_SET_LOW_32BIT
:
345 case VEC4_OPCODE_SET_HIGH_32BIT
:
346 case GS_OPCODE_SET_DWORD_2
:
347 case GS_OPCODE_SET_WRITE_OFFSET
:
348 case GS_OPCODE_SET_VERTEX_COUNT
:
349 case GS_OPCODE_PREPARE_CHANNEL_MASKS
:
350 case GS_OPCODE_SET_CHANNEL_MASKS
:
351 case GS_OPCODE_GET_INSTANCE_ID
:
352 case GS_OPCODE_SET_PRIMITIVE_ID
:
353 case GS_OPCODE_SVB_SET_DST_INDEX
:
354 case TCS_OPCODE_SRC0_010_IS_ZERO
:
355 case TCS_OPCODE_GET_PRIMITIVE_ID
:
356 case TES_OPCODE_GET_PRIMITIVE_ID
:
357 if (devinfo
->gen
>= 11) {
358 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
359 0, 10, 6 /* XXX */, 14, 0, 0);
360 } else if (devinfo
->gen
>= 8) {
361 if (type_sz(info
.tx
) > 4)
362 return calculate_desc(info
, unit_fpu
, 0, 4, 0, 0, 4,
363 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
365 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
367 } else if (devinfo
->is_haswell
) {
368 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
369 0, 10, 6 /* XXX */, 16, 0, 0);
371 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
372 0, 12, 8 /* XXX */, 18, 0, 0);
379 if (devinfo
->gen
>= 11) {
380 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
382 } else if (devinfo
->gen
>= 8) {
383 if (type_sz(info
.tx
) > 4)
384 return calculate_desc(info
, unit_fpu
, 0, 4, 0, 0, 4,
385 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
387 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
389 } else if (devinfo
->is_haswell
) {
390 if (info
.tx
== BRW_REGISTER_TYPE_F
)
391 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
392 0, 12, 8 /* XXX */, 18, 0, 0);
394 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
395 0, 10, 6 /* XXX */, 16, 0, 0);
396 } else if (devinfo
->gen
>= 7) {
397 if (info
.tx
== BRW_REGISTER_TYPE_F
)
398 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
399 0, 14, 10 /* XXX */, 20, 0, 0);
401 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
402 0, 12, 8 /* XXX */, 18, 0, 0);
404 return calculate_desc(info
, unit_fpu
, 0, 2 /* XXX */, 0,
406 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
411 case BRW_OPCODE_BFI2
:
412 case BRW_OPCODE_CSEL
:
413 if (devinfo
->gen
>= 11)
414 return calculate_desc(info
, unit_fpu
, 0, 2, 1, 0, 2,
415 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
416 else if (devinfo
->gen
>= 8)
417 return calculate_desc(info
, unit_fpu
, 0, 2, 1, 0, 2,
418 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
419 else if (devinfo
->is_haswell
)
420 return calculate_desc(info
, unit_fpu
, 0, 2, 1, 0, 2,
421 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
422 else if (devinfo
->gen
>= 7)
423 return calculate_desc(info
, unit_fpu
, 0, 2, 1, 0, 2,
424 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
429 if (devinfo
->gen
>= 11) {
430 return calculate_desc(info
, unit_fpu
, 0, 2, 1, 0, 2,
431 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
432 } else if (devinfo
->gen
>= 8) {
433 if (type_sz(info
.tx
) > 4)
434 return calculate_desc(info
, unit_fpu
, 0, 4, 1, 0, 4,
435 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
437 return calculate_desc(info
, unit_fpu
, 0, 2, 1, 0, 2,
438 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
439 } else if (devinfo
->is_haswell
) {
440 if (info
.tx
== BRW_REGISTER_TYPE_F
)
441 return calculate_desc(info
, unit_fpu
, 0, 2, 1, 0, 2,
442 0, 12, 8 /* XXX */, 18, 0, 0);
444 return calculate_desc(info
, unit_fpu
, 0, 2, 1, 0, 2,
445 0, 10, 6 /* XXX */, 16, 0, 0);
446 } else if (devinfo
->gen
>= 7) {
447 if (info
.tx
== BRW_REGISTER_TYPE_F
)
448 return calculate_desc(info
, unit_fpu
, 0, 2, 1, 0, 2,
449 0, 14, 10 /* XXX */, 20, 0, 0);
451 return calculate_desc(info
, unit_fpu
, 0, 2, 1, 0, 2,
452 0, 12, 8 /* XXX */, 18, 0, 0);
453 } else if (devinfo
->gen
>= 6) {
454 return calculate_desc(info
, unit_fpu
, 0, 2 /* XXX */, 1 /* XXX */,
456 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
462 case BRW_OPCODE_F32TO16
:
463 if (devinfo
->gen
>= 11)
464 return calculate_desc(info
, unit_fpu
, 0, 4, 0, 0, 4,
465 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
466 else if (devinfo
->gen
>= 8)
467 return calculate_desc(info
, unit_fpu
, 0, 4, 0, 0, 4,
468 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
469 else if (devinfo
->is_haswell
)
470 return calculate_desc(info
, unit_fpu
, 0, 4, 0, 0, 4,
471 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
472 else if (devinfo
->gen
>= 7)
473 return calculate_desc(info
, unit_fpu
, 0, 4, 0, 0, 4,
474 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
482 if (devinfo
->gen
>= 8)
483 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
484 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
485 else if (devinfo
->is_haswell
)
486 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
487 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
489 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
490 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
492 case SHADER_OPCODE_RCP
:
493 case SHADER_OPCODE_RSQ
:
494 case SHADER_OPCODE_SQRT
:
495 case SHADER_OPCODE_EXP2
:
496 case SHADER_OPCODE_LOG2
:
497 case SHADER_OPCODE_SIN
:
498 case SHADER_OPCODE_COS
:
499 case SHADER_OPCODE_POW
:
500 case SHADER_OPCODE_INT_QUOTIENT
:
501 case SHADER_OPCODE_INT_REMAINDER
:
502 if (devinfo
->gen
>= 6) {
504 case SHADER_OPCODE_RCP
:
505 case SHADER_OPCODE_RSQ
:
506 case SHADER_OPCODE_SQRT
:
507 case SHADER_OPCODE_EXP2
:
508 case SHADER_OPCODE_LOG2
:
509 case SHADER_OPCODE_SIN
:
510 case SHADER_OPCODE_COS
:
511 if (devinfo
->gen
>= 8)
512 return calculate_desc(info
, unit_em
, -2, 4, 0, 0, 4,
514 else if (devinfo
->is_haswell
)
515 return calculate_desc(info
, unit_em
, 0, 2, 0, 0, 2,
518 return calculate_desc(info
, unit_em
, 0, 2, 0, 0, 2,
521 case SHADER_OPCODE_POW
:
522 if (devinfo
->gen
>= 8)
523 return calculate_desc(info
, unit_em
, -2, 4, 0, 0, 8,
525 else if (devinfo
->is_haswell
)
526 return calculate_desc(info
, unit_em
, 0, 2, 0, 0, 4,
529 return calculate_desc(info
, unit_em
, 0, 2, 0, 0, 4,
532 case SHADER_OPCODE_INT_QUOTIENT
:
533 case SHADER_OPCODE_INT_REMAINDER
:
534 return calculate_desc(info
, unit_em
, 2, 0, 0, 26, 0,
535 0, 28 /* XXX */, 0, 0, 0, 0);
542 case SHADER_OPCODE_RCP
:
543 return calculate_desc(info
, unit_em
, 2, 0, 0, 0, 8,
546 case SHADER_OPCODE_RSQ
:
547 return calculate_desc(info
, unit_em
, 2, 0, 0, 0, 16,
550 case SHADER_OPCODE_INT_QUOTIENT
:
551 case SHADER_OPCODE_SQRT
:
552 case SHADER_OPCODE_LOG2
:
553 return calculate_desc(info
, unit_em
, 2, 0, 0, 0, 24,
556 case SHADER_OPCODE_INT_REMAINDER
:
557 case SHADER_OPCODE_EXP2
:
558 return calculate_desc(info
, unit_em
, 2, 0, 0, 0, 32,
561 case SHADER_OPCODE_SIN
:
562 case SHADER_OPCODE_COS
:
563 return calculate_desc(info
, unit_em
, 2, 0, 0, 0, 48,
566 case SHADER_OPCODE_POW
:
567 return calculate_desc(info
, unit_em
, 2, 0, 0, 0, 64,
576 if (devinfo
->gen
>= 6)
577 return calculate_desc(info
, unit_null
, 0, 0, 0, 0, 0,
580 return calculate_desc(info
, unit_null
, 2 /* XXX */, 0, 0, 0, 0,
584 case BRW_OPCODE_ELSE
:
585 case BRW_OPCODE_ENDIF
:
586 case BRW_OPCODE_WHILE
:
587 case BRW_OPCODE_BREAK
:
588 case BRW_OPCODE_CONTINUE
:
589 case FS_OPCODE_DISCARD_JUMP
:
590 if (devinfo
->gen
>= 8)
591 return calculate_desc(info
, unit_null
, 8, 0, 0, 0, 0,
593 else if (devinfo
->is_haswell
)
594 return calculate_desc(info
, unit_null
, 6, 0, 0, 0, 0,
597 return calculate_desc(info
, unit_null
, 2, 0, 0, 0, 0,
600 case FS_OPCODE_LINTERP
:
601 if (devinfo
->gen
>= 8)
602 return calculate_desc(info
, unit_fpu
, 0, 4, 0, 0, 4,
603 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
604 else if (devinfo
->is_haswell
)
605 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
606 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
608 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
609 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
612 if (devinfo
->gen
>= 8)
613 return calculate_desc(info
, unit_fpu
, 0, 4, 1, 0, 4,
614 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0);
615 else if (devinfo
->is_haswell
)
616 return calculate_desc(info
, unit_fpu
, 0, 2, 1, 0, 2,
617 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
618 else if (devinfo
->gen
>= 6)
619 return calculate_desc(info
, unit_fpu
, 0, 2, 1, 0, 2,
620 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
624 case FS_OPCODE_PACK_HALF_2x16_SPLIT
:
625 if (devinfo
->gen
>= 11)
626 return calculate_desc(info
, unit_fpu
, 20, 6, 0, 0, 6,
627 0, 10 /* XXX */, 6 /* XXX */,
629 else if (devinfo
->gen
>= 8)
630 return calculate_desc(info
, unit_fpu
, 16, 6, 0, 0, 6,
631 0, 8 /* XXX */, 4 /* XXX */,
633 else if (devinfo
->is_haswell
)
634 return calculate_desc(info
, unit_fpu
, 20, 6, 0, 0, 6,
635 0, 10 /* XXX */, 6 /* XXX */,
637 else if (devinfo
->gen
>= 7)
638 return calculate_desc(info
, unit_fpu
, 24, 6, 0, 0, 6,
639 0, 12 /* XXX */, 8 /* XXX */,
644 case SHADER_OPCODE_MOV_INDIRECT
:
645 if (devinfo
->gen
>= 11)
646 return calculate_desc(info
, unit_fpu
, 34, 0, 0, 34, 0,
647 0, 10 /* XXX */, 6 /* XXX */,
649 else if (devinfo
->gen
>= 8)
650 return calculate_desc(info
, unit_fpu
, 34, 0, 0, 34, 0,
651 0, 8 /* XXX */, 4 /* XXX */,
653 else if (devinfo
->is_haswell
)
654 return calculate_desc(info
, unit_fpu
, 34, 0, 0, 34, 0,
655 0, 10 /* XXX */, 6 /* XXX */,
658 return calculate_desc(info
, unit_fpu
, 34, 0, 0, 34, 0,
659 0, 12 /* XXX */, 8 /* XXX */,
662 case SHADER_OPCODE_BROADCAST
:
663 if (devinfo
->gen
>= 11)
664 return calculate_desc(info
, unit_fpu
, 20 /* XXX */, 0, 0, 4, 0,
665 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
666 else if (devinfo
->gen
>= 8)
667 return calculate_desc(info
, unit_fpu
, 18, 0, 0, 4, 0,
668 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
669 else if (devinfo
->is_haswell
)
670 return calculate_desc(info
, unit_fpu
, 18, 0, 0, 4, 0,
671 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
672 else if (devinfo
->gen
>= 7)
673 return calculate_desc(info
, unit_fpu
, 20, 0, 0, 4, 0,
674 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
678 case SHADER_OPCODE_FIND_LIVE_CHANNEL
:
679 if (devinfo
->gen
>= 11)
680 return calculate_desc(info
, unit_fpu
, 2, 0, 0, 2, 0,
681 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
682 else if (devinfo
->gen
>= 8)
683 return calculate_desc(info
, unit_fpu
, 2, 0, 0, 2, 0,
684 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
685 else if (devinfo
->is_haswell
)
686 return calculate_desc(info
, unit_fpu
, 36, 0, 0, 6, 0,
687 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0);
688 else if (devinfo
->gen
>= 7)
689 return calculate_desc(info
, unit_fpu
, 40, 0, 0, 6, 0,
690 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
694 case SHADER_OPCODE_RND_MODE
:
695 case SHADER_OPCODE_FLOAT_CONTROL_MODE
:
696 if (devinfo
->gen
>= 11)
697 return calculate_desc(info
, unit_fpu
, 24 /* XXX */, 0, 0,
700 else if (devinfo
->gen
>= 8)
701 return calculate_desc(info
, unit_fpu
, 20 /* XXX */, 0, 0,
704 else if (devinfo
->is_haswell
)
705 return calculate_desc(info
, unit_fpu
, 24 /* XXX */, 0, 0,
708 else if (devinfo
->gen
>= 6)
709 return calculate_desc(info
, unit_fpu
, 28 /* XXX */, 0, 0,
715 case SHADER_OPCODE_SHUFFLE
:
716 if (devinfo
->gen
>= 11)
717 return calculate_desc(info
, unit_fpu
, 44 /* XXX */, 0, 0,
719 0, 10 /* XXX */, 6 /* XXX */,
721 else if (devinfo
->gen
>= 8)
722 return calculate_desc(info
, unit_fpu
, 42 /* XXX */, 0, 0,
724 0, 8 /* XXX */, 4 /* XXX */,
726 else if (devinfo
->is_haswell
)
727 return calculate_desc(info
, unit_fpu
, 0, 44 /* XXX */, 0,
729 0, 10 /* XXX */, 6 /* XXX */,
731 else if (devinfo
->gen
>= 6)
732 return calculate_desc(info
, unit_fpu
, 0, 46 /* XXX */, 0,
734 0, 12 /* XXX */, 8 /* XXX */,
739 case SHADER_OPCODE_SEL_EXEC
:
740 if (devinfo
->gen
>= 11)
741 return calculate_desc(info
, unit_fpu
, 10 /* XXX */, 4 /* XXX */, 0,
743 0, 10 /* XXX */, 6 /* XXX */,
745 else if (devinfo
->gen
>= 8)
746 return calculate_desc(info
, unit_fpu
, 8 /* XXX */, 4 /* XXX */, 0,
748 0, 8 /* XXX */, 4 /* XXX */,
750 else if (devinfo
->is_haswell
)
751 return calculate_desc(info
, unit_fpu
, 10 /* XXX */, 4 /* XXX */, 0,
753 0, 10 /* XXX */, 6 /* XXX */,
756 return calculate_desc(info
, unit_fpu
, 12 /* XXX */, 4 /* XXX */, 0,
758 0, 12 /* XXX */, 8 /* XXX */,
761 case SHADER_OPCODE_QUAD_SWIZZLE
:
762 if (devinfo
->gen
>= 11)
763 return calculate_desc(info
, unit_fpu
, 0 /* XXX */, 8 /* XXX */, 0,
765 0, 10 /* XXX */, 6 /* XXX */,
767 else if (devinfo
->gen
>= 8)
768 return calculate_desc(info
, unit_fpu
, 0 /* XXX */, 8 /* XXX */, 0,
770 0, 8 /* XXX */, 4 /* XXX */,
772 else if (devinfo
->is_haswell
)
773 return calculate_desc(info
, unit_fpu
, 0 /* XXX */, 8 /* XXX */, 0,
775 0, 10 /* XXX */, 6 /* XXX */,
778 return calculate_desc(info
, unit_fpu
, 0 /* XXX */, 8 /* XXX */, 0,
780 0, 12 /* XXX */, 8 /* XXX */,
783 case FS_OPCODE_DDY_FINE
:
784 if (devinfo
->gen
>= 11)
785 return calculate_desc(info
, unit_fpu
, 0, 14, 0, 0, 4,
786 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0);
787 else if (devinfo
->gen
>= 8)
788 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
789 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0);
790 else if (devinfo
->is_haswell
)
791 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
792 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0);
794 return calculate_desc(info
, unit_fpu
, 0, 2, 0, 0, 2,
795 0, 14, 10 /* XXX */, 20 /* XXX */, 0, 0);
797 case FS_OPCODE_LOAD_LIVE_CHANNELS
:
798 if (devinfo
->gen
>= 11)
799 return calculate_desc(info
, unit_fpu
, 2 /* XXX */, 0, 0,
801 0, 0, 0, 10 /* XXX */, 0, 0);
802 else if (devinfo
->gen
>= 8)
803 return calculate_desc(info
, unit_fpu
, 0, 2 /* XXX */, 0,
805 0, 0, 0, 8 /* XXX */, 0, 0);
809 case VEC4_OPCODE_PACK_BYTES
:
810 if (devinfo
->gen
>= 8)
811 return calculate_desc(info
, unit_fpu
, 4 /* XXX */, 0, 0,
813 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
815 else if (devinfo
->is_haswell
)
816 return calculate_desc(info
, unit_fpu
, 4 /* XXX */, 0, 0,
818 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
821 return calculate_desc(info
, unit_fpu
, 4 /* XXX */, 0, 0,
823 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
826 case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9
:
827 if (devinfo
->gen
>= 8)
828 return calculate_desc(info
, unit_fpu
, 12 /* XXX */, 0, 0,
830 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
835 case VS_OPCODE_UNPACK_FLAGS_SIMD4X2
:
836 case TCS_OPCODE_GET_INSTANCE_ID
:
837 case TCS_OPCODE_SET_INPUT_URB_OFFSETS
:
838 case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS
:
839 case TES_OPCODE_CREATE_INPUT_READ_HEADER
:
840 if (devinfo
->gen
>= 8)
841 return calculate_desc(info
, unit_fpu
, 22 /* XXX */, 0, 0,
843 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
845 else if (devinfo
->is_haswell
)
846 return calculate_desc(info
, unit_fpu
, 26 /* XXX */, 0, 0,
848 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
851 return calculate_desc(info
, unit_fpu
, 30 /* XXX */, 0, 0,
853 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
856 case GS_OPCODE_FF_SYNC_SET_PRIMITIVES
:
857 case TCS_OPCODE_CREATE_BARRIER_HEADER
:
858 if (devinfo
->gen
>= 8)
859 return calculate_desc(info
, unit_fpu
, 32 /* XXX */, 0, 0,
861 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
863 else if (devinfo
->is_haswell
)
864 return calculate_desc(info
, unit_fpu
, 38 /* XXX */, 0, 0,
866 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
868 else if (devinfo
->gen
>= 6)
869 return calculate_desc(info
, unit_fpu
, 44 /* XXX */, 0, 0,
871 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
876 case TES_OPCODE_ADD_INDIRECT_URB_OFFSET
:
877 if (devinfo
->gen
>= 8)
878 return calculate_desc(info
, unit_fpu
, 12 /* XXX */, 0, 0,
880 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */,
882 else if (devinfo
->is_haswell
)
883 return calculate_desc(info
, unit_fpu
, 14 /* XXX */, 0, 0,
885 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */,
887 else if (devinfo
->gen
>= 7)
888 return calculate_desc(info
, unit_fpu
, 16 /* XXX */, 0, 0,
890 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */,
895 case SHADER_OPCODE_TEX
:
897 case SHADER_OPCODE_TXD
:
898 case SHADER_OPCODE_TXF
:
899 case SHADER_OPCODE_TXF_LZ
:
900 case SHADER_OPCODE_TXL
:
901 case SHADER_OPCODE_TXL_LZ
:
902 case SHADER_OPCODE_TXF_CMS
:
903 case SHADER_OPCODE_TXF_CMS_W
:
904 case SHADER_OPCODE_TXF_UMS
:
905 case SHADER_OPCODE_TXF_MCS
:
906 case SHADER_OPCODE_TXS
:
907 case SHADER_OPCODE_LOD
:
908 case SHADER_OPCODE_GET_BUFFER_SIZE
:
909 case SHADER_OPCODE_TG4
:
910 case SHADER_OPCODE_TG4_OFFSET
:
911 case SHADER_OPCODE_SAMPLEINFO
:
912 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4
:
913 return calculate_desc(info
, unit_sampler
, 2, 0, 0, 0, 16 /* XXX */,
914 8 /* XXX */, 750 /* XXX */, 0, 0,
917 case SHADER_OPCODE_URB_READ_SIMD8
:
918 case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT
:
919 case SHADER_OPCODE_URB_WRITE_SIMD8
:
920 case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT
:
921 case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED
:
922 case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT
:
923 case VEC4_OPCODE_URB_READ
:
924 case VS_OPCODE_URB_WRITE
:
925 case GS_OPCODE_URB_WRITE
:
926 case GS_OPCODE_URB_WRITE_ALLOCATE
:
927 case GS_OPCODE_THREAD_END
:
928 case GS_OPCODE_FF_SYNC
:
929 case TCS_OPCODE_URB_WRITE
:
930 case TCS_OPCODE_RELEASE_INPUT
:
931 case TCS_OPCODE_THREAD_END
:
932 return calculate_desc(info
, unit_urb
, 2, 0, 0, 0, 6 /* XXX */,
933 32 /* XXX */, 200 /* XXX */, 0, 0, 0, 0);
935 case SHADER_OPCODE_MEMORY_FENCE
:
936 case SHADER_OPCODE_INTERLOCK
:
937 if (devinfo
->gen
>= 7)
938 return calculate_desc(info
, unit_dp_dc
, 2, 0, 0, 30 /* XXX */, 0,
939 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
943 case SHADER_OPCODE_GEN4_SCRATCH_READ
:
944 case SHADER_OPCODE_GEN4_SCRATCH_WRITE
:
945 case SHADER_OPCODE_GEN7_SCRATCH_READ
:
946 return calculate_desc(info
, unit_dp_dc
, 2, 0, 0, 0, 8 /* XXX */,
947 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
949 case VEC4_OPCODE_UNTYPED_ATOMIC
:
950 if (devinfo
->gen
>= 7)
951 return calculate_desc(info
, unit_dp_dc
, 2, 0, 0,
952 30 /* XXX */, 400 /* XXX */,
953 10 /* XXX */, 100 /* XXX */, 0, 0,
958 case VEC4_OPCODE_UNTYPED_SURFACE_READ
:
959 case VEC4_OPCODE_UNTYPED_SURFACE_WRITE
:
960 if (devinfo
->gen
>= 7)
961 return calculate_desc(info
, unit_dp_dc
, 2, 0, 0,
963 10 /* XXX */, 100 /* XXX */, 0, 0,
968 case FS_OPCODE_FB_WRITE
:
969 case FS_OPCODE_FB_READ
:
970 case FS_OPCODE_REP_FB_WRITE
:
971 return calculate_desc(info
, unit_dp_rc
, 2, 0, 0, 0, 450 /* XXX */,
972 10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
974 case GS_OPCODE_SVB_WRITE
:
975 if (devinfo
->gen
>= 6)
976 return calculate_desc(info
, unit_dp_rc
, 2 /* XXX */, 0, 0,
978 10 /* XXX */, 300 /* XXX */, 0, 0,
983 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD
:
984 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7
:
985 return calculate_desc(info
, unit_dp_cc
, 2, 0, 0, 0, 16 /* XXX */,
986 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0);
988 case VS_OPCODE_PULL_CONSTANT_LOAD
:
989 case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7
:
990 return calculate_desc(info
, unit_sampler
, 2, 0, 0, 0, 16,
993 case FS_OPCODE_INTERPOLATE_AT_SAMPLE
:
994 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET
:
995 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET
:
996 if (devinfo
->gen
>= 7)
997 return calculate_desc(info
, unit_pi
, 2, 0, 0, 14 /* XXX */, 0,
998 0, 90 /* XXX */, 0, 0, 0, 0);
1002 case SHADER_OPCODE_BARRIER
:
1003 if (devinfo
->gen
>= 7)
1004 return calculate_desc(info
, unit_gateway
, 90 /* XXX */, 0, 0,
1010 case CS_OPCODE_CS_TERMINATE
:
1011 if (devinfo
->gen
>= 7)
1012 return calculate_desc(info
, unit_spawner
, 2, 0, 0, 0 /* XXX */, 0,
1013 10 /* XXX */, 0, 0, 0, 0, 0);
1017 case SHADER_OPCODE_SEND
:
1018 switch (info
.sfid
) {
1019 case GEN6_SFID_DATAPORT_RENDER_CACHE
:
1020 if (devinfo
->gen
>= 7) {
1021 switch (brw_dp_desc_msg_type(devinfo
, info
.desc
)) {
1022 case GEN7_DATAPORT_RC_TYPED_ATOMIC_OP
:
1023 return calculate_desc(info
, unit_dp_rc
, 2, 0, 0,
1024 30 /* XXX */, 450 /* XXX */,
1025 10 /* XXX */, 100 /* XXX */,
1026 0, 0, 0, 400 /* XXX */);
1028 return calculate_desc(info
, unit_dp_rc
, 2, 0, 0,
1030 10 /* XXX */, 300 /* XXX */, 0, 0,
1033 } else if (devinfo
->gen
>= 6) {
1034 return calculate_desc(info
, unit_dp_rc
, 2 /* XXX */, 0, 0,
1036 10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0);
1040 case BRW_SFID_SAMPLER
: {
1041 if (devinfo
->gen
>= 6)
1042 return calculate_desc(info
, unit_sampler
, 2, 0, 0, 0, 16,
1043 8, 750, 0, 0, 2, 0);
1047 case GEN7_SFID_DATAPORT_DATA_CACHE
:
1048 case HSW_SFID_DATAPORT_DATA_CACHE_1
:
1049 if (devinfo
->gen
>= 8 || devinfo
->is_haswell
) {
1050 switch (brw_dp_desc_msg_type(devinfo
, info
.desc
)) {
1051 case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP
:
1052 case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2
:
1053 case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2
:
1054 case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP
:
1055 return calculate_desc(info
, unit_dp_dc
, 2, 0, 0,
1056 30 /* XXX */, 400 /* XXX */,
1057 10 /* XXX */, 100 /* XXX */, 0, 0,
1061 return calculate_desc(info
, unit_dp_dc
, 2, 0, 0,
1063 10 /* XXX */, 100 /* XXX */, 0, 0,
1066 } else if (devinfo
->gen
>= 7) {
1067 switch (brw_dp_desc_msg_type(devinfo
, info
.desc
)) {
1068 case GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP
:
1069 return calculate_desc(info
, unit_dp_dc
, 2, 0, 0,
1070 30 /* XXX */, 400 /* XXX */,
1071 10 /* XXX */, 100 /* XXX */,
1072 0, 0, 0, 400 /* XXX */);
1074 return calculate_desc(info
, unit_dp_dc
, 2, 0, 0,
1076 10 /* XXX */, 100 /* XXX */, 0, 0,
1086 case SHADER_OPCODE_UNDEF
:
1087 case FS_OPCODE_PLACEHOLDER_HALT
:
1088 case FS_OPCODE_SCHEDULING_FENCE
:
1089 return calculate_desc(info
, unit_null
, 0, 0, 0, 0, 0,
1098 * Model the performance behavior of a stall on the specified dependency
1102 stall_on_dependency(state
&st
, dependency_id id
)
1104 if (id
< ARRAY_SIZE(st
.dep_ready
))
1105 st
.unit_ready
[unit_fe
] = MAX2(st
.unit_ready
[unit_fe
],
1110 * Model the performance behavior of the front-end and back-end while
1111 * executing an instruction with the specified timing information, assuming
1112 * all dependencies are already clear.
1115 execute_instruction(state
&st
, const perf_desc
&perf
)
1117 /* Compute the time at which the front-end will be ready to execute the
1120 st
.unit_ready
[unit_fe
] += perf
.df
;
1122 if (perf
.u
< num_units
) {
1123 /* Wait for the back-end to be ready to execute this instruction. */
1124 st
.unit_ready
[unit_fe
] = MAX2(st
.unit_ready
[unit_fe
],
1125 st
.unit_ready
[perf
.u
]);
1127 /* Compute the time at which the back-end will be ready to execute
1128 * the next instruction, and update the back-end utilization.
1130 st
.unit_ready
[perf
.u
] = st
.unit_ready
[unit_fe
] + perf
.db
;
1131 st
.unit_busy
[perf
.u
] += perf
.db
* st
.weight
;
1136 * Model the performance behavior of a read dependency provided by an
1140 mark_read_dependency(state
&st
, const perf_desc
&perf
, dependency_id id
)
1142 if (id
< ARRAY_SIZE(st
.dep_ready
))
1143 st
.dep_ready
[id
] = st
.unit_ready
[unit_fe
] + perf
.ls
;
1147 * Model the performance behavior of a write dependency provided by an
1151 mark_write_dependency(state
&st
, const perf_desc
&perf
, dependency_id id
)
1153 if (id
>= dependency_id_accum0
&& id
< dependency_id_flag0
)
1154 st
.dep_ready
[id
] = st
.unit_ready
[unit_fe
] + perf
.la
;
1155 else if (id
>= dependency_id_flag0
&& id
< dependency_id_sbid_wr0
)
1156 st
.dep_ready
[id
] = st
.unit_ready
[unit_fe
] + perf
.lf
;
1157 else if (id
< ARRAY_SIZE(st
.dep_ready
))
1158 st
.dep_ready
[id
] = st
.unit_ready
[unit_fe
] + perf
.ld
;
1162 * Return the dependency ID of a backend_reg, offset by \p delta GRFs.
1165 reg_dependency_id(const gen_device_info
*devinfo
, const backend_reg
&r
,
1168 if (r
.file
== VGRF
) {
1169 const unsigned i
= r
.nr
+ r
.offset
/ REG_SIZE
+ delta
;
1170 assert(i
< dependency_id_mrf0
- dependency_id_grf0
);
1171 return dependency_id(dependency_id_grf0
+ i
);
1173 } else if (r
.file
== FIXED_GRF
) {
1174 const unsigned i
= r
.nr
+ delta
;
1175 assert(i
< dependency_id_mrf0
- dependency_id_grf0
);
1176 return dependency_id(dependency_id_grf0
+ i
);
1178 } else if (r
.file
== MRF
&& devinfo
->gen
>= 7) {
1179 const unsigned i
= GEN7_MRF_HACK_START
+
1180 r
.nr
+ r
.offset
/ REG_SIZE
+ delta
;
1181 assert(i
< dependency_id_mrf0
- dependency_id_grf0
);
1182 return dependency_id(dependency_id_grf0
+ i
);
1184 } else if (r
.file
== MRF
&& devinfo
->gen
< 7) {
1185 const unsigned i
= (r
.nr
& ~BRW_MRF_COMPR4
) +
1186 r
.offset
/ REG_SIZE
+ delta
;
1187 assert(i
< dependency_id_addr0
- dependency_id_mrf0
);
1188 return dependency_id(dependency_id_mrf0
+ i
);
1190 } else if (r
.file
== ARF
&& r
.nr
>= BRW_ARF_ADDRESS
&&
1191 r
.nr
< BRW_ARF_ACCUMULATOR
) {
1193 return dependency_id_addr0
;
1195 } else if (r
.file
== ARF
&& r
.nr
>= BRW_ARF_ACCUMULATOR
&&
1196 r
.nr
< BRW_ARF_FLAG
) {
1197 const unsigned i
= r
.nr
- BRW_ARF_ACCUMULATOR
+ delta
;
1198 assert(i
< dependency_id_flag0
- dependency_id_accum0
);
1199 return dependency_id(dependency_id_accum0
+ i
);
1202 return num_dependency_ids
;
1207 * Return the dependency ID of flag register starting at offset \p i.
1210 flag_dependency_id(unsigned i
)
1212 assert(i
< dependency_id_sbid_wr0
- dependency_id_flag0
);
1213 return dependency_id(dependency_id_flag0
+ i
);
1217 * Return the dependency ID corresponding to the SBID read completion
1218 * condition of a Gen12+ SWSB.
1221 tgl_swsb_rd_dependency_id(tgl_swsb swsb
)
1224 assert(swsb
.sbid
< num_dependency_ids
- dependency_id_sbid_rd0
);
1225 return dependency_id(dependency_id_sbid_rd0
+ swsb
.sbid
);
1227 return num_dependency_ids
;
1232 * Return the dependency ID corresponding to the SBID write completion
1233 * condition of a Gen12+ SWSB.
1236 tgl_swsb_wr_dependency_id(tgl_swsb swsb
)
1239 assert(swsb
.sbid
< dependency_id_sbid_rd0
- dependency_id_sbid_wr0
);
1240 return dependency_id(dependency_id_sbid_wr0
+ swsb
.sbid
);
1242 return num_dependency_ids
;
1247 * Return the implicit accumulator register accessed by channel \p i of the
1251 accum_reg_of_channel(const gen_device_info
*devinfo
,
1252 const backend_instruction
*inst
,
1253 brw_reg_type tx
, unsigned i
)
1255 assert(inst
->reads_accumulator_implicitly() ||
1256 inst
->writes_accumulator_implicitly(devinfo
));
1257 const unsigned offset
= (inst
->group
+ i
) * type_sz(tx
) *
1258 (devinfo
->gen
< 7 || brw_reg_type_is_floating_point(tx
) ? 1 : 2);
1259 return offset
/ REG_SIZE
% 2;
1263 * Model the performance behavior of an FS back-end instruction.
1266 issue_fs_inst(state
&st
, const gen_device_info
*devinfo
,
1267 const backend_instruction
*be_inst
)
1269 const fs_inst
*inst
= static_cast<const fs_inst
*>(be_inst
);
1270 const instruction_info
info(devinfo
, inst
);
1271 const perf_desc perf
= instruction_desc(info
);
1273 /* Stall on any source dependencies. */
1274 for (unsigned i
= 0; i
< inst
->sources
; i
++) {
1275 for (unsigned j
= 0; j
< regs_read(inst
, i
); j
++)
1276 stall_on_dependency(
1277 st
, reg_dependency_id(devinfo
, inst
->src
[i
], j
));
1280 if (inst
->reads_accumulator_implicitly()) {
1281 for (unsigned j
= accum_reg_of_channel(devinfo
, inst
, info
.tx
, 0);
1282 j
<= accum_reg_of_channel(devinfo
, inst
, info
.tx
,
1283 inst
->exec_size
- 1); j
++)
1284 stall_on_dependency(
1285 st
, reg_dependency_id(devinfo
, brw_acc_reg(8), j
));
1288 if (is_send(inst
) && inst
->base_mrf
!= -1) {
1289 for (unsigned j
= 0; j
< inst
->mlen
; j
++)
1290 stall_on_dependency(
1291 st
, reg_dependency_id(
1292 devinfo
, brw_uvec_mrf(8, inst
->base_mrf
, 0), j
));
1295 if (const unsigned mask
= inst
->flags_read(devinfo
)) {
1296 for (unsigned i
= 0; i
< sizeof(mask
) * CHAR_BIT
; i
++) {
1297 if (mask
& (1 << i
))
1298 stall_on_dependency(st
, flag_dependency_id(i
));
1302 /* Stall on any write dependencies. */
1303 if (!inst
->no_dd_check
) {
1304 if (inst
->dst
.file
!= BAD_FILE
&& !inst
->dst
.is_null()) {
1305 for (unsigned j
= 0; j
< regs_written(inst
); j
++)
1306 stall_on_dependency(
1307 st
, reg_dependency_id(devinfo
, inst
->dst
, j
));
1310 if (inst
->writes_accumulator_implicitly(devinfo
)) {
1311 for (unsigned j
= accum_reg_of_channel(devinfo
, inst
, info
.tx
, 0);
1312 j
<= accum_reg_of_channel(devinfo
, inst
, info
.tx
,
1313 inst
->exec_size
- 1); j
++)
1314 stall_on_dependency(
1315 st
, reg_dependency_id(devinfo
, brw_acc_reg(8), j
));
1318 if (const unsigned mask
= inst
->flags_written()) {
1319 for (unsigned i
= 0; i
< sizeof(mask
) * CHAR_BIT
; i
++) {
1320 if (mask
& (1 << i
))
1321 stall_on_dependency(st
, flag_dependency_id(i
));
1326 /* Stall on any SBID dependencies. */
1327 if (inst
->sched
.mode
& (TGL_SBID_SET
| TGL_SBID_DST
))
1328 stall_on_dependency(st
, tgl_swsb_wr_dependency_id(inst
->sched
));
1329 else if (inst
->sched
.mode
& TGL_SBID_SRC
)
1330 stall_on_dependency(st
, tgl_swsb_rd_dependency_id(inst
->sched
));
1332 /* Execute the instruction. */
1333 execute_instruction(st
, perf
);
1335 /* Mark any source dependencies. */
1336 if (inst
->is_send_from_grf()) {
1337 for (unsigned i
= 0; i
< inst
->sources
; i
++) {
1338 if (inst
->is_payload(i
)) {
1339 for (unsigned j
= 0; j
< regs_read(inst
, i
); j
++)
1340 mark_read_dependency(
1341 st
, perf
, reg_dependency_id(devinfo
, inst
->src
[i
], j
));
1346 if (is_send(inst
) && inst
->base_mrf
!= -1) {
1347 for (unsigned j
= 0; j
< inst
->mlen
; j
++)
1348 mark_read_dependency(st
, perf
,
1349 reg_dependency_id(devinfo
, brw_uvec_mrf(8, inst
->base_mrf
, 0), j
));
1352 /* Mark any destination dependencies. */
1353 if (inst
->dst
.file
!= BAD_FILE
&& !inst
->dst
.is_null()) {
1354 for (unsigned j
= 0; j
< regs_written(inst
); j
++) {
1355 mark_write_dependency(st
, perf
,
1356 reg_dependency_id(devinfo
, inst
->dst
, j
));
1360 if (inst
->writes_accumulator_implicitly(devinfo
)) {
1361 for (unsigned j
= accum_reg_of_channel(devinfo
, inst
, info
.tx
, 0);
1362 j
<= accum_reg_of_channel(devinfo
, inst
, info
.tx
,
1363 inst
->exec_size
- 1); j
++)
1364 mark_write_dependency(st
, perf
,
1365 reg_dependency_id(devinfo
, brw_acc_reg(8), j
));
1368 if (const unsigned mask
= inst
->flags_written()) {
1369 for (unsigned i
= 0; i
< sizeof(mask
) * CHAR_BIT
; i
++) {
1370 if (mask
& (1 << i
))
1371 mark_write_dependency(st
, perf
, flag_dependency_id(i
));
1375 /* Mark any SBID dependencies. */
1376 if (inst
->sched
.mode
& TGL_SBID_SET
) {
1377 mark_read_dependency(st
, perf
, tgl_swsb_rd_dependency_id(inst
->sched
));
1378 mark_write_dependency(st
, perf
, tgl_swsb_wr_dependency_id(inst
->sched
));
1383 * Model the performance behavior of a VEC4 back-end instruction.
1386 issue_vec4_instruction(state
&st
, const gen_device_info
*devinfo
,
1387 const backend_instruction
*be_inst
)
1389 const vec4_instruction
*inst
=
1390 static_cast<const vec4_instruction
*>(be_inst
);
1391 const instruction_info
info(devinfo
, inst
);
1392 const perf_desc perf
= instruction_desc(info
);
1394 /* Stall on any source dependencies. */
1395 for (unsigned i
= 0; i
< ARRAY_SIZE(inst
->src
); i
++) {
1396 for (unsigned j
= 0; j
< regs_read(inst
, i
); j
++)
1397 stall_on_dependency(
1398 st
, reg_dependency_id(devinfo
, inst
->src
[i
], j
));
1401 if (inst
->reads_accumulator_implicitly()) {
1402 for (unsigned j
= accum_reg_of_channel(devinfo
, inst
, info
.tx
, 0);
1403 j
<= accum_reg_of_channel(devinfo
, inst
, info
.tx
,
1404 inst
->exec_size
- 1); j
++)
1405 stall_on_dependency(
1406 st
, reg_dependency_id(devinfo
, brw_acc_reg(8), j
));
1409 if (inst
->base_mrf
!= -1) {
1410 for (unsigned j
= 0; j
< inst
->mlen
; j
++)
1411 stall_on_dependency(
1412 st
, reg_dependency_id(
1413 devinfo
, brw_uvec_mrf(8, inst
->base_mrf
, 0), j
));
1416 if (inst
->reads_flag())
1417 stall_on_dependency(st
, dependency_id_flag0
);
1419 /* Stall on any write dependencies. */
1420 if (!inst
->no_dd_check
) {
1421 if (inst
->dst
.file
!= BAD_FILE
&& !inst
->dst
.is_null()) {
1422 for (unsigned j
= 0; j
< regs_written(inst
); j
++)
1423 stall_on_dependency(
1424 st
, reg_dependency_id(devinfo
, inst
->dst
, j
));
1427 if (inst
->writes_accumulator_implicitly(devinfo
)) {
1428 for (unsigned j
= accum_reg_of_channel(devinfo
, inst
, info
.tx
, 0);
1429 j
<= accum_reg_of_channel(devinfo
, inst
, info
.tx
,
1430 inst
->exec_size
- 1); j
++)
1431 stall_on_dependency(
1432 st
, reg_dependency_id(devinfo
, brw_acc_reg(8), j
));
1435 if (inst
->writes_flag())
1436 stall_on_dependency(st
, dependency_id_flag0
);
1439 /* Execute the instruction. */
1440 execute_instruction(st
, perf
);
1442 /* Mark any source dependencies. */
1443 if (inst
->is_send_from_grf()) {
1444 for (unsigned i
= 0; i
< ARRAY_SIZE(inst
->src
); i
++) {
1445 for (unsigned j
= 0; j
< regs_read(inst
, i
); j
++)
1446 mark_read_dependency(
1447 st
, perf
, reg_dependency_id(devinfo
, inst
->src
[i
], j
));
1451 if (inst
->base_mrf
!= -1) {
1452 for (unsigned j
= 0; j
< inst
->mlen
; j
++)
1453 mark_read_dependency(st
, perf
,
1454 reg_dependency_id(devinfo
, brw_uvec_mrf(8, inst
->base_mrf
, 0), j
));
1457 /* Mark any destination dependencies. */
1458 if (inst
->dst
.file
!= BAD_FILE
&& !inst
->dst
.is_null()) {
1459 for (unsigned j
= 0; j
< regs_written(inst
); j
++) {
1460 mark_write_dependency(st
, perf
,
1461 reg_dependency_id(devinfo
, inst
->dst
, j
));
1465 if (inst
->writes_accumulator_implicitly(devinfo
)) {
1466 for (unsigned j
= accum_reg_of_channel(devinfo
, inst
, info
.tx
, 0);
1467 j
<= accum_reg_of_channel(devinfo
, inst
, info
.tx
,
1468 inst
->exec_size
- 1); j
++)
1469 mark_write_dependency(st
, perf
,
1470 reg_dependency_id(devinfo
, brw_acc_reg(8), j
));
1473 if (inst
->writes_flag())
1474 mark_write_dependency(st
, perf
, dependency_id_flag0
);
1478 * Calculate the maximum possible throughput of the program compatible with
1479 * the cycle-count utilization estimated for each asynchronous unit, in
1480 * threads-per-cycle units.
1483 calculate_thread_throughput(const state
&st
, float busy
)
1485 for (unsigned i
= 0; i
< num_units
; i
++)
1486 busy
= MAX2(busy
, st
.unit_busy
[i
]);
1492 * Estimate the performance of the specified shader.
1495 calculate_performance(performance
&p
, const backend_shader
*s
,
1496 void (*issue_instruction
)(
1497 state
&, const gen_device_info
*,
1498 const backend_instruction
*),
1499 unsigned dispatch_width
)
1501 /* XXX - Plumbing the trip counts from NIR loop analysis would allow us
1502 * to do a better job regarding the loop weights. And some branch
1503 * divergence analysis would allow us to do a better job with
1504 * branching weights.
1506 * In the meantime use values that roughly match the control flow
1507 * weights used elsewhere in the compiler back-end -- Main
1508 * difference is the worst-case scenario branch_weight used for
1509 * SIMD32 which accounts for the possibility of a dynamically
1510 * uniform branch becoming divergent in SIMD32.
1512 const float branch_weight
= (dispatch_width
> 16 ? 1.0 : 0.5);
1513 const float loop_weight
= 10;
1514 unsigned elapsed
= 0;
1517 foreach_block(block
, s
->cfg
) {
1518 const unsigned elapsed0
= elapsed
;
1520 foreach_inst_in_block(backend_instruction
, inst
, block
) {
1521 const unsigned clock0
= st
.unit_ready
[unit_fe
];
1523 issue_instruction(st
, s
->devinfo
, inst
);
1525 if (inst
->opcode
== BRW_OPCODE_ENDIF
)
1526 st
.weight
/= branch_weight
;
1528 elapsed
+= (st
.unit_ready
[unit_fe
] - clock0
) * st
.weight
;
1530 if (inst
->opcode
== BRW_OPCODE_IF
)
1531 st
.weight
*= branch_weight
;
1532 else if (inst
->opcode
== BRW_OPCODE_DO
)
1533 st
.weight
*= loop_weight
;
1534 else if (inst
->opcode
== BRW_OPCODE_WHILE
)
1535 st
.weight
/= loop_weight
;
1538 p
.block_latency
[block
->num
] = elapsed
- elapsed0
;
1541 p
.latency
= elapsed
;
1542 p
.throughput
= dispatch_width
* calculate_thread_throughput(st
, elapsed
);
1546 brw::performance::performance(const fs_visitor
*v
) :
1547 block_latency(new unsigned[v
->cfg
->num_blocks
])
1549 calculate_performance(*this, v
, issue_fs_inst
, v
->dispatch_width
);
1552 brw::performance::performance(const vec4_visitor
*v
) :
1553 block_latency(new unsigned[v
->cfg
->num_blocks
])
1555 calculate_performance(*this, v
, issue_vec4_instruction
, 8);
1558 brw::performance::~performance()
1560 delete[] block_latency
;