2 * Copyright © 2012 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 /** @file brw_eu_compact.c
26 * Instruction compaction is a feature of gm45 and newer hardware that allows
27 * for a smaller instruction encoding.
29 * The instruction cache is on the order of 32KB, and many programs generate
30 * far more instructions than that. The instruction cache is built to barely
31 * keep up with instruction dispatch abaility in cache hit cases -- L1
32 * instruction cache misses that still hit in the next level could limit
33 * throughput by around 50%.
35 * The idea of instruction compaction is that most instructions use a tiny
36 * subset of the GPU functionality, so we can encode what would be a 16 byte
37 * instruction in 8 bytes using some lookup tables for various fields.
40 #include "brw_context.h"
43 static const uint32_t gen6_control_index_table
[32] = {
78 static const uint32_t gen6_datatype_table
[32] = {
100 0b001011010110100101,
101 0b001011110110100101,
102 0b001111011110111101,
103 0b001111011110111100,
104 0b001111011110111101,
105 0b001111011110011101,
106 0b001111011110111110,
107 0b001000000000100001,
108 0b001000000000100010,
109 0b001001111111011101,
110 0b001000001110111110,
113 static const uint32_t gen6_subreg_table
[32] = {
148 static const uint32_t gen6_src_index_table
[32] = {
183 static const uint32_t gen7_control_index_table
[32] = {
184 0b0000000000000000010,
185 0b0000100000000000000,
186 0b0000100000000000001,
187 0b0000100000000000010,
188 0b0000100000000000011,
189 0b0000100000000000100,
190 0b0000100000000000101,
191 0b0000100000000000111,
192 0b0000100000000001000,
193 0b0000100000000001001,
194 0b0000100000000001101,
195 0b0000110000000000000,
196 0b0000110000000000001,
197 0b0000110000000000010,
198 0b0000110000000000011,
199 0b0000110000000000100,
200 0b0000110000000000101,
201 0b0000110000000000111,
202 0b0000110000000001001,
203 0b0000110000000001101,
204 0b0000110000000010000,
205 0b0000110000100000000,
206 0b0001000000000000000,
207 0b0001000000000000010,
208 0b0001000000000000100,
209 0b0001000000100000000,
210 0b0010110000000000000,
211 0b0010110000000010000,
212 0b0011000000000000000,
213 0b0011000000100000000,
214 0b0101000000000000000,
215 0b0101000000100000000
218 static const uint32_t gen7_datatype_table
[32] = {
219 0b001000000000000001,
220 0b001000000000100000,
221 0b001000000000100001,
222 0b001000000001100001,
223 0b001000000010111101,
224 0b001000001011111101,
225 0b001000001110100001,
226 0b001000001110100101,
227 0b001000001110111101,
228 0b001000010000100001,
229 0b001000110000100000,
230 0b001000110000100001,
231 0b001001010010100101,
232 0b001001110010100100,
233 0b001001110010100101,
234 0b001111001110111101,
235 0b001111011110011101,
236 0b001111011110111100,
237 0b001111011110111101,
238 0b001111111110111100,
239 0b000000001000001100,
240 0b001000000000111101,
241 0b001000000010100101,
242 0b001000010000100000,
243 0b001001010010100100,
244 0b001001110010000100,
245 0b001010010100001001,
246 0b001101111110111101,
247 0b001111111110111101,
248 0b001011110110101100,
249 0b001010010100101000,
253 static const uint32_t gen7_subreg_table
[32] = {
288 static const uint32_t gen7_src_index_table
[32] = {
323 static const uint32_t *control_index_table
;
324 static const uint32_t *datatype_table
;
325 static const uint32_t *subreg_table
;
326 static const uint32_t *src_index_table
;
329 set_control_index(struct intel_context
*intel
,
330 struct brw_compact_instruction
*dst
,
331 struct brw_instruction
*src
)
333 uint32_t *src_u32
= (uint32_t *)src
;
334 uint32_t uncompacted
= 0;
336 uncompacted
|= ((src_u32
[0] >> 8) & 0xffff) << 0;
337 uncompacted
|= ((src_u32
[0] >> 31) & 0x1) << 16;
338 /* On gen7, the flag register number gets integrated into the control
342 uncompacted
|= ((src_u32
[2] >> 25) & 0x3) << 17;
344 for (int i
= 0; i
< 32; i
++) {
345 if (control_index_table
[i
] == uncompacted
) {
346 dst
->dw0
.control_index
= i
;
355 set_datatype_index(struct brw_compact_instruction
*dst
,
356 struct brw_instruction
*src
)
358 uint32_t uncompacted
= 0;
360 uncompacted
|= src
->bits1
.ud
& 0x7fff;
361 uncompacted
|= (src
->bits1
.ud
>> 29) << 15;
363 for (int i
= 0; i
< 32; i
++) {
364 if (datatype_table
[i
] == uncompacted
) {
365 dst
->dw0
.data_type_index
= i
;
374 set_subreg_index(struct brw_compact_instruction
*dst
,
375 struct brw_instruction
*src
)
377 uint32_t uncompacted
= 0;
379 uncompacted
|= src
->bits1
.da1
.dest_subreg_nr
<< 0;
380 uncompacted
|= src
->bits2
.da1
.src0_subreg_nr
<< 5;
381 uncompacted
|= src
->bits3
.da1
.src1_subreg_nr
<< 10;
383 for (int i
= 0; i
< 32; i
++) {
384 if (subreg_table
[i
] == uncompacted
) {
385 dst
->dw0
.sub_reg_index
= i
;
394 get_src_index(uint32_t uncompacted
,
397 for (int i
= 0; i
< 32; i
++) {
398 if (src_index_table
[i
] == uncompacted
) {
408 set_src0_index(struct brw_compact_instruction
*dst
,
409 struct brw_instruction
*src
)
411 uint32_t compacted
, uncompacted
= 0;
413 uncompacted
|= (src
->bits2
.ud
>> 13) & 0xfff;
415 if (!get_src_index(uncompacted
, &compacted
))
418 dst
->dw0
.src0_index
= compacted
& 0x3;
419 dst
->dw1
.src0_index
= compacted
>> 2;
425 set_src1_index(struct brw_compact_instruction
*dst
,
426 struct brw_instruction
*src
)
428 uint32_t compacted
, uncompacted
= 0;
430 uncompacted
|= (src
->bits3
.ud
>> 13) & 0xfff;
432 if (!get_src_index(uncompacted
, &compacted
))
435 dst
->dw1
.src1_index
= compacted
;
441 * Tries to compact instruction src into dst.
443 * It doesn't modify dst unless src is compactable, which is relied on by
444 * brw_compact_instructions().
447 brw_try_compact_instruction(struct brw_compile
*p
,
448 struct brw_compact_instruction
*dst
,
449 struct brw_instruction
*src
)
451 struct brw_context
*brw
= p
->brw
;
452 struct intel_context
*intel
= &brw
->intel
;
453 struct brw_compact_instruction temp
;
455 if (src
->header
.opcode
== BRW_OPCODE_IF
||
456 src
->header
.opcode
== BRW_OPCODE_ELSE
||
457 src
->header
.opcode
== BRW_OPCODE_ENDIF
||
458 src
->header
.opcode
== BRW_OPCODE_HALT
||
459 src
->header
.opcode
== BRW_OPCODE_DO
||
460 src
->header
.opcode
== BRW_OPCODE_WHILE
) {
461 /* FINISHME: The fixup code below, and brw_set_uip_jip and friends, needs
462 * to be able to handle compacted flow control instructions..
467 /* FINISHME: immediates */
468 if (src
->bits1
.da1
.src0_reg_file
== BRW_IMMEDIATE_VALUE
||
469 src
->bits1
.da1
.src1_reg_file
== BRW_IMMEDIATE_VALUE
)
472 memset(&temp
, 0, sizeof(temp
));
474 temp
.dw0
.opcode
= src
->header
.opcode
;
475 temp
.dw0
.debug_control
= src
->header
.debug_control
;
476 if (!set_control_index(intel
, &temp
, src
))
478 if (!set_datatype_index(&temp
, src
))
480 if (!set_subreg_index(&temp
, src
))
482 temp
.dw0
.acc_wr_control
= src
->header
.acc_wr_control
;
483 temp
.dw0
.conditionalmod
= src
->header
.destreg__conditionalmod
;
485 temp
.dw0
.flag_reg_nr
= src
->bits2
.da1
.flag_reg_nr
;
486 temp
.dw0
.cmpt_ctrl
= 1;
487 if (!set_src0_index(&temp
, src
))
489 if (!set_src1_index(&temp
, src
))
491 temp
.dw1
.dst_reg_nr
= src
->bits1
.da1
.dest_reg_nr
;
492 temp
.dw1
.src0_reg_nr
= src
->bits2
.da1
.src0_reg_nr
;
493 temp
.dw1
.src1_reg_nr
= src
->bits3
.da1
.src1_reg_nr
;
501 set_uncompacted_control(struct intel_context
*intel
,
502 struct brw_instruction
*dst
,
503 struct brw_compact_instruction
*src
)
505 uint32_t *dst_u32
= (uint32_t *)dst
;
506 uint32_t uncompacted
= control_index_table
[src
->dw0
.control_index
];
508 dst_u32
[0] |= ((uncompacted
>> 0) & 0xffff) << 8;
509 dst_u32
[0] |= ((uncompacted
>> 16) & 0x1) << 31;
512 dst_u32
[2] |= ((uncompacted
>> 17) & 0x3) << 25;
516 set_uncompacted_datatype(struct brw_instruction
*dst
,
517 struct brw_compact_instruction
*src
)
519 uint32_t uncompacted
= datatype_table
[src
->dw0
.data_type_index
];
521 dst
->bits1
.ud
&= ~(0x7 << 29);
522 dst
->bits1
.ud
|= ((uncompacted
>> 15) & 0x7) << 29;
523 dst
->bits1
.ud
&= ~0x7fff;
524 dst
->bits1
.ud
|= uncompacted
& 0x7fff;
528 set_uncompacted_subreg(struct brw_instruction
*dst
,
529 struct brw_compact_instruction
*src
)
531 uint32_t uncompacted
= subreg_table
[src
->dw0
.sub_reg_index
];
533 dst
->bits1
.da1
.dest_subreg_nr
= (uncompacted
>> 0) & 0x1f;
534 dst
->bits2
.da1
.src0_subreg_nr
= (uncompacted
>> 5) & 0x1f;
535 dst
->bits3
.da1
.src1_subreg_nr
= (uncompacted
>> 10) & 0x1f;
539 set_uncompacted_src0(struct brw_instruction
*dst
,
540 struct brw_compact_instruction
*src
)
542 uint32_t compacted
= src
->dw0
.src0_index
| src
->dw1
.src0_index
<< 2;
543 uint32_t uncompacted
= src_index_table
[compacted
];
545 dst
->bits2
.ud
|= uncompacted
<< 13;
549 set_uncompacted_src1(struct brw_instruction
*dst
,
550 struct brw_compact_instruction
*src
)
552 uint32_t uncompacted
= src_index_table
[src
->dw1
.src1_index
];
554 dst
->bits3
.ud
|= uncompacted
<< 13;
558 brw_uncompact_instruction(struct intel_context
*intel
,
559 struct brw_instruction
*dst
,
560 struct brw_compact_instruction
*src
)
562 memset(dst
, 0, sizeof(*dst
));
564 dst
->header
.opcode
= src
->dw0
.opcode
;
565 dst
->header
.debug_control
= src
->dw0
.debug_control
;
567 set_uncompacted_control(intel
, dst
, src
);
568 set_uncompacted_datatype(dst
, src
);
569 set_uncompacted_subreg(dst
, src
);
570 dst
->header
.acc_wr_control
= src
->dw0
.acc_wr_control
;
571 dst
->header
.destreg__conditionalmod
= src
->dw0
.conditionalmod
;
573 dst
->bits2
.da1
.flag_reg_nr
= src
->dw0
.flag_reg_nr
;
574 set_uncompacted_src0(dst
, src
);
575 set_uncompacted_src1(dst
, src
);
576 dst
->bits1
.da1
.dest_reg_nr
= src
->dw1
.dst_reg_nr
;
577 dst
->bits2
.da1
.src0_reg_nr
= src
->dw1
.src0_reg_nr
;
578 dst
->bits3
.da1
.src1_reg_nr
= src
->dw1
.src1_reg_nr
;
581 void brw_debug_compact_uncompact(struct intel_context
*intel
,
582 struct brw_instruction
*orig
,
583 struct brw_instruction
*uncompacted
)
585 fprintf(stderr
, "Instruction compact/uncompact changed (gen%d):\n",
588 fprintf(stderr
, " before: ");
589 brw_disasm(stderr
, orig
, intel
->gen
);
591 fprintf(stderr
, " after: ");
592 brw_disasm(stderr
, uncompacted
, intel
->gen
);
594 uint32_t *before_bits
= (uint32_t *)orig
;
595 uint32_t *after_bits
= (uint32_t *)uncompacted
;
596 printf(" changed bits:\n");
597 for (int i
= 0; i
< 128; i
++) {
598 uint32_t before
= before_bits
[i
/ 32] & (1 << (i
& 31));
599 uint32_t after
= after_bits
[i
/ 32] & (1 << (i
& 31));
601 if (before
!= after
) {
602 printf(" bit %d, %s to %s\n", i
,
603 before
? "set" : "unset",
604 after
? "set" : "unset");
610 compacted_between(int old_ip
, int old_target_ip
, int *compacted_counts
)
612 int this_compacted_count
= compacted_counts
[old_ip
];
613 int target_compacted_count
= compacted_counts
[old_target_ip
];
614 return target_compacted_count
- this_compacted_count
;
618 update_uip_jip(struct brw_instruction
*insn
, int this_old_ip
,
619 int *compacted_counts
)
623 target_old_ip
= this_old_ip
+ insn
->bits3
.break_cont
.jip
;
624 insn
->bits3
.break_cont
.jip
-= compacted_between(this_old_ip
,
628 target_old_ip
= this_old_ip
+ insn
->bits3
.break_cont
.uip
;
629 insn
->bits3
.break_cont
.uip
-= compacted_between(this_old_ip
,
635 brw_init_compaction_tables(struct intel_context
*intel
)
637 assert(gen6_control_index_table
[ARRAY_SIZE(gen6_control_index_table
) - 1] != 0);
638 assert(gen6_datatype_table
[ARRAY_SIZE(gen6_datatype_table
) - 1] != 0);
639 assert(gen6_subreg_table
[ARRAY_SIZE(gen6_subreg_table
) - 1] != 0);
640 assert(gen6_src_index_table
[ARRAY_SIZE(gen6_src_index_table
) - 1] != 0);
641 assert(gen7_control_index_table
[ARRAY_SIZE(gen6_control_index_table
) - 1] != 0);
642 assert(gen7_datatype_table
[ARRAY_SIZE(gen6_datatype_table
) - 1] != 0);
643 assert(gen7_subreg_table
[ARRAY_SIZE(gen6_subreg_table
) - 1] != 0);
644 assert(gen7_src_index_table
[ARRAY_SIZE(gen6_src_index_table
) - 1] != 0);
646 switch (intel
->gen
) {
648 control_index_table
= gen7_control_index_table
;
649 datatype_table
= gen7_datatype_table
;
650 subreg_table
= gen7_subreg_table
;
651 src_index_table
= gen7_src_index_table
;
654 control_index_table
= gen6_control_index_table
;
655 datatype_table
= gen6_datatype_table
;
656 subreg_table
= gen6_subreg_table
;
657 src_index_table
= gen6_src_index_table
;
665 brw_compact_instructions(struct brw_compile
*p
)
667 struct brw_context
*brw
= p
->brw
;
668 struct intel_context
*intel
= &brw
->intel
;
669 void *store
= p
->store
;
670 /* For an instruction at byte offset 8*i before compaction, this is the number
671 * of compacted instructions that preceded it.
673 int compacted_counts
[p
->next_insn_offset
/ 8];
674 /* For an instruction at byte offset 8*i after compaction, this is the
675 * 8-byte offset it was at before compaction.
677 int old_ip
[p
->next_insn_offset
/ 8];
684 int compacted_count
= 0;
685 for (src_offset
= 0; src_offset
< p
->nr_insn
* 16;) {
686 struct brw_instruction
*src
= store
+ src_offset
;
687 void *dst
= store
+ offset
;
689 old_ip
[offset
/ 8] = src_offset
/ 8;
690 compacted_counts
[src_offset
/ 8] = compacted_count
;
692 struct brw_instruction saved
= *src
;
694 if (!src
->header
.cmpt_control
&&
695 brw_try_compact_instruction(p
, dst
, src
)) {
699 struct brw_instruction uncompacted
;
700 brw_uncompact_instruction(intel
, &uncompacted
, dst
);
701 if (memcmp(&saved
, &uncompacted
, sizeof(uncompacted
))) {
702 brw_debug_compact_uncompact(intel
, &saved
, &uncompacted
);
709 int size
= src
->header
.cmpt_control
? 8 : 16;
711 /* It appears that the end of thread SEND instruction needs to be
712 * aligned, or the GPU hangs.
714 if ((src
->header
.opcode
== BRW_OPCODE_SEND
||
715 src
->header
.opcode
== BRW_OPCODE_SENDC
) &&
716 src
->bits3
.generic
.end_of_thread
&&
718 struct brw_compact_instruction
*align
= store
+ offset
;
719 memset(align
, 0, sizeof(*align
));
720 align
->dw0
.opcode
= BRW_OPCODE_NOP
;
721 align
->dw0
.cmpt_ctrl
= 1;
723 old_ip
[offset
/ 8] = src_offset
/ 8;
724 dst
= store
+ offset
;
727 /* If we didn't compact this intruction, we need to move it down into
730 if (offset
!= src_offset
) {
731 memmove(dst
, src
, size
);
738 /* Fix up control flow offsets. */
739 p
->next_insn_offset
= offset
;
740 for (offset
= 0; offset
< p
->next_insn_offset
;) {
741 struct brw_instruction
*insn
= store
+ offset
;
742 int this_old_ip
= old_ip
[offset
/ 8];
743 int this_compacted_count
= compacted_counts
[this_old_ip
];
744 int target_old_ip
, target_compacted_count
;
746 switch (insn
->header
.opcode
) {
747 case BRW_OPCODE_BREAK
:
748 case BRW_OPCODE_CONTINUE
:
749 case BRW_OPCODE_HALT
:
750 update_uip_jip(insn
, this_old_ip
, compacted_counts
);
754 case BRW_OPCODE_ELSE
:
755 case BRW_OPCODE_ENDIF
:
756 case BRW_OPCODE_WHILE
:
757 if (intel
->gen
== 6) {
758 target_old_ip
= this_old_ip
+ insn
->bits1
.branch_gen6
.jump_count
;
759 target_compacted_count
= compacted_counts
[target_old_ip
];
760 insn
->bits1
.branch_gen6
.jump_count
-= (target_compacted_count
-
761 this_compacted_count
);
763 update_uip_jip(insn
, this_old_ip
, compacted_counts
);
768 if (insn
->header
.cmpt_control
) {
775 /* p->nr_insn is counting the number of uncompacted instructions still, so
776 * divide. We do want to be sure there's a valid instruction in any
777 * alignment padding, so that the next compression pass (for the FS 8/16
778 * compile passes) parses correctly.
780 if (p
->next_insn_offset
& 8) {
781 struct brw_compact_instruction
*align
= store
+ offset
;
782 memset(align
, 0, sizeof(*align
));
783 align
->dw0
.opcode
= BRW_OPCODE_NOP
;
784 align
->dw0
.cmpt_ctrl
= 1;
785 p
->next_insn_offset
+= 8;
787 p
->nr_insn
= p
->next_insn_offset
/ 16;
790 fprintf(stdout
, "dumping compacted program\n");
791 brw_dump_compile(p
, stdout
, 0, p
->next_insn_offset
);
794 for (offset
= 0; offset
< p
->next_insn_offset
;) {
795 struct brw_instruction
*insn
= store
+ offset
;
797 if (insn
->header
.cmpt_control
) {
804 fprintf(stderr
, "%db/%db saved (%d%%)\n", cmp
* 8, offset
+ cmp
* 8,
805 cmp
* 8 * 100 / (offset
+ cmp
* 8));