2 * Copyright © 2012 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 /** @file brw_eu_compact.c
26 * Instruction compaction is a feature of gm45 and newer hardware that allows
27 * for a smaller instruction encoding.
29 * The instruction cache is on the order of 32KB, and many programs generate
30 * far more instructions than that. The instruction cache is built to barely
31 * keep up with instruction dispatch abaility in cache hit cases -- L1
32 * instruction cache misses that still hit in the next level could limit
33 * throughput by around 50%.
35 * The idea of instruction compaction is that most instructions use a tiny
36 * subset of the GPU functionality, so we can encode what would be a 16 byte
37 * instruction in 8 bytes using some lookup tables for various fields.
40 #include "brw_context.h"
43 static const uint32_t gen6_control_index_table
[32] = {
78 static const uint32_t gen6_datatype_table
[32] = {
100 0b001011010110100101,
101 0b001011110110100101,
102 0b001111011110111101,
103 0b001111011110111100,
104 0b001111011110111101,
105 0b001111011110011101,
106 0b001111011110111110,
107 0b001000000000100001,
108 0b001000000000100010,
109 0b001001111111011101,
110 0b001000001110111110,
113 static const uint32_t gen6_subreg_table
[32] = {
148 static const uint32_t gen6_src_index_table
[32] = {
183 static const uint32_t gen7_control_index_table
[32] = {
184 0b0000000000000000010,
185 0b0000100000000000000,
186 0b0000100000000000001,
187 0b0000100000000000010,
188 0b0000100000000000011,
189 0b0000100000000000100,
190 0b0000100000000000101,
191 0b0000100000000000111,
192 0b0000100000000001000,
193 0b0000100000000001001,
194 0b0000100000000001101,
195 0b0000110000000000000,
196 0b0000110000000000001,
197 0b0000110000000000010,
198 0b0000110000000000011,
199 0b0000110000000000100,
200 0b0000110000000000101,
201 0b0000110000000000111,
202 0b0000110000000001001,
203 0b0000110000000001101,
204 0b0000110000000010000,
205 0b0000110000100000000,
206 0b0001000000000000000,
207 0b0001000000000000010,
208 0b0001000000000000100,
209 0b0001000000100000000,
210 0b0010110000000000000,
211 0b0010110000000010000,
212 0b0011000000000000000,
213 0b0011000000100000000,
214 0b0101000000000000000,
215 0b0101000000100000000
218 static const uint32_t gen7_datatype_table
[32] = {
219 0b001000000000000001,
220 0b001000000000100000,
221 0b001000000000100001,
222 0b001000000001100001,
223 0b001000000010111101,
224 0b001000001011111101,
225 0b001000001110100001,
226 0b001000001110100101,
227 0b001000001110111101,
228 0b001000010000100001,
229 0b001000110000100000,
230 0b001000110000100001,
231 0b001001010010100101,
232 0b001001110010100100,
233 0b001001110010100101,
234 0b001111001110111101,
235 0b001111011110011101,
236 0b001111011110111100,
237 0b001111011110111101,
238 0b001111111110111100,
239 0b000000001000001100,
240 0b001000000000111101,
241 0b001000000010100101,
242 0b001000010000100000,
243 0b001001010010100100,
244 0b001001110010000100,
245 0b001010010100001001,
246 0b001101111110111101,
247 0b001111111110111101,
248 0b001011110110101100,
249 0b001010010100101000,
253 static const uint32_t gen7_subreg_table
[32] = {
288 static const uint32_t gen7_src_index_table
[32] = {
323 static const uint32_t *control_index_table
;
324 static const uint32_t *datatype_table
;
325 static const uint32_t *subreg_table
;
326 static const uint32_t *src_index_table
;
329 set_control_index(struct brw_context
*brw
,
330 struct brw_compact_instruction
*dst
,
331 struct brw_instruction
*src
)
333 uint32_t *src_u32
= (uint32_t *)src
;
334 uint32_t uncompacted
= 0;
336 uncompacted
|= ((src_u32
[0] >> 8) & 0xffff) << 0;
337 uncompacted
|= ((src_u32
[0] >> 31) & 0x1) << 16;
338 /* On gen7, the flag register number gets integrated into the control
342 uncompacted
|= ((src_u32
[2] >> 25) & 0x3) << 17;
344 for (int i
= 0; i
< 32; i
++) {
345 if (control_index_table
[i
] == uncompacted
) {
346 dst
->dw0
.control_index
= i
;
355 set_datatype_index(struct brw_compact_instruction
*dst
,
356 struct brw_instruction
*src
)
358 uint32_t uncompacted
= 0;
360 uncompacted
|= src
->bits1
.ud
& 0x7fff;
361 uncompacted
|= (src
->bits1
.ud
>> 29) << 15;
363 for (int i
= 0; i
< 32; i
++) {
364 if (datatype_table
[i
] == uncompacted
) {
365 dst
->dw0
.data_type_index
= i
;
374 set_subreg_index(struct brw_compact_instruction
*dst
,
375 struct brw_instruction
*src
)
377 uint32_t uncompacted
= 0;
379 uncompacted
|= src
->bits1
.da1
.dest_subreg_nr
<< 0;
380 uncompacted
|= src
->bits2
.da1
.src0_subreg_nr
<< 5;
381 uncompacted
|= src
->bits3
.da1
.src1_subreg_nr
<< 10;
383 for (int i
= 0; i
< 32; i
++) {
384 if (subreg_table
[i
] == uncompacted
) {
385 dst
->dw0
.sub_reg_index
= i
;
394 get_src_index(uint32_t uncompacted
,
397 for (int i
= 0; i
< 32; i
++) {
398 if (src_index_table
[i
] == uncompacted
) {
408 set_src0_index(struct brw_compact_instruction
*dst
,
409 struct brw_instruction
*src
)
411 uint32_t compacted
, uncompacted
= 0;
413 uncompacted
|= (src
->bits2
.ud
>> 13) & 0xfff;
415 if (!get_src_index(uncompacted
, &compacted
))
418 dst
->dw0
.src0_index
= compacted
& 0x3;
419 dst
->dw1
.src0_index
= compacted
>> 2;
425 set_src1_index(struct brw_compact_instruction
*dst
,
426 struct brw_instruction
*src
)
428 uint32_t compacted
, uncompacted
= 0;
430 uncompacted
|= (src
->bits3
.ud
>> 13) & 0xfff;
432 if (!get_src_index(uncompacted
, &compacted
))
435 dst
->dw1
.src1_index
= compacted
;
441 * Tries to compact instruction src into dst.
443 * It doesn't modify dst unless src is compactable, which is relied on by
444 * brw_compact_instructions().
447 brw_try_compact_instruction(struct brw_compile
*p
,
448 struct brw_compact_instruction
*dst
,
449 struct brw_instruction
*src
)
451 struct brw_context
*brw
= p
->brw
;
452 struct brw_compact_instruction temp
;
454 if (src
->header
.opcode
== BRW_OPCODE_IF
||
455 src
->header
.opcode
== BRW_OPCODE_ELSE
||
456 src
->header
.opcode
== BRW_OPCODE_ENDIF
||
457 src
->header
.opcode
== BRW_OPCODE_HALT
||
458 src
->header
.opcode
== BRW_OPCODE_DO
||
459 src
->header
.opcode
== BRW_OPCODE_WHILE
) {
460 /* FINISHME: The fixup code below, and brw_set_uip_jip and friends, needs
461 * to be able to handle compacted flow control instructions..
466 /* FINISHME: immediates */
467 if (src
->bits1
.da1
.src0_reg_file
== BRW_IMMEDIATE_VALUE
||
468 src
->bits1
.da1
.src1_reg_file
== BRW_IMMEDIATE_VALUE
)
471 memset(&temp
, 0, sizeof(temp
));
473 temp
.dw0
.opcode
= src
->header
.opcode
;
474 temp
.dw0
.debug_control
= src
->header
.debug_control
;
475 if (!set_control_index(brw
, &temp
, src
))
477 if (!set_datatype_index(&temp
, src
))
479 if (!set_subreg_index(&temp
, src
))
481 temp
.dw0
.acc_wr_control
= src
->header
.acc_wr_control
;
482 temp
.dw0
.conditionalmod
= src
->header
.destreg__conditionalmod
;
484 temp
.dw0
.flag_subreg_nr
= src
->bits2
.da1
.flag_subreg_nr
;
485 temp
.dw0
.cmpt_ctrl
= 1;
486 if (!set_src0_index(&temp
, src
))
488 if (!set_src1_index(&temp
, src
))
490 temp
.dw1
.dst_reg_nr
= src
->bits1
.da1
.dest_reg_nr
;
491 temp
.dw1
.src0_reg_nr
= src
->bits2
.da1
.src0_reg_nr
;
492 temp
.dw1
.src1_reg_nr
= src
->bits3
.da1
.src1_reg_nr
;
500 set_uncompacted_control(struct brw_context
*brw
,
501 struct brw_instruction
*dst
,
502 struct brw_compact_instruction
*src
)
504 uint32_t *dst_u32
= (uint32_t *)dst
;
505 uint32_t uncompacted
= control_index_table
[src
->dw0
.control_index
];
507 dst_u32
[0] |= ((uncompacted
>> 0) & 0xffff) << 8;
508 dst_u32
[0] |= ((uncompacted
>> 16) & 0x1) << 31;
511 dst_u32
[2] |= ((uncompacted
>> 17) & 0x3) << 25;
515 set_uncompacted_datatype(struct brw_instruction
*dst
,
516 struct brw_compact_instruction
*src
)
518 uint32_t uncompacted
= datatype_table
[src
->dw0
.data_type_index
];
520 dst
->bits1
.ud
&= ~(0x7 << 29);
521 dst
->bits1
.ud
|= ((uncompacted
>> 15) & 0x7) << 29;
522 dst
->bits1
.ud
&= ~0x7fff;
523 dst
->bits1
.ud
|= uncompacted
& 0x7fff;
527 set_uncompacted_subreg(struct brw_instruction
*dst
,
528 struct brw_compact_instruction
*src
)
530 uint32_t uncompacted
= subreg_table
[src
->dw0
.sub_reg_index
];
532 dst
->bits1
.da1
.dest_subreg_nr
= (uncompacted
>> 0) & 0x1f;
533 dst
->bits2
.da1
.src0_subreg_nr
= (uncompacted
>> 5) & 0x1f;
534 dst
->bits3
.da1
.src1_subreg_nr
= (uncompacted
>> 10) & 0x1f;
538 set_uncompacted_src0(struct brw_instruction
*dst
,
539 struct brw_compact_instruction
*src
)
541 uint32_t compacted
= src
->dw0
.src0_index
| src
->dw1
.src0_index
<< 2;
542 uint32_t uncompacted
= src_index_table
[compacted
];
544 dst
->bits2
.ud
|= uncompacted
<< 13;
548 set_uncompacted_src1(struct brw_instruction
*dst
,
549 struct brw_compact_instruction
*src
)
551 uint32_t uncompacted
= src_index_table
[src
->dw1
.src1_index
];
553 dst
->bits3
.ud
|= uncompacted
<< 13;
557 brw_uncompact_instruction(struct brw_context
*brw
,
558 struct brw_instruction
*dst
,
559 struct brw_compact_instruction
*src
)
561 memset(dst
, 0, sizeof(*dst
));
563 dst
->header
.opcode
= src
->dw0
.opcode
;
564 dst
->header
.debug_control
= src
->dw0
.debug_control
;
566 set_uncompacted_control(brw
, dst
, src
);
567 set_uncompacted_datatype(dst
, src
);
568 set_uncompacted_subreg(dst
, src
);
569 dst
->header
.acc_wr_control
= src
->dw0
.acc_wr_control
;
570 dst
->header
.destreg__conditionalmod
= src
->dw0
.conditionalmod
;
572 dst
->bits2
.da1
.flag_subreg_nr
= src
->dw0
.flag_subreg_nr
;
573 set_uncompacted_src0(dst
, src
);
574 set_uncompacted_src1(dst
, src
);
575 dst
->bits1
.da1
.dest_reg_nr
= src
->dw1
.dst_reg_nr
;
576 dst
->bits2
.da1
.src0_reg_nr
= src
->dw1
.src0_reg_nr
;
577 dst
->bits3
.da1
.src1_reg_nr
= src
->dw1
.src1_reg_nr
;
580 void brw_debug_compact_uncompact(struct brw_context
*brw
,
581 struct brw_instruction
*orig
,
582 struct brw_instruction
*uncompacted
)
584 fprintf(stderr
, "Instruction compact/uncompact changed (gen%d):\n",
587 fprintf(stderr
, " before: ");
588 brw_disasm(stderr
, orig
, brw
->gen
);
590 fprintf(stderr
, " after: ");
591 brw_disasm(stderr
, uncompacted
, brw
->gen
);
593 uint32_t *before_bits
= (uint32_t *)orig
;
594 uint32_t *after_bits
= (uint32_t *)uncompacted
;
595 fprintf(stderr
, " changed bits:\n");
596 for (int i
= 0; i
< 128; i
++) {
597 uint32_t before
= before_bits
[i
/ 32] & (1 << (i
& 31));
598 uint32_t after
= after_bits
[i
/ 32] & (1 << (i
& 31));
600 if (before
!= after
) {
601 fprintf(stderr
, " bit %d, %s to %s\n", i
,
602 before
? "set" : "unset",
603 after
? "set" : "unset");
609 compacted_between(int old_ip
, int old_target_ip
, int *compacted_counts
)
611 int this_compacted_count
= compacted_counts
[old_ip
];
612 int target_compacted_count
= compacted_counts
[old_target_ip
];
613 return target_compacted_count
- this_compacted_count
;
617 update_uip_jip(struct brw_instruction
*insn
, int this_old_ip
,
618 int *compacted_counts
)
622 target_old_ip
= this_old_ip
+ insn
->bits3
.break_cont
.jip
;
623 insn
->bits3
.break_cont
.jip
-= compacted_between(this_old_ip
,
627 target_old_ip
= this_old_ip
+ insn
->bits3
.break_cont
.uip
;
628 insn
->bits3
.break_cont
.uip
-= compacted_between(this_old_ip
,
634 brw_init_compaction_tables(struct brw_context
*brw
)
636 assert(gen6_control_index_table
[ARRAY_SIZE(gen6_control_index_table
) - 1] != 0);
637 assert(gen6_datatype_table
[ARRAY_SIZE(gen6_datatype_table
) - 1] != 0);
638 assert(gen6_subreg_table
[ARRAY_SIZE(gen6_subreg_table
) - 1] != 0);
639 assert(gen6_src_index_table
[ARRAY_SIZE(gen6_src_index_table
) - 1] != 0);
640 assert(gen7_control_index_table
[ARRAY_SIZE(gen6_control_index_table
) - 1] != 0);
641 assert(gen7_datatype_table
[ARRAY_SIZE(gen6_datatype_table
) - 1] != 0);
642 assert(gen7_subreg_table
[ARRAY_SIZE(gen6_subreg_table
) - 1] != 0);
643 assert(gen7_src_index_table
[ARRAY_SIZE(gen6_src_index_table
) - 1] != 0);
647 control_index_table
= gen7_control_index_table
;
648 datatype_table
= gen7_datatype_table
;
649 subreg_table
= gen7_subreg_table
;
650 src_index_table
= gen7_src_index_table
;
653 control_index_table
= gen6_control_index_table
;
654 datatype_table
= gen6_datatype_table
;
655 subreg_table
= gen6_subreg_table
;
656 src_index_table
= gen6_src_index_table
;
664 brw_compact_instructions(struct brw_compile
*p
)
666 struct brw_context
*brw
= p
->brw
;
667 void *store
= p
->store
;
668 /* For an instruction at byte offset 8*i before compaction, this is the number
669 * of compacted instructions that preceded it.
671 int compacted_counts
[p
->next_insn_offset
/ 8];
672 /* For an instruction at byte offset 8*i after compaction, this is the
673 * 8-byte offset it was at before compaction.
675 int old_ip
[p
->next_insn_offset
/ 8];
682 int compacted_count
= 0;
683 for (src_offset
= 0; src_offset
< p
->nr_insn
* 16;) {
684 struct brw_instruction
*src
= store
+ src_offset
;
685 void *dst
= store
+ offset
;
687 old_ip
[offset
/ 8] = src_offset
/ 8;
688 compacted_counts
[src_offset
/ 8] = compacted_count
;
690 struct brw_instruction saved
= *src
;
692 if (!src
->header
.cmpt_control
&&
693 brw_try_compact_instruction(p
, dst
, src
)) {
697 struct brw_instruction uncompacted
;
698 brw_uncompact_instruction(brw
, &uncompacted
, dst
);
699 if (memcmp(&saved
, &uncompacted
, sizeof(uncompacted
))) {
700 brw_debug_compact_uncompact(brw
, &saved
, &uncompacted
);
707 int size
= src
->header
.cmpt_control
? 8 : 16;
709 /* It appears that the end of thread SEND instruction needs to be
710 * aligned, or the GPU hangs.
712 if ((src
->header
.opcode
== BRW_OPCODE_SEND
||
713 src
->header
.opcode
== BRW_OPCODE_SENDC
) &&
714 src
->bits3
.generic
.end_of_thread
&&
716 struct brw_compact_instruction
*align
= store
+ offset
;
717 memset(align
, 0, sizeof(*align
));
718 align
->dw0
.opcode
= BRW_OPCODE_NOP
;
719 align
->dw0
.cmpt_ctrl
= 1;
721 old_ip
[offset
/ 8] = src_offset
/ 8;
722 dst
= store
+ offset
;
725 /* If we didn't compact this intruction, we need to move it down into
728 if (offset
!= src_offset
) {
729 memmove(dst
, src
, size
);
736 /* Fix up control flow offsets. */
737 p
->next_insn_offset
= offset
;
738 for (offset
= 0; offset
< p
->next_insn_offset
;) {
739 struct brw_instruction
*insn
= store
+ offset
;
740 int this_old_ip
= old_ip
[offset
/ 8];
741 int this_compacted_count
= compacted_counts
[this_old_ip
];
742 int target_old_ip
, target_compacted_count
;
744 switch (insn
->header
.opcode
) {
745 case BRW_OPCODE_BREAK
:
746 case BRW_OPCODE_CONTINUE
:
747 case BRW_OPCODE_HALT
:
748 update_uip_jip(insn
, this_old_ip
, compacted_counts
);
752 case BRW_OPCODE_ELSE
:
753 case BRW_OPCODE_ENDIF
:
754 case BRW_OPCODE_WHILE
:
756 target_old_ip
= this_old_ip
+ insn
->bits1
.branch_gen6
.jump_count
;
757 target_compacted_count
= compacted_counts
[target_old_ip
];
758 insn
->bits1
.branch_gen6
.jump_count
-= (target_compacted_count
-
759 this_compacted_count
);
761 update_uip_jip(insn
, this_old_ip
, compacted_counts
);
766 if (insn
->header
.cmpt_control
) {
773 /* p->nr_insn is counting the number of uncompacted instructions still, so
774 * divide. We do want to be sure there's a valid instruction in any
775 * alignment padding, so that the next compression pass (for the FS 8/16
776 * compile passes) parses correctly.
778 if (p
->next_insn_offset
& 8) {
779 struct brw_compact_instruction
*align
= store
+ offset
;
780 memset(align
, 0, sizeof(*align
));
781 align
->dw0
.opcode
= BRW_OPCODE_NOP
;
782 align
->dw0
.cmpt_ctrl
= 1;
783 p
->next_insn_offset
+= 8;
785 p
->nr_insn
= p
->next_insn_offset
/ 16;
788 fprintf(stderr
, "dumping compacted program\n");
789 brw_dump_compile(p
, stderr
, 0, p
->next_insn_offset
);
792 for (offset
= 0; offset
< p
->next_insn_offset
;) {
793 struct brw_instruction
*insn
= store
+ offset
;
795 if (insn
->header
.cmpt_control
) {
802 fprintf(stderr
, "%db/%db saved (%d%%)\n", cmp
* 8, offset
+ cmp
* 8,
803 cmp
* 8 * 100 / (offset
+ cmp
* 8));