2 * Copyright © 2012 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 /** @file brw_eu_compact.c
26 * Instruction compaction is a feature of G45 and newer hardware that allows
27 * for a smaller instruction encoding.
29 * The instruction cache is on the order of 32KB, and many programs generate
30 * far more instructions than that. The instruction cache is built to barely
31 * keep up with instruction dispatch ability in cache hit cases -- L1
32 * instruction cache misses that still hit in the next level could limit
33 * throughput by around 50%.
35 * The idea of instruction compaction is that most instructions use a tiny
36 * subset of the GPU functionality, so we can encode what would be a 16 byte
37 * instruction in 8 bytes using some lookup tables for various fields.
40 * Instruction compaction capabilities vary subtly by generation.
42 * G45's support for instruction compaction is very limited. Jump counts on
43 * this generation are in units of 16-byte uncompacted instructions. As such,
44 * all jump targets must be 16-byte aligned. Also, all instructions must be
45 * naturally aligned, i.e. uncompacted instructions must be 16-byte aligned.
46 * A G45-only instruction, NENOP, must be used to provide padding to align
47 * uncompacted instructions.
49 * Gen5 removes these restrictions and changes jump counts to be in units of
50 * 8-byte compacted instructions, allowing jump targets to be only 8-byte
51 * aligned. Uncompacted instructions can also be placed on 8-byte boundaries.
53 * Gen6 adds the ability to compact instructions with a limited range of
54 * immediate values. Compactable immediates have 12 unrestricted bits, and a
55 * 13th bit that's replicated through the high 20 bits, to create the 32-bit
56 * value of DW3 in the uncompacted instruction word.
58 * On Gen7 we can compact some control flow instructions with a small positive
59 * immediate in the low bits of DW3, like ENDIF with the JIP field. Other
60 * control flow instructions with UIP cannot be compacted, because of the
61 * replicated 13th bit. No control flow instructions can be compacted on Gen6
62 * since the jump count field is not in DW3.
68 * else JIP (plus UIP on BDW+)
70 * while JIP (must be negative)
72 * Gen 8 adds support for compacting 3-src instructions.
75 #include "brw_context.h"
77 #include "intel_asm_annotation.h"
78 #include "util/u_atomic.h" /* for p_atomic_cmpxchg */
80 static const uint32_t g45_control_index_table
[32] = {
115 static const uint32_t g45_datatype_table
[32] = {
116 0b001000000000100001,
117 0b001011010110101101,
118 0b001000001000110001,
119 0b001111011110111101,
120 0b001011010110101100,
121 0b001000000110101101,
122 0b001000000000100000,
123 0b010100010110110001,
124 0b001100011000101101,
125 0b001000000000100010,
126 0b001000001000110110,
127 0b010000001000110001,
128 0b001000001000110010,
129 0b011000001000110010,
130 0b001111011110111100,
131 0b001000000100101000,
132 0b010100011000110001,
133 0b001010010100101001,
134 0b001000001000101001,
135 0b010000001000110110,
136 0b101000001000110001,
137 0b001011011000101101,
138 0b001000000100001001,
139 0b001011011000101100,
140 0b110100011000110001,
141 0b001000001110111101,
142 0b110000001000110001,
143 0b011000000100101010,
144 0b101000001000101001,
145 0b001011010110001100,
146 0b001000000110100001,
150 static const uint16_t g45_subreg_table
[32] = {
185 static const uint16_t g45_src_index_table
[32] = {
220 static const uint32_t gen6_control_index_table
[32] = {
255 static const uint32_t gen6_datatype_table
[32] = {
256 0b001001110000000000,
257 0b001000110000100000,
258 0b001001110000000001,
259 0b001000000001100000,
260 0b001010110100101001,
261 0b001000000110101101,
262 0b001100011000101100,
263 0b001011110110101101,
264 0b001000000111101100,
265 0b001000000001100001,
266 0b001000110010100101,
267 0b001000000001000001,
268 0b001000001000110001,
269 0b001000001000101001,
270 0b001000000000100000,
271 0b001000001000110010,
272 0b001010010100101001,
273 0b001011010010100101,
274 0b001000000110100101,
275 0b001100011000101001,
276 0b001011011000101100,
277 0b001011010110100101,
278 0b001011110110100101,
279 0b001111011110111101,
280 0b001111011110111100,
281 0b001111011110111101,
282 0b001111011110011101,
283 0b001111011110111110,
284 0b001000000000100001,
285 0b001000000000100010,
286 0b001001111111011101,
287 0b001000001110111110,
290 static const uint16_t gen6_subreg_table
[32] = {
325 static const uint16_t gen6_src_index_table
[32] = {
360 static const uint32_t gen7_control_index_table
[32] = {
361 0b0000000000000000010,
362 0b0000100000000000000,
363 0b0000100000000000001,
364 0b0000100000000000010,
365 0b0000100000000000011,
366 0b0000100000000000100,
367 0b0000100000000000101,
368 0b0000100000000000111,
369 0b0000100000000001000,
370 0b0000100000000001001,
371 0b0000100000000001101,
372 0b0000110000000000000,
373 0b0000110000000000001,
374 0b0000110000000000010,
375 0b0000110000000000011,
376 0b0000110000000000100,
377 0b0000110000000000101,
378 0b0000110000000000111,
379 0b0000110000000001001,
380 0b0000110000000001101,
381 0b0000110000000010000,
382 0b0000110000100000000,
383 0b0001000000000000000,
384 0b0001000000000000010,
385 0b0001000000000000100,
386 0b0001000000100000000,
387 0b0010110000000000000,
388 0b0010110000000010000,
389 0b0011000000000000000,
390 0b0011000000100000000,
391 0b0101000000000000000,
392 0b0101000000100000000
395 static const uint32_t gen7_datatype_table
[32] = {
396 0b001000000000000001,
397 0b001000000000100000,
398 0b001000000000100001,
399 0b001000000001100001,
400 0b001000000010111101,
401 0b001000001011111101,
402 0b001000001110100001,
403 0b001000001110100101,
404 0b001000001110111101,
405 0b001000010000100001,
406 0b001000110000100000,
407 0b001000110000100001,
408 0b001001010010100101,
409 0b001001110010100100,
410 0b001001110010100101,
411 0b001111001110111101,
412 0b001111011110011101,
413 0b001111011110111100,
414 0b001111011110111101,
415 0b001111111110111100,
416 0b000000001000001100,
417 0b001000000000111101,
418 0b001000000010100101,
419 0b001000010000100000,
420 0b001001010010100100,
421 0b001001110010000100,
422 0b001010010100001001,
423 0b001101111110111101,
424 0b001111111110111101,
425 0b001011110110101100,
426 0b001010010100101000,
430 static const uint16_t gen7_subreg_table
[32] = {
465 static const uint16_t gen7_src_index_table
[32] = {
500 static const uint32_t gen8_control_index_table
[32] = {
501 0b0000000000000000010,
502 0b0000100000000000000,
503 0b0000100000000000001,
504 0b0000100000000000010,
505 0b0000100000000000011,
506 0b0000100000000000100,
507 0b0000100000000000101,
508 0b0000100000000000111,
509 0b0000100000000001000,
510 0b0000100000000001001,
511 0b0000100000000001101,
512 0b0000110000000000000,
513 0b0000110000000000001,
514 0b0000110000000000010,
515 0b0000110000000000011,
516 0b0000110000000000100,
517 0b0000110000000000101,
518 0b0000110000000000111,
519 0b0000110000000001001,
520 0b0000110000000001101,
521 0b0000110000000010000,
522 0b0000110000100000000,
523 0b0001000000000000000,
524 0b0001000000000000010,
525 0b0001000000000000100,
526 0b0001000000100000000,
527 0b0010110000000000000,
528 0b0010110000000010000,
529 0b0011000000000000000,
530 0b0011000000100000000,
531 0b0101000000000000000,
532 0b0101000000100000000
535 static const uint32_t gen8_datatype_table
[32] = {
536 0b001000000000000000001,
537 0b001000000000001000000,
538 0b001000000000001000001,
539 0b001000000000011000001,
540 0b001000000000101011101,
541 0b001000000010111011101,
542 0b001000000011101000001,
543 0b001000000011101000101,
544 0b001000000011101011101,
545 0b001000001000001000001,
546 0b001000011000001000000,
547 0b001000011000001000001,
548 0b001000101000101000101,
549 0b001000111000101000100,
550 0b001000111000101000101,
551 0b001011100011101011101,
552 0b001011101011100011101,
553 0b001011101011101011100,
554 0b001011101011101011101,
555 0b001011111011101011100,
556 0b000000000010000001100,
557 0b001000000000001011101,
558 0b001000000000101000101,
559 0b001000001000001000000,
560 0b001000101000101000100,
561 0b001000111000100000100,
562 0b001001001001000001001,
563 0b001010111011101011101,
564 0b001011111011101011101,
565 0b001001111001101001100,
566 0b001001001001001001000,
567 0b001001011001001001000
570 static const uint16_t gen8_subreg_table
[32] = {
605 static const uint16_t gen8_src_index_table
[32] = {
640 /* This is actually the control index table for Cherryview (26 bits), but the
641 * only difference from Broadwell (24 bits) is that it has two extra 0-bits at
644 * The low 24 bits have the same mappings on both hardware.
646 static const uint32_t gen8_3src_control_index_table
[4] = {
647 0b00100000000110000000000001,
648 0b00000000000110000000000001,
649 0b00000000001000000000000001,
650 0b00000000001000000000100001
653 /* This is actually the control index table for Cherryview (49 bits), but the
654 * only difference from Broadwell (46 bits) is that it has three extra 0-bits
657 * The low 44 bits have the same mappings on both hardware, and since the high
658 * three bits on Broadwell are zero, we can reuse Cherryview's table.
660 static const uint64_t gen8_3src_source_index_table
[4] = {
661 0b0000001110010011100100111001000001111000000000000,
662 0b0000001110010011100100111001000001111000000000010,
663 0b0000001110010011100100111001000001111000000001000,
664 0b0000001110010011100100111001000001111000000100000
667 static const uint32_t *control_index_table
;
668 static const uint32_t *datatype_table
;
669 static const uint16_t *subreg_table
;
670 static const uint16_t *src_index_table
;
673 set_control_index(struct brw_context
*brw
, brw_compact_inst
*dst
, brw_inst
*src
)
675 uint32_t uncompacted
= brw
->gen
>= 8 /* 17b/G45; 19b/IVB+ */
676 ? (brw_inst_bits(src
, 33, 31) << 16) | /* 3b */
677 (brw_inst_bits(src
, 23, 12) << 4) | /* 12b */
678 (brw_inst_bits(src
, 10, 9) << 2) | /* 2b */
679 (brw_inst_bits(src
, 34, 34) << 1) | /* 1b */
680 (brw_inst_bits(src
, 8, 8)) /* 1b */
681 : (brw_inst_bits(src
, 31, 31) << 16) | /* 1b */
682 (brw_inst_bits(src
, 23, 8)); /* 16b */
684 /* On gen7, the flag register and subregister numbers are integrated into
688 uncompacted
|= brw_inst_bits(src
, 90, 89) << 17; /* 2b */
690 for (int i
= 0; i
< 32; i
++) {
691 if (control_index_table
[i
] == uncompacted
) {
692 brw_compact_inst_set_control_index(dst
, i
);
701 set_datatype_index(struct brw_context
*brw
, brw_compact_inst
*dst
,
704 uint32_t uncompacted
= brw
->gen
>= 8 /* 18b/G45+; 21b/BDW+ */
705 ? (brw_inst_bits(src
, 63, 61) << 18) | /* 3b */
706 (brw_inst_bits(src
, 94, 89) << 12) | /* 6b */
707 (brw_inst_bits(src
, 46, 35)) /* 12b */
708 : (brw_inst_bits(src
, 63, 61) << 15) | /* 3b */
709 (brw_inst_bits(src
, 46, 32)); /* 15b */
711 for (int i
= 0; i
< 32; i
++) {
712 if (datatype_table
[i
] == uncompacted
) {
713 brw_compact_inst_set_datatype_index(dst
, i
);
722 set_subreg_index(struct brw_context
*brw
, brw_compact_inst
*dst
, brw_inst
*src
,
725 uint16_t uncompacted
= /* 15b */
726 (brw_inst_bits(src
, 52, 48) << 0) | /* 5b */
727 (brw_inst_bits(src
, 68, 64) << 5); /* 5b */
730 uncompacted
|= brw_inst_bits(src
, 100, 96) << 10; /* 5b */
732 for (int i
= 0; i
< 32; i
++) {
733 if (subreg_table
[i
] == uncompacted
) {
734 brw_compact_inst_set_subreg_index(dst
, i
);
743 get_src_index(uint16_t uncompacted
,
746 for (int i
= 0; i
< 32; i
++) {
747 if (src_index_table
[i
] == uncompacted
) {
757 set_src0_index(struct brw_context
*brw
, brw_compact_inst
*dst
, brw_inst
*src
)
760 uint16_t uncompacted
= brw_inst_bits(src
, 88, 77); /* 12b */
762 if (!get_src_index(uncompacted
, &compacted
))
765 brw_compact_inst_set_src0_index(dst
, compacted
);
771 set_src1_index(struct brw_context
*brw
, brw_compact_inst
*dst
, brw_inst
*src
,
777 compacted
= (brw_inst_imm_ud(brw
, src
) >> 8) & 0x1f;
779 uint16_t uncompacted
= brw_inst_bits(src
, 120, 109); /* 12b */
781 if (!get_src_index(uncompacted
, &compacted
))
785 brw_compact_inst_set_src1_index(dst
, compacted
);
791 set_3src_control_index(struct brw_context
*brw
, brw_compact_inst
*dst
, brw_inst
*src
)
793 assert(brw
->gen
>= 8);
795 uint32_t uncompacted
= /* 24b/BDW; 26b/CHV */
796 (brw_inst_bits(src
, 34, 32) << 21) | /* 3b */
797 (brw_inst_bits(src
, 28, 8)); /* 21b */
799 if (brw
->gen
>= 9 || brw
->is_cherryview
)
800 uncompacted
|= brw_inst_bits(src
, 36, 35) << 24; /* 2b */
802 for (int i
= 0; i
< ARRAY_SIZE(gen8_3src_control_index_table
); i
++) {
803 if (gen8_3src_control_index_table
[i
] == uncompacted
) {
804 brw_compact_inst_set_3src_control_index(dst
, i
);
813 set_3src_source_index(struct brw_context
*brw
, brw_compact_inst
*dst
, brw_inst
*src
)
815 assert(brw
->gen
>= 8);
817 uint64_t uncompacted
= /* 46b/BDW; 49b/CHV */
818 (brw_inst_bits(src
, 83, 83) << 43) | /* 1b */
819 (brw_inst_bits(src
, 114, 107) << 35) | /* 8b */
820 (brw_inst_bits(src
, 93, 86) << 27) | /* 8b */
821 (brw_inst_bits(src
, 72, 65) << 19) | /* 8b */
822 (brw_inst_bits(src
, 55, 37)); /* 19b */
824 if (brw
->gen
>= 9 || brw
->is_cherryview
) {
826 (brw_inst_bits(src
, 126, 125) << 47) | /* 2b */
827 (brw_inst_bits(src
, 105, 104) << 45) | /* 2b */
828 (brw_inst_bits(src
, 84, 84) << 44); /* 1b */
831 (brw_inst_bits(src
, 125, 125) << 45) | /* 1b */
832 (brw_inst_bits(src
, 104, 104) << 44); /* 1b */
835 for (int i
= 0; i
< ARRAY_SIZE(gen8_3src_source_index_table
); i
++) {
836 if (gen8_3src_source_index_table
[i
] == uncompacted
) {
837 brw_compact_inst_set_3src_source_index(dst
, i
);
846 has_unmapped_bits(struct brw_context
*brw
, brw_inst
*src
)
848 /* Check for instruction bits that don't map to any of the fields of the
849 * compacted instruction. The instruction cannot be compacted if any of
850 * them are set. They overlap with:
851 * - NibCtrl (bit 47 on Gen7, bit 11 on Gen8)
852 * - Dst.AddrImm[9] (bit 47 on Gen8)
853 * - Src0.AddrImm[9] (bit 95 on Gen8)
854 * - Imm64[27:31] (bits 91-95 on Gen7, bit 95 on Gen8)
855 * - UIP[31] (bit 95 on Gen8)
858 assert(!brw_inst_bits(src
, 7, 7));
859 return brw_inst_bits(src
, 95, 95) ||
860 brw_inst_bits(src
, 47, 47) ||
861 brw_inst_bits(src
, 11, 11);
863 assert(!brw_inst_bits(src
, 7, 7) &&
864 !(brw
->gen
< 7 && brw_inst_bits(src
, 90, 90)));
865 return brw_inst_bits(src
, 95, 91) ||
866 brw_inst_bits(src
, 47, 47);
871 has_3src_unmapped_bits(struct brw_context
*brw
, brw_inst
*src
)
873 /* Check for three-source instruction bits that don't map to any of the
874 * fields of the compacted instruction. All of them seem to be reserved
877 if (brw
->gen
>= 9 || brw
->is_cherryview
) {
878 assert(!brw_inst_bits(src
, 127, 127) &&
879 !brw_inst_bits(src
, 7, 7));
881 assert(brw
->gen
>= 8);
882 assert(!brw_inst_bits(src
, 127, 126) &&
883 !brw_inst_bits(src
, 105, 105) &&
884 !brw_inst_bits(src
, 84, 84) &&
885 !brw_inst_bits(src
, 36, 35) &&
886 !brw_inst_bits(src
, 7, 7));
893 brw_try_compact_3src_instruction(struct brw_context
*brw
, brw_compact_inst
*dst
,
896 assert(brw
->gen
>= 8);
898 if (has_3src_unmapped_bits(brw
, src
))
901 #define compact(field) \
902 brw_compact_inst_set_3src_##field(dst, brw_inst_3src_##field(brw, src))
906 if (!set_3src_control_index(brw
, dst
, src
))
909 if (!set_3src_source_index(brw
, dst
, src
))
913 compact(src0_rep_ctrl
);
914 brw_compact_inst_set_3src_cmpt_control(dst
, true);
915 compact(debug_control
);
917 compact(src1_rep_ctrl
);
918 compact(src2_rep_ctrl
);
919 compact(src0_reg_nr
);
920 compact(src1_reg_nr
);
921 compact(src2_reg_nr
);
922 compact(src0_subreg_nr
);
923 compact(src1_subreg_nr
);
924 compact(src2_subreg_nr
);
931 /* Compacted instructions have 12-bits for immediate sources, and a 13th bit
932 * that's replicated through the high 20 bits.
934 * Effectively this means we get 12-bit integers, 0.0f, and some limited uses
935 * of packed vectors as compactable immediates.
938 is_compactable_immediate(unsigned imm
)
940 /* We get the low 12 bits as-is. */
943 /* We get one bit replicated through the top 20 bits. */
944 return imm
== 0 || imm
== 0xfffff000;
947 /* Returns whether an opcode takes three sources. */
951 return opcode_descs
[op
].nsrc
== 3;
955 * Tries to compact instruction src into dst.
957 * It doesn't modify dst unless src is compactable, which is relied on by
958 * brw_compact_instructions().
961 brw_try_compact_instruction(struct brw_context
*brw
, brw_compact_inst
*dst
,
964 brw_compact_inst temp
;
966 assert(brw_inst_cmpt_control(brw
, src
) == 0);
968 if (is_3src(brw_inst_opcode(brw
, src
))) {
970 memset(&temp
, 0, sizeof(temp
));
971 if (brw_try_compact_3src_instruction(brw
, &temp
, src
)) {
983 brw_inst_src0_reg_file(brw
, src
) == BRW_IMMEDIATE_VALUE
||
984 brw_inst_src1_reg_file(brw
, src
) == BRW_IMMEDIATE_VALUE
;
986 (brw
->gen
< 6 || !is_compactable_immediate(brw_inst_imm_ud(brw
, src
)))) {
990 if (has_unmapped_bits(brw
, src
))
993 memset(&temp
, 0, sizeof(temp
));
995 brw_compact_inst_set_opcode(&temp
, brw_inst_opcode(brw
, src
));
996 brw_compact_inst_set_debug_control(&temp
, brw_inst_debug_control(brw
, src
));
997 if (!set_control_index(brw
, &temp
, src
))
999 if (!set_datatype_index(brw
, &temp
, src
))
1001 if (!set_subreg_index(brw
, &temp
, src
, is_immediate
))
1003 brw_compact_inst_set_acc_wr_control(&temp
,
1004 brw_inst_acc_wr_control(brw
, src
));
1005 brw_compact_inst_set_cond_modifier(&temp
, brw_inst_cond_modifier(brw
, src
));
1007 brw_compact_inst_set_flag_subreg_nr(&temp
,
1008 brw_inst_flag_subreg_nr(brw
, src
));
1009 brw_compact_inst_set_cmpt_control(&temp
, true);
1010 if (!set_src0_index(brw
, &temp
, src
))
1012 if (!set_src1_index(brw
, &temp
, src
, is_immediate
))
1014 brw_compact_inst_set_dst_reg_nr(&temp
, brw_inst_dst_da_reg_nr(brw
, src
));
1015 brw_compact_inst_set_src0_reg_nr(&temp
, brw_inst_src0_da_reg_nr(brw
, src
));
1017 brw_compact_inst_set_src1_reg_nr(&temp
, brw_inst_imm_ud(brw
, src
) & 0xff);
1019 brw_compact_inst_set_src1_reg_nr(&temp
,
1020 brw_inst_src1_da_reg_nr(brw
, src
));
1029 set_uncompacted_control(struct brw_context
*brw
, brw_inst
*dst
,
1030 brw_compact_inst
*src
)
1032 uint32_t uncompacted
=
1033 control_index_table
[brw_compact_inst_control_index(src
)];
1035 if (brw
->gen
>= 8) {
1036 brw_inst_set_bits(dst
, 33, 31, (uncompacted
>> 16));
1037 brw_inst_set_bits(dst
, 23, 12, (uncompacted
>> 4) & 0xfff);
1038 brw_inst_set_bits(dst
, 10, 9, (uncompacted
>> 2) & 0x3);
1039 brw_inst_set_bits(dst
, 34, 34, (uncompacted
>> 1) & 0x1);
1040 brw_inst_set_bits(dst
, 8, 8, (uncompacted
>> 0) & 0x1);
1042 brw_inst_set_bits(dst
, 31, 31, (uncompacted
>> 16) & 0x1);
1043 brw_inst_set_bits(dst
, 23, 8, (uncompacted
& 0xffff));
1046 brw_inst_set_bits(dst
, 90, 89, uncompacted
>> 17);
1051 set_uncompacted_datatype(struct brw_context
*brw
, brw_inst
*dst
,
1052 brw_compact_inst
*src
)
1054 uint32_t uncompacted
= datatype_table
[brw_compact_inst_datatype_index(src
)];
1056 if (brw
->gen
>= 8) {
1057 brw_inst_set_bits(dst
, 63, 61, (uncompacted
>> 18));
1058 brw_inst_set_bits(dst
, 94, 89, (uncompacted
>> 12) & 0x3f);
1059 brw_inst_set_bits(dst
, 46, 35, (uncompacted
>> 0) & 0xfff);
1061 brw_inst_set_bits(dst
, 63, 61, (uncompacted
>> 15));
1062 brw_inst_set_bits(dst
, 46, 32, (uncompacted
& 0x7fff));
1067 set_uncompacted_subreg(struct brw_context
*brw
, brw_inst
*dst
,
1068 brw_compact_inst
*src
)
1070 uint16_t uncompacted
= subreg_table
[brw_compact_inst_subreg_index(src
)];
1072 brw_inst_set_bits(dst
, 100, 96, (uncompacted
>> 10));
1073 brw_inst_set_bits(dst
, 68, 64, (uncompacted
>> 5) & 0x1f);
1074 brw_inst_set_bits(dst
, 52, 48, (uncompacted
>> 0) & 0x1f);
1078 set_uncompacted_src0(struct brw_context
*brw
, brw_inst
*dst
,
1079 brw_compact_inst
*src
)
1081 uint32_t compacted
= brw_compact_inst_src0_index(src
);
1082 uint16_t uncompacted
= src_index_table
[compacted
];
1084 brw_inst_set_bits(dst
, 88, 77, uncompacted
);
1088 set_uncompacted_src1(struct brw_context
*brw
, brw_inst
*dst
,
1089 brw_compact_inst
*src
, bool is_immediate
)
1092 signed high5
= brw_compact_inst_src1_index(src
);
1093 /* Replicate top bit of src1_index into high 20 bits of the immediate. */
1094 brw_inst_set_imm_ud(brw
, dst
, (high5
<< 27) >> 19);
1096 uint16_t uncompacted
= src_index_table
[brw_compact_inst_src1_index(src
)];
1098 brw_inst_set_bits(dst
, 120, 109, uncompacted
);
1103 set_uncompacted_3src_control_index(struct brw_context
*brw
, brw_inst
*dst
,
1104 brw_compact_inst
*src
)
1106 assert(brw
->gen
>= 8);
1108 uint32_t compacted
= brw_compact_inst_3src_control_index(src
);
1109 uint32_t uncompacted
= gen8_3src_control_index_table
[compacted
];
1111 brw_inst_set_bits(dst
, 34, 32, (uncompacted
>> 21) & 0x7);
1112 brw_inst_set_bits(dst
, 28, 8, (uncompacted
>> 0) & 0x1fffff);
1114 if (brw
->gen
>= 9 || brw
->is_cherryview
)
1115 brw_inst_set_bits(dst
, 36, 35, (uncompacted
>> 24) & 0x3);
1119 set_uncompacted_3src_source_index(struct brw_context
*brw
, brw_inst
*dst
,
1120 brw_compact_inst
*src
)
1122 assert(brw
->gen
>= 8);
1124 uint32_t compacted
= brw_compact_inst_3src_source_index(src
);
1125 uint64_t uncompacted
= gen8_3src_source_index_table
[compacted
];
1127 brw_inst_set_bits(dst
, 83, 83, (uncompacted
>> 43) & 0x1);
1128 brw_inst_set_bits(dst
, 114, 107, (uncompacted
>> 35) & 0xff);
1129 brw_inst_set_bits(dst
, 93, 86, (uncompacted
>> 27) & 0xff);
1130 brw_inst_set_bits(dst
, 72, 65, (uncompacted
>> 19) & 0xff);
1131 brw_inst_set_bits(dst
, 55, 37, (uncompacted
>> 0) & 0x7ffff);
1133 if (brw
->gen
>= 9 || brw
->is_cherryview
) {
1134 brw_inst_set_bits(dst
, 126, 125, (uncompacted
>> 47) & 0x3);
1135 brw_inst_set_bits(dst
, 105, 104, (uncompacted
>> 45) & 0x3);
1136 brw_inst_set_bits(dst
, 84, 84, (uncompacted
>> 44) & 0x1);
1138 brw_inst_set_bits(dst
, 125, 125, (uncompacted
>> 45) & 0x1);
1139 brw_inst_set_bits(dst
, 104, 104, (uncompacted
>> 44) & 0x1);
1144 brw_uncompact_3src_instruction(struct brw_context
*brw
, brw_inst
*dst
,
1145 brw_compact_inst
*src
)
1147 assert(brw
->gen
>= 8);
1149 #define uncompact(field) \
1150 brw_inst_set_3src_##field(brw, dst, brw_compact_inst_3src_##field(src))
1154 set_uncompacted_3src_control_index(brw
, dst
, src
);
1155 set_uncompacted_3src_source_index(brw
, dst
, src
);
1157 uncompact(dst_reg_nr
);
1158 uncompact(src0_rep_ctrl
);
1159 brw_inst_set_3src_cmpt_control(brw
, dst
, false);
1160 uncompact(debug_control
);
1161 uncompact(saturate
);
1162 uncompact(src1_rep_ctrl
);
1163 uncompact(src2_rep_ctrl
);
1164 uncompact(src0_reg_nr
);
1165 uncompact(src1_reg_nr
);
1166 uncompact(src2_reg_nr
);
1167 uncompact(src0_subreg_nr
);
1168 uncompact(src1_subreg_nr
);
1169 uncompact(src2_subreg_nr
);
1175 brw_uncompact_instruction(struct brw_context
*brw
, brw_inst
*dst
,
1176 brw_compact_inst
*src
)
1178 memset(dst
, 0, sizeof(*dst
));
1180 if (brw
->gen
>= 8 && is_3src(brw_compact_inst_3src_opcode(src
))) {
1181 brw_uncompact_3src_instruction(brw
, dst
, src
);
1185 brw_inst_set_opcode(brw
, dst
, brw_compact_inst_opcode(src
));
1186 brw_inst_set_debug_control(brw
, dst
, brw_compact_inst_debug_control(src
));
1188 set_uncompacted_control(brw
, dst
, src
);
1189 set_uncompacted_datatype(brw
, dst
, src
);
1191 /* src0/1 register file fields are in the datatype table. */
1192 bool is_immediate
= brw_inst_src0_reg_file(brw
, dst
) == BRW_IMMEDIATE_VALUE
||
1193 brw_inst_src1_reg_file(brw
, dst
) == BRW_IMMEDIATE_VALUE
;
1195 set_uncompacted_subreg(brw
, dst
, src
);
1196 brw_inst_set_acc_wr_control(brw
, dst
, brw_compact_inst_acc_wr_control(src
));
1197 brw_inst_set_cond_modifier(brw
, dst
, brw_compact_inst_cond_modifier(src
));
1199 brw_inst_set_flag_subreg_nr(brw
, dst
,
1200 brw_compact_inst_flag_subreg_nr(src
));
1201 set_uncompacted_src0(brw
, dst
, src
);
1202 set_uncompacted_src1(brw
, dst
, src
, is_immediate
);
1203 brw_inst_set_dst_da_reg_nr(brw
, dst
, brw_compact_inst_dst_reg_nr(src
));
1204 brw_inst_set_src0_da_reg_nr(brw
, dst
, brw_compact_inst_src0_reg_nr(src
));
1206 brw_inst_set_imm_ud(brw
, dst
,
1207 brw_inst_imm_ud(brw
, dst
) |
1208 brw_compact_inst_src1_reg_nr(src
));
1210 brw_inst_set_src1_da_reg_nr(brw
, dst
, brw_compact_inst_src1_reg_nr(src
));
1214 void brw_debug_compact_uncompact(struct brw_context
*brw
,
1216 brw_inst
*uncompacted
)
1218 fprintf(stderr
, "Instruction compact/uncompact changed (gen%d):\n",
1221 fprintf(stderr
, " before: ");
1222 brw_disassemble_inst(stderr
, brw
, orig
, true);
1224 fprintf(stderr
, " after: ");
1225 brw_disassemble_inst(stderr
, brw
, uncompacted
, false);
1227 uint32_t *before_bits
= (uint32_t *)orig
;
1228 uint32_t *after_bits
= (uint32_t *)uncompacted
;
1229 fprintf(stderr
, " changed bits:\n");
1230 for (int i
= 0; i
< 128; i
++) {
1231 uint32_t before
= before_bits
[i
/ 32] & (1 << (i
& 31));
1232 uint32_t after
= after_bits
[i
/ 32] & (1 << (i
& 31));
1234 if (before
!= after
) {
1235 fprintf(stderr
, " bit %d, %s to %s\n", i
,
1236 before
? "set" : "unset",
1237 after
? "set" : "unset");
1243 compacted_between(int old_ip
, int old_target_ip
, int *compacted_counts
)
1245 int this_compacted_count
= compacted_counts
[old_ip
];
1246 int target_compacted_count
= compacted_counts
[old_target_ip
];
1247 return target_compacted_count
- this_compacted_count
;
1251 update_uip_jip(struct brw_context
*brw
, brw_inst
*insn
,
1252 int this_old_ip
, int *compacted_counts
)
1254 /* JIP and UIP are in units of:
1255 * - bytes on Gen8+; and
1256 * - compacted instructions on Gen6+.
1258 int shift
= brw
->gen
>= 8 ? 3 : 0;
1260 int32_t jip_compacted
= brw_inst_jip(brw
, insn
) >> shift
;
1261 jip_compacted
-= compacted_between(this_old_ip
,
1262 this_old_ip
+ (jip_compacted
/ 2),
1264 brw_inst_set_jip(brw
, insn
, jip_compacted
<< shift
);
1266 if (brw_inst_opcode(brw
, insn
) == BRW_OPCODE_ENDIF
||
1267 brw_inst_opcode(brw
, insn
) == BRW_OPCODE_WHILE
||
1268 (brw_inst_opcode(brw
, insn
) == BRW_OPCODE_ELSE
&& brw
->gen
<= 7))
1271 int32_t uip_compacted
= brw_inst_uip(brw
, insn
) >> shift
;
1272 uip_compacted
-= compacted_between(this_old_ip
,
1273 this_old_ip
+ (uip_compacted
/ 2),
1275 brw_inst_set_uip(brw
, insn
, uip_compacted
<< shift
);
1279 update_gen4_jump_count(struct brw_context
*brw
, brw_inst
*insn
,
1280 int this_old_ip
, int *compacted_counts
)
1282 assert(brw
->gen
== 5 || brw
->is_g4x
);
1284 /* Jump Count is in units of:
1285 * - uncompacted instructions on G45; and
1286 * - compacted instructions on Gen5.
1288 int shift
= brw
->is_g4x
? 1 : 0;
1290 int jump_count_compacted
= brw_inst_gen4_jump_count(brw
, insn
) << shift
;
1292 int target_old_ip
= this_old_ip
+ (jump_count_compacted
/ 2);
1294 int this_compacted_count
= compacted_counts
[this_old_ip
];
1295 int target_compacted_count
= compacted_counts
[target_old_ip
];
1297 jump_count_compacted
-= (target_compacted_count
- this_compacted_count
);
1298 brw_inst_set_gen4_jump_count(brw
, insn
, jump_count_compacted
>> shift
);
1302 brw_init_compaction_tables(struct brw_context
*brw
)
1304 static bool initialized
;
1305 if (initialized
|| p_atomic_cmpxchg(&initialized
, false, true) != false)
1308 assert(g45_control_index_table
[ARRAY_SIZE(g45_control_index_table
) - 1] != 0);
1309 assert(g45_datatype_table
[ARRAY_SIZE(g45_datatype_table
) - 1] != 0);
1310 assert(g45_subreg_table
[ARRAY_SIZE(g45_subreg_table
) - 1] != 0);
1311 assert(g45_src_index_table
[ARRAY_SIZE(g45_src_index_table
) - 1] != 0);
1312 assert(gen6_control_index_table
[ARRAY_SIZE(gen6_control_index_table
) - 1] != 0);
1313 assert(gen6_datatype_table
[ARRAY_SIZE(gen6_datatype_table
) - 1] != 0);
1314 assert(gen6_subreg_table
[ARRAY_SIZE(gen6_subreg_table
) - 1] != 0);
1315 assert(gen6_src_index_table
[ARRAY_SIZE(gen6_src_index_table
) - 1] != 0);
1316 assert(gen7_control_index_table
[ARRAY_SIZE(gen7_control_index_table
) - 1] != 0);
1317 assert(gen7_datatype_table
[ARRAY_SIZE(gen7_datatype_table
) - 1] != 0);
1318 assert(gen7_subreg_table
[ARRAY_SIZE(gen7_subreg_table
) - 1] != 0);
1319 assert(gen7_src_index_table
[ARRAY_SIZE(gen7_src_index_table
) - 1] != 0);
1320 assert(gen8_control_index_table
[ARRAY_SIZE(gen8_control_index_table
) - 1] != 0);
1321 assert(gen8_datatype_table
[ARRAY_SIZE(gen8_datatype_table
) - 1] != 0);
1322 assert(gen8_subreg_table
[ARRAY_SIZE(gen8_subreg_table
) - 1] != 0);
1323 assert(gen8_src_index_table
[ARRAY_SIZE(gen8_src_index_table
) - 1] != 0);
1328 control_index_table
= gen8_control_index_table
;
1329 datatype_table
= gen8_datatype_table
;
1330 subreg_table
= gen8_subreg_table
;
1331 src_index_table
= gen8_src_index_table
;
1334 control_index_table
= gen7_control_index_table
;
1335 datatype_table
= gen7_datatype_table
;
1336 subreg_table
= gen7_subreg_table
;
1337 src_index_table
= gen7_src_index_table
;
1340 control_index_table
= gen6_control_index_table
;
1341 datatype_table
= gen6_datatype_table
;
1342 subreg_table
= gen6_subreg_table
;
1343 src_index_table
= gen6_src_index_table
;
1347 control_index_table
= g45_control_index_table
;
1348 datatype_table
= g45_datatype_table
;
1349 subreg_table
= g45_subreg_table
;
1350 src_index_table
= g45_src_index_table
;
1353 unreachable("unknown generation");
1358 brw_compact_instructions(struct brw_compile
*p
, int start_offset
,
1359 int num_annotations
, struct annotation
*annotation
)
1361 struct brw_context
*brw
= p
->brw
;
1362 void *store
= p
->store
+ start_offset
/ 16;
1363 /* For an instruction at byte offset 16*i before compaction, this is the
1364 * number of compacted instructions minus the number of padding NOP/NENOPs
1367 int compacted_counts
[(p
->next_insn_offset
- start_offset
) / sizeof(brw_inst
)];
1368 /* For an instruction at byte offset 8*i after compaction, this was its IP
1369 * (in 16-byte units) before compaction.
1371 int old_ip
[(p
->next_insn_offset
- start_offset
) / sizeof(brw_compact_inst
)];
1373 if (brw
->gen
== 4 && !brw
->is_g4x
)
1377 int compacted_count
= 0;
1378 for (int src_offset
= 0; src_offset
< p
->next_insn_offset
- start_offset
;
1379 src_offset
+= sizeof(brw_inst
)) {
1380 brw_inst
*src
= store
+ src_offset
;
1381 void *dst
= store
+ offset
;
1383 old_ip
[offset
/ sizeof(brw_compact_inst
)] = src_offset
/ sizeof(brw_inst
);
1384 compacted_counts
[src_offset
/ sizeof(brw_inst
)] = compacted_count
;
1386 brw_inst saved
= *src
;
1388 if (brw_try_compact_instruction(brw
, dst
, src
)) {
1392 brw_inst uncompacted
;
1393 brw_uncompact_instruction(brw
, &uncompacted
, dst
);
1394 if (memcmp(&saved
, &uncompacted
, sizeof(uncompacted
))) {
1395 brw_debug_compact_uncompact(brw
, &saved
, &uncompacted
);
1399 offset
+= sizeof(brw_compact_inst
);
1401 /* It appears that the end of thread SEND instruction needs to be
1402 * aligned, or the GPU hangs. All uncompacted instructions need to be
1405 if ((offset
& sizeof(brw_compact_inst
)) != 0 &&
1406 (((brw_inst_opcode(brw
, src
) == BRW_OPCODE_SEND
||
1407 brw_inst_opcode(brw
, src
) == BRW_OPCODE_SENDC
) &&
1408 brw_inst_eot(brw
, src
)) ||
1410 brw_compact_inst
*align
= store
+ offset
;
1411 memset(align
, 0, sizeof(*align
));
1412 brw_compact_inst_set_opcode(align
, brw
->is_g4x
? BRW_OPCODE_NENOP
:
1414 brw_compact_inst_set_cmpt_control(align
, true);
1415 offset
+= sizeof(brw_compact_inst
);
1417 compacted_counts
[src_offset
/ sizeof(brw_inst
)] = compacted_count
;
1418 old_ip
[offset
/ sizeof(brw_compact_inst
)] = src_offset
/ sizeof(brw_inst
);
1420 dst
= store
+ offset
;
1423 /* If we didn't compact this intruction, we need to move it down into
1426 if (offset
!= src_offset
) {
1427 memmove(dst
, src
, sizeof(brw_inst
));
1429 offset
+= sizeof(brw_inst
);
1433 /* Fix up control flow offsets. */
1434 p
->next_insn_offset
= start_offset
+ offset
;
1435 for (offset
= 0; offset
< p
->next_insn_offset
- start_offset
;
1436 offset
= next_offset(brw
, store
, offset
)) {
1437 brw_inst
*insn
= store
+ offset
;
1438 int this_old_ip
= old_ip
[offset
/ sizeof(brw_compact_inst
)];
1439 int this_compacted_count
= compacted_counts
[this_old_ip
];
1441 switch (brw_inst_opcode(brw
, insn
)) {
1442 case BRW_OPCODE_BREAK
:
1443 case BRW_OPCODE_CONTINUE
:
1444 case BRW_OPCODE_HALT
:
1445 if (brw
->gen
>= 6) {
1446 update_uip_jip(brw
, insn
, this_old_ip
, compacted_counts
);
1448 update_gen4_jump_count(brw
, insn
, this_old_ip
, compacted_counts
);
1453 case BRW_OPCODE_IFF
:
1454 case BRW_OPCODE_ELSE
:
1455 case BRW_OPCODE_ENDIF
:
1456 case BRW_OPCODE_WHILE
:
1457 if (brw
->gen
>= 7) {
1458 if (brw_inst_cmpt_control(brw
, insn
)) {
1459 brw_inst uncompacted
;
1460 brw_uncompact_instruction(brw
, &uncompacted
,
1461 (brw_compact_inst
*)insn
);
1463 update_uip_jip(brw
, &uncompacted
, this_old_ip
, compacted_counts
);
1465 bool ret
= brw_try_compact_instruction(brw
,
1466 (brw_compact_inst
*)insn
,
1468 assert(ret
); (void)ret
;
1470 update_uip_jip(brw
, insn
, this_old_ip
, compacted_counts
);
1472 } else if (brw
->gen
== 6) {
1473 assert(!brw_inst_cmpt_control(brw
, insn
));
1475 /* Jump Count is in units of compacted instructions on Gen6. */
1476 int jump_count_compacted
= brw_inst_gen6_jump_count(brw
, insn
);
1478 int target_old_ip
= this_old_ip
+ (jump_count_compacted
/ 2);
1479 int target_compacted_count
= compacted_counts
[target_old_ip
];
1480 jump_count_compacted
-= (target_compacted_count
- this_compacted_count
);
1481 brw_inst_set_gen6_jump_count(brw
, insn
, jump_count_compacted
);
1483 update_gen4_jump_count(brw
, insn
, this_old_ip
, compacted_counts
);
1487 case BRW_OPCODE_ADD
:
1488 /* Add instructions modifying the IP register use an immediate src1,
1489 * and Gens that use this cannot compact instructions with immediate
1492 if (brw_inst_cmpt_control(brw
, insn
))
1495 if (brw_inst_dst_reg_file(brw
, insn
) == BRW_ARCHITECTURE_REGISTER_FILE
&&
1496 brw_inst_dst_da_reg_nr(brw
, insn
) == BRW_ARF_IP
) {
1497 assert(brw_inst_src1_reg_file(brw
, insn
) == BRW_IMMEDIATE_VALUE
);
1500 int jump_compacted
= brw_inst_imm_d(brw
, insn
) >> shift
;
1502 int target_old_ip
= this_old_ip
+ (jump_compacted
/ 2);
1503 int target_compacted_count
= compacted_counts
[target_old_ip
];
1504 jump_compacted
-= (target_compacted_count
- this_compacted_count
);
1505 brw_inst_set_imm_ud(brw
, insn
, jump_compacted
<< shift
);
1511 /* p->nr_insn is counting the number of uncompacted instructions still, so
1512 * divide. We do want to be sure there's a valid instruction in any
1513 * alignment padding, so that the next compression pass (for the FS 8/16
1514 * compile passes) parses correctly.
1516 if (p
->next_insn_offset
& sizeof(brw_compact_inst
)) {
1517 brw_compact_inst
*align
= store
+ offset
;
1518 memset(align
, 0, sizeof(*align
));
1519 brw_compact_inst_set_opcode(align
, BRW_OPCODE_NOP
);
1520 brw_compact_inst_set_cmpt_control(align
, true);
1521 p
->next_insn_offset
+= sizeof(brw_compact_inst
);
1523 p
->nr_insn
= p
->next_insn_offset
/ sizeof(brw_inst
);
1525 /* Update the instruction offsets for each annotation. */
1527 for (int offset
= 0, i
= 0; i
< num_annotations
; i
++) {
1528 while (start_offset
+ old_ip
[offset
/ sizeof(brw_compact_inst
)] *
1529 sizeof(brw_inst
) != annotation
[i
].offset
) {
1530 assert(start_offset
+ old_ip
[offset
/ sizeof(brw_compact_inst
)] *
1531 sizeof(brw_inst
) < annotation
[i
].offset
);
1532 offset
= next_offset(brw
, store
, offset
);
1535 annotation
[i
].offset
= start_offset
+ offset
;
1537 offset
= next_offset(brw
, store
, offset
);
1540 annotation
[num_annotations
].offset
= p
->next_insn_offset
;