2 * Copyright © 2012 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 /** @file brw_eu_compact.c
26 * Instruction compaction is a feature of gm45 and newer hardware that allows
27 * for a smaller instruction encoding.
29 * The instruction cache is on the order of 32KB, and many programs generate
30 * far more instructions than that. The instruction cache is built to barely
31 * keep up with instruction dispatch abaility in cache hit cases -- L1
32 * instruction cache misses that still hit in the next level could limit
33 * throughput by around 50%.
35 * The idea of instruction compaction is that most instructions use a tiny
36 * subset of the GPU functionality, so we can encode what would be a 16 byte
37 * instruction in 8 bytes using some lookup tables for various fields.
40 #include "brw_context.h"
42 #include "intel_asm_annotation.h"
44 static const uint32_t gen6_control_index_table
[32] = {
79 static const uint32_t gen6_datatype_table
[32] = {
100 0b001011011000101100,
101 0b001011010110100101,
102 0b001011110110100101,
103 0b001111011110111101,
104 0b001111011110111100,
105 0b001111011110111101,
106 0b001111011110011101,
107 0b001111011110111110,
108 0b001000000000100001,
109 0b001000000000100010,
110 0b001001111111011101,
111 0b001000001110111110,
114 static const uint16_t gen6_subreg_table
[32] = {
149 static const uint16_t gen6_src_index_table
[32] = {
184 static const uint32_t gen7_control_index_table
[32] = {
185 0b0000000000000000010,
186 0b0000100000000000000,
187 0b0000100000000000001,
188 0b0000100000000000010,
189 0b0000100000000000011,
190 0b0000100000000000100,
191 0b0000100000000000101,
192 0b0000100000000000111,
193 0b0000100000000001000,
194 0b0000100000000001001,
195 0b0000100000000001101,
196 0b0000110000000000000,
197 0b0000110000000000001,
198 0b0000110000000000010,
199 0b0000110000000000011,
200 0b0000110000000000100,
201 0b0000110000000000101,
202 0b0000110000000000111,
203 0b0000110000000001001,
204 0b0000110000000001101,
205 0b0000110000000010000,
206 0b0000110000100000000,
207 0b0001000000000000000,
208 0b0001000000000000010,
209 0b0001000000000000100,
210 0b0001000000100000000,
211 0b0010110000000000000,
212 0b0010110000000010000,
213 0b0011000000000000000,
214 0b0011000000100000000,
215 0b0101000000000000000,
216 0b0101000000100000000
219 static const uint32_t gen7_datatype_table
[32] = {
220 0b001000000000000001,
221 0b001000000000100000,
222 0b001000000000100001,
223 0b001000000001100001,
224 0b001000000010111101,
225 0b001000001011111101,
226 0b001000001110100001,
227 0b001000001110100101,
228 0b001000001110111101,
229 0b001000010000100001,
230 0b001000110000100000,
231 0b001000110000100001,
232 0b001001010010100101,
233 0b001001110010100100,
234 0b001001110010100101,
235 0b001111001110111101,
236 0b001111011110011101,
237 0b001111011110111100,
238 0b001111011110111101,
239 0b001111111110111100,
240 0b000000001000001100,
241 0b001000000000111101,
242 0b001000000010100101,
243 0b001000010000100000,
244 0b001001010010100100,
245 0b001001110010000100,
246 0b001010010100001001,
247 0b001101111110111101,
248 0b001111111110111101,
249 0b001011110110101100,
250 0b001010010100101000,
254 static const uint16_t gen7_subreg_table
[32] = {
289 static const uint16_t gen7_src_index_table
[32] = {
324 static const uint32_t gen8_control_index_table
[32] = {
325 0b0000000000000000010,
326 0b0000100000000000000,
327 0b0000100000000000001,
328 0b0000100000000000010,
329 0b0000100000000000011,
330 0b0000100000000000100,
331 0b0000100000000000101,
332 0b0000100000000000111,
333 0b0000100000000001000,
334 0b0000100000000001001,
335 0b0000100000000001101,
336 0b0000110000000000000,
337 0b0000110000000000001,
338 0b0000110000000000010,
339 0b0000110000000000011,
340 0b0000110000000000100,
341 0b0000110000000000101,
342 0b0000110000000000111,
343 0b0000110000000001001,
344 0b0000110000000001101,
345 0b0000110000000010000,
346 0b0000110000100000000,
347 0b0001000000000000000,
348 0b0001000000000000010,
349 0b0001000000000000100,
350 0b0001000000100000000,
351 0b0010110000000000000,
352 0b0010110000000010000,
353 0b0011000000000000000,
354 0b0011000000100000000,
355 0b0101000000000000000,
356 0b0101000000100000000
359 static const uint32_t gen8_datatype_table
[32] = {
360 0b001000000000000000001,
361 0b001000000000001000000,
362 0b001000000000001000001,
363 0b001000000000011000001,
364 0b001000000000101011101,
365 0b001000000010111011101,
366 0b001000000011101000001,
367 0b001000000011101000101,
368 0b001000000011101011101,
369 0b001000001000001000001,
370 0b001000011000001000000,
371 0b001000011000001000001,
372 0b001000101000101000101,
373 0b001000111000101000100,
374 0b001000111000101000101,
375 0b001011100011101011101,
376 0b001011101011100011101,
377 0b001011101011101011100,
378 0b001011101011101011101,
379 0b001011111011101011100,
380 0b000000000010000001100,
381 0b001000000000001011101,
382 0b001000000000101000101,
383 0b001000001000001000000,
384 0b001000101000101000100,
385 0b001000111000100000100,
386 0b001001001001000001001,
387 0b001010111011101011101,
388 0b001011111011101011101,
389 0b001001111001101001100,
390 0b001001001001001001000,
391 0b001001011001001001000
394 static const uint16_t gen8_subreg_table
[32] = {
429 static const uint16_t gen8_src_index_table
[32] = {
464 /* This is actually the control index table for Cherryview (26 bits), but the
465 * only difference from Broadwell (24 bits) is that it has two extra 0-bits at
468 * The low 24 bits have the same mappings on both hardware.
470 static const uint32_t gen8_3src_control_index_table
[4] = {
471 0b00100000000110000000000001,
472 0b00000000000110000000000001,
473 0b00000000001000000000000001,
474 0b00000000001000000000100001
477 /* This is actually the control index table for Cherryview (49 bits), but the
478 * only difference from Broadwell (46 bits) is that it has three extra 0-bits
481 * The low 44 bits have the same mappings on both hardware, and since the high
482 * three bits on Broadwell are zero, we can reuse Cherryview's table.
484 static const uint64_t gen8_3src_source_index_table
[4] = {
485 0b0000001110010011100100111001000001111000000000000,
486 0b0000001110010011100100111001000001111000000000010,
487 0b0000001110010011100100111001000001111000000001000,
488 0b0000001110010011100100111001000001111000000100000
491 static const uint32_t *control_index_table
;
492 static const uint32_t *datatype_table
;
493 static const uint16_t *subreg_table
;
494 static const uint16_t *src_index_table
;
497 set_control_index(struct brw_context
*brw
, brw_compact_inst
*dst
, brw_inst
*src
)
499 uint32_t uncompacted
= brw
->gen
>= 8 /* 17b/SNB; 19b/IVB+ */
500 ? (brw_inst_bits(src
, 33, 31) << 16) | /* 3b */
501 (brw_inst_bits(src
, 23, 12) << 4) | /* 12b */
502 (brw_inst_bits(src
, 10, 9) << 2) | /* 2b */
503 (brw_inst_bits(src
, 34, 34) << 1) | /* 1b */
504 (brw_inst_bits(src
, 8, 8)) /* 1b */
505 : (brw_inst_bits(src
, 31, 31) << 16) | /* 1b */
506 (brw_inst_bits(src
, 23, 8)); /* 16b */
508 /* On gen7, the flag register and subregister numbers are integrated into
512 uncompacted
|= brw_inst_bits(src
, 90, 89) << 17; /* 2b */
514 for (int i
= 0; i
< 32; i
++) {
515 if (control_index_table
[i
] == uncompacted
) {
516 brw_compact_inst_set_control_index(dst
, i
);
525 set_datatype_index(struct brw_context
*brw
, brw_compact_inst
*dst
,
528 uint32_t uncompacted
= brw
->gen
>= 8 /* 18b/SNB+; 21b/BDW+ */
529 ? (brw_inst_bits(src
, 63, 61) << 18) | /* 3b */
530 (brw_inst_bits(src
, 94, 89) << 12) | /* 6b */
531 (brw_inst_bits(src
, 46, 35)) /* 12b */
532 : (brw_inst_bits(src
, 63, 61) << 15) | /* 3b */
533 (brw_inst_bits(src
, 46, 32)); /* 15b */
535 for (int i
= 0; i
< 32; i
++) {
536 if (datatype_table
[i
] == uncompacted
) {
537 brw_compact_inst_set_datatype_index(dst
, i
);
546 set_subreg_index(struct brw_context
*brw
, brw_compact_inst
*dst
, brw_inst
*src
,
549 uint16_t uncompacted
= /* 15b */
550 (brw_inst_bits(src
, 52, 48) << 0) | /* 5b */
551 (brw_inst_bits(src
, 68, 64) << 5); /* 5b */
554 uncompacted
|= brw_inst_bits(src
, 100, 96) << 10; /* 5b */
556 for (int i
= 0; i
< 32; i
++) {
557 if (subreg_table
[i
] == uncompacted
) {
558 brw_compact_inst_set_subreg_index(dst
, i
);
567 get_src_index(uint16_t uncompacted
,
570 for (int i
= 0; i
< 32; i
++) {
571 if (src_index_table
[i
] == uncompacted
) {
581 set_src0_index(struct brw_context
*brw
, brw_compact_inst
*dst
, brw_inst
*src
)
584 uint16_t uncompacted
= brw_inst_bits(src
, 88, 77); /* 12b */
586 if (!get_src_index(uncompacted
, &compacted
))
589 brw_compact_inst_set_src0_index(dst
, compacted
);
595 set_src1_index(struct brw_context
*brw
, brw_compact_inst
*dst
, brw_inst
*src
,
601 compacted
= (brw_inst_imm_ud(brw
, src
) >> 8) & 0x1f;
603 uint16_t uncompacted
= brw_inst_bits(src
, 120, 109); /* 12b */
605 if (!get_src_index(uncompacted
, &compacted
))
609 brw_compact_inst_set_src1_index(dst
, compacted
);
615 set_3src_control_index(struct brw_context
*brw
, brw_compact_inst
*dst
, brw_inst
*src
)
617 assert(brw
->gen
>= 8);
619 uint32_t uncompacted
= /* 24b/BDW; 26b/CHV */
620 (brw_inst_bits(src
, 34, 32) << 21) | /* 3b */
621 (brw_inst_bits(src
, 28, 8)); /* 21b */
623 if (brw
->is_cherryview
)
624 uncompacted
|= brw_inst_bits(src
, 36, 35) << 24; /* 2b */
626 for (int i
= 0; i
< ARRAY_SIZE(gen8_3src_control_index_table
); i
++) {
627 if (gen8_3src_control_index_table
[i
] == uncompacted
) {
628 brw_compact_inst_set_3src_control_index(dst
, i
);
637 set_3src_source_index(struct brw_context
*brw
, brw_compact_inst
*dst
, brw_inst
*src
)
639 assert(brw
->gen
>= 8);
641 uint64_t uncompacted
= /* 46b/BDW; 49b/CHV */
642 (brw_inst_bits(src
, 83, 83) << 43) | /* 1b */
643 (brw_inst_bits(src
, 114, 107) << 35) | /* 8b */
644 (brw_inst_bits(src
, 93, 86) << 27) | /* 8b */
645 (brw_inst_bits(src
, 72, 65) << 19) | /* 8b */
646 (brw_inst_bits(src
, 55, 37)); /* 19b */
648 if (brw
->is_cherryview
) {
650 (brw_inst_bits(src
, 126, 125) << 47) | /* 2b */
651 (brw_inst_bits(src
, 105, 104) << 45) | /* 2b */
652 (brw_inst_bits(src
, 84, 84) << 44); /* 1b */
655 (brw_inst_bits(src
, 125, 125) << 45) | /* 1b */
656 (brw_inst_bits(src
, 104, 104) << 44); /* 1b */
659 for (int i
= 0; i
< ARRAY_SIZE(gen8_3src_source_index_table
); i
++) {
660 if (gen8_3src_source_index_table
[i
] == uncompacted
) {
661 brw_compact_inst_set_3src_source_index(dst
, i
);
670 brw_try_compact_3src_instruction(struct brw_context
*brw
, brw_compact_inst
*dst
,
673 assert(brw
->gen
>= 8);
675 #define compact(field) \
676 brw_compact_inst_set_3src_##field(dst, brw_inst_3src_##field(brw, src))
680 if (!set_3src_control_index(brw
, dst
, src
))
683 if (!set_3src_source_index(brw
, dst
, src
))
687 compact(src0_rep_ctrl
);
688 brw_compact_inst_set_3src_cmpt_control(dst
, true);
689 compact(debug_control
);
691 compact(src1_rep_ctrl
);
692 compact(src2_rep_ctrl
);
693 compact(src0_reg_nr
);
694 compact(src1_reg_nr
);
695 compact(src2_reg_nr
);
696 compact(src0_subreg_nr
);
697 compact(src1_subreg_nr
);
698 compact(src2_subreg_nr
);
705 /* Compacted instructions have 12-bits for immediate sources, and a 13th bit
706 * that's replicated through the high 20 bits.
708 * Effectively this means we get 12-bit integers, 0.0f, and some limited uses
709 * of packed vectors as compactable immediates.
712 is_compactable_immediate(unsigned imm
)
714 /* We get the low 12 bits as-is. */
717 /* We get one bit replicated through the top 20 bits. */
718 return imm
== 0 || imm
== 0xfffff000;
721 /* Returns whether an opcode takes three sources. */
725 return opcode_descs
[op
].nsrc
== 3;
729 * Tries to compact instruction src into dst.
731 * It doesn't modify dst unless src is compactable, which is relied on by
732 * brw_compact_instructions().
735 brw_try_compact_instruction(struct brw_context
*brw
, brw_compact_inst
*dst
,
738 brw_compact_inst temp
;
740 if (brw_inst_opcode(brw
, src
) == BRW_OPCODE_IF
||
741 brw_inst_opcode(brw
, src
) == BRW_OPCODE_ELSE
||
742 brw_inst_opcode(brw
, src
) == BRW_OPCODE_ENDIF
||
743 brw_inst_opcode(brw
, src
) == BRW_OPCODE_HALT
||
744 brw_inst_opcode(brw
, src
) == BRW_OPCODE_DO
||
745 brw_inst_opcode(brw
, src
) == BRW_OPCODE_WHILE
) {
746 /* FINISHME: The fixup code below, and brw_set_uip_jip and friends, needs
747 * to be able to handle compacted flow control instructions..
752 if (brw
->gen
>= 8 && is_3src(brw_inst_opcode(brw
, src
))) {
753 memset(&temp
, 0, sizeof(temp
));
754 if (brw_try_compact_3src_instruction(brw
, &temp
, src
)) {
763 brw_inst_src0_reg_file(brw
, src
) == BRW_IMMEDIATE_VALUE
||
764 brw_inst_src1_reg_file(brw
, src
) == BRW_IMMEDIATE_VALUE
;
765 if (is_immediate
&& !is_compactable_immediate(brw_inst_imm_ud(brw
, src
))) {
769 memset(&temp
, 0, sizeof(temp
));
771 brw_compact_inst_set_opcode(&temp
, brw_inst_opcode(brw
, src
));
772 brw_compact_inst_set_debug_control(&temp
, brw_inst_debug_control(brw
, src
));
773 if (!set_control_index(brw
, &temp
, src
))
775 if (!set_datatype_index(brw
, &temp
, src
))
777 if (!set_subreg_index(brw
, &temp
, src
, is_immediate
))
779 brw_compact_inst_set_acc_wr_control(&temp
,
780 brw_inst_acc_wr_control(brw
, src
));
781 brw_compact_inst_set_cond_modifier(&temp
, brw_inst_cond_modifier(brw
, src
));
783 brw_compact_inst_set_flag_subreg_nr(&temp
,
784 brw_inst_flag_subreg_nr(brw
, src
));
785 brw_compact_inst_set_cmpt_control(&temp
, true);
786 if (!set_src0_index(brw
, &temp
, src
))
788 if (!set_src1_index(brw
, &temp
, src
, is_immediate
))
790 brw_compact_inst_set_dst_reg_nr(&temp
, brw_inst_dst_da_reg_nr(brw
, src
));
791 brw_compact_inst_set_src0_reg_nr(&temp
, brw_inst_src0_da_reg_nr(brw
, src
));
793 brw_compact_inst_set_src1_reg_nr(&temp
, brw_inst_imm_ud(brw
, src
) & 0xff);
795 brw_compact_inst_set_src1_reg_nr(&temp
,
796 brw_inst_src1_da_reg_nr(brw
, src
));
805 set_uncompacted_control(struct brw_context
*brw
, brw_inst
*dst
,
806 brw_compact_inst
*src
)
808 uint32_t uncompacted
=
809 control_index_table
[brw_compact_inst_control_index(src
)];
812 brw_inst_set_bits(dst
, 33, 31, (uncompacted
>> 16));
813 brw_inst_set_bits(dst
, 23, 12, (uncompacted
>> 4) & 0xfff);
814 brw_inst_set_bits(dst
, 10, 9, (uncompacted
>> 2) & 0x3);
815 brw_inst_set_bits(dst
, 34, 34, (uncompacted
>> 1) & 0x1);
816 brw_inst_set_bits(dst
, 8, 8, (uncompacted
>> 0) & 0x1);
818 brw_inst_set_bits(dst
, 31, 31, (uncompacted
>> 16) & 0x1);
819 brw_inst_set_bits(dst
, 23, 8, (uncompacted
& 0xffff));
822 brw_inst_set_bits(dst
, 90, 89, uncompacted
>> 17);
827 set_uncompacted_datatype(struct brw_context
*brw
, brw_inst
*dst
,
828 brw_compact_inst
*src
)
830 uint32_t uncompacted
= datatype_table
[brw_compact_inst_datatype_index(src
)];
833 brw_inst_set_bits(dst
, 63, 61, (uncompacted
>> 18));
834 brw_inst_set_bits(dst
, 94, 89, (uncompacted
>> 12) & 0x3f);
835 brw_inst_set_bits(dst
, 46, 35, (uncompacted
>> 0) & 0xfff);
837 brw_inst_set_bits(dst
, 63, 61, (uncompacted
>> 15));
838 brw_inst_set_bits(dst
, 46, 32, (uncompacted
& 0x7fff));
843 set_uncompacted_subreg(struct brw_context
*brw
, brw_inst
*dst
,
844 brw_compact_inst
*src
)
846 uint16_t uncompacted
= subreg_table
[brw_compact_inst_subreg_index(src
)];
848 brw_inst_set_bits(dst
, 100, 96, (uncompacted
>> 10));
849 brw_inst_set_bits(dst
, 68, 64, (uncompacted
>> 5) & 0x1f);
850 brw_inst_set_bits(dst
, 52, 48, (uncompacted
>> 0) & 0x1f);
854 set_uncompacted_src0(struct brw_context
*brw
, brw_inst
*dst
,
855 brw_compact_inst
*src
)
857 uint32_t compacted
= brw_compact_inst_src0_index(src
);
858 uint16_t uncompacted
= src_index_table
[compacted
];
860 brw_inst_set_bits(dst
, 88, 77, uncompacted
);
864 set_uncompacted_src1(struct brw_context
*brw
, brw_inst
*dst
,
865 brw_compact_inst
*src
, bool is_immediate
)
868 signed high5
= brw_compact_inst_src1_index(src
);
869 /* Replicate top bit of src1_index into high 20 bits of the immediate. */
870 brw_inst_set_imm_ud(brw
, dst
, (high5
<< 27) >> 19);
872 uint16_t uncompacted
= src_index_table
[brw_compact_inst_src1_index(src
)];
874 brw_inst_set_bits(dst
, 120, 109, uncompacted
);
879 set_uncompacted_3src_control_index(struct brw_context
*brw
, brw_inst
*dst
,
880 brw_compact_inst
*src
)
882 assert(brw
->gen
>= 8);
884 uint32_t compacted
= brw_compact_inst_3src_control_index(src
);
885 uint32_t uncompacted
= gen8_3src_control_index_table
[compacted
];
887 brw_inst_set_bits(dst
, 34, 32, (uncompacted
>> 21) & 0x7);
888 brw_inst_set_bits(dst
, 28, 8, (uncompacted
>> 0) & 0x1fffff);
890 if (brw
->is_cherryview
)
891 brw_inst_set_bits(dst
, 36, 35, (uncompacted
>> 24) & 0x3);
895 set_uncompacted_3src_source_index(struct brw_context
*brw
, brw_inst
*dst
,
896 brw_compact_inst
*src
)
898 assert(brw
->gen
>= 8);
900 uint32_t compacted
= brw_compact_inst_3src_source_index(src
);
901 uint64_t uncompacted
= gen8_3src_source_index_table
[compacted
];
903 brw_inst_set_bits(dst
, 83, 83, (uncompacted
>> 43) & 0x1);
904 brw_inst_set_bits(dst
, 114, 107, (uncompacted
>> 35) & 0xff);
905 brw_inst_set_bits(dst
, 93, 86, (uncompacted
>> 27) & 0xff);
906 brw_inst_set_bits(dst
, 72, 65, (uncompacted
>> 19) & 0xff);
907 brw_inst_set_bits(dst
, 55, 37, (uncompacted
>> 0) & 0x7ffff);
909 if (brw
->is_cherryview
) {
910 brw_inst_set_bits(dst
, 126, 125, (uncompacted
>> 47) & 0x3);
911 brw_inst_set_bits(dst
, 105, 104, (uncompacted
>> 45) & 0x3);
912 brw_inst_set_bits(dst
, 84, 84, (uncompacted
>> 44) & 0x1);
914 brw_inst_set_bits(dst
, 125, 125, (uncompacted
>> 45) & 0x1);
915 brw_inst_set_bits(dst
, 104, 104, (uncompacted
>> 44) & 0x1);
920 brw_uncompact_3src_instruction(struct brw_context
*brw
, brw_inst
*dst
,
921 brw_compact_inst
*src
)
923 assert(brw
->gen
>= 8);
925 #define uncompact(field) \
926 brw_inst_set_3src_##field(brw, dst, brw_compact_inst_3src_##field(src))
930 set_uncompacted_3src_control_index(brw
, dst
, src
);
931 set_uncompacted_3src_source_index(brw
, dst
, src
);
933 uncompact(dst_reg_nr
);
934 uncompact(src0_rep_ctrl
);
935 brw_inst_set_3src_cmpt_control(brw
, dst
, false);
936 uncompact(debug_control
);
938 uncompact(src1_rep_ctrl
);
939 uncompact(src2_rep_ctrl
);
940 uncompact(src0_reg_nr
);
941 uncompact(src1_reg_nr
);
942 uncompact(src2_reg_nr
);
943 uncompact(src0_subreg_nr
);
944 uncompact(src1_subreg_nr
);
945 uncompact(src2_subreg_nr
);
951 brw_uncompact_instruction(struct brw_context
*brw
, brw_inst
*dst
,
952 brw_compact_inst
*src
)
954 memset(dst
, 0, sizeof(*dst
));
956 if (brw
->gen
>= 8 && is_3src(brw_compact_inst_3src_opcode(src
))) {
957 brw_uncompact_3src_instruction(brw
, dst
, src
);
961 brw_inst_set_opcode(brw
, dst
, brw_compact_inst_opcode(src
));
962 brw_inst_set_debug_control(brw
, dst
, brw_compact_inst_debug_control(src
));
964 set_uncompacted_control(brw
, dst
, src
);
965 set_uncompacted_datatype(brw
, dst
, src
);
967 /* src0/1 register file fields are in the datatype table. */
968 bool is_immediate
= brw_inst_src0_reg_file(brw
, dst
) == BRW_IMMEDIATE_VALUE
||
969 brw_inst_src1_reg_file(brw
, dst
) == BRW_IMMEDIATE_VALUE
;
971 set_uncompacted_subreg(brw
, dst
, src
);
972 brw_inst_set_acc_wr_control(brw
, dst
, brw_compact_inst_acc_wr_control(src
));
973 brw_inst_set_cond_modifier(brw
, dst
, brw_compact_inst_cond_modifier(src
));
975 brw_inst_set_flag_subreg_nr(brw
, dst
,
976 brw_compact_inst_flag_subreg_nr(src
));
977 set_uncompacted_src0(brw
, dst
, src
);
978 set_uncompacted_src1(brw
, dst
, src
, is_immediate
);
979 brw_inst_set_dst_da_reg_nr(brw
, dst
, brw_compact_inst_dst_reg_nr(src
));
980 brw_inst_set_src0_da_reg_nr(brw
, dst
, brw_compact_inst_src0_reg_nr(src
));
982 brw_inst_set_imm_ud(brw
, dst
,
983 brw_inst_imm_ud(brw
, dst
) |
984 brw_compact_inst_src1_reg_nr(src
));
986 brw_inst_set_src1_da_reg_nr(brw
, dst
, brw_compact_inst_src1_reg_nr(src
));
990 void brw_debug_compact_uncompact(struct brw_context
*brw
,
992 brw_inst
*uncompacted
)
994 fprintf(stderr
, "Instruction compact/uncompact changed (gen%d):\n",
997 fprintf(stderr
, " before: ");
998 brw_disassemble_inst(stderr
, brw
, orig
, true);
1000 fprintf(stderr
, " after: ");
1001 brw_disassemble_inst(stderr
, brw
, uncompacted
, false);
1003 uint32_t *before_bits
= (uint32_t *)orig
;
1004 uint32_t *after_bits
= (uint32_t *)uncompacted
;
1005 fprintf(stderr
, " changed bits:\n");
1006 for (int i
= 0; i
< 128; i
++) {
1007 uint32_t before
= before_bits
[i
/ 32] & (1 << (i
& 31));
1008 uint32_t after
= after_bits
[i
/ 32] & (1 << (i
& 31));
1010 if (before
!= after
) {
1011 fprintf(stderr
, " bit %d, %s to %s\n", i
,
1012 before
? "set" : "unset",
1013 after
? "set" : "unset");
1019 compacted_between(int old_ip
, int old_target_ip
, int *compacted_counts
)
1021 int this_compacted_count
= compacted_counts
[old_ip
];
1022 int target_compacted_count
= compacted_counts
[old_target_ip
];
1023 return target_compacted_count
- this_compacted_count
;
1027 update_uip_jip(struct brw_context
*brw
, brw_inst
*insn
,
1028 int this_old_ip
, int *compacted_counts
)
1030 int scale
= brw
->gen
>= 8 ? sizeof(brw_compact_inst
) : 1;
1032 int32_t jip
= brw_inst_jip(brw
, insn
) / scale
;
1033 jip
-= compacted_between(this_old_ip
, this_old_ip
+ jip
, compacted_counts
);
1034 brw_inst_set_jip(brw
, insn
, jip
* scale
);
1036 if (brw_inst_opcode(brw
, insn
) == BRW_OPCODE_ENDIF
||
1037 brw_inst_opcode(brw
, insn
) == BRW_OPCODE_WHILE
)
1040 int32_t uip
= brw_inst_uip(brw
, insn
) / scale
;
1041 uip
-= compacted_between(this_old_ip
, this_old_ip
+ uip
, compacted_counts
);
1042 brw_inst_set_uip(brw
, insn
, uip
* scale
);
1046 brw_init_compaction_tables(struct brw_context
*brw
)
1048 assert(gen6_control_index_table
[ARRAY_SIZE(gen6_control_index_table
) - 1] != 0);
1049 assert(gen6_datatype_table
[ARRAY_SIZE(gen6_datatype_table
) - 1] != 0);
1050 assert(gen6_subreg_table
[ARRAY_SIZE(gen6_subreg_table
) - 1] != 0);
1051 assert(gen6_src_index_table
[ARRAY_SIZE(gen6_src_index_table
) - 1] != 0);
1052 assert(gen7_control_index_table
[ARRAY_SIZE(gen7_control_index_table
) - 1] != 0);
1053 assert(gen7_datatype_table
[ARRAY_SIZE(gen7_datatype_table
) - 1] != 0);
1054 assert(gen7_subreg_table
[ARRAY_SIZE(gen7_subreg_table
) - 1] != 0);
1055 assert(gen7_src_index_table
[ARRAY_SIZE(gen7_src_index_table
) - 1] != 0);
1056 assert(gen8_control_index_table
[ARRAY_SIZE(gen8_control_index_table
) - 1] != 0);
1057 assert(gen8_datatype_table
[ARRAY_SIZE(gen8_datatype_table
) - 1] != 0);
1058 assert(gen8_subreg_table
[ARRAY_SIZE(gen8_subreg_table
) - 1] != 0);
1059 assert(gen8_src_index_table
[ARRAY_SIZE(gen8_src_index_table
) - 1] != 0);
1063 control_index_table
= gen8_control_index_table
;
1064 datatype_table
= gen8_datatype_table
;
1065 subreg_table
= gen8_subreg_table
;
1066 src_index_table
= gen8_src_index_table
;
1069 control_index_table
= gen7_control_index_table
;
1070 datatype_table
= gen7_datatype_table
;
1071 subreg_table
= gen7_subreg_table
;
1072 src_index_table
= gen7_src_index_table
;
1075 control_index_table
= gen6_control_index_table
;
1076 datatype_table
= gen6_datatype_table
;
1077 subreg_table
= gen6_subreg_table
;
1078 src_index_table
= gen6_src_index_table
;
1086 brw_compact_instructions(struct brw_compile
*p
, int start_offset
,
1087 int num_annotations
, struct annotation
*annotation
)
1089 struct brw_context
*brw
= p
->brw
;
1090 void *store
= p
->store
+ start_offset
/ 16;
1091 /* For an instruction at byte offset 8*i before compaction, this is the number
1092 * of compacted instructions that preceded it.
1094 int compacted_counts
[(p
->next_insn_offset
- start_offset
) / 8];
1095 /* For an instruction at byte offset 8*i after compaction, this is the
1096 * 8-byte offset it was at before compaction.
1098 int old_ip
[(p
->next_insn_offset
- start_offset
) / 8];
1105 int compacted_count
= 0;
1106 for (src_offset
= 0; src_offset
< p
->next_insn_offset
- start_offset
;) {
1107 brw_inst
*src
= store
+ src_offset
;
1108 void *dst
= store
+ offset
;
1110 old_ip
[offset
/ 8] = src_offset
/ 8;
1111 compacted_counts
[src_offset
/ 8] = compacted_count
;
1113 brw_inst saved
= *src
;
1115 if (!brw_inst_cmpt_control(brw
, src
) &&
1116 brw_try_compact_instruction(brw
, dst
, src
)) {
1120 brw_inst uncompacted
;
1121 brw_uncompact_instruction(brw
, &uncompacted
, dst
);
1122 if (memcmp(&saved
, &uncompacted
, sizeof(uncompacted
))) {
1123 brw_debug_compact_uncompact(brw
, &saved
, &uncompacted
);
1130 int size
= brw_inst_cmpt_control(brw
, src
) ? 8 : 16;
1132 /* It appears that the end of thread SEND instruction needs to be
1133 * aligned, or the GPU hangs.
1135 if ((brw_inst_opcode(brw
, src
) == BRW_OPCODE_SEND
||
1136 brw_inst_opcode(brw
, src
) == BRW_OPCODE_SENDC
) &&
1137 brw_inst_eot(brw
, src
) &&
1138 (offset
& 8) != 0) {
1139 brw_compact_inst
*align
= store
+ offset
;
1140 memset(align
, 0, sizeof(*align
));
1141 brw_compact_inst_set_opcode(align
, BRW_OPCODE_NOP
);
1142 brw_compact_inst_set_cmpt_control(align
, true);
1144 old_ip
[offset
/ 8] = src_offset
/ 8;
1145 dst
= store
+ offset
;
1148 /* If we didn't compact this intruction, we need to move it down into
1151 if (offset
!= src_offset
) {
1152 memmove(dst
, src
, size
);
1159 /* Fix up control flow offsets. */
1160 p
->next_insn_offset
= start_offset
+ offset
;
1161 for (offset
= 0; offset
< p
->next_insn_offset
- start_offset
;) {
1162 brw_inst
*insn
= store
+ offset
;
1163 int this_old_ip
= old_ip
[offset
/ 8];
1164 int this_compacted_count
= compacted_counts
[this_old_ip
];
1165 int target_old_ip
, target_compacted_count
;
1167 switch (brw_inst_opcode(brw
, insn
)) {
1168 case BRW_OPCODE_BREAK
:
1169 case BRW_OPCODE_CONTINUE
:
1170 case BRW_OPCODE_HALT
:
1171 update_uip_jip(brw
, insn
, this_old_ip
, compacted_counts
);
1175 case BRW_OPCODE_ELSE
:
1176 case BRW_OPCODE_ENDIF
:
1177 case BRW_OPCODE_WHILE
:
1178 if (brw
->gen
>= 7) {
1179 update_uip_jip(brw
, insn
, this_old_ip
, compacted_counts
);
1180 } else if (brw
->gen
== 6) {
1181 int gen6_jump_count
= brw_inst_gen6_jump_count(brw
, insn
);
1182 target_old_ip
= this_old_ip
+ gen6_jump_count
;
1183 target_compacted_count
= compacted_counts
[target_old_ip
];
1184 gen6_jump_count
-= (target_compacted_count
- this_compacted_count
);
1185 brw_inst_set_gen6_jump_count(brw
, insn
, gen6_jump_count
);
1190 offset
= next_offset(brw
, store
, offset
);
1193 /* p->nr_insn is counting the number of uncompacted instructions still, so
1194 * divide. We do want to be sure there's a valid instruction in any
1195 * alignment padding, so that the next compression pass (for the FS 8/16
1196 * compile passes) parses correctly.
1198 if (p
->next_insn_offset
& 8) {
1199 brw_compact_inst
*align
= store
+ offset
;
1200 memset(align
, 0, sizeof(*align
));
1201 brw_compact_inst_set_opcode(align
, BRW_OPCODE_NOP
);
1202 brw_compact_inst_set_cmpt_control(align
, true);
1203 p
->next_insn_offset
+= 8;
1205 p
->nr_insn
= p
->next_insn_offset
/ 16;
1207 /* Update the instruction offsets for each annotation. */
1209 for (int offset
= 0, i
= 0; i
< num_annotations
; i
++) {
1210 while (start_offset
+ old_ip
[offset
/ 8] * 8 != annotation
[i
].offset
) {
1211 assert(start_offset
+ old_ip
[offset
/ 8] * 8 <
1212 annotation
[i
].offset
);
1213 offset
= next_offset(brw
, store
, offset
);
1216 annotation
[i
].offset
= start_offset
+ offset
;
1218 offset
= next_offset(brw
, store
, offset
);
1221 annotation
[num_annotations
].offset
= p
->next_insn_offset
;