2 * Copyright (C) 2009 Nicolai Haehnle.
3 * Copyright 2011 Tom Stellard <tstellar@gmail.com>
7 * Permission is hereby granted, free of charge, to any person obtaining
8 * a copy of this software and associated documentation files (the
9 * "Software"), to deal in the Software without restriction, including
10 * without limitation the rights to use, copy, modify, merge, publish,
11 * distribute, sublicense, and/or sell copies of the Software, and to
12 * permit persons to whom the Software is furnished to do so, subject to
13 * the following conditions:
15 * The above copyright notice and this permission notice (including the
16 * next paragraph) shall be included in all copies or substantial
17 * portions of the Software.
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
20 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
22 * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
23 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
24 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
25 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
29 #include "radeon_program_pair.h"
33 #include "main/glheader.h"
34 #include "program/register_allocate.h"
37 #include "r300_fragprog_swizzle.h"
38 #include "radeon_compiler.h"
39 #include "radeon_compiler_util.h"
40 #include "radeon_dataflow.h"
41 #include "radeon_list.h"
42 #include "radeon_variable.h"
46 #define DBG(...) do { if (VERBOSE) fprintf(stderr, __VA_ARGS__); } while(0)
50 struct register_info
{
51 struct live_intervals Live
[4];
54 unsigned int Allocated
:1;
56 unsigned int Index
:RC_REGISTER_INDEX_BITS
;
57 unsigned int Writemask
;
60 struct regalloc_state
{
61 struct radeon_compiler
* C
;
63 struct register_info
* Input
;
64 unsigned int NumInputs
;
66 struct register_info
* Temporary
;
67 unsigned int NumTemporaries
;
78 RC_REG_CLASS_SINGLE_PLUS_ALPHA
,
79 RC_REG_CLASS_DOUBLE_PLUS_ALPHA
,
80 RC_REG_CLASS_TRIPLE_PLUS_ALPHA
,
97 enum rc_reg_class Class
;
99 unsigned int WritemaskCount
;
101 /** This is 1 if this class is being used by the register allocator
105 /** This is the ID number assigned to this class by ra. */
108 /** List of writemasks that belong to this class */
109 unsigned int Writemasks
[3];
114 static void print_live_intervals(struct live_intervals
* src
)
116 if (!src
|| !src
->Used
) {
121 DBG("(%i,%i)", src
->Start
, src
->End
);
124 static int overlap_live_intervals(struct live_intervals
* a
, struct live_intervals
* b
)
127 DBG("overlap_live_intervals: ");
128 print_live_intervals(a
);
130 print_live_intervals(b
);
134 if (!a
->Used
|| !b
->Used
) {
135 DBG(" unused interval\n");
139 if (a
->Start
> b
->Start
) {
140 if (a
->Start
< b
->End
) {
144 } else if (b
->Start
> a
->Start
) {
145 if (b
->Start
< a
->End
) {
149 } else { /* a->Start == b->Start */
150 if (a
->Start
!= a
->End
&& b
->Start
!= b
->End
) {
156 DBG(" no overlap\n");
161 static void scan_read_callback(void * data
, struct rc_instruction
* inst
,
162 rc_register_file file
, unsigned int index
, unsigned int mask
)
164 struct regalloc_state
* s
= data
;
165 struct register_info
* reg
;
168 if (file
!= RC_FILE_INPUT
)
171 s
->Input
[index
].Used
= 1;
172 reg
= &s
->Input
[index
];
174 for (i
= 0; i
< 4; i
++) {
175 if (!((mask
>> i
) & 0x1)) {
178 reg
->Live
[i
].Used
= 1;
179 reg
->Live
[i
].Start
= 0;
181 s
->LoopEnd
> inst
->IP
? s
->LoopEnd
: inst
->IP
;
185 static void remap_register(void * data
, struct rc_instruction
* inst
,
186 rc_register_file
* file
, unsigned int * index
)
188 struct regalloc_state
* s
= data
;
189 const struct register_info
* reg
;
191 if (*file
== RC_FILE_TEMPORARY
&& s
->Simple
)
192 reg
= &s
->Temporary
[*index
];
193 else if (*file
== RC_FILE_INPUT
)
194 reg
= &s
->Input
[*index
];
198 if (reg
->Allocated
) {
203 static void alloc_input_simple(void * data
, unsigned int input
,
206 struct regalloc_state
* s
= data
;
208 if (input
>= s
->NumInputs
)
211 s
->Input
[input
].Allocated
= 1;
212 s
->Input
[input
].File
= RC_FILE_TEMPORARY
;
213 s
->Input
[input
].Index
= hwreg
;
216 /* This functions offsets the temporary register indices by the number
217 * of input registers, because input registers are actually temporaries and
218 * should not occupy the same space.
220 * This pass is supposed to be used to maintain correct allocation of inputs
221 * if the standard register allocation is disabled. */
222 static void do_regalloc_inputs_only(struct regalloc_state
* s
)
224 for (unsigned i
= 0; i
< s
->NumTemporaries
; i
++) {
225 s
->Temporary
[i
].Allocated
= 1;
226 s
->Temporary
[i
].File
= RC_FILE_TEMPORARY
;
227 s
->Temporary
[i
].Index
= i
+ s
->NumInputs
;
231 static unsigned int is_derivative(rc_opcode op
)
233 return (op
== RC_OPCODE_DDX
|| op
== RC_OPCODE_DDY
);
236 static int find_class(
237 struct rc_class
* classes
,
238 unsigned int writemask
,
239 unsigned int max_writemask_count
)
242 for (i
= 0; i
< RC_REG_CLASS_COUNT
; i
++) {
244 if (classes
[i
].WritemaskCount
> max_writemask_count
) {
247 for (j
= 0; j
< 3; j
++) {
248 if (classes
[i
].Writemasks
[j
] == writemask
) {
256 static enum rc_reg_class
variable_get_class(
257 struct rc_variable
* variable
,
258 struct rc_class
* classes
)
261 unsigned int can_change_writemask
= 1;
262 unsigned int writemask
= rc_variable_writemask_sum(variable
);
263 struct rc_list
* readers
= rc_variable_readers_union(variable
);
266 if (!variable
->C
->is_r500
) {
268 /* The assumption here is that if an instruction has type
269 * RC_INSTRUCTION_NORMAL then it is a TEX instruction.
270 * r300 and r400 can't swizzle the result of a TEX lookup. */
271 if (variable
->Inst
->Type
== RC_INSTRUCTION_NORMAL
) {
272 writemask
= RC_MASK_XYZW
;
275 /* Check if it is possible to do swizzle packing for r300/r400
276 * without creating non-native swizzles. */
277 class_index
= find_class(classes
, writemask
, 3);
278 if (class_index
< 0) {
281 c
= classes
[class_index
];
282 for (i
= 0; i
< c
.WritemaskCount
; i
++) {
284 unsigned int conversion_swizzle
=
285 rc_make_conversion_swizzle(
286 writemask
, c
.Writemasks
[i
]);
287 for (j
= 0; j
< variable
->ReaderCount
; j
++) {
288 unsigned int old_swizzle
;
289 unsigned int new_swizzle
;
290 struct rc_reader r
= variable
->Readers
[j
];
291 if (r
.Inst
->Type
== RC_INSTRUCTION_PAIR
) {
292 old_swizzle
= r
.U
.P
.Arg
->Swizzle
;
294 old_swizzle
= r
.U
.I
.Src
->Swizzle
;
296 new_swizzle
= rc_adjust_channels(
297 old_swizzle
, conversion_swizzle
);
298 if (!r300_swizzle_is_native_basic(new_swizzle
)) {
299 can_change_writemask
= 0;
303 if (!can_change_writemask
) {
309 if (variable
->Inst
->Type
== RC_INSTRUCTION_PAIR
) {
310 /* DDX/DDY seem to always fail when their writemasks are
312 if (is_derivative(variable
->Inst
->U
.P
.RGB
.Opcode
)
313 || is_derivative(variable
->Inst
->U
.P
.Alpha
.Opcode
)) {
314 can_change_writemask
= 0;
317 for ( ; readers
; readers
= readers
->Next
) {
318 struct rc_reader
* r
= readers
->Item
;
319 if (r
->Inst
->Type
== RC_INSTRUCTION_PAIR
) {
320 if (r
->U
.P
.Arg
->Source
== RC_PAIR_PRESUB_SRC
) {
321 can_change_writemask
= 0;
324 /* DDX/DDY also fail when their swizzles are changed. */
325 if (is_derivative(r
->Inst
->U
.P
.RGB
.Opcode
)
326 || is_derivative(r
->Inst
->U
.P
.Alpha
.Opcode
)) {
327 can_change_writemask
= 0;
333 class_index
= find_class(classes
, writemask
,
334 can_change_writemask
? 3 : 1);
335 if (class_index
> -1) {
336 return classes
[class_index
].Class
;
339 rc_error(variable
->C
,
340 "Could not find class for index=%u mask=%u\n",
341 variable
->Dst
.Index
, writemask
);
346 static unsigned int overlap_live_intervals_array(
347 struct live_intervals
* a
,
348 struct live_intervals
* b
)
350 unsigned int a_chan
, b_chan
;
351 for (a_chan
= 0; a_chan
< 4; a_chan
++) {
352 for (b_chan
= 0; b_chan
< 4; b_chan
++) {
353 if (overlap_live_intervals(&a
[a_chan
], &b
[b_chan
])) {
361 static unsigned int reg_get_index(int reg
)
363 return reg
/ RC_MASK_XYZW
;
366 static unsigned int reg_get_writemask(int reg
)
368 return (reg
% RC_MASK_XYZW
) + 1;
371 static int get_reg_id(unsigned int index
, unsigned int writemask
)
374 if (writemask
== 0) {
377 return (index
* RC_MASK_XYZW
) + (writemask
- 1);
381 static void print_reg(int reg
)
383 unsigned int index
= reg_get_index(reg
);
384 unsigned int mask
= reg_get_writemask(reg
);
385 fprintf(stderr
, "Temp[%u].%c%c%c%c", index
,
386 mask
& RC_MASK_X
? 'x' : '_',
387 mask
& RC_MASK_Y
? 'y' : '_',
388 mask
& RC_MASK_Z
? 'z' : '_',
389 mask
& RC_MASK_W
? 'w' : '_');
393 static void add_register_conflicts(
394 struct ra_regs
* regs
,
395 unsigned int max_temp_regs
)
397 unsigned int index
, a_mask
, b_mask
;
398 for (index
= 0; index
< max_temp_regs
; index
++) {
399 for(a_mask
= 1; a_mask
<= RC_MASK_XYZW
; a_mask
++) {
400 for (b_mask
= a_mask
+ 1; b_mask
<= RC_MASK_XYZW
;
402 if (a_mask
& b_mask
) {
403 ra_add_reg_conflict(regs
,
404 get_reg_id(index
, a_mask
),
405 get_reg_id(index
, b_mask
));
412 static void do_advanced_regalloc(struct regalloc_state
* s
)
414 struct rc_class rc_class_list
[] = {
415 {RC_REG_CLASS_SINGLE
, 3, 0, 0,
419 {RC_REG_CLASS_DOUBLE
, 3, 0, 0,
420 {RC_MASK_X
| RC_MASK_Y
,
421 RC_MASK_X
| RC_MASK_Z
,
422 RC_MASK_Y
| RC_MASK_Z
}},
423 {RC_REG_CLASS_TRIPLE
, 1, 0, 0,
424 {RC_MASK_X
| RC_MASK_Y
| RC_MASK_Z
,
427 {RC_REG_CLASS_ALPHA
, 1, 0, 0,
431 {RC_REG_CLASS_SINGLE_PLUS_ALPHA
, 3, 0, 0,
432 {RC_MASK_X
| RC_MASK_W
,
433 RC_MASK_Y
| RC_MASK_W
,
434 RC_MASK_Z
| RC_MASK_W
}},
435 {RC_REG_CLASS_DOUBLE_PLUS_ALPHA
, 3, 0, 0,
436 {RC_MASK_X
| RC_MASK_Y
| RC_MASK_W
,
437 RC_MASK_X
| RC_MASK_Z
| RC_MASK_W
,
438 RC_MASK_Y
| RC_MASK_Z
| RC_MASK_W
}},
439 {RC_REG_CLASS_TRIPLE_PLUS_ALPHA
, 1, 0, 0,
440 {RC_MASK_X
| RC_MASK_Y
| RC_MASK_Z
| RC_MASK_W
,
443 {RC_REG_CLASS_X
, 1, 0, 0,
447 {RC_REG_CLASS_Y
, 1, 0, 0,
451 {RC_REG_CLASS_Z
, 1, 0, 0,
455 {RC_REG_CLASS_XY
, 1, 0, 0,
456 {RC_MASK_X
| RC_MASK_Y
,
459 {RC_REG_CLASS_YZ
, 1, 0, 0,
460 {RC_MASK_Y
| RC_MASK_Z
,
463 {RC_REG_CLASS_XZ
, 1, 0, 0,
464 {RC_MASK_X
| RC_MASK_Z
,
467 {RC_REG_CLASS_XW
, 1, 0, 0,
468 {RC_MASK_X
| RC_MASK_W
,
471 {RC_REG_CLASS_YW
, 1, 0, 0,
472 {RC_MASK_Y
| RC_MASK_W
,
475 {RC_REG_CLASS_ZW
, 1, 0, 0,
476 {RC_MASK_Z
| RC_MASK_W
,
479 {RC_REG_CLASS_XYW
, 1, 0, 0,
480 {RC_MASK_X
| RC_MASK_Y
| RC_MASK_W
,
483 {RC_REG_CLASS_YZW
, 1, 0, 0,
484 {RC_MASK_Y
| RC_MASK_Z
| RC_MASK_W
,
487 {RC_REG_CLASS_XZW
, 1, 0, 0,
488 {RC_MASK_X
| RC_MASK_Z
| RC_MASK_W
,
493 unsigned int i
, j
, index
, input_node
, node_count
, node_index
;
494 unsigned int * node_classes
;
495 unsigned int * input_classes
;
496 struct rc_instruction
* inst
;
497 struct rc_list
* var_ptr
;
498 struct rc_list
* variables
;
499 struct ra_regs
* regs
;
500 struct ra_graph
* graph
;
502 /* Allocate the main ra data structure */
503 regs
= ra_alloc_reg_set(s
->C
->max_temp_regs
* RC_MASK_XYZW
);
505 /* Get list of program variables */
506 variables
= rc_get_variables(s
->C
);
507 node_count
= rc_list_count(variables
);
508 node_classes
= memory_pool_malloc(&s
->C
->Pool
,
509 node_count
* sizeof(unsigned int));
510 input_classes
= memory_pool_malloc(&s
->C
->Pool
,
511 s
->NumInputs
* sizeof(unsigned int));
513 for (var_ptr
= variables
, node_index
= 0; var_ptr
;
514 var_ptr
= var_ptr
->Next
, node_index
++) {
515 unsigned int class_index
;
516 /* Compute the live intervals */
517 rc_variable_compute_live_intervals(var_ptr
->Item
);
519 class_index
= variable_get_class(var_ptr
->Item
, rc_class_list
);
521 /* If we haven't used this register class yet, mark it
522 * as used and allocate space for it. */
523 if (!rc_class_list
[class_index
].Used
) {
524 rc_class_list
[class_index
].Used
= 1;
525 rc_class_list
[class_index
].Id
= ra_alloc_reg_class(regs
);
528 node_classes
[node_index
] = rc_class_list
[class_index
].Id
;
532 /* Assign registers to the classes */
533 for (i
= 0; i
< RC_REG_CLASS_COUNT
; i
++) {
534 struct rc_class
class = rc_class_list
[i
];
539 for (index
= 0; index
< s
->C
->max_temp_regs
; index
++) {
540 for (j
= 0; j
< class.WritemaskCount
; j
++) {
541 int reg_id
= get_reg_id(index
,
542 class.Writemasks
[j
]);
543 ra_class_add_reg(regs
, class.Id
, reg_id
);
548 /* Add register conflicts */
549 add_register_conflicts(regs
, s
->C
->max_temp_regs
);
551 /* Calculate live intervals for input registers */
552 for (inst
= s
->C
->Program
.Instructions
.Next
;
553 inst
!= &s
->C
->Program
.Instructions
;
555 rc_opcode op
= rc_get_flow_control_inst(inst
);
556 if (op
== RC_OPCODE_BGNLOOP
) {
557 struct rc_instruction
* endloop
=
558 rc_match_bgnloop(inst
);
559 if (endloop
->IP
> s
->LoopEnd
) {
560 s
->LoopEnd
= endloop
->IP
;
563 rc_for_all_reads_mask(inst
, scan_read_callback
, s
);
566 /* Create classes for input registers */
567 for (i
= 0; i
< s
->NumInputs
; i
++) {
568 unsigned int chan
, class_id
, writemask
= 0;
569 for (chan
= 0; chan
< 4; chan
++) {
570 if (s
->Input
[i
].Live
[chan
].Used
) {
571 writemask
|= (1 << chan
);
574 s
->Input
[i
].Writemask
= writemask
;
579 class_id
= ra_alloc_reg_class(regs
);
580 input_classes
[i
] = class_id
;
581 ra_class_add_reg(regs
, class_id
,
582 get_reg_id(s
->Input
[i
].Index
, writemask
));
585 ra_set_finalize(regs
);
587 graph
= ra_alloc_interference_graph(regs
, node_count
+ s
->NumInputs
);
589 /* Build the interference graph */
590 for (var_ptr
= variables
, node_index
= 0; var_ptr
;
591 var_ptr
= var_ptr
->Next
,node_index
++) {
592 struct rc_list
* a
, * b
;
593 unsigned int b_index
;
595 ra_set_node_class(graph
, node_index
, node_classes
[node_index
]);
597 for (a
= var_ptr
, b
= var_ptr
->Next
, b_index
= node_index
+ 1;
598 b
; b
= b
->Next
, b_index
++) {
599 struct rc_variable
* var_a
= a
->Item
;
601 struct rc_variable
* var_b
= b
->Item
;
603 if (overlap_live_intervals_array(var_a
->Live
, var_b
->Live
)) {
604 ra_add_node_interference(graph
,
605 node_index
, b_index
);
607 var_b
= var_b
->Friend
;
609 var_a
= var_a
->Friend
;
614 /* Add input registers to the interference graph */
615 for (i
= 0, input_node
= 0; i
< s
->NumInputs
; i
++) {
616 if (!s
->Input
[i
].Writemask
) {
619 ra_set_node_class(graph
, node_count
+ input_node
,
621 for (var_ptr
= variables
, node_index
= 0;
622 var_ptr
; var_ptr
= var_ptr
->Next
, node_index
++) {
623 struct rc_variable
* var
= var_ptr
->Item
;
624 if (overlap_live_intervals_array(s
->Input
[i
].Live
,
626 ra_add_node_interference(graph
, node_index
,
627 node_count
+ input_node
);
630 /* Manually allocate a register for this input */
631 ra_set_node_reg(graph
, node_count
+ input_node
, get_reg_id(
632 s
->Input
[i
].Index
, s
->Input
[i
].Writemask
));
636 if (!ra_allocate_no_spills(graph
)) {
637 rc_error(s
->C
, "Ran out of hardware temporaries\n");
641 /* Rewrite the registers */
642 for (var_ptr
= variables
, node_index
= 0; var_ptr
;
643 var_ptr
= var_ptr
->Next
, node_index
++) {
644 int reg
= ra_get_node_reg(graph
, node_index
);
645 unsigned int writemask
= reg_get_writemask(reg
);
646 unsigned int index
= reg_get_index(reg
);
647 struct rc_variable
* var
= var_ptr
->Item
;
649 if (!s
->C
->is_r500
&& var
->Inst
->Type
== RC_INSTRUCTION_NORMAL
) {
650 writemask
= rc_variable_writemask_sum(var
);
653 if (var
->Dst
.File
== RC_FILE_INPUT
) {
656 rc_variable_change_dst(var
, index
, writemask
);
664 * @param user This parameter should be a pointer to an integer value. If this
665 * integer value is zero, then a simple register allocator will be used that
666 * only allocates space for input registers (\sa do_regalloc_inputs_only). If
667 * user is non-zero, then the regular register allocator will be used
670 void rc_pair_regalloc(struct radeon_compiler
*cc
, void *user
)
672 struct r300_fragment_program_compiler
*c
=
673 (struct r300_fragment_program_compiler
*)cc
;
674 struct regalloc_state s
;
675 int * do_full_regalloc
= (int*)user
;
677 memset(&s
, 0, sizeof(s
));
679 s
.NumInputs
= rc_get_max_index(cc
, RC_FILE_INPUT
) + 1;
680 s
.Input
= memory_pool_malloc(&cc
->Pool
,
681 s
.NumInputs
* sizeof(struct register_info
));
682 memset(s
.Input
, 0, s
.NumInputs
* sizeof(struct register_info
));
684 s
.NumTemporaries
= rc_get_max_index(cc
, RC_FILE_TEMPORARY
) + 1;
685 s
.Temporary
= memory_pool_malloc(&cc
->Pool
,
686 s
.NumTemporaries
* sizeof(struct register_info
));
687 memset(s
.Temporary
, 0, s
.NumTemporaries
* sizeof(struct register_info
));
689 rc_recompute_ips(s
.C
);
691 c
->AllocateHwInputs(c
, &alloc_input_simple
, &s
);
692 if (*do_full_regalloc
) {
693 do_advanced_regalloc(&s
);
696 do_regalloc_inputs_only(&s
);
699 /* Rewrite inputs and if we are doing the simple allocation, rewrite
700 * temporaries too. */
701 for (struct rc_instruction
*inst
= s
.C
->Program
.Instructions
.Next
;
702 inst
!= &s
.C
->Program
.Instructions
;
704 rc_remap_registers(inst
, &remap_register
, &s
);