2 * Copyright 2014-2019 Advanced Micro Devices, Inc.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26 #include "ac_binary.h"
27 #include "ac_gpu_info.h"
28 #include "util/u_dynarray.h"
29 #include "util/u_math.h"
38 // Old distributions may not have this enum constant
39 #define MY_EM_AMDGPU 224
41 #ifndef STT_AMDGPU_LDS
42 #define STT_AMDGPU_LDS 13 // this is deprecated -- remove
45 #ifndef SHN_AMDGPU_LDS
46 #define SHN_AMDGPU_LDS 0xff00
50 #define R_AMDGPU_NONE 0
51 #define R_AMDGPU_ABS32_LO 1
52 #define R_AMDGPU_ABS32_HI 2
53 #define R_AMDGPU_ABS64 3
54 #define R_AMDGPU_REL32 4
55 #define R_AMDGPU_REL64 5
56 #define R_AMDGPU_ABS32 6
57 #define R_AMDGPU_GOTPCREL 7
58 #define R_AMDGPU_GOTPCREL32_LO 8
59 #define R_AMDGPU_GOTPCREL32_HI 9
60 #define R_AMDGPU_REL32_LO 10
61 #define R_AMDGPU_REL32_HI 11
62 #define R_AMDGPU_RELATIVE64 13
65 /* For the UMR disassembler. */
66 #define DEBUGGER_END_OF_CODE_MARKER 0xbf9f0000 /* invalid instruction */
67 #define DEBUGGER_NUM_MARKERS 5
69 struct ac_rtld_section
{
71 bool is_pasted_text
: 1;
78 struct ac_rtld_section
*sections
;
79 unsigned num_sections
;
82 static void report_erroraf(const char *fmt
, va_list va
)
85 int ret
= vasprintf(&msg
, fmt
, va
);
87 msg
= "(vasprintf failed)";
89 fprintf(stderr
, "ac_rtld error: %s\n", msg
);
95 static void report_errorf(const char *fmt
, ...) PRINTFLIKE(1, 2);
97 static void report_errorf(const char *fmt
, ...)
101 report_erroraf(fmt
, va
);
105 static void report_elf_errorf(const char *fmt
, ...) PRINTFLIKE(1, 2);
107 static void report_elf_errorf(const char *fmt
, ...)
111 report_erroraf(fmt
, va
);
114 fprintf(stderr
, "ELF error: %s\n", elf_errmsg(elf_errno()));
118 * Find a symbol in a dynarray of struct ac_rtld_symbol by \p name and shader
121 static const struct ac_rtld_symbol
*find_symbol(const struct util_dynarray
*symbols
,
122 const char *name
, unsigned part_idx
)
124 util_dynarray_foreach (symbols
, struct ac_rtld_symbol
, symbol
) {
125 if ((symbol
->part_idx
== ~0u || symbol
->part_idx
== part_idx
) && !strcmp(name
, symbol
->name
))
131 static int compare_symbol_by_align(const void *lhsp
, const void *rhsp
)
133 const struct ac_rtld_symbol
*lhs
= lhsp
;
134 const struct ac_rtld_symbol
*rhs
= rhsp
;
135 if (rhs
->align
> lhs
->align
)
137 if (rhs
->align
< lhs
->align
)
143 * Sort the given symbol list by decreasing alignment and assign offsets.
145 static bool layout_symbols(struct ac_rtld_symbol
*symbols
, unsigned num_symbols
,
146 uint64_t *ptotal_size
)
148 qsort(symbols
, num_symbols
, sizeof(*symbols
), compare_symbol_by_align
);
150 uint64_t total_size
= *ptotal_size
;
152 for (unsigned i
= 0; i
< num_symbols
; ++i
) {
153 struct ac_rtld_symbol
*s
= &symbols
[i
];
154 assert(util_is_power_of_two_nonzero(s
->align
));
156 total_size
= align64(total_size
, s
->align
);
157 s
->offset
= total_size
;
159 if (total_size
+ s
->size
< total_size
) {
160 report_errorf("%s: size overflow", __FUNCTION__
);
164 total_size
+= s
->size
;
167 *ptotal_size
= total_size
;
172 * Read LDS symbols from the given \p section of the ELF of \p part and append
173 * them to the LDS symbols list.
175 * Shared LDS symbols are filtered out.
177 static bool read_private_lds_symbols(struct ac_rtld_binary
*binary
, unsigned part_idx
,
178 Elf_Scn
*section
, uint32_t *lds_end_align
)
180 #define report_if(cond) \
183 report_errorf(#cond); \
187 #define report_elf_if(cond) \
190 report_elf_errorf(#cond); \
195 struct ac_rtld_part
*part
= &binary
->parts
[part_idx
];
196 Elf64_Shdr
*shdr
= elf64_getshdr(section
);
197 uint32_t strtabidx
= shdr
->sh_link
;
198 Elf_Data
*symbols_data
= elf_getdata(section
, NULL
);
199 report_elf_if(!symbols_data
);
201 const Elf64_Sym
*symbol
= symbols_data
->d_buf
;
202 size_t num_symbols
= symbols_data
->d_size
/ sizeof(Elf64_Sym
);
204 for (size_t j
= 0; j
< num_symbols
; ++j
, ++symbol
) {
205 struct ac_rtld_symbol s
= {};
207 if (ELF64_ST_TYPE(symbol
->st_info
) == STT_AMDGPU_LDS
) {
208 /* old-style LDS symbols from initial prototype -- remove eventually */
209 s
.align
= MIN2(1u << (symbol
->st_other
>> 3), 1u << 16);
210 } else if (symbol
->st_shndx
== SHN_AMDGPU_LDS
) {
211 s
.align
= MIN2(symbol
->st_value
, 1u << 16);
212 report_if(!util_is_power_of_two_nonzero(s
.align
));
216 report_if(symbol
->st_size
> 1u << 29);
218 s
.name
= elf_strptr(part
->elf
, strtabidx
, symbol
->st_name
);
219 s
.size
= symbol
->st_size
;
220 s
.part_idx
= part_idx
;
222 if (!strcmp(s
.name
, "__lds_end")) {
223 report_elf_if(s
.size
!= 0);
224 *lds_end_align
= MAX2(*lds_end_align
, s
.align
);
228 const struct ac_rtld_symbol
*shared
= find_symbol(&binary
->lds_symbols
, s
.name
, part_idx
);
230 report_elf_if(s
.align
> shared
->align
);
231 report_elf_if(s
.size
> shared
->size
);
235 util_dynarray_append(&binary
->lds_symbols
, struct ac_rtld_symbol
, s
);
245 * Open a binary consisting of one or more shader parts.
247 * \param binary the uninitialized struct
248 * \param i binary opening parameters
250 bool ac_rtld_open(struct ac_rtld_binary
*binary
, struct ac_rtld_open_info i
)
252 /* One of the libelf implementations
253 * (http://www.mr511.de/software/english.htm) requires calling
254 * elf_version() before elf_memory().
256 elf_version(EV_CURRENT
);
258 memset(binary
, 0, sizeof(*binary
));
259 memcpy(&binary
->options
, &i
.options
, sizeof(binary
->options
));
260 binary
->wave_size
= i
.wave_size
;
261 binary
->num_parts
= i
.num_parts
;
262 binary
->parts
= calloc(sizeof(*binary
->parts
), i
.num_parts
);
266 uint64_t pasted_text_size
= 0;
267 uint64_t rx_align
= 1;
268 uint64_t rx_size
= 0;
269 uint64_t exec_size
= 0;
271 #define report_if(cond) \
274 report_errorf(#cond); \
278 #define report_elf_if(cond) \
281 report_elf_errorf(#cond); \
286 /* Copy and layout shared LDS symbols. */
287 if (i
.num_shared_lds_symbols
) {
288 if (!util_dynarray_resize(&binary
->lds_symbols
, struct ac_rtld_symbol
,
289 i
.num_shared_lds_symbols
))
292 memcpy(binary
->lds_symbols
.data
, i
.shared_lds_symbols
, binary
->lds_symbols
.size
);
295 util_dynarray_foreach (&binary
->lds_symbols
, struct ac_rtld_symbol
, symbol
)
296 symbol
->part_idx
= ~0u;
298 unsigned max_lds_size
= 64 * 1024;
300 if (i
.info
->chip_class
== GFX6
||
301 (i
.shader_type
!= MESA_SHADER_COMPUTE
&& i
.shader_type
!= MESA_SHADER_FRAGMENT
))
302 max_lds_size
= 32 * 1024;
304 uint64_t shared_lds_size
= 0;
305 if (!layout_symbols(binary
->lds_symbols
.data
, i
.num_shared_lds_symbols
, &shared_lds_size
))
308 if (shared_lds_size
> max_lds_size
) {
309 fprintf(stderr
, "ac_rtld error(1): too much LDS (used = %u, max = %u)\n",
310 (unsigned)shared_lds_size
, max_lds_size
);
313 binary
->lds_size
= shared_lds_size
;
315 /* First pass over all parts: open ELFs, pre-determine the placement of
316 * sections in the memory image, and collect and layout private LDS symbols. */
317 uint32_t lds_end_align
= 0;
319 if (binary
->options
.halt_at_entry
)
320 pasted_text_size
+= 4;
322 for (unsigned part_idx
= 0; part_idx
< i
.num_parts
; ++part_idx
) {
323 struct ac_rtld_part
*part
= &binary
->parts
[part_idx
];
324 unsigned part_lds_symbols_begin
=
325 util_dynarray_num_elements(&binary
->lds_symbols
, struct ac_rtld_symbol
);
327 part
->elf
= elf_memory((char *)i
.elf_ptrs
[part_idx
], i
.elf_sizes
[part_idx
]);
328 report_elf_if(!part
->elf
);
330 const Elf64_Ehdr
*ehdr
= elf64_getehdr(part
->elf
);
331 report_elf_if(!ehdr
);
332 report_if(ehdr
->e_machine
!= MY_EM_AMDGPU
);
334 size_t section_str_index
;
336 report_elf_if(elf_getshdrstrndx(part
->elf
, §ion_str_index
) < 0);
337 report_elf_if(elf_getshdrnum(part
->elf
, &num_shdrs
) < 0);
339 part
->num_sections
= num_shdrs
;
340 part
->sections
= calloc(sizeof(*part
->sections
), num_shdrs
);
341 report_if(!part
->sections
);
343 Elf_Scn
*section
= NULL
;
344 while ((section
= elf_nextscn(part
->elf
, section
))) {
345 Elf64_Shdr
*shdr
= elf64_getshdr(section
);
346 struct ac_rtld_section
*s
= &part
->sections
[elf_ndxscn(section
)];
347 s
->name
= elf_strptr(part
->elf
, section_str_index
, shdr
->sh_name
);
348 report_elf_if(!s
->name
);
350 /* Cannot actually handle linked objects yet */
351 report_elf_if(shdr
->sh_addr
!= 0);
353 /* Alignment must be 0 or a power of two */
354 report_elf_if(shdr
->sh_addralign
& (shdr
->sh_addralign
- 1));
355 uint64_t sh_align
= MAX2(shdr
->sh_addralign
, 1);
357 if (shdr
->sh_flags
& SHF_ALLOC
&& shdr
->sh_type
!= SHT_NOTE
) {
358 report_if(shdr
->sh_flags
& SHF_WRITE
);
362 if (shdr
->sh_flags
& SHF_EXECINSTR
) {
363 report_elf_if(shdr
->sh_size
& 3);
365 if (!strcmp(s
->name
, ".text"))
366 s
->is_pasted_text
= true;
368 exec_size
+= shdr
->sh_size
;
371 if (s
->is_pasted_text
) {
372 s
->offset
= pasted_text_size
;
373 pasted_text_size
+= shdr
->sh_size
;
375 rx_align
= align(rx_align
, sh_align
);
376 rx_size
= align(rx_size
, sh_align
);
378 rx_size
+= shdr
->sh_size
;
380 } else if (shdr
->sh_type
== SHT_SYMTAB
) {
381 if (!read_private_lds_symbols(binary
, part_idx
, section
, &lds_end_align
))
386 uint64_t part_lds_size
= shared_lds_size
;
387 if (!layout_symbols(util_dynarray_element(&binary
->lds_symbols
, struct ac_rtld_symbol
,
388 part_lds_symbols_begin
),
389 util_dynarray_num_elements(&binary
->lds_symbols
, struct ac_rtld_symbol
) -
390 part_lds_symbols_begin
,
393 binary
->lds_size
= MAX2(binary
->lds_size
, part_lds_size
);
396 binary
->rx_end_markers
= pasted_text_size
;
397 pasted_text_size
+= 4 * DEBUGGER_NUM_MARKERS
;
399 /* __lds_end is a special symbol that points at the end of the memory
400 * occupied by other LDS symbols. Its alignment is taken as the
401 * maximum of its alignment over all shader parts where it occurs.
404 binary
->lds_size
= align(binary
->lds_size
, lds_end_align
);
406 struct ac_rtld_symbol
*lds_end
=
407 util_dynarray_grow(&binary
->lds_symbols
, struct ac_rtld_symbol
, 1);
408 lds_end
->name
= "__lds_end";
410 lds_end
->align
= lds_end_align
;
411 lds_end
->offset
= binary
->lds_size
;
412 lds_end
->part_idx
= ~0u;
415 if (binary
->lds_size
> max_lds_size
) {
416 fprintf(stderr
, "ac_rtld error(2): too much LDS (used = %u, max = %u)\n",
417 (unsigned)binary
->lds_size
, max_lds_size
);
421 /* Second pass: Adjust offsets of non-pasted text sections. */
422 binary
->rx_size
= pasted_text_size
;
423 binary
->rx_size
= align(binary
->rx_size
, rx_align
);
425 for (unsigned part_idx
= 0; part_idx
< i
.num_parts
; ++part_idx
) {
426 struct ac_rtld_part
*part
= &binary
->parts
[part_idx
];
428 elf_getshdrnum(part
->elf
, &num_shdrs
);
430 for (unsigned j
= 0; j
< num_shdrs
; ++j
) {
431 struct ac_rtld_section
*s
= &part
->sections
[j
];
432 if (s
->is_rx
&& !s
->is_pasted_text
)
433 s
->offset
+= binary
->rx_size
;
437 binary
->rx_size
+= rx_size
;
438 binary
->exec_size
= exec_size
;
440 if (i
.info
->chip_class
>= GFX10
) {
441 /* In gfx10, the SQ fetches up to 3 cache lines of 16 dwords
442 * ahead of the PC, configurable by SH_MEM_CONFIG and
443 * S_INST_PREFETCH. This can cause two issues:
445 * (1) Crossing a page boundary to an unmapped page. The logic
446 * does not distinguish between a required fetch and a "mere"
447 * prefetch and will fault.
449 * (2) Prefetching instructions that will be changed for a
452 * (2) is not currently an issue because we flush the I$ at IB
453 * boundaries, but (1) needs to be addressed. Due to buffer
454 * suballocation, we just play it safe.
456 binary
->rx_size
= align(binary
->rx_size
+ 3 * 64, 64);
465 ac_rtld_close(binary
);
469 void ac_rtld_close(struct ac_rtld_binary
*binary
)
471 for (unsigned i
= 0; i
< binary
->num_parts
; ++i
) {
472 struct ac_rtld_part
*part
= &binary
->parts
[i
];
473 free(part
->sections
);
477 util_dynarray_fini(&binary
->lds_symbols
);
479 binary
->parts
= NULL
;
480 binary
->num_parts
= 0;
483 static bool get_section_by_name(struct ac_rtld_part
*part
, const char *name
, const char **data
,
486 for (unsigned i
= 0; i
< part
->num_sections
; ++i
) {
487 struct ac_rtld_section
*s
= &part
->sections
[i
];
488 if (s
->name
&& !strcmp(name
, s
->name
)) {
489 Elf_Scn
*target_scn
= elf_getscn(part
->elf
, i
);
490 Elf_Data
*target_data
= elf_getdata(target_scn
, NULL
);
492 report_elf_errorf("ac_rtld: get_section_by_name: elf_getdata");
496 *data
= target_data
->d_buf
;
497 *nbytes
= target_data
->d_size
;
504 bool ac_rtld_get_section_by_name(struct ac_rtld_binary
*binary
, const char *name
, const char **data
,
507 assert(binary
->num_parts
== 1);
508 return get_section_by_name(&binary
->parts
[0], name
, data
, nbytes
);
511 bool ac_rtld_read_config(const struct radeon_info
*info
, struct ac_rtld_binary
*binary
,
512 struct ac_shader_config
*config
)
514 for (unsigned i
= 0; i
< binary
->num_parts
; ++i
) {
515 struct ac_rtld_part
*part
= &binary
->parts
[i
];
516 const char *config_data
;
517 size_t config_nbytes
;
519 if (!get_section_by_name(part
, ".AMDGPU.config", &config_data
, &config_nbytes
))
522 /* TODO: be precise about scratch use? */
523 struct ac_shader_config c
= {};
524 ac_parse_shader_binary_config(config_data
, config_nbytes
, binary
->wave_size
, true, info
, &c
);
526 config
->num_sgprs
= MAX2(config
->num_sgprs
, c
.num_sgprs
);
527 config
->num_vgprs
= MAX2(config
->num_vgprs
, c
.num_vgprs
);
528 config
->spilled_sgprs
= MAX2(config
->spilled_sgprs
, c
.spilled_sgprs
);
529 config
->spilled_vgprs
= MAX2(config
->spilled_vgprs
, c
.spilled_vgprs
);
530 config
->scratch_bytes_per_wave
=
531 MAX2(config
->scratch_bytes_per_wave
, c
.scratch_bytes_per_wave
);
533 assert(i
== 0 || config
->float_mode
== c
.float_mode
);
534 config
->float_mode
= c
.float_mode
;
536 /* SPI_PS_INPUT_ENA/ADDR can't be combined. Only the value from
537 * the main shader part is used. */
538 assert(config
->spi_ps_input_ena
== 0 && config
->spi_ps_input_addr
== 0);
539 config
->spi_ps_input_ena
= c
.spi_ps_input_ena
;
540 config
->spi_ps_input_addr
= c
.spi_ps_input_addr
;
542 /* TODO: consistently use LDS symbols for this */
543 config
->lds_size
= MAX2(config
->lds_size
, c
.lds_size
);
545 /* TODO: Should we combine these somehow? It's currently only
546 * used for radeonsi's compute, where multiple parts aren't used. */
547 assert(config
->rsrc1
== 0 && config
->rsrc2
== 0);
548 config
->rsrc1
= c
.rsrc1
;
549 config
->rsrc2
= c
.rsrc2
;
555 static bool resolve_symbol(const struct ac_rtld_upload_info
*u
, unsigned part_idx
,
556 const Elf64_Sym
*sym
, const char *name
, uint64_t *value
)
558 /* TODO: properly disentangle the undef and the LDS cases once
559 * STT_AMDGPU_LDS is retired. */
560 if (sym
->st_shndx
== SHN_UNDEF
|| sym
->st_shndx
== SHN_AMDGPU_LDS
) {
561 const struct ac_rtld_symbol
*lds_sym
= find_symbol(&u
->binary
->lds_symbols
, name
, part_idx
);
564 *value
= lds_sym
->offset
;
568 /* TODO: resolve from other parts */
570 if (u
->get_external_symbol(u
->cb_data
, name
, value
))
573 report_errorf("symbol %s: unknown", name
);
577 struct ac_rtld_part
*part
= &u
->binary
->parts
[part_idx
];
578 if (sym
->st_shndx
>= part
->num_sections
) {
579 report_errorf("symbol %s: section out of bounds", name
);
583 struct ac_rtld_section
*s
= &part
->sections
[sym
->st_shndx
];
585 report_errorf("symbol %s: bad section", name
);
589 uint64_t section_base
= u
->rx_va
+ s
->offset
;
591 *value
= section_base
+ sym
->st_value
;
595 static bool apply_relocs(const struct ac_rtld_upload_info
*u
, unsigned part_idx
,
596 const Elf64_Shdr
*reloc_shdr
, const Elf_Data
*reloc_data
)
598 #define report_if(cond) \
601 report_errorf(#cond); \
605 #define report_elf_if(cond) \
608 report_elf_errorf(#cond); \
613 struct ac_rtld_part
*part
= &u
->binary
->parts
[part_idx
];
614 Elf_Scn
*target_scn
= elf_getscn(part
->elf
, reloc_shdr
->sh_info
);
615 report_elf_if(!target_scn
);
617 Elf_Data
*target_data
= elf_getdata(target_scn
, NULL
);
618 report_elf_if(!target_data
);
620 Elf_Scn
*symbols_scn
= elf_getscn(part
->elf
, reloc_shdr
->sh_link
);
621 report_elf_if(!symbols_scn
);
623 Elf64_Shdr
*symbols_shdr
= elf64_getshdr(symbols_scn
);
624 report_elf_if(!symbols_shdr
);
625 uint32_t strtabidx
= symbols_shdr
->sh_link
;
627 Elf_Data
*symbols_data
= elf_getdata(symbols_scn
, NULL
);
628 report_elf_if(!symbols_data
);
630 const Elf64_Sym
*symbols
= symbols_data
->d_buf
;
631 size_t num_symbols
= symbols_data
->d_size
/ sizeof(Elf64_Sym
);
633 struct ac_rtld_section
*s
= &part
->sections
[reloc_shdr
->sh_info
];
634 report_if(!s
->is_rx
);
636 const char *orig_base
= target_data
->d_buf
;
637 char *dst_base
= u
->rx_ptr
+ s
->offset
;
638 uint64_t va_base
= u
->rx_va
+ s
->offset
;
640 Elf64_Rel
*rel
= reloc_data
->d_buf
;
641 size_t num_relocs
= reloc_data
->d_size
/ sizeof(*rel
);
642 for (size_t i
= 0; i
< num_relocs
; ++i
, ++rel
) {
643 size_t r_sym
= ELF64_R_SYM(rel
->r_info
);
644 unsigned r_type
= ELF64_R_TYPE(rel
->r_info
);
646 const char *orig_ptr
= orig_base
+ rel
->r_offset
;
647 char *dst_ptr
= dst_base
+ rel
->r_offset
;
648 uint64_t va
= va_base
+ rel
->r_offset
;
653 if (r_sym
== STN_UNDEF
) {
656 report_elf_if(r_sym
>= num_symbols
);
658 const Elf64_Sym
*sym
= &symbols
[r_sym
];
659 const char *symbol_name
= elf_strptr(part
->elf
, strtabidx
, sym
->st_name
);
660 report_elf_if(!symbol_name
);
662 if (!resolve_symbol(u
, part_idx
, sym
, symbol_name
, &symbol
))
666 /* TODO: Should we also support .rela sections, where the
667 * addend is part of the relocation record? */
669 /* Load the addend from the ELF instead of the destination,
670 * because the destination may be in VRAM. */
673 case R_AMDGPU_ABS32_LO
:
674 case R_AMDGPU_ABS32_HI
:
676 case R_AMDGPU_REL32_LO
:
677 case R_AMDGPU_REL32_HI
:
678 addend
= *(const uint32_t *)orig_ptr
;
682 addend
= *(const uint64_t *)orig_ptr
;
685 report_errorf("unsupported r_type == %u", r_type
);
689 uint64_t abs
= symbol
+ addend
;
693 assert((uint32_t)abs
== abs
);
694 case R_AMDGPU_ABS32_LO
:
695 *(uint32_t *)dst_ptr
= util_cpu_to_le32(abs
);
697 case R_AMDGPU_ABS32_HI
:
698 *(uint32_t *)dst_ptr
= util_cpu_to_le32(abs
>> 32);
701 *(uint64_t *)dst_ptr
= util_cpu_to_le64(abs
);
704 assert((int64_t)(int32_t)(abs
- va
) == (int64_t)(abs
- va
));
705 case R_AMDGPU_REL32_LO
:
706 *(uint32_t *)dst_ptr
= util_cpu_to_le32(abs
- va
);
708 case R_AMDGPU_REL32_HI
:
709 *(uint32_t *)dst_ptr
= util_cpu_to_le32((abs
- va
) >> 32);
712 *(uint64_t *)dst_ptr
= util_cpu_to_le64(abs
- va
);
715 unreachable("bad r_type");
726 * Upload the binary or binaries to the provided GPU buffers, including
729 bool ac_rtld_upload(struct ac_rtld_upload_info
*u
)
731 #define report_if(cond) \
734 report_errorf(#cond); \
738 #define report_elf_if(cond) \
741 report_errorf(#cond); \
746 if (u
->binary
->options
.halt_at_entry
) {
748 *(uint32_t *)u
->rx_ptr
= util_cpu_to_le32(0xbf8d0001);
751 /* First pass: upload raw section data and lay out private LDS symbols. */
752 for (unsigned i
= 0; i
< u
->binary
->num_parts
; ++i
) {
753 struct ac_rtld_part
*part
= &u
->binary
->parts
[i
];
755 Elf_Scn
*section
= NULL
;
756 while ((section
= elf_nextscn(part
->elf
, section
))) {
757 Elf64_Shdr
*shdr
= elf64_getshdr(section
);
758 struct ac_rtld_section
*s
= &part
->sections
[elf_ndxscn(section
)];
763 report_if(shdr
->sh_type
!= SHT_PROGBITS
);
765 Elf_Data
*data
= elf_getdata(section
, NULL
);
766 report_elf_if(!data
|| data
->d_size
!= shdr
->sh_size
);
767 memcpy(u
->rx_ptr
+ s
->offset
, data
->d_buf
, shdr
->sh_size
);
771 if (u
->binary
->rx_end_markers
) {
772 uint32_t *dst
= (uint32_t *)(u
->rx_ptr
+ u
->binary
->rx_end_markers
);
773 for (unsigned i
= 0; i
< DEBUGGER_NUM_MARKERS
; ++i
)
774 *dst
++ = util_cpu_to_le32(DEBUGGER_END_OF_CODE_MARKER
);
777 /* Second pass: handle relocations, overwriting uploaded data where
779 for (unsigned i
= 0; i
< u
->binary
->num_parts
; ++i
) {
780 struct ac_rtld_part
*part
= &u
->binary
->parts
[i
];
781 Elf_Scn
*section
= NULL
;
782 while ((section
= elf_nextscn(part
->elf
, section
))) {
783 Elf64_Shdr
*shdr
= elf64_getshdr(section
);
784 if (shdr
->sh_type
== SHT_REL
) {
785 Elf_Data
*relocs
= elf_getdata(section
, NULL
);
786 report_elf_if(!relocs
|| relocs
->d_size
!= shdr
->sh_size
);
787 if (!apply_relocs(u
, i
, shdr
, relocs
))
789 } else if (shdr
->sh_type
== SHT_RELA
) {
790 report_errorf("SHT_RELA not supported");