2 * Copyright 2014-2019 Advanced Micro Devices, Inc.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
33 #include "ac_binary.h"
34 #include "ac_gpu_info.h"
35 #include "util/u_dynarray.h"
36 #include "util/u_math.h"
38 // Old distributions may not have this enum constant
39 #define MY_EM_AMDGPU 224
41 #ifndef STT_AMDGPU_LDS
42 #define STT_AMDGPU_LDS 13 // this is deprecated -- remove
45 #ifndef SHN_AMDGPU_LDS
46 #define SHN_AMDGPU_LDS 0xff00
50 #define R_AMDGPU_NONE 0
51 #define R_AMDGPU_ABS32_LO 1
52 #define R_AMDGPU_ABS32_HI 2
53 #define R_AMDGPU_ABS64 3
54 #define R_AMDGPU_REL32 4
55 #define R_AMDGPU_REL64 5
56 #define R_AMDGPU_ABS32 6
57 #define R_AMDGPU_GOTPCREL 7
58 #define R_AMDGPU_GOTPCREL32_LO 8
59 #define R_AMDGPU_GOTPCREL32_HI 9
60 #define R_AMDGPU_REL32_LO 10
61 #define R_AMDGPU_REL32_HI 11
62 #define R_AMDGPU_RELATIVE64 13
65 /* For the UMR disassembler. */
66 #define DEBUGGER_END_OF_CODE_MARKER 0xbf9f0000 /* invalid instruction */
67 #define DEBUGGER_NUM_MARKERS 5
69 struct ac_rtld_section
{
71 bool is_pasted_text
: 1;
78 struct ac_rtld_section
*sections
;
79 unsigned num_sections
;
82 static void report_erroraf(const char *fmt
, va_list va
)
85 int ret
= vasprintf(&msg
, fmt
, va
);
87 msg
= "(vasprintf failed)";
89 fprintf(stderr
, "ac_rtld error: %s\n", msg
);
95 static void report_errorf(const char *fmt
, ...) PRINTFLIKE(1, 2);
97 static void report_errorf(const char *fmt
, ...)
101 report_erroraf(fmt
, va
);
105 static void report_elf_errorf(const char *fmt
, ...) PRINTFLIKE(1, 2);
107 static void report_elf_errorf(const char *fmt
, ...)
111 report_erroraf(fmt
, va
);
114 fprintf(stderr
, "ELF error: %s\n", elf_errmsg(elf_errno()));
118 * Find a symbol in a dynarray of struct ac_rtld_symbol by \p name and shader
121 static const struct ac_rtld_symbol
*find_symbol(const struct util_dynarray
*symbols
,
122 const char *name
, unsigned part_idx
)
124 util_dynarray_foreach(symbols
, struct ac_rtld_symbol
, symbol
) {
125 if ((symbol
->part_idx
== ~0u || symbol
->part_idx
== part_idx
) &&
126 !strcmp(name
, symbol
->name
))
132 static int compare_symbol_by_align(const void *lhsp
, const void *rhsp
)
134 const struct ac_rtld_symbol
*lhs
= lhsp
;
135 const struct ac_rtld_symbol
*rhs
= rhsp
;
136 if (rhs
->align
> lhs
->align
)
138 if (rhs
->align
< lhs
->align
)
144 * Sort the given symbol list by decreasing alignment and assign offsets.
146 static bool layout_symbols(struct ac_rtld_symbol
*symbols
, unsigned num_symbols
,
147 uint64_t *ptotal_size
)
149 qsort(symbols
, num_symbols
, sizeof(*symbols
), compare_symbol_by_align
);
151 uint64_t total_size
= *ptotal_size
;
153 for (unsigned i
= 0; i
< num_symbols
; ++i
) {
154 struct ac_rtld_symbol
*s
= &symbols
[i
];
155 assert(util_is_power_of_two_nonzero(s
->align
));
157 total_size
= align64(total_size
, s
->align
);
158 s
->offset
= total_size
;
160 if (total_size
+ s
->size
< total_size
) {
161 report_errorf("%s: size overflow", __FUNCTION__
);
165 total_size
+= s
->size
;
168 *ptotal_size
= total_size
;
173 * Read LDS symbols from the given \p section of the ELF of \p part and append
174 * them to the LDS symbols list.
176 * Shared LDS symbols are filtered out.
178 static bool read_private_lds_symbols(struct ac_rtld_binary
*binary
,
181 uint32_t *lds_end_align
)
183 #define report_if(cond) \
186 report_errorf(#cond); \
190 #define report_elf_if(cond) \
193 report_elf_errorf(#cond); \
198 struct ac_rtld_part
*part
= &binary
->parts
[part_idx
];
199 Elf64_Shdr
*shdr
= elf64_getshdr(section
);
200 uint32_t strtabidx
= shdr
->sh_link
;
201 Elf_Data
*symbols_data
= elf_getdata(section
, NULL
);
202 report_elf_if(!symbols_data
);
204 const Elf64_Sym
*symbol
= symbols_data
->d_buf
;
205 size_t num_symbols
= symbols_data
->d_size
/ sizeof(Elf64_Sym
);
207 for (size_t j
= 0; j
< num_symbols
; ++j
, ++symbol
) {
208 struct ac_rtld_symbol s
= {};
210 if (ELF64_ST_TYPE(symbol
->st_info
) == STT_AMDGPU_LDS
) {
211 /* old-style LDS symbols from initial prototype -- remove eventually */
212 s
.align
= MIN2(1u << (symbol
->st_other
>> 3), 1u << 16);
213 } else if (symbol
->st_shndx
== SHN_AMDGPU_LDS
) {
214 s
.align
= MIN2(symbol
->st_value
, 1u << 16);
215 report_if(!util_is_power_of_two_nonzero(s
.align
));
219 report_if(symbol
->st_size
> 1u << 29);
221 s
.name
= elf_strptr(part
->elf
, strtabidx
, symbol
->st_name
);
222 s
.size
= symbol
->st_size
;
223 s
.part_idx
= part_idx
;
225 if (!strcmp(s
.name
, "__lds_end")) {
226 report_elf_if(s
.size
!= 0);
227 *lds_end_align
= MAX2(*lds_end_align
, s
.align
);
231 const struct ac_rtld_symbol
*shared
=
232 find_symbol(&binary
->lds_symbols
, s
.name
, part_idx
);
234 report_elf_if(s
.align
> shared
->align
);
235 report_elf_if(s
.size
> shared
->size
);
239 util_dynarray_append(&binary
->lds_symbols
, struct ac_rtld_symbol
, s
);
249 * Open a binary consisting of one or more shader parts.
251 * \param binary the uninitialized struct
252 * \param i binary opening parameters
254 bool ac_rtld_open(struct ac_rtld_binary
*binary
,
255 struct ac_rtld_open_info i
)
257 /* One of the libelf implementations
258 * (http://www.mr511.de/software/english.htm) requires calling
259 * elf_version() before elf_memory().
261 elf_version(EV_CURRENT
);
263 memset(binary
, 0, sizeof(*binary
));
264 memcpy(&binary
->options
, &i
.options
, sizeof(binary
->options
));
265 binary
->wave_size
= i
.wave_size
;
266 binary
->num_parts
= i
.num_parts
;
267 binary
->parts
= calloc(sizeof(*binary
->parts
), i
.num_parts
);
271 uint64_t pasted_text_size
= 0;
272 uint64_t rx_align
= 1;
273 uint64_t rx_size
= 0;
274 uint64_t exec_size
= 0;
276 #define report_if(cond) \
279 report_errorf(#cond); \
283 #define report_elf_if(cond) \
286 report_elf_errorf(#cond); \
291 /* Copy and layout shared LDS symbols. */
292 if (i
.num_shared_lds_symbols
) {
293 if (!util_dynarray_resize(&binary
->lds_symbols
, struct ac_rtld_symbol
,
294 i
.num_shared_lds_symbols
))
297 memcpy(binary
->lds_symbols
.data
, i
.shared_lds_symbols
, binary
->lds_symbols
.size
);
300 util_dynarray_foreach(&binary
->lds_symbols
, struct ac_rtld_symbol
, symbol
)
301 symbol
->part_idx
= ~0u;
303 unsigned max_lds_size
= 64 * 1024;
305 if (i
.info
->chip_class
== GFX6
||
306 (i
.shader_type
!= MESA_SHADER_COMPUTE
&&
307 i
.shader_type
!= MESA_SHADER_FRAGMENT
))
308 max_lds_size
= 32 * 1024;
310 uint64_t shared_lds_size
= 0;
311 if (!layout_symbols(binary
->lds_symbols
.data
, i
.num_shared_lds_symbols
, &shared_lds_size
))
314 if (shared_lds_size
> max_lds_size
) {
315 fprintf(stderr
, "ac_rtld error(1): too much LDS (used = %u, max = %u)\n",
316 (unsigned)shared_lds_size
, max_lds_size
);
319 binary
->lds_size
= shared_lds_size
;
321 /* First pass over all parts: open ELFs, pre-determine the placement of
322 * sections in the memory image, and collect and layout private LDS symbols. */
323 uint32_t lds_end_align
= 0;
325 if (binary
->options
.halt_at_entry
)
326 pasted_text_size
+= 4;
328 for (unsigned part_idx
= 0; part_idx
< i
.num_parts
; ++part_idx
) {
329 struct ac_rtld_part
*part
= &binary
->parts
[part_idx
];
330 unsigned part_lds_symbols_begin
=
331 util_dynarray_num_elements(&binary
->lds_symbols
, struct ac_rtld_symbol
);
333 part
->elf
= elf_memory((char *)i
.elf_ptrs
[part_idx
], i
.elf_sizes
[part_idx
]);
334 report_elf_if(!part
->elf
);
336 const Elf64_Ehdr
*ehdr
= elf64_getehdr(part
->elf
);
337 report_elf_if(!ehdr
);
338 report_if(ehdr
->e_machine
!= MY_EM_AMDGPU
);
340 size_t section_str_index
;
342 report_elf_if(elf_getshdrstrndx(part
->elf
, §ion_str_index
) < 0);
343 report_elf_if(elf_getshdrnum(part
->elf
, &num_shdrs
) < 0);
345 part
->num_sections
= num_shdrs
;
346 part
->sections
= calloc(sizeof(*part
->sections
), num_shdrs
);
347 report_if(!part
->sections
);
349 Elf_Scn
*section
= NULL
;
350 while ((section
= elf_nextscn(part
->elf
, section
))) {
351 Elf64_Shdr
*shdr
= elf64_getshdr(section
);
352 struct ac_rtld_section
*s
= &part
->sections
[elf_ndxscn(section
)];
353 s
->name
= elf_strptr(part
->elf
, section_str_index
, shdr
->sh_name
);
354 report_elf_if(!s
->name
);
356 /* Cannot actually handle linked objects yet */
357 report_elf_if(shdr
->sh_addr
!= 0);
359 /* Alignment must be 0 or a power of two */
360 report_elf_if(shdr
->sh_addralign
& (shdr
->sh_addralign
- 1));
361 uint64_t sh_align
= MAX2(shdr
->sh_addralign
, 1);
363 if (shdr
->sh_flags
& SHF_ALLOC
&&
364 shdr
->sh_type
!= SHT_NOTE
) {
365 report_if(shdr
->sh_flags
& SHF_WRITE
);
369 if (shdr
->sh_flags
& SHF_EXECINSTR
) {
370 report_elf_if(shdr
->sh_size
& 3);
372 if (!strcmp(s
->name
, ".text"))
373 s
->is_pasted_text
= true;
375 exec_size
+= shdr
->sh_size
;
378 if (s
->is_pasted_text
) {
379 s
->offset
= pasted_text_size
;
380 pasted_text_size
+= shdr
->sh_size
;
382 rx_align
= align(rx_align
, sh_align
);
383 rx_size
= align(rx_size
, sh_align
);
385 rx_size
+= shdr
->sh_size
;
387 } else if (shdr
->sh_type
== SHT_SYMTAB
) {
388 if (!read_private_lds_symbols(binary
, part_idx
, section
, &lds_end_align
))
393 uint64_t part_lds_size
= shared_lds_size
;
395 util_dynarray_element(&binary
->lds_symbols
, struct ac_rtld_symbol
, part_lds_symbols_begin
),
396 util_dynarray_num_elements(&binary
->lds_symbols
, struct ac_rtld_symbol
) - part_lds_symbols_begin
,
399 binary
->lds_size
= MAX2(binary
->lds_size
, part_lds_size
);
402 binary
->rx_end_markers
= pasted_text_size
;
403 pasted_text_size
+= 4 * DEBUGGER_NUM_MARKERS
;
405 /* __lds_end is a special symbol that points at the end of the memory
406 * occupied by other LDS symbols. Its alignment is taken as the
407 * maximum of its alignment over all shader parts where it occurs.
410 binary
->lds_size
= align(binary
->lds_size
, lds_end_align
);
412 struct ac_rtld_symbol
*lds_end
=
413 util_dynarray_grow(&binary
->lds_symbols
, struct ac_rtld_symbol
, 1);
414 lds_end
->name
= "__lds_end";
416 lds_end
->align
= lds_end_align
;
417 lds_end
->offset
= binary
->lds_size
;
418 lds_end
->part_idx
= ~0u;
421 if (binary
->lds_size
> max_lds_size
) {
422 fprintf(stderr
, "ac_rtld error(2): too much LDS (used = %u, max = %u)\n",
423 (unsigned)binary
->lds_size
, max_lds_size
);
427 /* Second pass: Adjust offsets of non-pasted text sections. */
428 binary
->rx_size
= pasted_text_size
;
429 binary
->rx_size
= align(binary
->rx_size
, rx_align
);
431 for (unsigned part_idx
= 0; part_idx
< i
.num_parts
; ++part_idx
) {
432 struct ac_rtld_part
*part
= &binary
->parts
[part_idx
];
434 elf_getshdrnum(part
->elf
, &num_shdrs
);
436 for (unsigned j
= 0; j
< num_shdrs
; ++j
) {
437 struct ac_rtld_section
*s
= &part
->sections
[j
];
438 if (s
->is_rx
&& !s
->is_pasted_text
)
439 s
->offset
+= binary
->rx_size
;
443 binary
->rx_size
+= rx_size
;
444 binary
->exec_size
= exec_size
;
446 if (i
.info
->chip_class
>= GFX10
) {
447 /* In gfx10, the SQ fetches up to 3 cache lines of 16 dwords
448 * ahead of the PC, configurable by SH_MEM_CONFIG and
449 * S_INST_PREFETCH. This can cause two issues:
451 * (1) Crossing a page boundary to an unmapped page. The logic
452 * does not distinguish between a required fetch and a "mere"
453 * prefetch and will fault.
455 * (2) Prefetching instructions that will be changed for a
458 * (2) is not currently an issue because we flush the I$ at IB
459 * boundaries, but (1) needs to be addressed. Due to buffer
460 * suballocation, we just play it safe.
462 binary
->rx_size
= align(binary
->rx_size
+ 3 * 64, 64);
471 ac_rtld_close(binary
);
475 void ac_rtld_close(struct ac_rtld_binary
*binary
)
477 for (unsigned i
= 0; i
< binary
->num_parts
; ++i
) {
478 struct ac_rtld_part
*part
= &binary
->parts
[i
];
479 free(part
->sections
);
483 util_dynarray_fini(&binary
->lds_symbols
);
485 binary
->parts
= NULL
;
486 binary
->num_parts
= 0;
489 static bool get_section_by_name(struct ac_rtld_part
*part
, const char *name
,
490 const char **data
, size_t *nbytes
)
492 for (unsigned i
= 0; i
< part
->num_sections
; ++i
) {
493 struct ac_rtld_section
*s
= &part
->sections
[i
];
494 if (s
->name
&& !strcmp(name
, s
->name
)) {
495 Elf_Scn
*target_scn
= elf_getscn(part
->elf
, i
);
496 Elf_Data
*target_data
= elf_getdata(target_scn
, NULL
);
498 report_elf_errorf("ac_rtld: get_section_by_name: elf_getdata");
502 *data
= target_data
->d_buf
;
503 *nbytes
= target_data
->d_size
;
510 bool ac_rtld_get_section_by_name(struct ac_rtld_binary
*binary
, const char *name
,
511 const char **data
, size_t *nbytes
)
513 assert(binary
->num_parts
== 1);
514 return get_section_by_name(&binary
->parts
[0], name
, data
, nbytes
);
517 bool ac_rtld_read_config(struct ac_rtld_binary
*binary
,
518 struct ac_shader_config
*config
)
520 for (unsigned i
= 0; i
< binary
->num_parts
; ++i
) {
521 struct ac_rtld_part
*part
= &binary
->parts
[i
];
522 const char *config_data
;
523 size_t config_nbytes
;
525 if (!get_section_by_name(part
, ".AMDGPU.config",
526 &config_data
, &config_nbytes
))
529 /* TODO: be precise about scratch use? */
530 struct ac_shader_config c
= {};
531 ac_parse_shader_binary_config(config_data
, config_nbytes
,
532 binary
->wave_size
, true, &c
);
534 config
->num_sgprs
= MAX2(config
->num_sgprs
, c
.num_sgprs
);
535 config
->num_vgprs
= MAX2(config
->num_vgprs
, c
.num_vgprs
);
536 config
->spilled_sgprs
= MAX2(config
->spilled_sgprs
, c
.spilled_sgprs
);
537 config
->spilled_vgprs
= MAX2(config
->spilled_vgprs
, c
.spilled_vgprs
);
538 config
->scratch_bytes_per_wave
= MAX2(config
->scratch_bytes_per_wave
,
539 c
.scratch_bytes_per_wave
);
541 assert(i
== 0 || config
->float_mode
== c
.float_mode
);
542 config
->float_mode
= c
.float_mode
;
544 /* SPI_PS_INPUT_ENA/ADDR can't be combined. Only the value from
545 * the main shader part is used. */
546 assert(config
->spi_ps_input_ena
== 0 &&
547 config
->spi_ps_input_addr
== 0);
548 config
->spi_ps_input_ena
= c
.spi_ps_input_ena
;
549 config
->spi_ps_input_addr
= c
.spi_ps_input_addr
;
551 /* TODO: consistently use LDS symbols for this */
552 config
->lds_size
= MAX2(config
->lds_size
, c
.lds_size
);
554 /* TODO: Should we combine these somehow? It's currently only
555 * used for radeonsi's compute, where multiple parts aren't used. */
556 assert(config
->rsrc1
== 0 && config
->rsrc2
== 0);
557 config
->rsrc1
= c
.rsrc1
;
558 config
->rsrc2
= c
.rsrc2
;
564 static bool resolve_symbol(const struct ac_rtld_upload_info
*u
,
565 unsigned part_idx
, const Elf64_Sym
*sym
,
566 const char *name
, uint64_t *value
)
568 /* TODO: properly disentangle the undef and the LDS cases once
569 * STT_AMDGPU_LDS is retired. */
570 if (sym
->st_shndx
== SHN_UNDEF
|| sym
->st_shndx
== SHN_AMDGPU_LDS
) {
571 const struct ac_rtld_symbol
*lds_sym
=
572 find_symbol(&u
->binary
->lds_symbols
, name
, part_idx
);
575 *value
= lds_sym
->offset
;
579 /* TODO: resolve from other parts */
581 if (u
->get_external_symbol(u
->cb_data
, name
, value
))
584 report_errorf("symbol %s: unknown", name
);
588 struct ac_rtld_part
*part
= &u
->binary
->parts
[part_idx
];
589 if (sym
->st_shndx
>= part
->num_sections
) {
590 report_errorf("symbol %s: section out of bounds", name
);
594 struct ac_rtld_section
*s
= &part
->sections
[sym
->st_shndx
];
596 report_errorf("symbol %s: bad section", name
);
600 uint64_t section_base
= u
->rx_va
+ s
->offset
;
602 *value
= section_base
+ sym
->st_value
;
606 static bool apply_relocs(const struct ac_rtld_upload_info
*u
,
607 unsigned part_idx
, const Elf64_Shdr
*reloc_shdr
,
608 const Elf_Data
*reloc_data
)
610 #define report_if(cond) \
613 report_errorf(#cond); \
617 #define report_elf_if(cond) \
620 report_elf_errorf(#cond); \
625 struct ac_rtld_part
*part
= &u
->binary
->parts
[part_idx
];
626 Elf_Scn
*target_scn
= elf_getscn(part
->elf
, reloc_shdr
->sh_info
);
627 report_elf_if(!target_scn
);
629 Elf_Data
*target_data
= elf_getdata(target_scn
, NULL
);
630 report_elf_if(!target_data
);
632 Elf_Scn
*symbols_scn
= elf_getscn(part
->elf
, reloc_shdr
->sh_link
);
633 report_elf_if(!symbols_scn
);
635 Elf64_Shdr
*symbols_shdr
= elf64_getshdr(symbols_scn
);
636 report_elf_if(!symbols_shdr
);
637 uint32_t strtabidx
= symbols_shdr
->sh_link
;
639 Elf_Data
*symbols_data
= elf_getdata(symbols_scn
, NULL
);
640 report_elf_if(!symbols_data
);
642 const Elf64_Sym
*symbols
= symbols_data
->d_buf
;
643 size_t num_symbols
= symbols_data
->d_size
/ sizeof(Elf64_Sym
);
645 struct ac_rtld_section
*s
= &part
->sections
[reloc_shdr
->sh_info
];
646 report_if(!s
->is_rx
);
648 const char *orig_base
= target_data
->d_buf
;
649 char *dst_base
= u
->rx_ptr
+ s
->offset
;
650 uint64_t va_base
= u
->rx_va
+ s
->offset
;
652 Elf64_Rel
*rel
= reloc_data
->d_buf
;
653 size_t num_relocs
= reloc_data
->d_size
/ sizeof(*rel
);
654 for (size_t i
= 0; i
< num_relocs
; ++i
, ++rel
) {
655 size_t r_sym
= ELF64_R_SYM(rel
->r_info
);
656 unsigned r_type
= ELF64_R_TYPE(rel
->r_info
);
658 const char *orig_ptr
= orig_base
+ rel
->r_offset
;
659 char *dst_ptr
= dst_base
+ rel
->r_offset
;
660 uint64_t va
= va_base
+ rel
->r_offset
;
665 if (r_sym
== STN_UNDEF
) {
668 report_elf_if(r_sym
>= num_symbols
);
670 const Elf64_Sym
*sym
= &symbols
[r_sym
];
671 const char *symbol_name
=
672 elf_strptr(part
->elf
, strtabidx
, sym
->st_name
);
673 report_elf_if(!symbol_name
);
675 if (!resolve_symbol(u
, part_idx
, sym
, symbol_name
, &symbol
))
679 /* TODO: Should we also support .rela sections, where the
680 * addend is part of the relocation record? */
682 /* Load the addend from the ELF instead of the destination,
683 * because the destination may be in VRAM. */
686 case R_AMDGPU_ABS32_LO
:
687 case R_AMDGPU_ABS32_HI
:
689 case R_AMDGPU_REL32_LO
:
690 case R_AMDGPU_REL32_HI
:
691 addend
= *(const uint32_t *)orig_ptr
;
695 addend
= *(const uint64_t *)orig_ptr
;
698 report_errorf("unsupported r_type == %u", r_type
);
702 uint64_t abs
= symbol
+ addend
;
706 assert((uint32_t)abs
== abs
);
707 case R_AMDGPU_ABS32_LO
:
708 *(uint32_t *)dst_ptr
= util_cpu_to_le32(abs
);
710 case R_AMDGPU_ABS32_HI
:
711 *(uint32_t *)dst_ptr
= util_cpu_to_le32(abs
>> 32);
714 *(uint64_t *)dst_ptr
= util_cpu_to_le64(abs
);
717 assert((int64_t)(int32_t)(abs
- va
) == (int64_t)(abs
- va
));
718 case R_AMDGPU_REL32_LO
:
719 *(uint32_t *)dst_ptr
= util_cpu_to_le32(abs
- va
);
721 case R_AMDGPU_REL32_HI
:
722 *(uint32_t *)dst_ptr
= util_cpu_to_le32((abs
- va
) >> 32);
725 *(uint64_t *)dst_ptr
= util_cpu_to_le64(abs
- va
);
728 unreachable("bad r_type");
739 * Upload the binary or binaries to the provided GPU buffers, including
742 bool ac_rtld_upload(struct ac_rtld_upload_info
*u
)
744 #define report_if(cond) \
747 report_errorf(#cond); \
751 #define report_elf_if(cond) \
754 report_errorf(#cond); \
759 if (u
->binary
->options
.halt_at_entry
) {
761 *(uint32_t *)u
->rx_ptr
= util_cpu_to_le32(0xbf8d0001);
764 /* First pass: upload raw section data and lay out private LDS symbols. */
765 for (unsigned i
= 0; i
< u
->binary
->num_parts
; ++i
) {
766 struct ac_rtld_part
*part
= &u
->binary
->parts
[i
];
768 Elf_Scn
*section
= NULL
;
769 while ((section
= elf_nextscn(part
->elf
, section
))) {
770 Elf64_Shdr
*shdr
= elf64_getshdr(section
);
771 struct ac_rtld_section
*s
= &part
->sections
[elf_ndxscn(section
)];
776 report_if(shdr
->sh_type
!= SHT_PROGBITS
);
778 Elf_Data
*data
= elf_getdata(section
, NULL
);
779 report_elf_if(!data
|| data
->d_size
!= shdr
->sh_size
);
780 memcpy(u
->rx_ptr
+ s
->offset
, data
->d_buf
, shdr
->sh_size
);
784 if (u
->binary
->rx_end_markers
) {
785 uint32_t *dst
= (uint32_t *)(u
->rx_ptr
+ u
->binary
->rx_end_markers
);
786 for (unsigned i
= 0; i
< DEBUGGER_NUM_MARKERS
; ++i
)
787 *dst
++ = util_cpu_to_le32(DEBUGGER_END_OF_CODE_MARKER
);
790 /* Second pass: handle relocations, overwriting uploaded data where
792 for (unsigned i
= 0; i
< u
->binary
->num_parts
; ++i
) {
793 struct ac_rtld_part
*part
= &u
->binary
->parts
[i
];
794 Elf_Scn
*section
= NULL
;
795 while ((section
= elf_nextscn(part
->elf
, section
))) {
796 Elf64_Shdr
*shdr
= elf64_getshdr(section
);
797 if (shdr
->sh_type
== SHT_REL
) {
798 Elf_Data
*relocs
= elf_getdata(section
, NULL
);
799 report_elf_if(!relocs
|| relocs
->d_size
!= shdr
->sh_size
);
800 if (!apply_relocs(u
, i
, shdr
, relocs
))
802 } else if (shdr
->sh_type
== SHT_RELA
) {
803 report_errorf("SHT_RELA not supported");