2 * Copyright 2014-2019 Advanced Micro Devices, Inc.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
33 #include "ac_binary.h"
34 #include "ac_gpu_info.h"
35 #include "util/u_dynarray.h"
36 #include "util/u_math.h"
38 // Old distributions may not have this enum constant
39 #define MY_EM_AMDGPU 224
41 #ifndef STT_AMDGPU_LDS
42 #define STT_AMDGPU_LDS 13 // this is deprecated -- remove
45 #ifndef SHN_AMDGPU_LDS
46 #define SHN_AMDGPU_LDS 0xff00
50 #define R_AMDGPU_NONE 0
51 #define R_AMDGPU_ABS32_LO 1
52 #define R_AMDGPU_ABS32_HI 2
53 #define R_AMDGPU_ABS64 3
54 #define R_AMDGPU_REL32 4
55 #define R_AMDGPU_REL64 5
56 #define R_AMDGPU_ABS32 6
57 #define R_AMDGPU_GOTPCREL 7
58 #define R_AMDGPU_GOTPCREL32_LO 8
59 #define R_AMDGPU_GOTPCREL32_HI 9
60 #define R_AMDGPU_REL32_LO 10
61 #define R_AMDGPU_REL32_HI 11
62 #define R_AMDGPU_RELATIVE64 13
65 /* For the UMR disassembler. */
66 #define DEBUGGER_END_OF_CODE_MARKER 0xbf9f0000 /* invalid instruction */
67 #define DEBUGGER_NUM_MARKERS 5
69 struct ac_rtld_section
{
71 bool is_pasted_text
: 1;
78 struct ac_rtld_section
*sections
;
79 unsigned num_sections
;
82 static void report_erroraf(const char *fmt
, va_list va
)
85 int ret
= asprintf(&msg
, fmt
, va
);
87 msg
= "(asprintf failed)";
89 fprintf(stderr
, "ac_rtld error: %s\n", msg
);
95 static void report_errorf(const char *fmt
, ...) PRINTFLIKE(1, 2);
97 static void report_errorf(const char *fmt
, ...)
101 report_erroraf(fmt
, va
);
105 static void report_elf_errorf(const char *fmt
, ...) PRINTFLIKE(1, 2);
107 static void report_elf_errorf(const char *fmt
, ...)
111 report_erroraf(fmt
, va
);
114 fprintf(stderr
, "ELF error: %s\n", elf_errmsg(elf_errno()));
118 * Find a symbol in a dynarray of struct ac_rtld_symbol by \p name and shader
121 static const struct ac_rtld_symbol
*find_symbol(const struct util_dynarray
*symbols
,
122 const char *name
, unsigned part_idx
)
124 util_dynarray_foreach(symbols
, struct ac_rtld_symbol
, symbol
) {
125 if ((symbol
->part_idx
== ~0u || symbol
->part_idx
== part_idx
) &&
126 !strcmp(name
, symbol
->name
))
132 static int compare_symbol_by_align(const void *lhsp
, const void *rhsp
)
134 const struct ac_rtld_symbol
*lhs
= lhsp
;
135 const struct ac_rtld_symbol
*rhs
= rhsp
;
136 if (rhs
->align
> lhs
->align
)
138 if (rhs
->align
< lhs
->align
)
144 * Sort the given symbol list by decreasing alignment and assign offsets.
146 static bool layout_symbols(struct ac_rtld_symbol
*symbols
, unsigned num_symbols
,
147 uint64_t *ptotal_size
)
149 qsort(symbols
, num_symbols
, sizeof(*symbols
), compare_symbol_by_align
);
151 uint64_t total_size
= *ptotal_size
;
153 for (unsigned i
= 0; i
< num_symbols
; ++i
) {
154 struct ac_rtld_symbol
*s
= &symbols
[i
];
155 assert(util_is_power_of_two_nonzero(s
->align
));
157 total_size
= align64(total_size
, s
->align
);
158 s
->offset
= total_size
;
160 if (total_size
+ s
->size
< total_size
) {
161 report_errorf("%s: size overflow", __FUNCTION__
);
165 total_size
+= s
->size
;
168 *ptotal_size
= total_size
;
173 * Read LDS symbols from the given \p section of the ELF of \p part and append
174 * them to the LDS symbols list.
176 * Shared LDS symbols are filtered out.
178 static bool read_private_lds_symbols(struct ac_rtld_binary
*binary
,
181 uint32_t *lds_end_align
)
183 #define report_if(cond) \
186 report_errorf(#cond); \
190 #define report_elf_if(cond) \
193 report_elf_errorf(#cond); \
198 struct ac_rtld_part
*part
= &binary
->parts
[part_idx
];
199 Elf64_Shdr
*shdr
= elf64_getshdr(section
);
200 uint32_t strtabidx
= shdr
->sh_link
;
201 Elf_Data
*symbols_data
= elf_getdata(section
, NULL
);
202 report_elf_if(!symbols_data
);
204 const Elf64_Sym
*symbol
= symbols_data
->d_buf
;
205 size_t num_symbols
= symbols_data
->d_size
/ sizeof(Elf64_Sym
);
207 for (size_t j
= 0; j
< num_symbols
; ++j
, ++symbol
) {
208 struct ac_rtld_symbol s
= {};
210 if (ELF64_ST_TYPE(symbol
->st_info
) == STT_AMDGPU_LDS
) {
211 /* old-style LDS symbols from initial prototype -- remove eventually */
212 s
.align
= MIN2(1u << (symbol
->st_other
>> 3), 1u << 16);
213 } else if (symbol
->st_shndx
== SHN_AMDGPU_LDS
) {
214 s
.align
= MIN2(symbol
->st_value
, 1u << 16);
215 report_if(!util_is_power_of_two_nonzero(s
.align
));
219 report_if(symbol
->st_size
> 1u << 29);
221 s
.name
= elf_strptr(part
->elf
, strtabidx
, symbol
->st_name
);
222 s
.size
= symbol
->st_size
;
223 s
.part_idx
= part_idx
;
225 if (!strcmp(s
.name
, "__lds_end")) {
226 report_elf_if(s
.size
!= 0);
227 *lds_end_align
= MAX2(*lds_end_align
, s
.align
);
231 const struct ac_rtld_symbol
*shared
=
232 find_symbol(&binary
->lds_symbols
, s
.name
, part_idx
);
234 report_elf_if(s
.align
> shared
->align
);
235 report_elf_if(s
.size
> shared
->size
);
239 util_dynarray_append(&binary
->lds_symbols
, struct ac_rtld_symbol
, s
);
249 * Open a binary consisting of one or more shader parts.
251 * \param binary the uninitialized struct
252 * \param i binary opening parameters
254 bool ac_rtld_open(struct ac_rtld_binary
*binary
,
255 struct ac_rtld_open_info i
)
257 /* One of the libelf implementations
258 * (http://www.mr511.de/software/english.htm) requires calling
259 * elf_version() before elf_memory().
261 elf_version(EV_CURRENT
);
263 memset(binary
, 0, sizeof(*binary
));
264 memcpy(&binary
->options
, &i
.options
, sizeof(binary
->options
));
265 binary
->wave_size
= i
.wave_size
;
266 binary
->num_parts
= i
.num_parts
;
267 binary
->parts
= calloc(sizeof(*binary
->parts
), i
.num_parts
);
271 uint64_t pasted_text_size
= 0;
272 uint64_t rx_align
= 1;
273 uint64_t rx_size
= 0;
275 #define report_if(cond) \
278 report_errorf(#cond); \
282 #define report_elf_if(cond) \
285 report_elf_errorf(#cond); \
290 /* Copy and layout shared LDS symbols. */
291 if (i
.num_shared_lds_symbols
) {
292 if (!util_dynarray_resize(&binary
->lds_symbols
, struct ac_rtld_symbol
,
293 i
.num_shared_lds_symbols
))
296 memcpy(binary
->lds_symbols
.data
, i
.shared_lds_symbols
, binary
->lds_symbols
.size
);
299 util_dynarray_foreach(&binary
->lds_symbols
, struct ac_rtld_symbol
, symbol
)
300 symbol
->part_idx
= ~0u;
302 unsigned max_lds_size
= 64 * 1024;
304 if (i
.info
->chip_class
== GFX6
||
305 (i
.shader_type
!= MESA_SHADER_COMPUTE
&&
306 i
.shader_type
!= MESA_SHADER_FRAGMENT
))
307 max_lds_size
= 32 * 1024;
309 uint64_t shared_lds_size
= 0;
310 if (!layout_symbols(binary
->lds_symbols
.data
, i
.num_shared_lds_symbols
, &shared_lds_size
))
313 if (shared_lds_size
> max_lds_size
) {
314 fprintf(stderr
, "ac_rtld error(1): too much LDS (used = %u, max = %u)\n",
315 (unsigned)shared_lds_size
, max_lds_size
);
318 binary
->lds_size
= shared_lds_size
;
320 /* First pass over all parts: open ELFs, pre-determine the placement of
321 * sections in the memory image, and collect and layout private LDS symbols. */
322 uint32_t lds_end_align
= 0;
324 if (binary
->options
.halt_at_entry
)
325 pasted_text_size
+= 4;
327 for (unsigned part_idx
= 0; part_idx
< i
.num_parts
; ++part_idx
) {
328 struct ac_rtld_part
*part
= &binary
->parts
[part_idx
];
329 unsigned part_lds_symbols_begin
=
330 util_dynarray_num_elements(&binary
->lds_symbols
, struct ac_rtld_symbol
);
332 part
->elf
= elf_memory((char *)i
.elf_ptrs
[part_idx
], i
.elf_sizes
[part_idx
]);
333 report_elf_if(!part
->elf
);
335 const Elf64_Ehdr
*ehdr
= elf64_getehdr(part
->elf
);
336 report_elf_if(!ehdr
);
337 report_if(ehdr
->e_machine
!= MY_EM_AMDGPU
);
339 size_t section_str_index
;
341 report_elf_if(elf_getshdrstrndx(part
->elf
, §ion_str_index
) < 0);
342 report_elf_if(elf_getshdrnum(part
->elf
, &num_shdrs
) < 0);
344 part
->num_sections
= num_shdrs
;
345 part
->sections
= calloc(sizeof(*part
->sections
), num_shdrs
);
346 report_if(!part
->sections
);
348 Elf_Scn
*section
= NULL
;
349 while ((section
= elf_nextscn(part
->elf
, section
))) {
350 Elf64_Shdr
*shdr
= elf64_getshdr(section
);
351 struct ac_rtld_section
*s
= &part
->sections
[elf_ndxscn(section
)];
352 s
->name
= elf_strptr(part
->elf
, section_str_index
, shdr
->sh_name
);
353 report_elf_if(!s
->name
);
355 /* Cannot actually handle linked objects yet */
356 report_elf_if(shdr
->sh_addr
!= 0);
358 /* Alignment must be 0 or a power of two */
359 report_elf_if(shdr
->sh_addralign
& (shdr
->sh_addralign
- 1));
360 uint64_t sh_align
= MAX2(shdr
->sh_addralign
, 1);
362 if (shdr
->sh_flags
& SHF_ALLOC
&&
363 shdr
->sh_type
!= SHT_NOTE
) {
364 report_if(shdr
->sh_flags
& SHF_WRITE
);
368 if (shdr
->sh_flags
& SHF_EXECINSTR
) {
369 report_elf_if(shdr
->sh_size
& 3);
371 if (!strcmp(s
->name
, ".text"))
372 s
->is_pasted_text
= true;
375 if (s
->is_pasted_text
) {
376 s
->offset
= pasted_text_size
;
377 pasted_text_size
+= shdr
->sh_size
;
379 rx_align
= align(rx_align
, sh_align
);
380 rx_size
= align(rx_size
, sh_align
);
382 rx_size
+= shdr
->sh_size
;
384 } else if (shdr
->sh_type
== SHT_SYMTAB
) {
385 if (!read_private_lds_symbols(binary
, part_idx
, section
, &lds_end_align
))
390 uint64_t part_lds_size
= shared_lds_size
;
392 util_dynarray_element(&binary
->lds_symbols
, struct ac_rtld_symbol
, part_lds_symbols_begin
),
393 util_dynarray_num_elements(&binary
->lds_symbols
, struct ac_rtld_symbol
) - part_lds_symbols_begin
,
396 binary
->lds_size
= MAX2(binary
->lds_size
, part_lds_size
);
399 binary
->rx_end_markers
= pasted_text_size
;
400 pasted_text_size
+= 4 * DEBUGGER_NUM_MARKERS
;
402 /* __lds_end is a special symbol that points at the end of the memory
403 * occupied by other LDS symbols. Its alignment is taken as the
404 * maximum of its alignment over all shader parts where it occurs.
407 binary
->lds_size
= align(binary
->lds_size
, lds_end_align
);
409 struct ac_rtld_symbol
*lds_end
=
410 util_dynarray_grow(&binary
->lds_symbols
, struct ac_rtld_symbol
, 1);
411 lds_end
->name
= "__lds_end";
413 lds_end
->align
= lds_end_align
;
414 lds_end
->offset
= binary
->lds_size
;
415 lds_end
->part_idx
= ~0u;
418 if (binary
->lds_size
> max_lds_size
) {
419 fprintf(stderr
, "ac_rtld error(2): too much LDS (used = %u, max = %u)\n",
420 (unsigned)binary
->lds_size
, max_lds_size
);
424 /* Second pass: Adjust offsets of non-pasted text sections. */
425 binary
->rx_size
= pasted_text_size
;
426 binary
->rx_size
= align(binary
->rx_size
, rx_align
);
428 for (unsigned part_idx
= 0; part_idx
< i
.num_parts
; ++part_idx
) {
429 struct ac_rtld_part
*part
= &binary
->parts
[part_idx
];
431 elf_getshdrnum(part
->elf
, &num_shdrs
);
433 for (unsigned j
= 0; j
< num_shdrs
; ++j
) {
434 struct ac_rtld_section
*s
= &part
->sections
[j
];
435 if (s
->is_rx
&& !s
->is_pasted_text
)
436 s
->offset
+= binary
->rx_size
;
440 binary
->rx_size
+= rx_size
;
442 if (i
.info
->chip_class
>= GFX10
) {
443 /* In gfx10, the SQ fetches up to 3 cache lines of 16 dwords
444 * ahead of the PC, configurable by SH_MEM_CONFIG and
445 * S_INST_PREFETCH. This can cause two issues:
447 * (1) Crossing a page boundary to an unmapped page. The logic
448 * does not distinguish between a required fetch and a "mere"
449 * prefetch and will fault.
451 * (2) Prefetching instructions that will be changed for a
454 * (2) is not currently an issue because we flush the I$ at IB
455 * boundaries, but (1) needs to be addressed. Due to buffer
456 * suballocation, we just play it safe.
458 binary
->rx_size
= align(binary
->rx_size
+ 3 * 64, 64);
467 ac_rtld_close(binary
);
471 void ac_rtld_close(struct ac_rtld_binary
*binary
)
473 for (unsigned i
= 0; i
< binary
->num_parts
; ++i
) {
474 struct ac_rtld_part
*part
= &binary
->parts
[i
];
475 free(part
->sections
);
479 util_dynarray_fini(&binary
->lds_symbols
);
481 binary
->parts
= NULL
;
482 binary
->num_parts
= 0;
485 static bool get_section_by_name(struct ac_rtld_part
*part
, const char *name
,
486 const char **data
, size_t *nbytes
)
488 for (unsigned i
= 0; i
< part
->num_sections
; ++i
) {
489 struct ac_rtld_section
*s
= &part
->sections
[i
];
490 if (s
->name
&& !strcmp(name
, s
->name
)) {
491 Elf_Scn
*target_scn
= elf_getscn(part
->elf
, i
);
492 Elf_Data
*target_data
= elf_getdata(target_scn
, NULL
);
494 report_elf_errorf("ac_rtld: get_section_by_name: elf_getdata");
498 *data
= target_data
->d_buf
;
499 *nbytes
= target_data
->d_size
;
506 bool ac_rtld_get_section_by_name(struct ac_rtld_binary
*binary
, const char *name
,
507 const char **data
, size_t *nbytes
)
509 assert(binary
->num_parts
== 1);
510 return get_section_by_name(&binary
->parts
[0], name
, data
, nbytes
);
513 bool ac_rtld_read_config(struct ac_rtld_binary
*binary
,
514 struct ac_shader_config
*config
)
516 for (unsigned i
= 0; i
< binary
->num_parts
; ++i
) {
517 struct ac_rtld_part
*part
= &binary
->parts
[i
];
518 const char *config_data
;
519 size_t config_nbytes
;
521 if (!get_section_by_name(part
, ".AMDGPU.config",
522 &config_data
, &config_nbytes
))
525 /* TODO: be precise about scratch use? */
526 struct ac_shader_config c
= {};
527 ac_parse_shader_binary_config(config_data
, config_nbytes
,
528 binary
->wave_size
, true, &c
);
530 config
->num_sgprs
= MAX2(config
->num_sgprs
, c
.num_sgprs
);
531 config
->num_vgprs
= MAX2(config
->num_vgprs
, c
.num_vgprs
);
532 config
->spilled_sgprs
= MAX2(config
->spilled_sgprs
, c
.spilled_sgprs
);
533 config
->spilled_vgprs
= MAX2(config
->spilled_vgprs
, c
.spilled_vgprs
);
534 config
->scratch_bytes_per_wave
= MAX2(config
->scratch_bytes_per_wave
,
535 c
.scratch_bytes_per_wave
);
537 assert(i
== 0 || config
->float_mode
== c
.float_mode
);
538 config
->float_mode
= c
.float_mode
;
540 /* SPI_PS_INPUT_ENA/ADDR can't be combined. Only the value from
541 * the main shader part is used. */
542 assert(config
->spi_ps_input_ena
== 0 &&
543 config
->spi_ps_input_addr
== 0);
544 config
->spi_ps_input_ena
= c
.spi_ps_input_ena
;
545 config
->spi_ps_input_addr
= c
.spi_ps_input_addr
;
547 /* TODO: consistently use LDS symbols for this */
548 config
->lds_size
= MAX2(config
->lds_size
, c
.lds_size
);
550 /* TODO: Should we combine these somehow? It's currently only
551 * used for radeonsi's compute, where multiple parts aren't used. */
552 assert(config
->rsrc1
== 0 && config
->rsrc2
== 0);
553 config
->rsrc1
= c
.rsrc1
;
554 config
->rsrc2
= c
.rsrc2
;
560 static bool resolve_symbol(const struct ac_rtld_upload_info
*u
,
561 unsigned part_idx
, const Elf64_Sym
*sym
,
562 const char *name
, uint64_t *value
)
564 /* TODO: properly disentangle the undef and the LDS cases once
565 * STT_AMDGPU_LDS is retired. */
566 if (sym
->st_shndx
== SHN_UNDEF
|| sym
->st_shndx
== SHN_AMDGPU_LDS
) {
567 const struct ac_rtld_symbol
*lds_sym
=
568 find_symbol(&u
->binary
->lds_symbols
, name
, part_idx
);
571 *value
= lds_sym
->offset
;
575 /* TODO: resolve from other parts */
577 if (u
->get_external_symbol(u
->cb_data
, name
, value
))
580 report_errorf("symbol %s: unknown", name
);
584 struct ac_rtld_part
*part
= &u
->binary
->parts
[part_idx
];
585 if (sym
->st_shndx
>= part
->num_sections
) {
586 report_errorf("symbol %s: section out of bounds", name
);
590 struct ac_rtld_section
*s
= &part
->sections
[sym
->st_shndx
];
592 report_errorf("symbol %s: bad section", name
);
596 uint64_t section_base
= u
->rx_va
+ s
->offset
;
598 *value
= section_base
+ sym
->st_value
;
602 static bool apply_relocs(const struct ac_rtld_upload_info
*u
,
603 unsigned part_idx
, const Elf64_Shdr
*reloc_shdr
,
604 const Elf_Data
*reloc_data
)
606 #define report_if(cond) \
609 report_errorf(#cond); \
613 #define report_elf_if(cond) \
616 report_elf_errorf(#cond); \
621 struct ac_rtld_part
*part
= &u
->binary
->parts
[part_idx
];
622 Elf_Scn
*target_scn
= elf_getscn(part
->elf
, reloc_shdr
->sh_info
);
623 report_elf_if(!target_scn
);
625 Elf_Data
*target_data
= elf_getdata(target_scn
, NULL
);
626 report_elf_if(!target_data
);
628 Elf_Scn
*symbols_scn
= elf_getscn(part
->elf
, reloc_shdr
->sh_link
);
629 report_elf_if(!symbols_scn
);
631 Elf64_Shdr
*symbols_shdr
= elf64_getshdr(symbols_scn
);
632 report_elf_if(!symbols_shdr
);
633 uint32_t strtabidx
= symbols_shdr
->sh_link
;
635 Elf_Data
*symbols_data
= elf_getdata(symbols_scn
, NULL
);
636 report_elf_if(!symbols_data
);
638 const Elf64_Sym
*symbols
= symbols_data
->d_buf
;
639 size_t num_symbols
= symbols_data
->d_size
/ sizeof(Elf64_Sym
);
641 struct ac_rtld_section
*s
= &part
->sections
[reloc_shdr
->sh_info
];
642 report_if(!s
->is_rx
);
644 const char *orig_base
= target_data
->d_buf
;
645 char *dst_base
= u
->rx_ptr
+ s
->offset
;
646 uint64_t va_base
= u
->rx_va
+ s
->offset
;
648 Elf64_Rel
*rel
= reloc_data
->d_buf
;
649 size_t num_relocs
= reloc_data
->d_size
/ sizeof(*rel
);
650 for (size_t i
= 0; i
< num_relocs
; ++i
, ++rel
) {
651 size_t r_sym
= ELF64_R_SYM(rel
->r_info
);
652 unsigned r_type
= ELF64_R_TYPE(rel
->r_info
);
654 const char *orig_ptr
= orig_base
+ rel
->r_offset
;
655 char *dst_ptr
= dst_base
+ rel
->r_offset
;
656 uint64_t va
= va_base
+ rel
->r_offset
;
661 if (r_sym
== STN_UNDEF
) {
664 report_elf_if(r_sym
>= num_symbols
);
666 const Elf64_Sym
*sym
= &symbols
[r_sym
];
667 const char *symbol_name
=
668 elf_strptr(part
->elf
, strtabidx
, sym
->st_name
);
669 report_elf_if(!symbol_name
);
671 if (!resolve_symbol(u
, part_idx
, sym
, symbol_name
, &symbol
))
675 /* TODO: Should we also support .rela sections, where the
676 * addend is part of the relocation record? */
678 /* Load the addend from the ELF instead of the destination,
679 * because the destination may be in VRAM. */
682 case R_AMDGPU_ABS32_LO
:
683 case R_AMDGPU_ABS32_HI
:
685 case R_AMDGPU_REL32_LO
:
686 case R_AMDGPU_REL32_HI
:
687 addend
= *(const uint32_t *)orig_ptr
;
691 addend
= *(const uint64_t *)orig_ptr
;
694 report_errorf("unsupported r_type == %u", r_type
);
698 uint64_t abs
= symbol
+ addend
;
702 assert((uint32_t)abs
== abs
);
703 case R_AMDGPU_ABS32_LO
:
704 *(uint32_t *)dst_ptr
= util_cpu_to_le32(abs
);
706 case R_AMDGPU_ABS32_HI
:
707 *(uint32_t *)dst_ptr
= util_cpu_to_le32(abs
>> 32);
710 *(uint64_t *)dst_ptr
= util_cpu_to_le64(abs
);
713 assert((int64_t)(int32_t)(abs
- va
) == (int64_t)(abs
- va
));
714 case R_AMDGPU_REL32_LO
:
715 *(uint32_t *)dst_ptr
= util_cpu_to_le32(abs
- va
);
717 case R_AMDGPU_REL32_HI
:
718 *(uint32_t *)dst_ptr
= util_cpu_to_le32((abs
- va
) >> 32);
721 *(uint64_t *)dst_ptr
= util_cpu_to_le64(abs
- va
);
724 unreachable("bad r_type");
735 * Upload the binary or binaries to the provided GPU buffers, including
738 bool ac_rtld_upload(struct ac_rtld_upload_info
*u
)
740 #define report_if(cond) \
743 report_errorf(#cond); \
747 #define report_elf_if(cond) \
750 report_errorf(#cond); \
755 if (u
->binary
->options
.halt_at_entry
) {
757 *(uint32_t *)u
->rx_ptr
= util_cpu_to_le32(0xbf8d0001);
760 /* First pass: upload raw section data and lay out private LDS symbols. */
761 for (unsigned i
= 0; i
< u
->binary
->num_parts
; ++i
) {
762 struct ac_rtld_part
*part
= &u
->binary
->parts
[i
];
764 Elf_Scn
*section
= NULL
;
765 while ((section
= elf_nextscn(part
->elf
, section
))) {
766 Elf64_Shdr
*shdr
= elf64_getshdr(section
);
767 struct ac_rtld_section
*s
= &part
->sections
[elf_ndxscn(section
)];
772 report_if(shdr
->sh_type
!= SHT_PROGBITS
);
774 Elf_Data
*data
= elf_getdata(section
, NULL
);
775 report_elf_if(!data
|| data
->d_size
!= shdr
->sh_size
);
776 memcpy(u
->rx_ptr
+ s
->offset
, data
->d_buf
, shdr
->sh_size
);
780 if (u
->binary
->rx_end_markers
) {
781 uint32_t *dst
= (uint32_t *)(u
->rx_ptr
+ u
->binary
->rx_end_markers
);
782 for (unsigned i
= 0; i
< DEBUGGER_NUM_MARKERS
; ++i
)
783 *dst
++ = util_cpu_to_le32(DEBUGGER_END_OF_CODE_MARKER
);
786 /* Second pass: handle relocations, overwriting uploaded data where
788 for (unsigned i
= 0; i
< u
->binary
->num_parts
; ++i
) {
789 struct ac_rtld_part
*part
= &u
->binary
->parts
[i
];
790 Elf_Scn
*section
= NULL
;
791 while ((section
= elf_nextscn(part
->elf
, section
))) {
792 Elf64_Shdr
*shdr
= elf64_getshdr(section
);
793 if (shdr
->sh_type
== SHT_REL
) {
794 Elf_Data
*relocs
= elf_getdata(section
, NULL
);
795 report_elf_if(!relocs
|| relocs
->d_size
!= shdr
->sh_size
);
796 if (!apply_relocs(u
, i
, shdr
, relocs
))
798 } else if (shdr
->sh_type
== SHT_RELA
) {
799 report_errorf("SHT_RELA not supported");