2 * Copyright 2014-2019 Advanced Micro Devices, Inc.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
33 #include "ac_binary.h"
34 #include "util/u_math.h"
36 // Old distributions may not have this enum constant
37 #define MY_EM_AMDGPU 224
40 #define R_AMDGPU_NONE 0
41 #define R_AMDGPU_ABS32_LO 1
42 #define R_AMDGPU_ABS32_HI 2
43 #define R_AMDGPU_ABS64 3
44 #define R_AMDGPU_REL32 4
45 #define R_AMDGPU_REL64 5
46 #define R_AMDGPU_ABS32 6
47 #define R_AMDGPU_GOTPCREL 7
48 #define R_AMDGPU_GOTPCREL32_LO 8
49 #define R_AMDGPU_GOTPCREL32_HI 9
50 #define R_AMDGPU_REL32_LO 10
51 #define R_AMDGPU_REL32_HI 11
52 #define R_AMDGPU_RELATIVE64 13
55 /* For the UMR disassembler. */
56 #define DEBUGGER_END_OF_CODE_MARKER 0xbf9f0000 /* invalid instruction */
57 #define DEBUGGER_NUM_MARKERS 5
59 struct ac_rtld_section
{
61 bool is_pasted_text
: 1;
68 struct ac_rtld_section
*sections
;
69 unsigned num_sections
;
72 static void report_erroraf(const char *fmt
, va_list va
)
75 int ret
= asprintf(&msg
, fmt
, va
);
77 msg
= "(asprintf failed)";
79 fprintf(stderr
, "ac_rtld error: %s\n", msg
);
85 static void report_errorf(const char *fmt
, ...) PRINTFLIKE(1, 2);
87 static void report_errorf(const char *fmt
, ...)
91 report_erroraf(fmt
, va
);
95 static void report_elf_errorf(const char *fmt
, ...) PRINTFLIKE(1, 2);
97 static void report_elf_errorf(const char *fmt
, ...)
101 report_erroraf(fmt
, va
);
104 fprintf(stderr
, "ELF error: %s\n", elf_errmsg(elf_errno()));
108 * Open a binary consisting of one or more shader parts.
110 * \param binary the uninitialized struct
111 * \param num_parts number of shader parts
112 * \param elf_ptrs pointers to the in-memory ELF objects for each shader part
113 * \param elf_sizes sizes (in bytes) of the in-memory ELF objects
115 bool ac_rtld_open(struct ac_rtld_binary
*binary
, unsigned num_parts
,
116 const char * const *elf_ptrs
,
117 const size_t *elf_sizes
)
119 /* One of the libelf implementations
120 * (http://www.mr511.de/software/english.htm) requires calling
121 * elf_version() before elf_memory().
123 elf_version(EV_CURRENT
);
125 memset(binary
, 0, sizeof(*binary
));
126 binary
->num_parts
= num_parts
;
127 binary
->parts
= calloc(sizeof(*binary
->parts
), num_parts
);
131 uint64_t pasted_text_size
= 0;
132 uint64_t rx_align
= 1;
133 uint64_t rx_size
= 0;
135 #define report_if(cond) \
138 report_errorf(#cond); \
142 #define report_elf_if(cond) \
145 report_elf_errorf(#cond); \
150 /* First pass over all parts: open ELFs and determine the placement of
151 * sections in the memory image. */
152 for (unsigned i
= 0; i
< num_parts
; ++i
) {
153 struct ac_rtld_part
*part
= &binary
->parts
[i
];
154 part
->elf
= elf_memory((char *)elf_ptrs
[i
], elf_sizes
[i
]);
155 report_elf_if(!part
->elf
);
157 const Elf64_Ehdr
*ehdr
= elf64_getehdr(part
->elf
);
158 report_elf_if(!ehdr
);
159 report_if(ehdr
->e_machine
!= MY_EM_AMDGPU
);
161 size_t section_str_index
;
163 report_elf_if(elf_getshdrstrndx(part
->elf
, §ion_str_index
) < 0);
164 report_elf_if(elf_getshdrnum(part
->elf
, &num_shdrs
) < 0);
166 part
->num_sections
= num_shdrs
;
167 part
->sections
= calloc(sizeof(*part
->sections
), num_shdrs
);
168 report_if(!part
->sections
);
170 Elf_Scn
*section
= NULL
;
171 while ((section
= elf_nextscn(part
->elf
, section
))) {
172 Elf64_Shdr
*shdr
= elf64_getshdr(section
);
173 struct ac_rtld_section
*s
= &part
->sections
[elf_ndxscn(section
)];
174 s
->name
= elf_strptr(part
->elf
, section_str_index
, shdr
->sh_name
);
175 report_elf_if(!s
->name
);
177 /* Cannot actually handle linked objects yet */
178 report_elf_if(shdr
->sh_addr
!= 0);
180 /* Alignment must be 0 or a power of two */
181 report_elf_if(shdr
->sh_addralign
& (shdr
->sh_addralign
- 1));
182 uint64_t sh_align
= MAX2(shdr
->sh_addralign
, 1);
184 if (shdr
->sh_flags
& SHF_ALLOC
&&
185 shdr
->sh_type
!= SHT_NOTE
) {
186 report_if(shdr
->sh_flags
& SHF_WRITE
);
190 if (shdr
->sh_flags
& SHF_EXECINSTR
) {
191 report_elf_if(shdr
->sh_size
& 3);
193 if (!strcmp(s
->name
, ".text"))
194 s
->is_pasted_text
= true;
197 if (s
->is_pasted_text
) {
198 s
->offset
= pasted_text_size
;
199 pasted_text_size
+= shdr
->sh_size
;
201 rx_align
= align(rx_align
, sh_align
);
202 rx_size
= align(rx_size
, sh_align
);
204 rx_size
+= shdr
->sh_size
;
210 binary
->rx_end_markers
= pasted_text_size
;
211 pasted_text_size
+= 4 * DEBUGGER_NUM_MARKERS
;
213 /* Second pass: Adjust offsets of non-pasted text sections. */
214 binary
->rx_size
= pasted_text_size
;
215 binary
->rx_size
= align(binary
->rx_size
, rx_align
);
217 for (unsigned i
= 0; i
< num_parts
; ++i
) {
218 struct ac_rtld_part
*part
= &binary
->parts
[i
];
220 elf_getshdrnum(part
->elf
, &num_shdrs
);
222 for (unsigned j
= 0; j
< num_shdrs
; ++j
) {
223 struct ac_rtld_section
*s
= &part
->sections
[j
];
224 if (s
->is_rx
&& !s
->is_pasted_text
)
225 s
->offset
+= binary
->rx_size
;
229 binary
->rx_size
+= rx_size
;
237 ac_rtld_close(binary
);
241 void ac_rtld_close(struct ac_rtld_binary
*binary
)
243 for (unsigned i
= 0; i
< binary
->num_parts
; ++i
) {
244 struct ac_rtld_part
*part
= &binary
->parts
[i
];
245 free(part
->sections
);
250 binary
->parts
= NULL
;
251 binary
->num_parts
= 0;
254 static bool get_section_by_name(struct ac_rtld_part
*part
, const char *name
,
255 const char **data
, size_t *nbytes
)
257 for (unsigned i
= 0; i
< part
->num_sections
; ++i
) {
258 struct ac_rtld_section
*s
= &part
->sections
[i
];
259 if (s
->name
&& !strcmp(name
, s
->name
)) {
260 Elf_Scn
*target_scn
= elf_getscn(part
->elf
, i
);
261 Elf_Data
*target_data
= elf_getdata(target_scn
, NULL
);
263 report_elf_errorf("ac_rtld: get_section_by_name: elf_getdata");
267 *data
= target_data
->d_buf
;
268 *nbytes
= target_data
->d_size
;
275 bool ac_rtld_get_section_by_name(struct ac_rtld_binary
*binary
, const char *name
,
276 const char **data
, size_t *nbytes
)
278 assert(binary
->num_parts
== 1);
279 return get_section_by_name(&binary
->parts
[0], name
, data
, nbytes
);
282 bool ac_rtld_read_config(struct ac_rtld_binary
*binary
,
283 struct ac_shader_config
*config
)
285 for (unsigned i
= 0; i
< binary
->num_parts
; ++i
) {
286 struct ac_rtld_part
*part
= &binary
->parts
[i
];
287 const char *config_data
;
288 size_t config_nbytes
;
290 if (!get_section_by_name(part
, ".AMDGPU.config",
291 &config_data
, &config_nbytes
))
294 /* TODO: be precise about scratch use? */
295 struct ac_shader_config c
= {};
296 ac_parse_shader_binary_config(config_data
, config_nbytes
, true, &c
);
298 config
->num_sgprs
= MAX2(config
->num_sgprs
, c
.num_sgprs
);
299 config
->num_vgprs
= MAX2(config
->num_vgprs
, c
.num_vgprs
);
300 config
->spilled_sgprs
= MAX2(config
->spilled_sgprs
, c
.spilled_sgprs
);
301 config
->spilled_vgprs
= MAX2(config
->spilled_vgprs
, c
.spilled_vgprs
);
302 config
->scratch_bytes_per_wave
= MAX2(config
->scratch_bytes_per_wave
,
303 c
.scratch_bytes_per_wave
);
305 assert(i
== 0 || config
->float_mode
== c
.float_mode
);
306 config
->float_mode
= c
.float_mode
;
308 /* SPI_PS_INPUT_ENA/ADDR can't be combined. Only the value from
309 * the main shader part is used. */
310 assert(config
->spi_ps_input_ena
== 0 &&
311 config
->spi_ps_input_addr
== 0);
312 config
->spi_ps_input_ena
= c
.spi_ps_input_ena
;
313 config
->spi_ps_input_addr
= c
.spi_ps_input_addr
;
315 /* TODO: consistently use LDS symbols for this */
316 config
->lds_size
= MAX2(config
->lds_size
, c
.lds_size
);
318 /* TODO: Should we combine these somehow? It's currently only
319 * used for radeonsi's compute, where multiple parts aren't used. */
320 assert(config
->rsrc1
== 0 && config
->rsrc2
== 0);
321 config
->rsrc1
= c
.rsrc1
;
322 config
->rsrc2
= c
.rsrc2
;
328 static bool resolve_symbol(const struct ac_rtld_upload_info
*u
,
329 unsigned part_idx
, const Elf64_Sym
*sym
,
330 const char *name
, uint64_t *value
)
332 if (sym
->st_shndx
== SHN_UNDEF
) {
333 /* TODO: resolve from other parts */
335 if (u
->get_external_symbol(u
->cb_data
, name
, value
))
338 report_errorf("symbol %s: unknown", name
);
342 struct ac_rtld_part
*part
= &u
->binary
->parts
[part_idx
];
343 if (sym
->st_shndx
>= part
->num_sections
) {
344 report_errorf("symbol %s: section out of bounds", name
);
348 struct ac_rtld_section
*s
= &part
->sections
[sym
->st_shndx
];
350 report_errorf("symbol %s: bad section", name
);
354 uint64_t section_base
= u
->rx_va
+ s
->offset
;
356 *value
= section_base
+ sym
->st_value
;
360 static bool apply_relocs(const struct ac_rtld_upload_info
*u
,
361 unsigned part_idx
, const Elf64_Shdr
*reloc_shdr
,
362 const Elf_Data
*reloc_data
)
364 #define report_if(cond) \
367 report_errorf(#cond); \
371 #define report_elf_if(cond) \
374 report_elf_errorf(#cond); \
379 struct ac_rtld_part
*part
= &u
->binary
->parts
[part_idx
];
380 Elf_Scn
*target_scn
= elf_getscn(part
->elf
, reloc_shdr
->sh_info
);
381 report_elf_if(!target_scn
);
383 Elf_Data
*target_data
= elf_getdata(target_scn
, NULL
);
384 report_elf_if(!target_data
);
386 Elf_Scn
*symbols_scn
= elf_getscn(part
->elf
, reloc_shdr
->sh_link
);
387 report_elf_if(!symbols_scn
);
389 Elf64_Shdr
*symbols_shdr
= elf64_getshdr(symbols_scn
);
390 report_elf_if(!symbols_shdr
);
391 uint32_t strtabidx
= symbols_shdr
->sh_link
;
393 Elf_Data
*symbols_data
= elf_getdata(symbols_scn
, NULL
);
394 report_elf_if(!symbols_data
);
396 const Elf64_Sym
*symbols
= symbols_data
->d_buf
;
397 size_t num_symbols
= symbols_data
->d_size
/ sizeof(Elf64_Sym
);
399 struct ac_rtld_section
*s
= &part
->sections
[reloc_shdr
->sh_info
];
400 report_if(!s
->is_rx
);
402 const char *orig_base
= target_data
->d_buf
;
403 char *dst_base
= u
->rx_ptr
+ s
->offset
;
404 uint64_t va_base
= u
->rx_va
+ s
->offset
;
406 Elf64_Rel
*rel
= reloc_data
->d_buf
;
407 size_t num_relocs
= reloc_data
->d_size
/ sizeof(*rel
);
408 for (size_t i
= 0; i
< num_relocs
; ++i
, ++rel
) {
409 size_t r_sym
= ELF64_R_SYM(rel
->r_info
);
410 unsigned r_type
= ELF64_R_TYPE(rel
->r_info
);
412 const char *orig_ptr
= orig_base
+ rel
->r_offset
;
413 char *dst_ptr
= dst_base
+ rel
->r_offset
;
414 uint64_t va
= va_base
+ rel
->r_offset
;
419 if (r_sym
== STN_UNDEF
) {
422 report_elf_if(r_sym
>= num_symbols
);
424 const Elf64_Sym
*sym
= &symbols
[r_sym
];
425 const char *symbol_name
=
426 elf_strptr(part
->elf
, strtabidx
, sym
->st_name
);
427 report_elf_if(!symbol_name
);
429 if (!resolve_symbol(u
, part_idx
, sym
, symbol_name
, &symbol
))
433 /* TODO: Should we also support .rela sections, where the
434 * addend is part of the relocation record? */
436 /* Load the addend from the ELF instead of the destination,
437 * because the destination may be in VRAM. */
440 case R_AMDGPU_ABS32_LO
:
441 case R_AMDGPU_ABS32_HI
:
443 case R_AMDGPU_REL32_LO
:
444 case R_AMDGPU_REL32_HI
:
445 addend
= *(const uint32_t *)orig_ptr
;
449 addend
= *(const uint64_t *)orig_ptr
;
452 report_errorf("unsupported r_type == %u", r_type
);
456 uint64_t abs
= symbol
+ addend
;
460 assert((uint32_t)abs
== abs
);
461 case R_AMDGPU_ABS32_LO
:
462 *(uint32_t *)dst_ptr
= util_cpu_to_le32(abs
);
464 case R_AMDGPU_ABS32_HI
:
465 *(uint32_t *)dst_ptr
= util_cpu_to_le32(abs
>> 32);
468 *(uint64_t *)dst_ptr
= util_cpu_to_le64(abs
);
471 assert((int64_t)(int32_t)(abs
- va
) == (int64_t)(abs
- va
));
472 case R_AMDGPU_REL32_LO
:
473 *(uint32_t *)dst_ptr
= util_cpu_to_le32(abs
- va
);
475 case R_AMDGPU_REL32_HI
:
476 *(uint32_t *)dst_ptr
= util_cpu_to_le32((abs
- va
) >> 32);
479 *(uint64_t *)dst_ptr
= util_cpu_to_le64(abs
- va
);
482 unreachable("bad r_type");
493 * Upload the binary or binaries to the provided GPU buffers, including
496 bool ac_rtld_upload(struct ac_rtld_upload_info
*u
)
498 #define report_if(cond) \
501 report_errorf(#cond); \
505 #define report_elf_if(cond) \
508 report_errorf(#cond); \
513 /* First pass: upload raw section data. */
514 for (unsigned i
= 0; i
< u
->binary
->num_parts
; ++i
) {
515 struct ac_rtld_part
*part
= &u
->binary
->parts
[i
];
516 Elf_Scn
*section
= NULL
;
517 while ((section
= elf_nextscn(part
->elf
, section
))) {
518 Elf64_Shdr
*shdr
= elf64_getshdr(section
);
519 struct ac_rtld_section
*s
= &part
->sections
[elf_ndxscn(section
)];
524 report_if(shdr
->sh_type
!= SHT_PROGBITS
);
526 Elf_Data
*data
= elf_getdata(section
, NULL
);
527 report_elf_if(!data
|| data
->d_size
!= shdr
->sh_size
);
528 memcpy(u
->rx_ptr
+ s
->offset
, data
->d_buf
, shdr
->sh_size
);
532 if (u
->binary
->rx_end_markers
) {
533 uint32_t *dst
= (uint32_t *)(u
->rx_ptr
+ u
->binary
->rx_end_markers
);
534 for (unsigned i
= 0; i
< DEBUGGER_NUM_MARKERS
; ++i
)
535 *dst
++ = util_cpu_to_le32(DEBUGGER_END_OF_CODE_MARKER
);
538 /* Second pass: handle relocations, overwriting uploaded data where
540 for (unsigned i
= 0; i
< u
->binary
->num_parts
; ++i
) {
541 struct ac_rtld_part
*part
= &u
->binary
->parts
[i
];
542 Elf_Scn
*section
= NULL
;
543 while ((section
= elf_nextscn(part
->elf
, section
))) {
544 Elf64_Shdr
*shdr
= elf64_getshdr(section
);
545 if (shdr
->sh_type
== SHT_REL
) {
546 Elf_Data
*relocs
= elf_getdata(section
, NULL
);
547 report_elf_if(!relocs
|| relocs
->d_size
!= shdr
->sh_size
);
548 if (!apply_relocs(u
, i
, shdr
, relocs
))
550 } else if (shdr
->sh_type
== SHT_RELA
) {
551 report_errorf("SHT_RELA not supported");