Now that V3D has 8 byte per pixel formats exposed, we've got stride==32
utiles to load and store. Just handle them through the non-NEON paths for
now.
:
: "r"(gpu), "r"(cpu), "r"(cpu_stride)
: "q0", "q1", "q2", "q3");
:
: "r"(gpu), "r"(cpu), "r"(cpu_stride)
: "q0", "q1", "q2", "q3");
- } else {
- assert(gpu_stride == 16);
+ return;
+ } else if (gpu_stride == 16) {
__asm__ volatile (
/* Load from the GPU in one shot, no interleave, to
* d0-d7.
__asm__ volatile (
/* Load from the GPU in one shot, no interleave, to
* d0-d7.
:
: "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
: "q0", "q1", "q2", "q3");
:
: "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
: "q0", "q1", "q2", "q3");
}
#elif defined (PIPE_ARCH_AARCH64)
if (gpu_stride == 8) {
}
#elif defined (PIPE_ARCH_AARCH64)
if (gpu_stride == 8) {
:
: "r"(gpu), "r"(cpu), "r"(cpu_stride)
: "v0", "v1", "v2", "v3");
:
: "r"(gpu), "r"(cpu), "r"(cpu_stride)
: "v0", "v1", "v2", "v3");
- } else {
- assert(gpu_stride == 16);
+ return;
+ } else if (gpu_stride == 16) {
__asm__ volatile (
/* Load from the GPU in one shot, no interleave, to
* d0-d7.
__asm__ volatile (
/* Load from the GPU in one shot, no interleave, to
* d0-d7.
:
: "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
: "v0", "v1", "v2", "v3");
:
: "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
: "v0", "v1", "v2", "v3");
for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) {
memcpy(cpu, gpu + gpu_offset, gpu_stride);
cpu += cpu_stride;
}
for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) {
memcpy(cpu, gpu + gpu_offset, gpu_stride);
cpu += cpu_stride;
}
:
: "r"(gpu), "r"(cpu), "r"(cpu_stride)
: "q0", "q1", "q2", "q3");
:
: "r"(gpu), "r"(cpu), "r"(cpu_stride)
: "q0", "q1", "q2", "q3");
- } else {
- assert(gpu_stride == 16);
+ return;
+ } else if (gpu_stride == 16) {
__asm__ volatile (
/* Load each 16-byte line in 2 parts from the cpu-side
* destination. (vld1 can only store one d-register
__asm__ volatile (
/* Load each 16-byte line in 2 parts from the cpu-side
* destination. (vld1 can only store one d-register
:
: "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
: "q0", "q1", "q2", "q3");
:
: "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
: "q0", "q1", "q2", "q3");
}
#elif defined (PIPE_ARCH_AARCH64)
if (gpu_stride == 8) {
}
#elif defined (PIPE_ARCH_AARCH64)
if (gpu_stride == 8) {
:
: "r"(gpu), "r"(cpu), "r"(cpu_stride)
: "v0", "v1", "v2", "v3");
:
: "r"(gpu), "r"(cpu), "r"(cpu_stride)
: "v0", "v1", "v2", "v3");
- } else {
- assert(gpu_stride == 16);
+ return;
+ } else if (gpu_stride == 16) {
__asm__ volatile (
/* Load each 16-byte line in 2 parts from the cpu-side
* destination. (vld1 can only store one d-register
__asm__ volatile (
/* Load each 16-byte line in 2 parts from the cpu-side
* destination. (vld1 can only store one d-register
:
: "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
: "v0", "v1", "v2", "v3");
:
: "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
: "v0", "v1", "v2", "v3");
for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) {
memcpy(gpu + gpu_offset, cpu, gpu_stride);
cpu += cpu_stride;
}
for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) {
memcpy(gpu + gpu_offset, cpu, gpu_stride);
cpu += cpu_stride;
}