From 48a9ba63f5c9751052e472f8d7fb195ce874199d Mon Sep 17 00:00:00 2001 From: Ilia Mirkin Date: Wed, 12 Mar 2014 12:00:58 -0400 Subject: [PATCH] nv50/ir/gk110: add implementations of div u32/s32 Signed-off-by: Ilia Mirkin --- .../nouveau/codegen/target_lib_nvf0.asm | 86 +++++++++++++++++++ .../nouveau/codegen/target_lib_nvf0.asm.h | 81 +++++++++++++++-- 2 files changed, 162 insertions(+), 5 deletions(-) create mode 100644 src/gallium/drivers/nouveau/codegen/target_lib_nvf0.asm diff --git a/src/gallium/drivers/nouveau/codegen/target_lib_nvf0.asm b/src/gallium/drivers/nouveau/codegen/target_lib_nvf0.asm new file mode 100644 index 00000000000..a0c5497524a --- /dev/null +++ b/src/gallium/drivers/nouveau/codegen/target_lib_nvf0.asm @@ -0,0 +1,86 @@ +// +// DIV U32 +// +// UNR recurrence (q = a / b): +// look for z such that 2^32 - b <= b * z < 2^32 +// then q - 1 <= (a * z) / 2^32 <= q +// +// INPUT: $r0: dividend, $r1: divisor +// OUTPUT: $r0: result, $r1: modulus +// CLOBBER: $r2 - $r3, $p0 - $p1 +// SIZE: 22 / 14 * 8 bytes +// +sched 0x28282804280428 +bfind u32 $r2 $r1 +xor b32 $r2 $r2 0x1f +mov b32 $r3 0x1 +shl b32 $r2 $r3 clamp $r2 +cvt u32 $r1 neg u32 $r1 +mul $r3 u32 $r1 u32 $r2 +add $r2 (mul high u32 $r2 u32 $r3) $r2 +sched 0x28282828282828 +mul $r3 u32 $r1 u32 $r2 +add $r2 (mul high u32 $r2 u32 $r3) $r2 +mul $r3 u32 $r1 u32 $r2 +add $r2 (mul high u32 $r2 u32 $r3) $r2 +mul $r3 u32 $r1 u32 $r2 +add $r2 (mul high u32 $r2 u32 $r3) $r2 +mul $r3 u32 $r1 u32 $r2 +sched 0x042c2828042804 +add $r2 (mul high u32 $r2 u32 $r3) $r2 +mov b32 $r3 $r0 +mul high $r0 u32 $r0 u32 $r2 +cvt u32 $r2 neg u32 $r1 +add $r1 (mul u32 $r1 u32 $r0) $r3 +set $p0 0x1 ge u32 $r1 $r2 +$p0 sub b32 $r1 $r1 $r2 +sched 0x20282e20042c28 +$p0 add b32 $r0 $r0 0x1 +$p0 set $p0 0x1 ge u32 $r1 $r2 +$p0 sub b32 $r1 $r1 $r2 +$p0 add b32 $r0 $r0 0x1 +ret +// +// DIV S32, like DIV U32 after taking ABS(inputs) +// +// INPUT: $r0: dividend, $r1: divisor +// OUTPUT: $r0: result, $r1: modulus +// CLOBBER: $r2 - $r3, $p0 - $p3 +// +set $p2 0x1 lt s32 $r0 0x0 +set $p3 0x1 lt s32 $r1 0x0 xor $p2 +sched 0x28042804282820 +cvt s32 $r0 abs s32 $r0 +cvt s32 $r1 abs s32 $r1 +bfind u32 $r2 $r1 +xor b32 $r2 $r2 0x1f +mov b32 $r3 0x1 +shl b32 $r2 $r3 clamp $r2 +cvt u32 $r1 neg u32 $r1 +sched 0x28282828282828 +mul $r3 u32 $r1 u32 $r2 +add $r2 (mul high u32 $r2 u32 $r3) $r2 +mul $r3 u32 $r1 u32 $r2 +add $r2 (mul high u32 $r2 u32 $r3) $r2 +mul $r3 u32 $r1 u32 $r2 +add $r2 (mul high u32 $r2 u32 $r3) $r2 +mul $r3 u32 $r1 u32 $r2 +sched 0x28280428042828 +add $r2 (mul high u32 $r2 u32 $r3) $r2 +mul $r3 u32 $r1 u32 $r2 +add $r2 (mul high u32 $r2 u32 $r3) $r2 +mov b32 $r3 $r0 +mul high $r0 u32 $r0 u32 $r2 +cvt u32 $r2 neg u32 $r1 +add $r1 (mul u32 $r1 u32 $r0) $r3 +sched 0x2028042c28042c +set $p0 0x1 ge u32 $r1 $r2 +$p0 sub b32 $r1 $r1 $r2 +$p0 add b32 $r0 $r0 0x1 +$p0 set $p0 0x1 ge u32 $r1 $r2 +$p0 sub b32 $r1 $r1 $r2 +$p0 add b32 $r0 $r0 0x1 +$p3 cvt s32 $r0 neg s32 $r0 +sched 0x2c200428042e04 +$p2 cvt s32 $r1 neg s32 $r1 +ret diff --git a/src/gallium/drivers/nouveau/codegen/target_lib_nvf0.asm.h b/src/gallium/drivers/nouveau/codegen/target_lib_nvf0.asm.h index d10b6b07693..02c1ec646aa 100644 --- a/src/gallium/drivers/nouveau/codegen/target_lib_nvf0.asm.h +++ b/src/gallium/drivers/nouveau/codegen/target_lib_nvf0.asm.h @@ -1,13 +1,84 @@ +// Assembled from target_lib_nvf0.asm by envyas -m gk110 -W. + static const uint64_t nvf0_builtin_code[] = { - 0x19000000001c003cULL, +// DIV U32 +0x08a0a0a010a010a0ULL, +0xe1800000009c000aULL, +0x220000000f9c0808ULL, +0x74000000009fc00eULL, +0xe2400000011c0c0aULL, +0xe6010000009c2806ULL, +0xe1c00000011c040eULL, +0xd2000800019c080aULL, +0x08a0a0a0a0a0a0a0ULL, +0xe1c00000011c040eULL, +0xd2000800019c080aULL, +0xe1c00000011c040eULL, +0xd2000800019c080aULL, +0xe1c00000011c040eULL, +0xd2000800019c080aULL, +0xe1c00000011c040eULL, +0x0810b0a0a010a010ULL, +0xd2000800019c080aULL, +0xe4c03c00001c000eULL, +0xe1c00400011c0002ULL, +0xe6010000009c280aULL, +0xd0000c00001c0406ULL, +0xdb601c00011c041eULL, +0xe088000001000406ULL, +0x0880a0b88010b0a0ULL, +0x4000000000800001ULL, +0xdb601c000100041eULL, +0xe088000001000406ULL, +0x4000000000800001ULL, +0x19000000001c003cULL, +// DIV S32 +0xdb181c007f9c005eULL, +0xdb1a08007f9c047eULL, +0x08a010a010a0a080ULL, +0xe6100000001ce802ULL, +0xe6100000009ce806ULL, +0xe1800000009c000aULL, +0x220000000f9c0808ULL, +0x74000000009fc00eULL, +0xe2400000011c0c0aULL, +0xe6010000009c2806ULL, +0x08a0a0a0a0a0a0a0ULL, +0xe1c00000011c040eULL, +0xd2000800019c080aULL, +0xe1c00000011c040eULL, +0xd2000800019c080aULL, +0xe1c00000011c040eULL, +0xd2000800019c080aULL, +0xe1c00000011c040eULL, +0x08a0a010a010a0a0ULL, +0xd2000800019c080aULL, +0xe1c00000011c040eULL, +0xd2000800019c080aULL, +0xe4c03c00001c000eULL, +0xe1c00400011c0002ULL, +0xe6010000009c280aULL, +0xd0000c00001c0406ULL, +0x0880a010b0a010b0ULL, +0xdb601c00011c041eULL, +0xe088000001000406ULL, +0x4000000000800001ULL, +0xdb601c000100041eULL, +0xe088000001000406ULL, +0x4000000000800001ULL, +0xe6010000000ce802ULL, +0x08b08010a010b810ULL, +0xe60100000088e806ULL, +0x19000000001c003cULL, }; static const uint16_t nvf0_builtin_offsets[NVC0_BUILTIN_COUNT] = { - 0, - 0, - 0, - 0 + 0x0000, + 0x00f0, + /* Just point at a ret instruction for now. */ + 0x00f0 - 8, + 0x00f0 - 8 }; -- 2.30.2