X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fgallium%2Fdrivers%2Fradeonsi%2Fsi_shaderlib_tgsi.c;h=fbf83a08c948c8f19f8457df667f324976f7db91;hb=50fde5b20801a82bd95f14301707c32cf5df38a2;hp=45bc93ed78240151369265d99e821ee568ec3e74;hpb=ac72a6bd0b6fb9aad879a2d43833fb77dd3f5b7d;p=mesa.git diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c index 45bc93ed782..fbf83a08c94 100644 --- a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c +++ b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c @@ -61,7 +61,7 @@ void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type, return NULL; /* Tell the shader to load VS inputs from SGPRs: */ - ureg_property(ureg, TGSI_PROPERTY_VS_BLIT_SGPRS, vs_blit_property); + ureg_property(ureg, TGSI_PROPERTY_VS_BLIT_SGPRS_AMD, vs_blit_property); ureg_property(ureg, TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION, true); /* This is just a pass-through shader with 1-3 MOV instructions. */ @@ -105,9 +105,9 @@ void *si_create_fixed_func_tcs(struct si_context *sctx) return NULL; outer = ureg_DECL_system_value(ureg, - TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI, 0); + TGSI_SEMANTIC_TESS_DEFAULT_OUTER_LEVEL, 0); inner = ureg_DECL_system_value(ureg, - TGSI_SEMANTIC_DEFAULT_TESSINNER_SI, 0); + TGSI_SEMANTIC_TESS_DEFAULT_INNER_LEVEL, 0); tessouter = ureg_DECL_output(ureg, TGSI_SEMANTIC_TESSOUTER, 0); tessinner = ureg_DECL_output(ureg, TGSI_SEMANTIC_TESSINNER, 0); @@ -119,6 +119,190 @@ void *si_create_fixed_func_tcs(struct si_context *sctx) return ureg_create_shader_and_destroy(ureg, &sctx->b); } +/* Create a compute shader implementing clear_buffer or copy_buffer. */ +void *si_create_dma_compute_shader(struct pipe_context *ctx, + unsigned num_dwords_per_thread, + bool dst_stream_cache_policy, bool is_copy) +{ + struct si_screen *sscreen = (struct si_screen *)ctx->screen; + assert(util_is_power_of_two_nonzero(num_dwords_per_thread)); + + unsigned store_qualifier = TGSI_MEMORY_COHERENT | TGSI_MEMORY_RESTRICT; + if (dst_stream_cache_policy) + store_qualifier |= TGSI_MEMORY_STREAM_CACHE_POLICY; + + /* Don't cache loads, because there is no reuse. */ + unsigned load_qualifier = store_qualifier | TGSI_MEMORY_STREAM_CACHE_POLICY; + + unsigned num_mem_ops = MAX2(1, num_dwords_per_thread / 4); + unsigned *inst_dwords = alloca(num_mem_ops * sizeof(unsigned)); + + for (unsigned i = 0; i < num_mem_ops; i++) { + if (i*4 < num_dwords_per_thread) + inst_dwords[i] = MIN2(4, num_dwords_per_thread - i*4); + } + + struct ureg_program *ureg = ureg_create(PIPE_SHADER_COMPUTE); + if (!ureg) + return NULL; + + ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH, sscreen->compute_wave_size); + ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT, 1); + ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH, 1); + + struct ureg_src value; + if (!is_copy) { + ureg_property(ureg, TGSI_PROPERTY_CS_USER_DATA_COMPONENTS_AMD, inst_dwords[0]); + value = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_CS_USER_DATA_AMD, 0); + } + + struct ureg_src tid = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_THREAD_ID, 0); + struct ureg_src blk = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_BLOCK_ID, 0); + struct ureg_dst store_addr = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X); + struct ureg_dst load_addr = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X); + struct ureg_dst dstbuf = ureg_dst(ureg_DECL_buffer(ureg, 0, false)); + struct ureg_src srcbuf; + struct ureg_src *values = NULL; + + if (is_copy) { + srcbuf = ureg_DECL_buffer(ureg, 1, false); + values = malloc(num_mem_ops * sizeof(struct ureg_src)); + } + + /* If there are multiple stores, the first store writes into 0*wavesize+tid, + * the 2nd store writes into 1*wavesize+tid, the 3rd store writes into 2*wavesize+tid, etc. + */ + ureg_UMAD(ureg, store_addr, blk, + ureg_imm1u(ureg, sscreen->compute_wave_size * num_mem_ops), tid); + /* Convert from a "store size unit" into bytes. */ + ureg_UMUL(ureg, store_addr, ureg_src(store_addr), + ureg_imm1u(ureg, 4 * inst_dwords[0])); + ureg_MOV(ureg, load_addr, ureg_src(store_addr)); + + /* Distance between a load and a store for latency hiding. */ + unsigned load_store_distance = is_copy ? 8 : 0; + + for (unsigned i = 0; i < num_mem_ops + load_store_distance; i++) { + int d = i - load_store_distance; + + if (is_copy && i < num_mem_ops) { + if (i) { + ureg_UADD(ureg, load_addr, ureg_src(load_addr), + ureg_imm1u(ureg, 4 * inst_dwords[i] * + sscreen->compute_wave_size)); + } + + values[i] = ureg_src(ureg_DECL_temporary(ureg)); + struct ureg_dst dst = + ureg_writemask(ureg_dst(values[i]), + u_bit_consecutive(0, inst_dwords[i])); + struct ureg_src srcs[] = {srcbuf, ureg_src(load_addr)}; + ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &dst, 1, srcs, 2, + load_qualifier, TGSI_TEXTURE_BUFFER, 0); + } + + if (d >= 0) { + if (d) { + ureg_UADD(ureg, store_addr, ureg_src(store_addr), + ureg_imm1u(ureg, 4 * inst_dwords[d] * + sscreen->compute_wave_size)); + } + + struct ureg_dst dst = + ureg_writemask(dstbuf, u_bit_consecutive(0, inst_dwords[d])); + struct ureg_src srcs[] = + {ureg_src(store_addr), is_copy ? values[d] : value}; + ureg_memory_insn(ureg, TGSI_OPCODE_STORE, &dst, 1, srcs, 2, + store_qualifier, TGSI_TEXTURE_BUFFER, 0); + } + } + ureg_END(ureg); + + struct pipe_compute_state state = {}; + state.ir_type = PIPE_SHADER_IR_TGSI; + state.prog = ureg_get_tokens(ureg, NULL); + + void *cs = ctx->create_compute_state(ctx, &state); + ureg_destroy(ureg); + ureg_free_tokens(state.prog); + + free(values); + return cs; +} + +/* Create a compute shader that copies DCC from one buffer to another + * where each DCC buffer has a different layout. + * + * image[0]: offset remap table (pairs of ), + * 2 pairs are read + * image[1]: DCC source buffer, typed r8_uint + * image[2]: DCC destination buffer, typed r8_uint + */ +void *si_create_dcc_retile_cs(struct pipe_context *ctx) +{ + struct ureg_program *ureg = ureg_create(PIPE_SHADER_COMPUTE); + if (!ureg) + return NULL; + + ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH, 64); + ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT, 1); + ureg_property(ureg, TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH, 1); + + /* Compute the global thread ID (in idx). */ + struct ureg_src tid = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_THREAD_ID, 0); + struct ureg_src blk = ureg_DECL_system_value(ureg, TGSI_SEMANTIC_BLOCK_ID, 0); + struct ureg_dst idx = ureg_writemask(ureg_DECL_temporary(ureg), + TGSI_WRITEMASK_X); + ureg_UMAD(ureg, idx, blk, ureg_imm1u(ureg, 64), tid); + + /* Load 2 pairs of offsets for DCC load & store. */ + struct ureg_src map = ureg_DECL_image(ureg, 0, TGSI_TEXTURE_BUFFER, 0, false, false); + struct ureg_dst offsets = ureg_DECL_temporary(ureg); + struct ureg_src map_load_args[] = {map, ureg_src(idx)}; + + ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &offsets, 1, map_load_args, 2, + TGSI_MEMORY_RESTRICT, TGSI_TEXTURE_BUFFER, 0); + + struct ureg_src dcc_src = ureg_DECL_image(ureg, 1, TGSI_TEXTURE_BUFFER, + 0, false, false); + struct ureg_dst dcc_dst = ureg_dst(ureg_DECL_image(ureg, 2, TGSI_TEXTURE_BUFFER, + 0, true, false)); + struct ureg_dst dcc_value[2]; + + /* Copy DCC values: + * dst[offsets.y] = src[offsets.x]; + * dst[offsets.w] = src[offsets.z]; + */ + for (unsigned i = 0; i < 2; i++) { + dcc_value[i] = ureg_writemask(ureg_DECL_temporary(ureg), TGSI_WRITEMASK_X); + + struct ureg_src load_args[] = + {dcc_src, ureg_scalar(ureg_src(offsets), TGSI_SWIZZLE_X + i*2)}; + ureg_memory_insn(ureg, TGSI_OPCODE_LOAD, &dcc_value[i], 1, load_args, 2, + TGSI_MEMORY_RESTRICT, TGSI_TEXTURE_BUFFER, 0); + } + + dcc_dst = ureg_writemask(dcc_dst, TGSI_WRITEMASK_X); + + for (unsigned i = 0; i < 2; i++) { + struct ureg_src store_args[] = { + ureg_scalar(ureg_src(offsets), TGSI_SWIZZLE_Y + i*2), + ureg_src(dcc_value[i]) + }; + ureg_memory_insn(ureg, TGSI_OPCODE_STORE, &dcc_dst, 1, store_args, 2, + TGSI_MEMORY_RESTRICT, TGSI_TEXTURE_BUFFER, 0); + } + ureg_END(ureg); + + struct pipe_compute_state state = {}; + state.ir_type = PIPE_SHADER_IR_TGSI; + state.prog = ureg_get_tokens(ureg, NULL); + + void *cs = ctx->create_compute_state(ctx, &state); + ureg_destroy(ureg); + return cs; +} + /* Create the compute shader that is used to collect the results. * * One compute grid with a single thread is launched for every query result @@ -334,3 +518,374 @@ void *si_create_query_result_cs(struct si_context *sctx) return sctx->b.create_compute_state(&sctx->b, &state); } + +/* Create a compute shader implementing copy_image. + * Luckily, this works with all texture targets except 1D_ARRAY. + */ +void *si_create_copy_image_compute_shader(struct pipe_context *ctx) +{ + static const char text[] = + "COMP\n" + "PROPERTY CS_FIXED_BLOCK_WIDTH 8\n" + "PROPERTY CS_FIXED_BLOCK_HEIGHT 8\n" + "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n" + "DCL SV[0], THREAD_ID\n" + "DCL SV[1], BLOCK_ID\n" + "DCL IMAGE[0], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n" + "DCL IMAGE[1], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n" + "DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw + "DCL TEMP[0..4], LOCAL\n" + "IMM[0] UINT32 {8, 1, 0, 0}\n" + "MOV TEMP[0].xyz, CONST[0][0].xyzw\n" + "UMAD TEMP[1].xyz, SV[1].xyzz, IMM[0].xxyy, SV[0].xyzz\n" + "UADD TEMP[2].xyz, TEMP[1].xyzx, TEMP[0].xyzx\n" + "LOAD TEMP[3], IMAGE[0], TEMP[2].xyzx, 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n" + "MOV TEMP[4].xyz, CONST[0][1].xyzw\n" + "UADD TEMP[2].xyz, TEMP[1].xyzx, TEMP[4].xyzx\n" + "STORE IMAGE[1], TEMP[2].xyzz, TEMP[3], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n" + "END\n"; + + struct tgsi_token tokens[1024]; + struct pipe_compute_state state = {0}; + + if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) { + assert(false); + return NULL; + } + + state.ir_type = PIPE_SHADER_IR_TGSI; + state.prog = tokens; + + return ctx->create_compute_state(ctx, &state); +} + +void *si_create_copy_image_compute_shader_1d_array(struct pipe_context *ctx) +{ + static const char text[] = + "COMP\n" + "PROPERTY CS_FIXED_BLOCK_WIDTH 64\n" + "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n" + "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n" + "DCL SV[0], THREAD_ID\n" + "DCL SV[1], BLOCK_ID\n" + "DCL IMAGE[0], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n" + "DCL IMAGE[1], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n" + "DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw + "DCL TEMP[0..4], LOCAL\n" + "IMM[0] UINT32 {64, 1, 0, 0}\n" + "MOV TEMP[0].xy, CONST[0][0].xzzw\n" + "UMAD TEMP[1].xy, SV[1].xyzz, IMM[0].xyyy, SV[0].xyzz\n" + "UADD TEMP[2].xy, TEMP[1].xyzx, TEMP[0].xyzx\n" + "LOAD TEMP[3], IMAGE[0], TEMP[2].xyzx, 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n" + "MOV TEMP[4].xy, CONST[0][1].xzzw\n" + "UADD TEMP[2].xy, TEMP[1].xyzx, TEMP[4].xyzx\n" + "STORE IMAGE[1], TEMP[2].xyzz, TEMP[3], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n" + "END\n"; + + struct tgsi_token tokens[1024]; + struct pipe_compute_state state = {0}; + + if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) { + assert(false); + return NULL; + } + + state.ir_type = PIPE_SHADER_IR_TGSI; + state.prog = tokens; + + return ctx->create_compute_state(ctx, &state); +} + +void *si_clear_render_target_shader(struct pipe_context *ctx) +{ + static const char text[] = + "COMP\n" + "PROPERTY CS_FIXED_BLOCK_WIDTH 8\n" + "PROPERTY CS_FIXED_BLOCK_HEIGHT 8\n" + "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n" + "DCL SV[0], THREAD_ID\n" + "DCL SV[1], BLOCK_ID\n" + "DCL IMAGE[0], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n" + "DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw + "DCL TEMP[0..3], LOCAL\n" + "IMM[0] UINT32 {8, 1, 0, 0}\n" + "MOV TEMP[0].xyz, CONST[0][0].xyzw\n" + "UMAD TEMP[1].xyz, SV[1].xyzz, IMM[0].xxyy, SV[0].xyzz\n" + "UADD TEMP[2].xyz, TEMP[1].xyzx, TEMP[0].xyzx\n" + "MOV TEMP[3].xyzw, CONST[0][1].xyzw\n" + "STORE IMAGE[0], TEMP[2].xyzz, TEMP[3], 2D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n" + "END\n"; + + struct tgsi_token tokens[1024]; + struct pipe_compute_state state = {0}; + + if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) { + assert(false); + return NULL; + } + + state.ir_type = PIPE_SHADER_IR_TGSI; + state.prog = tokens; + + return ctx->create_compute_state(ctx, &state); +} + +/* TODO: Didn't really test 1D_ARRAY */ +void *si_clear_render_target_shader_1d_array(struct pipe_context *ctx) +{ + static const char text[] = + "COMP\n" + "PROPERTY CS_FIXED_BLOCK_WIDTH 64\n" + "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n" + "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n" + "DCL SV[0], THREAD_ID\n" + "DCL SV[1], BLOCK_ID\n" + "DCL IMAGE[0], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT, WR\n" + "DCL CONST[0][0..1]\n" // 0:xyzw 1:xyzw + "DCL TEMP[0..3], LOCAL\n" + "IMM[0] UINT32 {64, 1, 0, 0}\n" + "MOV TEMP[0].xy, CONST[0][0].xzzw\n" + "UMAD TEMP[1].xy, SV[1].xyzz, IMM[0].xyyy, SV[0].xyzz\n" + "UADD TEMP[2].xy, TEMP[1].xyzx, TEMP[0].xyzx\n" + "MOV TEMP[3].xyzw, CONST[0][1].xyzw\n" + "STORE IMAGE[0], TEMP[2].xyzz, TEMP[3], 1D_ARRAY, PIPE_FORMAT_R32G32B32A32_FLOAT\n" + "END\n"; + + struct tgsi_token tokens[1024]; + struct pipe_compute_state state = {0}; + + if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) { + assert(false); + return NULL; + } + + state.ir_type = PIPE_SHADER_IR_TGSI; + state.prog = tokens; + + return ctx->create_compute_state(ctx, &state); +} + +/* Create the compute shader that is used to collect the results of gfx10+ + * shader queries. + * + * One compute grid with a single thread is launched for every query result + * buffer. The thread (optionally) reads a previous summary buffer, then + * accumulates data from the query result buffer, and writes the result either + * to a summary buffer to be consumed by the next grid invocation or to the + * user-supplied buffer. + * + * Data layout: + * + * BUFFER[0] = query result buffer (layout is defined by gfx10_sh_query_buffer_mem) + * BUFFER[1] = previous summary buffer + * BUFFER[2] = next summary buffer or user-supplied buffer + * + * CONST + * 0.x = config; the low 3 bits indicate the mode: + * 0: sum up counts + * 1: determine result availability and write it as a boolean + * 2: SO_OVERFLOW + * 3: SO_ANY_OVERFLOW + * the remaining bits form a bitfield: + * 8: write result as a 64-bit value + * 0.y = offset in bytes to counts or stream for SO_OVERFLOW mode + * 0.z = chain bit field: + * 1: have previous summary buffer + * 2: write next summary buffer + * 0.w = result_count + */ +void *gfx10_create_sh_query_result_cs(struct si_context *sctx) +{ + /* TEMP[0].x = accumulated result so far + * TEMP[0].y = result missing + * TEMP[0].z = whether we're in overflow mode + */ + static const char text_tmpl[] = + "COMP\n" + "PROPERTY CS_FIXED_BLOCK_WIDTH 1\n" + "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n" + "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n" + "DCL BUFFER[0]\n" + "DCL BUFFER[1]\n" + "DCL BUFFER[2]\n" + "DCL CONST[0][0..0]\n" + "DCL TEMP[0..5]\n" + "IMM[0] UINT32 {0, 7, 0, 4294967295}\n" + "IMM[1] UINT32 {1, 2, 4, 8}\n" + "IMM[2] UINT32 {16, 32, 64, 128}\n" + + /* + acc_result = 0; + acc_missing = 0; + if (chain & 1) { + acc_result = buffer[1][0]; + acc_missing = buffer[1][1]; + } + */ + "MOV TEMP[0].xy, IMM[0].xxxx\n" + "AND TEMP[5], CONST[0][0].zzzz, IMM[1].xxxx\n" + "UIF TEMP[5]\n" + "LOAD TEMP[0].xy, BUFFER[1], IMM[0].xxxx\n" + "ENDIF\n" + + /* + is_overflow (TEMP[0].z) = (config & 7) >= 2; + result_remaining (TEMP[1].x) = (is_overflow && acc_result) ? 0 : result_count; + base_offset (TEMP[1].y) = 0; + for (;;) { + if (!result_remaining) + break; + result_remaining--; + */ + "AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n" + "USGE TEMP[0].z, TEMP[5].xxxx, IMM[1].yyyy\n" + + "AND TEMP[5].x, TEMP[0].zzzz, TEMP[0].xxxx\n" + "UCMP TEMP[1].x, TEMP[5].xxxx, IMM[0].xxxx, CONST[0][0].wwww\n" + "MOV TEMP[1].y, IMM[0].xxxx\n" + + "BGNLOOP\n" + "USEQ TEMP[5], TEMP[1].xxxx, IMM[0].xxxx\n" + "UIF TEMP[5]\n" + "BRK\n" + "ENDIF\n" + "UADD TEMP[1].x, TEMP[1].xxxx, IMM[0].wwww\n" + + /* + fence = buffer[0]@(base_offset + 32); + if (!fence) { + acc_missing = ~0u; + break; + } + */ + "UADD TEMP[5].x, TEMP[1].yyyy, IMM[2].yyyy\n" + "LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n" + "USEQ TEMP[5], TEMP[5].xxxx, IMM[0].xxxx\n" + "UIF TEMP[5]\n" + "MOV TEMP[0].y, TEMP[5].xxxx\n" + "BRK\n" + "ENDIF\n" + + /* + stream_offset (TEMP[2].x) = base_offset + offset; + + if (!(config & 7)) { + acc_result += buffer[0]@stream_offset; + } + */ + "UADD TEMP[2].x, TEMP[1].yyyy, CONST[0][0].yyyy\n" + + "AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n" + "USEQ TEMP[5], TEMP[5].xxxx, IMM[0].xxxx\n" + "UIF TEMP[5]\n" + "LOAD TEMP[5].x, BUFFER[0], TEMP[2].xxxx\n" + "UADD TEMP[0].x, TEMP[0].xxxx, TEMP[5].xxxx\n" + "ENDIF\n" + + /* + if ((config & 7) >= 2) { + count (TEMP[2].y) = (config & 1) ? 4 : 1; + */ + "AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n" + "USGE TEMP[5], TEMP[5].xxxx, IMM[1].yyyy\n" + "UIF TEMP[5]\n" + "AND TEMP[5].x, CONST[0][0].xxxx, IMM[1].xxxx\n" + "UCMP TEMP[2].y, TEMP[5].xxxx, IMM[1].zzzz, IMM[1].xxxx\n" + + /* + do { + generated = buffer[0]@stream_offset; + emitted = buffer[0]@(stream_offset + 16); + if (generated != emitted) { + acc_result = 1; + result_remaining = 0; + break; + } + + stream_offset += 4; + } while (--count); + */ + "BGNLOOP\n" + "UADD TEMP[5].x, TEMP[2].xxxx, IMM[2].xxxx\n" + "LOAD TEMP[4].x, BUFFER[0], TEMP[2].xxxx\n" + "LOAD TEMP[4].y, BUFFER[0], TEMP[5].xxxx\n" + "USNE TEMP[5], TEMP[4].xxxx, TEMP[4].yyyy\n" + "UIF TEMP[5]\n" + "MOV TEMP[0].x, IMM[1].xxxx\n" + "MOV TEMP[1].y, IMM[0].xxxx\n" + "BRK\n" + "ENDIF\n" + + "UADD TEMP[2].y, TEMP[2].yyyy, IMM[0].wwww\n" + "USEQ TEMP[5], TEMP[2].yyyy, IMM[0].xxxx\n" + "UIF TEMP[5]\n" + "BRK\n" + "ENDIF\n" + "UADD TEMP[2].x, TEMP[2].xxxx, IMM[1].zzzz\n" + "ENDLOOP\n" + "ENDIF\n" + + /* + base_offset += 64; + } // end outer loop + */ + "UADD TEMP[1].y, TEMP[1].yyyy, IMM[2].zzzz\n" + "ENDLOOP\n" + + /* + if (chain & 2) { + buffer[2][0] = acc_result; + buffer[2][1] = acc_missing; + } else { + */ + "AND TEMP[5], CONST[0][0].zzzz, IMM[1].yyyy\n" + "UIF TEMP[5]\n" + "STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0]\n" + "ELSE\n" + + /* + if ((config & 7) == 1) { + acc_result = acc_missing ? 0 : 1; + acc_missing = 0; + } + */ + "AND TEMP[5], CONST[0][0].xxxx, IMM[0].yyyy\n" + "USEQ TEMP[5], TEMP[5].xxxx, IMM[1].xxxx\n" + "UIF TEMP[5]\n" + "UCMP TEMP[0].x, TEMP[0].yyyy, IMM[0].xxxx, IMM[1].xxxx\n" + "MOV TEMP[0].y, IMM[0].xxxx\n" + "ENDIF\n" + + /* + if (!acc_missing) { + buffer[2][0] = acc_result; + if (config & 8) + buffer[2][1] = 0; + } + */ + "USEQ TEMP[5], TEMP[0].yyyy, IMM[0].xxxx\n" + "UIF TEMP[5]\n" + "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].xxxx\n" + + "AND TEMP[5], CONST[0][0].xxxx, IMM[1].wwww\n" + "UIF TEMP[5]\n" + "STORE BUFFER[2].x, IMM[1].zzzz, TEMP[0].yyyy\n" + "ENDIF\n" + "ENDIF\n" + "ENDIF\n" + + "END\n"; + + struct tgsi_token tokens[1024]; + struct pipe_compute_state state = {}; + + if (!tgsi_text_translate(text_tmpl, tokens, ARRAY_SIZE(tokens))) { + assert(false); + return NULL; + } + + state.ir_type = PIPE_SHADER_IR_TGSI; + state.prog = tokens; + + return sctx->b.create_compute_state(&sctx->b, &state); +}