src/gallium/drivers/cell/ppu/cell_vertex_fetch.c

   1 /*
   2  * (C) Copyright IBM Corporation 2008
   3  * All Rights Reserved.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * on the rights to use, copy, modify, merge, publish, distribute, sub
   9  * license, and/or sell copies of the Software, and to permit persons to whom
  10  * the Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
  19  * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 #include <inttypes.h>
  26 #include "pipe/p_defines.h"
  27 #include "pipe/p_context.h"
  28 #include "pipe/p_format.h"
  29
  30 #include "../auxiliary/draw/draw_context.h"
  31 #include "../auxiliary/draw/draw_private.h"
  32
  33 #include "cell_context.h"
  34 #include "rtasm/rtasm_ppc_spe.h"
  35
  36
  37 /**
  38  * Emit a 4x4 matrix transpose operation
  39  *
  40  * \param p         Function that the transpose operation is to be appended to
  41  * \param row0      Register containing row 0 of the source matrix
  42  * \param row1      Register containing row 1 of the source matrix
  43  * \param row2      Register containing row 2 of the source matrix
  44  * \param row3      Register containing row 3 of the source matrix
  45  * \param dest_ptr  Register containing the address of the destination matrix
  46  * \param shuf_ptr  Register containing the address of the shuffled data
  47  * \param count     Number of colums to actually be written to the destination
  48  *
  49  * \note
  50  * This function assumes that the registers named by \c row0, \c row1,
  51  * \c row2, and \c row3 are scratch and can be modified by the generated code.
  52  * Furthermore, these registers will be released, via calls to
  53  * \c release_register, by this function.
  54  *
  55  * \note
  56  * This function requires that four temporary are available on entry.
  57  */
  58 static void
  59 emit_matrix_transpose(struct spe_function *p,
  60                       unsigned row0, unsigned row1, unsigned row2,
  61                       unsigned row3, unsigned dest_ptr,
  62                       unsigned shuf_ptr, unsigned count)
  63 {
  64    int shuf_hi = spe_allocate_available_register(p);
  65    int shuf_lo = spe_allocate_available_register(p);
  66    int t1 = spe_allocate_available_register(p);
  67    int t2 = spe_allocate_available_register(p);
  68    int t3;
  69    int t4;
  70    int col0;
  71    int col1;
  72    int col2;
  73    int col3;
  74
  75
  76    spe_lqd(p, shuf_hi, shuf_ptr, 3*16);
  77    spe_lqd(p, shuf_lo, shuf_ptr, 4*16);
  78    spe_shufb(p, t1, row0, row2, shuf_hi);
  79    spe_shufb(p, t2, row0, row2, shuf_lo);
  80
  81
  82    /* row0 and row2 are now no longer needed.  Re-use those registers as
  83     * temporaries.
  84     */
  85    t3 = row0;
  86    t4 = row2;
  87
  88    spe_shufb(p, t3, row1, row3, shuf_hi);
  89    spe_shufb(p, t4, row1, row3, shuf_lo);
  90
  91
  92    /* row1 and row3 are now no longer needed.  Re-use those registers as
  93     * temporaries.
  94     */
  95    col0 = row1;
  96    col1 = row3;
  97
  98    spe_shufb(p, col0, t1, t3, shuf_hi);
  99    if (count > 1) {
 100       spe_shufb(p, col1, t1, t3, shuf_lo);
 101    }
 102
 103    /* t1 and t3 are now no longer needed.  Re-use those registers as
 104     * temporaries.
 105     */
 106    col2 = t1;
 107    col3 = t3;
 108
 109    if (count > 2) {
 110       spe_shufb(p, col2, t2, t4, shuf_hi);
 111    }
 112
 113    if (count > 3) {
 114       spe_shufb(p, col3, t2, t4, shuf_lo);
 115    }
 116
 117
 118    /* Store the results.  Remember that the stqd instruction is encoded using
 119     * the qword offset (stand-alone assemblers to the byte-offset to
 120     * qword-offset conversion for you), so the byte-offset needs be divided by
 121     * 16.
 122     */
 123    switch (count) {
 124    case 4:
 125       spe_stqd(p, col3, dest_ptr, 3 * 16);
 126    case 3:
 127       spe_stqd(p, col2, dest_ptr, 2 * 16);
 128    case 2:
 129       spe_stqd(p, col1, dest_ptr, 1 * 16);
 130    case 1:
 131       spe_stqd(p, col0, dest_ptr, 0 * 16);
 132    }
 133
 134
 135    /* Release all of the temporary registers used.
 136     */
 137    spe_release_register(p, col0);
 138    spe_release_register(p, col1);
 139    spe_release_register(p, col2);
 140    spe_release_register(p, col3);
 141    spe_release_register(p, shuf_hi);
 142    spe_release_register(p, shuf_lo);
 143    spe_release_register(p, t2);
 144    spe_release_register(p, t4);
 145 }
 146
 147
 148 #if 0
 149 /* This appears to not be used currently */
 150 static void
 151 emit_fetch(struct spe_function *p,
 152            unsigned in_ptr, unsigned *offset,
 153            unsigned out_ptr, unsigned shuf_ptr,
 154            enum pipe_format format)
 155 {
 156    const unsigned count = (pf_size_x(format) != 0) + (pf_size_y(format) != 0)
 157        + (pf_size_z(format) != 0) + (pf_size_w(format) != 0);
 158    const unsigned type = pf_type(format);
 159    const unsigned bytes = pf_size_x(format);
 160
 161    int v0 = spe_allocate_available_register(p);
 162    int v1 = spe_allocate_available_register(p);
 163    int v2 = spe_allocate_available_register(p);
 164    int v3 = spe_allocate_available_register(p);
 165    int tmp = spe_allocate_available_register(p);
 166    int float_zero = -1;
 167    int float_one = -1;
 168    float scale_signed = 0.0;
 169    float scale_unsigned = 0.0;
 170
 171    spe_lqd(p, v0, in_ptr, (0 + offset[0]) * 16);
 172    spe_lqd(p, v1, in_ptr, (1 + offset[0]) * 16);
 173    spe_lqd(p, v2, in_ptr, (2 + offset[0]) * 16);
 174    spe_lqd(p, v3, in_ptr, (3 + offset[0]) * 16);
 175    offset[0] += 4;
 176
 177    switch (bytes) {
 178    case 1:
 179       scale_signed = 1.0f / 127.0f;
 180       scale_unsigned = 1.0f / 255.0f;
 181       spe_lqd(p, tmp, shuf_ptr, 1 * 16);
 182       spe_shufb(p, v0, v0, v0, tmp);
 183       spe_shufb(p, v1, v1, v1, tmp);
 184       spe_shufb(p, v2, v2, v2, tmp);
 185       spe_shufb(p, v3, v3, v3, tmp);
 186       break;
 187    case 2:
 188       scale_signed = 1.0f / 32767.0f;
 189       scale_unsigned = 1.0f / 65535.0f;
 190       spe_lqd(p, tmp, shuf_ptr, 2 * 16);
 191       spe_shufb(p, v0, v0, v0, tmp);
 192       spe_shufb(p, v1, v1, v1, tmp);
 193       spe_shufb(p, v2, v2, v2, tmp);
 194       spe_shufb(p, v3, v3, v3, tmp);
 195       break;
 196    case 4:
 197       scale_signed = 1.0f / 2147483647.0f;
 198       scale_unsigned = 1.0f / 4294967295.0f;
 199       break;
 200    default:
 201       assert(0);
 202       break;
 203    }
 204
 205    switch (type) {
 206    case PIPE_FORMAT_TYPE_FLOAT:
 207       break;
 208    case PIPE_FORMAT_TYPE_UNORM:
 209       spe_ilhu(p, tmp, ((unsigned) scale_unsigned) >> 16);
 210       spe_iohl(p, tmp, ((unsigned) scale_unsigned) & 0x0ffff);
 211       spe_cuflt(p, v0, v0, 0);
 212       spe_fm(p, v0, v0, tmp);
 213       break;
 214    case PIPE_FORMAT_TYPE_SNORM:
 215       spe_ilhu(p, tmp, ((unsigned) scale_signed) >> 16);
 216       spe_iohl(p, tmp, ((unsigned) scale_signed) & 0x0ffff);
 217       spe_csflt(p, v0, v0, 0);
 218       spe_fm(p, v0, v0, tmp);
 219       break;
 220    case PIPE_FORMAT_TYPE_USCALED:
 221       spe_cuflt(p, v0, v0, 0);
 222       break;
 223    case PIPE_FORMAT_TYPE_SSCALED:
 224       spe_csflt(p, v0, v0, 0);
 225       break;
 226    }
 227
 228
 229    if (count < 4) {
 230       float_one = spe_allocate_available_register(p);
 231       spe_il(p, float_one, 1);
 232       spe_cuflt(p, float_one, float_one, 0);
 233
 234       if (count < 3) {
 235          float_zero = spe_allocate_available_register(p);
 236          spe_il(p, float_zero, 0);
 237       }
 238    }
 239
 240    spe_release_register(p, tmp);
 241
 242    emit_matrix_transpose(p, v0, v1, v2, v3, out_ptr, shuf_ptr, count);
 243
 244    switch (count) {
 245    case 1:
 246       spe_stqd(p, float_zero, out_ptr, 1 * 16);
 247    case 2:
 248       spe_stqd(p, float_zero, out_ptr, 2 * 16);
 249    case 3:
 250       spe_stqd(p, float_one, out_ptr, 3 * 16);
 251    }
 252
 253    if (float_zero != -1) {
 254       spe_release_register(p, float_zero);
 255    }
 256
 257    if (float_one != -1) {
 258       spe_release_register(p, float_one);
 259    }
 260 }
 261 #endif
 262
 263
 264 void cell_update_vertex_fetch(struct draw_context *draw)
 265 {
 266 #if 0
 267    struct cell_context *const cell =
 268        (struct cell_context *) draw->driver_private;
 269    struct spe_function *p = &cell->attrib_fetch;
 270    unsigned function_index[PIPE_MAX_ATTRIBS];
 271    unsigned unique_attr_formats;
 272    int out_ptr;
 273    int in_ptr;
 274    int shuf_ptr;
 275    unsigned i;
 276    unsigned j;
 277
 278
 279    /* Determine how many unique input attribute formats there are.  At the
 280     * same time, store the index of the lowest numbered attribute that has
 281     * the same format as any non-unique format.
 282     */
 283    unique_attr_formats = 1;
 284    function_index[0] = 0;
 285    for (i = 1; i < draw->vertex_fetch.nr_attrs; i++) {
 286       const enum pipe_format curr_fmt = draw->vertex_element[i].src_format;
 287
 288       for (j = 0; j < i; j++) {
 289          if (curr_fmt == draw->vertex_element[j].src_format) {
 290             break;
 291          }
 292       }
 293
 294       if (j == i) {
 295          unique_attr_formats++;
 296       }
 297
 298       function_index[i] = j;
 299    }
 300
 301
 302    /* Each fetch function can be a maximum of 34 instructions (note: this is
 303     * actually a slight over-estimate).
 304     */
 305    spe_init_func(p, 34 * SPE_INST_SIZE * unique_attr_formats);
 306
 307
 308    /* Allocate registers for the function's input parameters.
 309     */
 310    out_ptr = spe_allocate_register(p, 3);
 311    in_ptr = spe_allocate_register(p, 4);
 312    shuf_ptr = spe_allocate_register(p, 5);
 313
 314
 315    /* Generate code for the individual attribute fetch functions.
 316     */
 317    for (i = 0; i < draw->vertex_fetch.nr_attrs; i++) {
 318       unsigned offset;
 319
 320       if (function_index[i] == i) {
 321          cell->attrib_fetch_offsets[i] = (unsigned) ((void *) p->csr
 322                                                      - (void *) p->store);
 323
 324          offset = 0;
 325          emit_fetch(p, in_ptr, &offset, out_ptr, shuf_ptr,
 326                     draw->vertex_element[i].src_format);
 327          spe_bi(p, 0, 0, 0);
 328
 329          /* Round up to the next 16-byte boundary.
 330           */
 331          if ((((unsigned) p->store) & 0x0f) != 0) {
 332             const unsigned align = ((unsigned) p->store) & 0x0f;
 333             p->store = (uint32_t *) (((void *) p->store) + align);
 334          }
 335       } else {
 336          /* Use the same function entry-point as a previously seen attribute
 337           * with the same format.
 338           */
 339          cell->attrib_fetch_offsets[i] =
 340              cell->attrib_fetch_offsets[function_index[i]];
 341       }
 342    }
 343 #else
 344    assert(0);
 345 #endif
 346 }