src/gallium/drivers/cell/ppu/cell_vertex_fetch.c

   1 /*
   2  * (C) Copyright IBM Corporation 2008
   3  * All Rights Reserved.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * on the rights to use, copy, modify, merge, publish, distribute, sub
   9  * license, and/or sell copies of the Software, and to permit persons to whom
  10  * the Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
  19  * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  20  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  21  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  22  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  23  */
  24
  25 #include <inttypes.h>
  26 #include "pipe/p_defines.h"
  27 #include "pipe/p_context.h"
  28 #include "pipe/p_format.h"
  29
  30 #include "../auxiliary/draw/draw_context.h"
  31 #include "../auxiliary/draw/draw_private.h"
  32
  33 #include "cell_context.h"
  34 #include "ppc/rtasm/spe_asm.h"
  35
  36 typedef uint64_t register_mask;
  37
  38 int allocate_available_register(register_mask *m)
  39 {
  40    unsigned i;
  41    for (i = 0; i < 64; i++) {
  42       const uint64_t mask = (1ULL << i);
  43
  44       if ((m[0] & mask) != 0) {
  45          m[0] &= ~mask;
  46          return i;
  47       }
  48    }
  49
  50    return -1;
  51 }
  52
  53
  54 int allocate_register(register_mask *m, unsigned reg)
  55 {
  56    assert((m[0] & (1ULL << reg)) != 0);
  57
  58    m[0] &= ~(1ULL << reg);
  59    return reg;
  60 }
  61
  62
  63 void release_register(register_mask *m, unsigned reg)
  64 {
  65    assert((m[0] & (1ULL << reg)) == 0);
  66
  67    m[0] |= (1ULL << reg);
  68 }
  69
  70
  71 /**
  72  * Emit a 4x4 matrix transpose operation
  73  *
  74  * \param p         Function that the transpose operation is to be appended to
  75  * \param m         Live register mask
  76  * \param row0      Register containing row 0 of the source matrix
  77  * \param row1      Register containing row 1 of the source matrix
  78  * \param row2      Register containing row 2 of the source matrix
  79  * \param row3      Register containing row 3 of the source matrix
  80  * \param dest_ptr  Register containing the address of the destination matrix
  81  * \param shuf_ptr  Register containing the address of the shuffled data
  82  * \param count     Number of colums to actually be written to the destination
  83  *
  84  * \note
  85  * This function assumes that the registers named by \c row0, \c row1,
  86  * \c row2, and \c row3 are scratch and can be modified by the generated code.
  87  * Furthermore, these registers will be released, via calls to
  88  * \c release_register, by this function.
  89  *
  90  * \note
  91  * This function requires that four temporary are available on entry.
  92  */
  93 static void
  94 emit_matrix_transpose(struct spe_function *p, register_mask *m,
  95                       unsigned row0, unsigned row1, unsigned row2,
  96                       unsigned row3, unsigned dest_ptr,
  97                       unsigned shuf_ptr, unsigned count)
  98 {
  99    int shuf_hi = allocate_available_register(m);
 100    int shuf_lo = allocate_available_register(m);
 101    int t1 = allocate_available_register(m);
 102    int t2 = allocate_available_register(m);
 103    int t3;
 104    int t4;
 105    int col0;
 106    int col1;
 107    int col2;
 108    int col3;
 109
 110
 111    spe_lqd(p, shuf_hi, shuf_ptr, 3);
 112    spe_lqd(p, shuf_lo, shuf_ptr, 4);
 113    spe_shufb(p, t1, row0, row2, shuf_hi);
 114    spe_shufb(p, t2, row0, row2, shuf_lo);
 115
 116
 117    /* row0 and row2 are now no longer needed.  Re-use those registers as
 118     * temporaries.
 119     */
 120    t3 = row0;
 121    t4 = row2;
 122
 123    spe_shufb(p, t3, row1, row3, shuf_hi);
 124    spe_shufb(p, t4, row1, row3, shuf_lo);
 125
 126
 127    /* row1 and row3 are now no longer needed.  Re-use those registers as
 128     * temporaries.
 129     */
 130    col0 = row1;
 131    col1 = row3;
 132
 133    spe_shufb(p, col0, t1, t3, shuf_hi);
 134    if (count > 1) {
 135       spe_shufb(p, col1, t1, t3, shuf_lo);
 136    }
 137
 138    /* t1 and t3 are now no longer needed.  Re-use those registers as
 139     * temporaries.
 140     */
 141    col2 = t1;
 142    col3 = t3;
 143
 144    if (count > 2) {
 145       spe_shufb(p, col2, t2, t4, shuf_hi);
 146    }
 147
 148    if (count > 3) {
 149       spe_shufb(p, col3, t2, t4, shuf_lo);
 150    }
 151
 152
 153    /* Store the results.  Remember that the stqd instruction is encoded using
 154     * the qword offset (stand-alone assemblers to the byte-offset to
 155     * qword-offset conversion for you), so the byte-offset needs be divided by
 156     * 16.
 157     */
 158    switch (count) {
 159    case 4:
 160       spe_stqd(p, col3, dest_ptr, 3);
 161    case 3:
 162       spe_stqd(p, col2, dest_ptr, 2);
 163    case 2:
 164       spe_stqd(p, col1, dest_ptr, 1);
 165    case 1:
 166       spe_stqd(p, col0, dest_ptr, 0);
 167    }
 168
 169
 170    /* Release all of the temporary registers used.
 171     */
 172    release_register(m, col0);
 173    release_register(m, col1);
 174    release_register(m, col2);
 175    release_register(m, col3);
 176    release_register(m, shuf_hi);
 177    release_register(m, shuf_lo);
 178    release_register(m, t2);
 179    release_register(m, t4);
 180 }
 181
 182
 183 static void
 184 emit_fetch(struct spe_function *p, register_mask *m,
 185            unsigned in_ptr, unsigned *offset,
 186            unsigned out_ptr, unsigned shuf_ptr,
 187            enum pipe_format format)
 188 {
 189    const unsigned count = (pf_size_x(format) != 0) + (pf_size_y(format) != 0)
 190        + (pf_size_z(format) != 0) + (pf_size_w(format) != 0);
 191    const unsigned type = pf_type(format);
 192    const unsigned bytes = pf_size_x(format);
 193
 194    int v0 = allocate_available_register(m);
 195    int v1 = allocate_available_register(m);
 196    int v2 = allocate_available_register(m);
 197    int v3 = allocate_available_register(m);
 198    int tmp = allocate_available_register(m);
 199    int float_zero = -1;
 200    int float_one = -1;
 201    float scale_signed = 0.0;
 202    float scale_unsigned = 0.0;
 203
 204    spe_lqd(p, v0, in_ptr, 0 + offset[0]);
 205    spe_lqd(p, v1, in_ptr, 1 + offset[0]);
 206    spe_lqd(p, v2, in_ptr, 2 + offset[0]);
 207    spe_lqd(p, v3, in_ptr, 3 + offset[0]);
 208    offset[0] += 4;
 209
 210    switch (bytes) {
 211    case 1:
 212       scale_signed = 1.0f / 127.0f;
 213       scale_unsigned = 1.0f / 255.0f;
 214       spe_lqd(p, tmp, shuf_ptr, 1);
 215       spe_shufb(p, v0, v0, v0, tmp);
 216       spe_shufb(p, v1, v1, v1, tmp);
 217       spe_shufb(p, v2, v2, v2, tmp);
 218       spe_shufb(p, v3, v3, v3, tmp);
 219       break;
 220    case 2:
 221       scale_signed = 1.0f / 32767.0f;
 222       scale_unsigned = 1.0f / 65535.0f;
 223       spe_lqd(p, tmp, shuf_ptr, 2);
 224       spe_shufb(p, v0, v0, v0, tmp);
 225       spe_shufb(p, v1, v1, v1, tmp);
 226       spe_shufb(p, v2, v2, v2, tmp);
 227       spe_shufb(p, v3, v3, v3, tmp);
 228       break;
 229    case 4:
 230       scale_signed = 1.0f / 2147483647.0f;
 231       scale_unsigned = 1.0f / 4294967295.0f;
 232       break;
 233    default:
 234       assert(0);
 235       break;
 236    }
 237
 238    switch (type) {
 239    case PIPE_FORMAT_TYPE_FLOAT:
 240       break;
 241    case PIPE_FORMAT_TYPE_UNORM:
 242       spe_ilhu(p, tmp, ((unsigned) scale_unsigned) >> 16);
 243       spe_iohl(p, tmp, ((unsigned) scale_unsigned) & 0x0ffff);
 244       spe_cuflt(p, v0, v0, 0);
 245       spe_fm(p, v0, v0, tmp);
 246       break;
 247    case PIPE_FORMAT_TYPE_SNORM:
 248       spe_ilhu(p, tmp, ((unsigned) scale_signed) >> 16);
 249       spe_iohl(p, tmp, ((unsigned) scale_signed) & 0x0ffff);
 250       spe_csflt(p, v0, v0, 0);
 251       spe_fm(p, v0, v0, tmp);
 252       break;
 253    case PIPE_FORMAT_TYPE_USCALED:
 254       spe_cuflt(p, v0, v0, 0);
 255       break;
 256    case PIPE_FORMAT_TYPE_SSCALED:
 257       spe_csflt(p, v0, v0, 0);
 258       break;
 259    }
 260
 261
 262    if (count < 4) {
 263       float_one = allocate_available_register(m);
 264       spe_il(p, float_one, 1);
 265       spe_cuflt(p, float_one, float_one, 0);
 266
 267       if (count < 3) {
 268          float_zero = allocate_available_register(m);
 269          spe_il(p, float_zero, 0);
 270       }
 271    }
 272
 273    release_register(m, tmp);
 274
 275    emit_matrix_transpose(p, m, v0, v1, v2, v3, out_ptr, shuf_ptr, count);
 276
 277    switch (count) {
 278    case 1:
 279       spe_stqd(p, float_zero, out_ptr, 1);
 280    case 2:
 281       spe_stqd(p, float_zero, out_ptr, 2);
 282    case 3:
 283       spe_stqd(p, float_one, out_ptr, 3);
 284    }
 285
 286    if (float_zero != -1) {
 287       release_register(m, float_zero);
 288    }
 289
 290    if (float_one != -1) {
 291       release_register(m, float_one);
 292    }
 293 }
 294
 295
 296 void cell_update_vertex_fetch(struct draw_context *draw)
 297 {
 298    struct cell_context *const cell =
 299        (struct cell_context *) draw->driver_private;
 300    register_mask m = ~0;
 301    struct spe_function *p = &cell->attrib_fetch;
 302    unsigned function_index[PIPE_ATTRIB_MAX];
 303    unsigned unique_attr_formats;
 304    int out_ptr;
 305    int in_ptr;
 306    int shuf_ptr;
 307    unsigned i;
 308    unsigned j;
 309
 310
 311    /* Determine how many unique input attribute formats there are.  At the
 312     * same time, store the index of the lowest numbered attribute that has
 313     * the same format as any non-unique format.
 314     */
 315    unique_attr_formats = 1;
 316    function_index[0] = 0;
 317    for (i = 1; i < draw->vertex_fetch.nr_attrs; i++) {
 318       const enum pipe_format curr_fmt = draw->vertex_element[i].src_format;
 319
 320       for (j = 0; j < i; j++) {
 321          if (curr_fmt == draw->vertex_element[j].src_format) {
 322             break;
 323          }
 324       }
 325
 326       if (j == i) {
 327          unique_attr_formats++;
 328       }
 329
 330       function_index[i] = j;
 331    }
 332
 333
 334    /* Each fetch function can be a maximum of 34 instructions (note: this is
 335     * actually a slight over-estimate).  That means (34 * 4) = 136 bytes
 336     * each maximum.
 337     */
 338    spe_init_func(p, 136 * unique_attr_formats);
 339
 340
 341    /* Registers 0, 1, and 2 are reserved by the ABI.
 342     */
 343    allocate_register(&m, 0);
 344    allocate_register(&m, 1);
 345    allocate_register(&m, 2);
 346
 347
 348    /* Allocate registers for the function's input parameters.
 349     */
 350    out_ptr = allocate_register(&m, 3);
 351    in_ptr = allocate_register(&m, 4);
 352    shuf_ptr = allocate_register(&m, 5);
 353
 354
 355    /* Generate code for the individual attribute fetch functions.
 356     */
 357    for (i = 0; i < draw->vertex_fetch.nr_attrs; i++) {
 358       unsigned offset;
 359
 360       if (function_index[i] == i) {
 361          cell->attrib_fetch_offsets[i] = (unsigned) ((void *) p->csr
 362                                                      - (void *) p->store);
 363
 364          offset = 0;
 365          emit_fetch(p, & m, in_ptr, &offset, out_ptr, shuf_ptr,
 366                     draw->vertex_element[i].src_format);
 367          spe_bi(p, 0, 0, 0);
 368
 369          /* Round up to the next 16-byte boundary.
 370           */
 371          if ((((unsigned) p->store) & 0x0f) != 0) {
 372             const unsigned align = ((unsigned) p->store) & 0x0f;
 373             p->store = (uint32_t *) (((void *) p->store) + align);
 374          }
 375       } else {
 376          /* Use the same function entry-point as a previously seen attribute
 377           * with the same format.
 378           */
 379          cell->attrib_fetch_offsets[i] =
 380              cell->attrib_fetch_offsets[function_index[i]];
 381       }
 382    }
 383 }