#define NSEC_PER_SEC 1000000000ull
 #define WAIT_TIMEOUT 5
 
-/* It seems like sample counts need to be copied over to 16-byte aligned
- * memory. */
+/* Depending on the query type, there might be 2 integer values.
+ * eg. VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT
+ *   values[0] : primitives written, values[1]: primitives generated
+ */
 struct PACKED slot_value {
-   uint64_t value;
-   uint64_t __padding;
+   uint64_t values[2];
 };
 
 struct PACKED query_slot {
 
 /* Returns the IOVA of a given uint64_t field in a given slot of a query
  * pool. */
-#define query_iova(type, pool, query, field)                         \
+#define query_iova(type, pool, query, field, value_index)            \
    pool->bo.iova + pool->stride * query + offsetof(type, field) +    \
-         offsetof(struct slot_value, value)
+         offsetof(struct slot_value, values[value_index])
 
 #define occlusion_query_iova(pool, query, field)                     \
-   query_iova(struct occlusion_query_slot, pool, query, field)
+   query_iova(struct occlusion_query_slot, pool, query, field, 0)
 
 #define query_available_iova(pool, query)                            \
-   query_iova(struct query_slot, pool, query, available)
+   query_iova(struct query_slot, pool, query, available, 0)
 
-#define query_result_iova(pool, query)                               \
-   query_iova(struct query_slot, pool, query, result)
+#define query_result_iova(pool, query, i)                            \
+   query_iova(struct query_slot, pool, query, result, i)
 
-#define query_is_available(slot) slot->available.value
+#define query_is_available(slot) slot->available.values[0]
 
 /*
  * Returns a pointer to a given slot in a query pool.
    vk_free2(&device->alloc, pAllocator, pool);
 }
 
+static uint32_t
+get_result_count(struct tu_query_pool *pool)
+{
+   switch (pool->type) {
+   /* Occulusion and timestamp queries write one integer value */
+   case VK_QUERY_TYPE_OCCLUSION:
+   case VK_QUERY_TYPE_TIMESTAMP:
+      return 1;
+   /* Transform feedback queries write two integer values */
+   case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
+      return 2;
+   default:
+      assert(!"Invalid query type");
+      return 0;
+   }
+}
+
 /* Wait on the the availability status of a query up until a timeout. */
 static VkResult
 wait_for_available(struct tu_device *device, struct tu_query_pool *pool,
       uint32_t query = firstQuery + i;
       struct query_slot *slot = slot_address(pool, query);
       bool available = query_is_available(slot);
+      uint32_t result_count = get_result_count(pool);
+
       if ((flags & VK_QUERY_RESULT_WAIT_BIT) && !available) {
          VkResult wait_result = wait_for_available(device, pool, query);
          if (wait_result != VK_SUCCESS)
          }
       }
 
-      if (available)
-         write_query_value_cpu(result_base, 0, slot->result.value, flags);
-      else if (flags & VK_QUERY_RESULT_PARTIAL_BIT)
-          /* From the Vulkan 1.1.130 spec:
-           *
-           *   If VK_QUERY_RESULT_PARTIAL_BIT is set, VK_QUERY_RESULT_WAIT_BIT
-           *   is not set, and the query’s status is unavailable, an
-           *   intermediate result value between zero and the final result
-           *   value is written to pData for that query.
-           *
-           * Just return 0 here for simplicity since it's a valid result.
-           */
-         write_query_value_cpu(result_base, 0, 0, flags);
+      for (uint32_t k = 0; k < result_count; k++) {
+         if (available)
+            write_query_value_cpu(result_base, k, slot->result.values[k], flags);
+         else if (flags & VK_QUERY_RESULT_PARTIAL_BIT)
+             /* From the Vulkan 1.1.130 spec:
+              *
+              *   If VK_QUERY_RESULT_PARTIAL_BIT is set, VK_QUERY_RESULT_WAIT_BIT
+              *   is not set, and the query’s status is unavailable, an
+              *   intermediate result value between zero and the final result
+              *   value is written to pData for that query.
+              *
+              * Just return 0 here for simplicity since it's a valid result.
+              */
+            write_query_value_cpu(result_base, k, 0, flags);
+      }
 
       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
          /* From the Vulkan 1.1.130 spec:
           *    integer value written for each query is non-zero if the query’s
           *    status was available or zero if the status was unavailable.
           */
-         write_query_value_cpu(result_base, 1, available, flags);
+         write_query_value_cpu(result_base, result_count, available, flags);
 
       result_base += stride;
    }
    for (uint32_t i = 0; i < queryCount; i++) {
       uint32_t query = firstQuery + i;
       uint64_t available_iova = query_available_iova(pool, query);
-      uint64_t result_iova = query_result_iova(pool, query);
       uint64_t buffer_iova = tu_buffer_iova(buffer) + dstOffset + i * stride;
+      uint32_t result_count = get_result_count(pool);
 
       /* Wait for the available bit to be set if executed with the
        * VK_QUERY_RESULT_WAIT_BIT flag. */
          tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16));
       }
 
-      if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
-         /* Unconditionally copying the bo->result into the buffer here is
-          * valid because we only set bo->result on vkCmdEndQuery. Thus, even
-          * if the query is unavailable, this will copy the correct partial
-          * value of 0.
-          */
-         copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
-                              0 /* offset */, flags);
-      } else {
-         /* Conditionally copy bo->result into the buffer based on whether the
-          * query is available.
-          *
-          * NOTE: For the conditional packets to be executed, CP_COND_EXEC
-          * tests that ADDR0 != 0 and ADDR1 < REF. The packet here simply tests
-          * that 0 < available < 2, aka available == 1.
-          */
-         tu_cs_reserve(cs, 7 + 6);
-         tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6);
-         tu_cs_emit_qw(cs, available_iova);
-         tu_cs_emit_qw(cs, available_iova);
-         tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2));
-         tu_cs_emit(cs, 6); /* Cond execute the next 6 DWORDS */
-
-         /* Start of conditional execution */
-         copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
-                              0 /* offset */, flags);
-         /* End of conditional execution */
+      for (uint32_t k = 0; k < result_count; k++) {
+         uint64_t result_iova = query_result_iova(pool, query, k);
+
+         if (flags & VK_QUERY_RESULT_PARTIAL_BIT) {
+            /* Unconditionally copying the bo->result into the buffer here is
+             * valid because we only set bo->result on vkCmdEndQuery. Thus, even
+             * if the query is unavailable, this will copy the correct partial
+             * value of 0.
+             */
+            copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
+                                 k /* offset */, flags);
+         } else {
+            /* Conditionally copy bo->result into the buffer based on whether the
+             * query is available.
+             *
+             * NOTE: For the conditional packets to be executed, CP_COND_EXEC
+             * tests that ADDR0 != 0 and ADDR1 < REF. The packet here simply tests
+             * that 0 < available < 2, aka available == 1.
+             */
+            tu_cs_reserve(cs, 7 + 6);
+            tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6);
+            tu_cs_emit_qw(cs, available_iova);
+            tu_cs_emit_qw(cs, available_iova);
+            tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2));
+            tu_cs_emit(cs, 6); /* Cond execute the next 6 DWORDS */
+
+            /* Start of conditional execution */
+            copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova,
+                              k /* offset */, flags);
+            /* End of conditional execution */
+         }
       }
 
       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) {
          copy_query_value_gpu(cmdbuf, cs, available_iova, buffer_iova,
-                              1 /* offset */, flags);
+                              result_count /* offset */, flags);
       }
    }
 
       tu_cs_emit_qw(cs, 0x0);
 
       tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
-      tu_cs_emit_qw(cs, query_result_iova(pool, query));
+      tu_cs_emit_qw(cs, query_result_iova(pool, query, 0));
+      tu_cs_emit_qw(cs, 0x0);
+      tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
+      tu_cs_emit_qw(cs, query_result_iova(pool, query, 1));
       tu_cs_emit_qw(cs, 0x0);
    }
 }
    uint64_t available_iova = query_available_iova(pool, query);
    uint64_t begin_iova = occlusion_query_iova(pool, query, begin);
    uint64_t end_iova = occlusion_query_iova(pool, query, end);
-   uint64_t result_iova = query_result_iova(pool, query);
+   uint64_t result_iova = query_result_iova(pool, query, 0);
    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
    tu_cs_emit_qw(cs, end_iova);
    tu_cs_emit_qw(cs, 0xffffffffffffffffull);
    tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_ALWAYS_ON_COUNTER_LO) |
                   CP_REG_TO_MEM_0_CNT(2) |
                   CP_REG_TO_MEM_0_64B);
-   tu_cs_emit_qw(cs, query_result_iova(pool, query));
+   tu_cs_emit_qw(cs, query_result_iova(pool, query, 0));
 
    tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4);
    tu_cs_emit_qw(cs, query_available_iova(pool, query));