nvc0: change prefix of MP performance counters to HW_SM
[mesa.git] / src / gallium / drivers / nouveau / nvc0 / nvc0_screen.h
1 #ifndef __NVC0_SCREEN_H__
2 #define __NVC0_SCREEN_H__
3
4 #include "nouveau_screen.h"
5 #include "nouveau_mm.h"
6 #include "nouveau_fence.h"
7 #include "nouveau_heap.h"
8
9 #include "nv_object.xml.h"
10
11 #include "nvc0/nvc0_winsys.h"
12 #include "nvc0/nvc0_stateobj.h"
13
14 #define NVC0_TIC_MAX_ENTRIES 2048
15 #define NVC0_TSC_MAX_ENTRIES 2048
16
17 /* doesn't count reserved slots (for auxiliary constants, immediates, etc.) */
18 #define NVC0_MAX_PIPE_CONSTBUFS 14
19 #define NVE4_MAX_PIPE_CONSTBUFS_COMPUTE 7
20
21 #define NVC0_MAX_SURFACE_SLOTS 16
22
23 #define NVC0_MAX_VIEWPORTS 16
24
25
26 struct nvc0_context;
27
28 struct nvc0_blitter;
29
30 struct nvc0_graph_state {
31 bool flushed;
32 bool rasterizer_discard;
33 bool early_z_forced;
34 bool prim_restart;
35 uint32_t instance_elts; /* bitmask of per-instance elements */
36 uint32_t instance_base;
37 uint32_t constant_vbos;
38 uint32_t constant_elts;
39 int32_t index_bias;
40 uint16_t scissor;
41 uint8_t patch_vertices;
42 uint8_t vbo_mode; /* 0 = normal, 1 = translate, 3 = translate, forced */
43 uint8_t num_vtxbufs;
44 uint8_t num_vtxelts;
45 uint8_t num_textures[6];
46 uint8_t num_samplers[6];
47 uint8_t tls_required; /* bitmask of shader types using l[] */
48 uint8_t c14_bound; /* whether immediate array constbuf is bound */
49 uint8_t clip_enable;
50 uint32_t clip_mode;
51 uint32_t uniform_buffer_bound[5];
52 struct nvc0_transform_feedback_state *tfb;
53 };
54
55 struct nvc0_screen {
56 struct nouveau_screen base;
57
58 struct nvc0_context *cur_ctx;
59 struct nvc0_graph_state save_state;
60
61 int num_occlusion_queries_active;
62
63 struct nouveau_bo *text;
64 struct nouveau_bo *parm; /* for COMPUTE */
65 struct nouveau_bo *uniform_bo; /* for 3D */
66 struct nouveau_bo *tls;
67 struct nouveau_bo *txc; /* TIC (offset 0) and TSC (65536) */
68 struct nouveau_bo *poly_cache;
69
70 uint16_t mp_count;
71 uint16_t mp_count_compute; /* magic reg can make compute use fewer MPs */
72
73 struct nouveau_heap *text_heap;
74 struct nouveau_heap *lib_code; /* allocated from text_heap */
75
76 struct nvc0_blitter *blitter;
77
78 struct {
79 void **entries;
80 int next;
81 uint32_t lock[NVC0_TIC_MAX_ENTRIES / 32];
82 } tic;
83
84 struct {
85 void **entries;
86 int next;
87 uint32_t lock[NVC0_TSC_MAX_ENTRIES / 32];
88 } tsc;
89
90 struct {
91 struct nouveau_bo *bo;
92 uint32_t *map;
93 } fence;
94
95 struct {
96 struct nvc0_program *prog; /* compute state object to read MP counters */
97 struct pipe_query *mp_counter[8]; /* counter to query allocation */
98 uint8_t num_hw_sm_active[2];
99 bool mp_counters_enabled;
100 } pm;
101
102 struct nouveau_object *eng3d; /* sqrt(1/2)|kepler> + sqrt(1/2)|fermi> */
103 struct nouveau_object *eng2d;
104 struct nouveau_object *m2mf;
105 struct nouveau_object *compute;
106 struct nouveau_object *nvsw;
107 };
108
109 static inline struct nvc0_screen *
110 nvc0_screen(struct pipe_screen *screen)
111 {
112 return (struct nvc0_screen *)screen;
113 }
114
115 /*
116 * Performance counters groups:
117 */
118 #define NVC0_QUERY_MP_COUNTER_GROUP 0
119 #define NVC0_QUERY_DRV_STAT_GROUP 1
120
121 /* Performance counter queries:
122 */
123 #define NVE4_HW_SM_QUERY(i) (PIPE_QUERY_DRIVER_SPECIFIC + (i))
124 #define NVE4_HW_SM_QUERY_LAST NVE4_HW_SM_QUERY(NVE4_HW_SM_QUERY_COUNT - 1)
125 enum nve4_pm_queries
126 {
127 NVE4_HW_SM_QUERY_ACTIVE_CYCLES = 0,
128 NVE4_HW_SM_QUERY_ACTIVE_WARPS,
129 NVE4_HW_SM_QUERY_ATOM_COUNT,
130 NVE4_HW_SM_QUERY_BRANCH,
131 NVE4_HW_SM_QUERY_DIVERGENT_BRANCH,
132 NVE4_HW_SM_QUERY_GLD_REQUEST,
133 NVE4_HW_SM_QUERY_GLD_MEM_DIV_REPLAY,
134 NVE4_HW_SM_QUERY_GST_TRANSACTIONS,
135 NVE4_HW_SM_QUERY_GST_MEM_DIV_REPLAY,
136 NVE4_HW_SM_QUERY_GRED_COUNT,
137 NVE4_HW_SM_QUERY_GST_REQUEST,
138 NVE4_HW_SM_QUERY_INST_EXECUTED,
139 NVE4_HW_SM_QUERY_INST_ISSUED,
140 NVE4_HW_SM_QUERY_INST_ISSUED1,
141 NVE4_HW_SM_QUERY_INST_ISSUED2,
142 NVE4_HW_SM_QUERY_L1_GLD_HIT,
143 NVE4_HW_SM_QUERY_L1_GLD_MISS,
144 NVE4_HW_SM_QUERY_L1_LOCAL_LD_HIT,
145 NVE4_HW_SM_QUERY_L1_LOCAL_LD_MISS,
146 NVE4_HW_SM_QUERY_L1_LOCAL_ST_HIT,
147 NVE4_HW_SM_QUERY_L1_LOCAL_ST_MISS,
148 NVE4_HW_SM_QUERY_L1_SHARED_LD_TRANSACTIONS,
149 NVE4_HW_SM_QUERY_L1_SHARED_ST_TRANSACTIONS,
150 NVE4_HW_SM_QUERY_LOCAL_LD,
151 NVE4_HW_SM_QUERY_LOCAL_LD_TRANSACTIONS,
152 NVE4_HW_SM_QUERY_LOCAL_ST,
153 NVE4_HW_SM_QUERY_LOCAL_ST_TRANSACTIONS,
154 NVE4_HW_SM_QUERY_PROF_TRIGGER_0,
155 NVE4_HW_SM_QUERY_PROF_TRIGGER_1,
156 NVE4_HW_SM_QUERY_PROF_TRIGGER_2,
157 NVE4_HW_SM_QUERY_PROF_TRIGGER_3,
158 NVE4_HW_SM_QUERY_PROF_TRIGGER_4,
159 NVE4_HW_SM_QUERY_PROF_TRIGGER_5,
160 NVE4_HW_SM_QUERY_PROF_TRIGGER_6,
161 NVE4_HW_SM_QUERY_PROF_TRIGGER_7,
162 NVE4_HW_SM_QUERY_SHARED_LD,
163 NVE4_HW_SM_QUERY_SHARED_LD_REPLAY,
164 NVE4_HW_SM_QUERY_SHARED_ST,
165 NVE4_HW_SM_QUERY_SHARED_ST_REPLAY,
166 NVE4_HW_SM_QUERY_SM_CTA_LAUNCHED,
167 NVE4_HW_SM_QUERY_THREADS_LAUNCHED,
168 NVE4_HW_SM_QUERY_UNCACHED_GLD_TRANSACTIONS,
169 NVE4_HW_SM_QUERY_WARPS_LAUNCHED,
170 NVE4_HW_SM_QUERY_METRIC_IPC,
171 NVE4_HW_SM_QUERY_METRIC_IPAC,
172 NVE4_HW_SM_QUERY_METRIC_IPEC,
173 NVE4_HW_SM_QUERY_METRIC_MP_OCCUPANCY,
174 NVE4_HW_SM_QUERY_METRIC_MP_EFFICIENCY,
175 NVE4_HW_SM_QUERY_METRIC_INST_REPLAY_OHEAD,
176 NVE4_HW_SM_QUERY_COUNT
177 };
178
179 #define NVC0_HW_SM_QUERY(i) (PIPE_QUERY_DRIVER_SPECIFIC + 2048 + (i))
180 #define NVC0_HW_SM_QUERY_LAST NVC0_HW_SM_QUERY(NVC0_HW_SM_QUERY_COUNT - 1)
181 enum nvc0_pm_queries
182 {
183 NVC0_HW_SM_QUERY_ACTIVE_CYCLES = 0,
184 NVC0_HW_SM_QUERY_ACTIVE_WARPS,
185 NVC0_HW_SM_QUERY_ATOM_COUNT,
186 NVC0_HW_SM_QUERY_BRANCH,
187 NVC0_HW_SM_QUERY_DIVERGENT_BRANCH,
188 NVC0_HW_SM_QUERY_GLD_REQUEST,
189 NVC0_HW_SM_QUERY_GRED_COUNT,
190 NVC0_HW_SM_QUERY_GST_REQUEST,
191 NVC0_HW_SM_QUERY_INST_EXECUTED,
192 NVC0_HW_SM_QUERY_INST_ISSUED1_0,
193 NVC0_HW_SM_QUERY_INST_ISSUED1_1,
194 NVC0_HW_SM_QUERY_INST_ISSUED2_0,
195 NVC0_HW_SM_QUERY_INST_ISSUED2_1,
196 NVC0_HW_SM_QUERY_LOCAL_LD,
197 NVC0_HW_SM_QUERY_LOCAL_ST,
198 NVC0_HW_SM_QUERY_PROF_TRIGGER_0,
199 NVC0_HW_SM_QUERY_PROF_TRIGGER_1,
200 NVC0_HW_SM_QUERY_PROF_TRIGGER_2,
201 NVC0_HW_SM_QUERY_PROF_TRIGGER_3,
202 NVC0_HW_SM_QUERY_PROF_TRIGGER_4,
203 NVC0_HW_SM_QUERY_PROF_TRIGGER_5,
204 NVC0_HW_SM_QUERY_PROF_TRIGGER_6,
205 NVC0_HW_SM_QUERY_PROF_TRIGGER_7,
206 NVC0_HW_SM_QUERY_SHARED_LD,
207 NVC0_HW_SM_QUERY_SHARED_ST,
208 NVC0_HW_SM_QUERY_THREADS_LAUNCHED,
209 NVC0_HW_SM_QUERY_TH_INST_EXECUTED_0,
210 NVC0_HW_SM_QUERY_TH_INST_EXECUTED_1,
211 NVC0_HW_SM_QUERY_TH_INST_EXECUTED_2,
212 NVC0_HW_SM_QUERY_TH_INST_EXECUTED_3,
213 NVC0_HW_SM_QUERY_WARPS_LAUNCHED,
214 NVC0_HW_SM_QUERY_COUNT
215 };
216
217 /* Driver statistics queries:
218 */
219 #define NVC0_QUERY_DRV_STAT(i) (PIPE_QUERY_DRIVER_SPECIFIC + 1024 + (i))
220 #define NVC0_QUERY_DRV_STAT_LAST NVC0_QUERY_DRV_STAT(NVC0_QUERY_DRV_STAT_COUNT - 1)
221 enum nvc0_drv_stats_queries
222 {
223 #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
224 NVC0_QUERY_DRV_STAT_TEX_OBJECT_CURRENT_COUNT = 0,
225 NVC0_QUERY_DRV_STAT_TEX_OBJECT_CURRENT_BYTES,
226 NVC0_QUERY_DRV_STAT_BUF_OBJECT_CURRENT_COUNT,
227 NVC0_QUERY_DRV_STAT_BUF_OBJECT_CURRENT_BYTES_VID,
228 NVC0_QUERY_DRV_STAT_BUF_OBJECT_CURRENT_BYTES_SYS,
229 NVC0_QUERY_DRV_STAT_TEX_TRANSFERS_READ,
230 NVC0_QUERY_DRV_STAT_TEX_TRANSFERS_WRITE,
231 NVC0_QUERY_DRV_STAT_TEX_COPY_COUNT,
232 NVC0_QUERY_DRV_STAT_TEX_BLIT_COUNT,
233 NVC0_QUERY_DRV_STAT_TEX_CACHE_FLUSH_COUNT,
234 NVC0_QUERY_DRV_STAT_BUF_TRANSFERS_READ,
235 NVC0_QUERY_DRV_STAT_BUF_TRANSFERS_WRITE,
236 NVC0_QUERY_DRV_STAT_BUF_READ_BYTES_STAGING_VID,
237 NVC0_QUERY_DRV_STAT_BUF_WRITE_BYTES_DIRECT,
238 NVC0_QUERY_DRV_STAT_BUF_WRITE_BYTES_STAGING_VID,
239 NVC0_QUERY_DRV_STAT_BUF_WRITE_BYTES_STAGING_SYS,
240 NVC0_QUERY_DRV_STAT_BUF_COPY_BYTES,
241 NVC0_QUERY_DRV_STAT_BUF_NON_KERNEL_FENCE_SYNC_COUNT,
242 NVC0_QUERY_DRV_STAT_ANY_NON_KERNEL_FENCE_SYNC_COUNT,
243 NVC0_QUERY_DRV_STAT_QUERY_SYNC_COUNT,
244 NVC0_QUERY_DRV_STAT_GPU_SERIALIZE_COUNT,
245 NVC0_QUERY_DRV_STAT_DRAW_CALLS_ARRAY,
246 NVC0_QUERY_DRV_STAT_DRAW_CALLS_INDEXED,
247 NVC0_QUERY_DRV_STAT_DRAW_CALLS_FALLBACK_COUNT,
248 NVC0_QUERY_DRV_STAT_USER_BUFFER_UPLOAD_BYTES,
249 NVC0_QUERY_DRV_STAT_CONSTBUF_UPLOAD_COUNT,
250 NVC0_QUERY_DRV_STAT_CONSTBUF_UPLOAD_BYTES,
251 NVC0_QUERY_DRV_STAT_PUSHBUF_COUNT,
252 NVC0_QUERY_DRV_STAT_RESOURCE_VALIDATE_COUNT,
253 #endif
254 NVC0_QUERY_DRV_STAT_COUNT
255 };
256
257 int nvc0_screen_get_driver_query_info(struct pipe_screen *, unsigned,
258 struct pipe_driver_query_info *);
259
260 int nvc0_screen_get_driver_query_group_info(struct pipe_screen *, unsigned,
261 struct pipe_driver_query_group_info *);
262
263 bool nvc0_blitter_create(struct nvc0_screen *);
264 void nvc0_blitter_destroy(struct nvc0_screen *);
265
266 void nvc0_screen_make_buffers_resident(struct nvc0_screen *);
267
268 int nvc0_screen_tic_alloc(struct nvc0_screen *, void *);
269 int nvc0_screen_tsc_alloc(struct nvc0_screen *, void *);
270
271 int nve4_screen_compute_setup(struct nvc0_screen *, struct nouveau_pushbuf *);
272 int nvc0_screen_compute_setup(struct nvc0_screen *, struct nouveau_pushbuf *);
273
274 bool nvc0_screen_resize_tls_area(struct nvc0_screen *, uint32_t lpos,
275 uint32_t lneg, uint32_t cstack);
276
277 static inline void
278 nvc0_resource_fence(struct nv04_resource *res, uint32_t flags)
279 {
280 struct nvc0_screen *screen = nvc0_screen(res->base.screen);
281
282 if (res->mm) {
283 nouveau_fence_ref(screen->base.fence.current, &res->fence);
284 if (flags & NOUVEAU_BO_WR)
285 nouveau_fence_ref(screen->base.fence.current, &res->fence_wr);
286 }
287 }
288
289 static inline void
290 nvc0_resource_validate(struct nv04_resource *res, uint32_t flags)
291 {
292 if (likely(res->bo)) {
293 if (flags & NOUVEAU_BO_WR)
294 res->status |= NOUVEAU_BUFFER_STATUS_GPU_WRITING |
295 NOUVEAU_BUFFER_STATUS_DIRTY;
296 if (flags & NOUVEAU_BO_RD)
297 res->status |= NOUVEAU_BUFFER_STATUS_GPU_READING;
298
299 nvc0_resource_fence(res, flags);
300 }
301 }
302
303 struct nvc0_format {
304 uint32_t rt;
305 uint32_t tic;
306 uint32_t vtx;
307 uint32_t usage;
308 };
309
310 extern const struct nvc0_format nvc0_format_table[];
311
312 static inline void
313 nvc0_screen_tic_unlock(struct nvc0_screen *screen, struct nv50_tic_entry *tic)
314 {
315 if (tic->id >= 0)
316 screen->tic.lock[tic->id / 32] &= ~(1 << (tic->id % 32));
317 }
318
319 static inline void
320 nvc0_screen_tsc_unlock(struct nvc0_screen *screen, struct nv50_tsc_entry *tsc)
321 {
322 if (tsc->id >= 0)
323 screen->tsc.lock[tsc->id / 32] &= ~(1 << (tsc->id % 32));
324 }
325
326 static inline void
327 nvc0_screen_tic_free(struct nvc0_screen *screen, struct nv50_tic_entry *tic)
328 {
329 if (tic->id >= 0) {
330 screen->tic.entries[tic->id] = NULL;
331 screen->tic.lock[tic->id / 32] &= ~(1 << (tic->id % 32));
332 }
333 }
334
335 static inline void
336 nvc0_screen_tsc_free(struct nvc0_screen *screen, struct nv50_tsc_entry *tsc)
337 {
338 if (tsc->id >= 0) {
339 screen->tsc.entries[tsc->id] = NULL;
340 screen->tsc.lock[tsc->id / 32] &= ~(1 << (tsc->id % 32));
341 }
342 }
343
344 #endif