2 * Copyright (c) 2014-2015 Advanced Micro Devices, Inc.
5 * For use for simulation and test purposes only
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
10 * 1. Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 * this list of conditions and the following disclaimer in the documentation
15 * and/or other materials provided with the distribution.
17 * 3. Neither the name of the copyright holder nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
25 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 * POSSIBILITY OF SUCH DAMAGE.
33 * Author: John Kalamatianos, Sooraj Puthoor
36 #include "gpu-compute/global_memory_pipeline.hh"
38 #include "debug/GPUMem.hh"
39 #include "debug/GPUReg.hh"
40 #include "gpu-compute/compute_unit.hh"
41 #include "gpu-compute/gpu_dyn_inst.hh"
42 #include "gpu-compute/shader.hh"
43 #include "gpu-compute/vector_register_file.hh"
44 #include "gpu-compute/wavefront.hh"
46 GlobalMemPipeline::GlobalMemPipeline(const ComputeUnitParams
* p
) :
47 computeUnit(nullptr), gmQueueSize(p
->global_mem_queue_size
),
48 inflightStores(0), inflightLoads(0)
53 GlobalMemPipeline::init(ComputeUnit
*cu
)
56 globalMemSize
= computeUnit
->shader
->globalMemSize
;
57 _name
= computeUnit
->name() + ".GlobalMemPipeline";
61 GlobalMemPipeline::exec()
63 // apply any returned global memory operations
64 GPUDynInstPtr m
= !gmReturnedLoads
.empty() ? gmReturnedLoads
.front() :
65 !gmReturnedStores
.empty() ? gmReturnedStores
.front() : nullptr;
67 bool accessVrf
= true;
68 // check the VRF to see if the operands of a load (or load component
69 // of an atomic) are accessible
70 if ((m
) && (m
->m_op
==Enums::MO_LD
|| MO_A(m
->m_op
))) {
71 Wavefront
*w
= computeUnit
->wfList
[m
->simdId
][m
->wfSlotId
];
74 w
->computeUnit
->vrf
[m
->simdId
]->
75 vrfOperandAccessReady(m
->seqNum(), w
, m
,
76 VrfAccessType::WRITE
);
79 if ((!gmReturnedStores
.empty() || !gmReturnedLoads
.empty()) &&
80 m
->latency
.rdy() && computeUnit
->glbMemToVrfBus
.rdy() &&
81 accessVrf
&& m
->statusBitVector
== VectorMask(0) &&
82 (computeUnit
->shader
->coissue_return
||
83 computeUnit
->wfWait
.at(m
->pipeId
).rdy())) {
85 if (m
->v_type
== VT_32
&& m
->m_type
== Enums::M_U8
)
86 doGmReturn
<uint32_t, uint8_t>(m
);
87 else if (m
->v_type
== VT_32
&& m
->m_type
== Enums::M_U16
)
88 doGmReturn
<uint32_t, uint16_t>(m
);
89 else if (m
->v_type
== VT_32
&& m
->m_type
== Enums::M_U32
)
90 doGmReturn
<uint32_t, uint32_t>(m
);
91 else if (m
->v_type
== VT_32
&& m
->m_type
== Enums::M_S8
)
92 doGmReturn
<int32_t, int8_t>(m
);
93 else if (m
->v_type
== VT_32
&& m
->m_type
== Enums::M_S16
)
94 doGmReturn
<int32_t, int16_t>(m
);
95 else if (m
->v_type
== VT_32
&& m
->m_type
== Enums::M_S32
)
96 doGmReturn
<int32_t, int32_t>(m
);
97 else if (m
->v_type
== VT_32
&& m
->m_type
== Enums::M_F16
)
98 doGmReturn
<float, Float16
>(m
);
99 else if (m
->v_type
== VT_32
&& m
->m_type
== Enums::M_F32
)
100 doGmReturn
<float, float>(m
);
101 else if (m
->v_type
== VT_64
&& m
->m_type
== Enums::M_U8
)
102 doGmReturn
<uint64_t, uint8_t>(m
);
103 else if (m
->v_type
== VT_64
&& m
->m_type
== Enums::M_U16
)
104 doGmReturn
<uint64_t, uint16_t>(m
);
105 else if (m
->v_type
== VT_64
&& m
->m_type
== Enums::M_U32
)
106 doGmReturn
<uint64_t, uint32_t>(m
);
107 else if (m
->v_type
== VT_64
&& m
->m_type
== Enums::M_U64
)
108 doGmReturn
<uint64_t, uint64_t>(m
);
109 else if (m
->v_type
== VT_64
&& m
->m_type
== Enums::M_S8
)
110 doGmReturn
<int64_t, int8_t>(m
);
111 else if (m
->v_type
== VT_64
&& m
->m_type
== Enums::M_S16
)
112 doGmReturn
<int64_t, int16_t>(m
);
113 else if (m
->v_type
== VT_64
&& m
->m_type
== Enums::M_S32
)
114 doGmReturn
<int64_t, int32_t>(m
);
115 else if (m
->v_type
== VT_64
&& m
->m_type
== Enums::M_S64
)
116 doGmReturn
<int64_t, int64_t>(m
);
117 else if (m
->v_type
== VT_64
&& m
->m_type
== Enums::M_F16
)
118 doGmReturn
<double, Float16
>(m
);
119 else if (m
->v_type
== VT_64
&& m
->m_type
== Enums::M_F32
)
120 doGmReturn
<double, float>(m
);
121 else if (m
->v_type
== VT_64
&& m
->m_type
== Enums::M_F64
)
122 doGmReturn
<double, double>(m
);
125 // If pipeline has executed a global memory instruction
126 // execute global memory packets and issue global
127 // memory packets to DTLB
128 if (!gmIssuedRequests
.empty()) {
129 GPUDynInstPtr mp
= gmIssuedRequests
.front();
130 if (mp
->m_op
== Enums::MO_LD
||
131 (mp
->m_op
>= Enums::MO_AAND
&& mp
->m_op
<= Enums::MO_AMIN
) ||
132 (mp
->m_op
>= Enums::MO_ANRAND
&& mp
->m_op
<= Enums::MO_ANRMIN
)) {
134 if (inflightLoads
>= gmQueueSize
) {
140 if (inflightStores
>= gmQueueSize
) {
148 gmIssuedRequests
.pop();
150 DPRINTF(GPUMem
, "CU%d: WF[%d][%d] Popping 0 mem_op = %s\n",
151 computeUnit
->cu_id
, mp
->simdId
, mp
->wfSlotId
,
152 Enums::MemOpTypeStrings
[mp
->m_op
]);
156 template<typename c0
, typename c1
>
158 GlobalMemPipeline::doGmReturn(GPUDynInstPtr m
)
160 Wavefront
*w
= computeUnit
->wfList
[m
->simdId
][m
->wfSlotId
];
162 // Return data to registers
163 if (m
->m_op
== Enums::MO_LD
|| MO_A(m
->m_op
) || MO_ANR(m
->m_op
)) {
164 gmReturnedLoads
.pop();
165 assert(inflightLoads
> 0);
168 if (m
->m_op
== Enums::MO_LD
|| MO_A(m
->m_op
)) {
169 std::vector
<uint32_t> regVec
;
170 // iterate over number of destination register operands since
171 // this is a load or atomic operation
172 for (int k
= 0; k
< m
->n_reg
; ++k
) {
173 assert((sizeof(c1
) * m
->n_reg
) <= MAX_WIDTH_FOR_MEM_INST
);
174 int dst
= m
->dst_reg
+ k
;
176 if (m
->n_reg
> MAX_REGS_FOR_NON_VEC_MEM_INST
)
177 dst
= m
->dst_reg_vec
[k
];
178 // virtual->physical VGPR mapping
179 int physVgpr
= w
->remap(dst
, sizeof(c0
), 1);
180 // save the physical VGPR index
181 regVec
.push_back(physVgpr
);
182 c1
*p1
= &((c1
*)m
->d_data
)[k
* VSZ
];
184 for (int i
= 0; i
< VSZ
; ++i
) {
185 if (m
->exec_mask
[i
]) {
186 DPRINTF(GPUReg
, "CU%d, WF[%d][%d], lane %d: "
187 "$%s%d <- %d global ld done (src = wavefront "
188 "ld inst)\n", w
->computeUnit
->cu_id
, w
->simdId
,
189 w
->wfSlotId
, i
, sizeof(c0
) == 4 ? "s" : "d",
191 // write the value into the physical VGPR. This is a
192 // purely functional operation. No timing is modeled.
193 w
->computeUnit
->vrf
[w
->simdId
]->write
<c0
>(physVgpr
,
200 // Schedule the write operation of the load data on the VRF.
201 // This simply models the timing aspect of the VRF write operation.
202 // It does not modify the physical VGPR.
203 loadVrfBankConflictCycles
+=
204 w
->computeUnit
->vrf
[w
->simdId
]->exec(m
->seqNum(),
205 w
, regVec
, sizeof(c0
),
209 gmReturnedStores
.pop();
210 assert(inflightStores
> 0);
214 // Decrement outstanding register count
215 computeUnit
->shader
->ScheduleAdd(&w
->outstanding_reqs
, m
->time
, -1);
217 if (m
->m_op
== Enums::MO_ST
|| MO_A(m
->m_op
) || MO_ANR(m
->m_op
) ||
219 computeUnit
->shader
->ScheduleAdd(&w
->outstanding_reqs_wr_gm
, m
->time
,
223 if (m
->m_op
== Enums::MO_LD
|| MO_A(m
->m_op
) || MO_ANR(m
->m_op
)) {
224 computeUnit
->shader
->ScheduleAdd(&w
->outstanding_reqs_rd_gm
, m
->time
,
228 // Mark write bus busy for appropriate amount of time
229 computeUnit
->glbMemToVrfBus
.set(m
->time
);
230 if (!computeUnit
->shader
->coissue_return
)
231 w
->computeUnit
->wfWait
.at(m
->pipeId
).set(m
->time
);
235 GlobalMemPipeline::regStats()
237 loadVrfBankConflictCycles
238 .name(name() + ".load_vrf_bank_conflict_cycles")
239 .desc("total number of cycles GM data are delayed before updating "