1use core::marker::PhantomData;
2use core::ops::{Deref, DerefMut};
5
6#[cfg(not(feature = "codegen_tests"))]
7use cuda_bindings::SafeGpuConfig;
8use num_traits::AsPrimitive;
9
10use crate::assert_ptr;
11use crate::cg::Block;
12use crate::chunk::{ScopeUniqueMap, ScopeUniqueMapProvidedMethods};
13use crate::chunk_scope::{Block2ThreadScope, ChainedMap, ChainedScope, ChunkScope, Thread};
14
15#[rustc_diagnostic_item = "gpu::GpuShared"]
19#[repr(C, align(16))]
20pub struct GpuShared<T: ?Sized> {
21 value: T,
22}
23
24impl<T> GpuShared<T> {
25 #[rustc_diagnostic_item = "gpu::new_shared_mem"]
26 #[gpu_codegen::device]
27 #[gpu_codegen::sync_data]
28 #[inline(never)]
29 pub const fn zero() -> Self {
30 unimplemented!();
31 }
32}
33
34impl<T> Deref for GpuShared<T> {
35 type Target = T;
36
37 #[gpu_codegen::device]
38 #[gpu_codegen::memspace_shared(0, 1000)]
39 #[inline(always)]
40 fn deref(&self) -> &T {
41 &self.value
42 }
43}
44
45impl<T> !DerefMut for GpuShared<T> {}
46
47#[allow(dead_code)]
49pub struct DynamicSharedAlloc {
50 size: usize,
51}
52
53impl DynamicSharedAlloc {
54 #[rustc_diagnostic_item = "gpu::base_dynamic_shared"]
55 #[inline(never)]
56 #[gpu_codegen::memspace_shared(1000)]
57 unsafe fn base_ptr() -> *const u8 {
58 unimplemented!()
59 }
60
61 #[gpu_codegen::device]
62 #[gpu_codegen::memspace_shared(1000)]
63 #[gpu_codegen::sync_data(1)] #[gpu_codegen::ret_sync_data(1000)] pub fn alloc<T: Sized>(&mut self, len: usize) -> &'static mut GpuShared<[T]> {
66 let size = core::mem::size_of::<T>() * len;
67 assert!(size <= self.size);
68 self.size -= size;
69 #[allow(clippy::transmute_ptr_to_ref)]
70 unsafe {
71 let raw = core::intrinsics::offset(Self::base_ptr(), self.size);
72 let slice = core::ptr::slice_from_raw_parts_mut(raw as *mut T, len);
73 core::mem::transmute(slice)
74 }
75 }
76}
77
78pub trait DynamicSharedAllocBuilder {
80 fn smem_alloc(&self) -> DynamicSharedAlloc;
81}
82
83#[cfg(not(feature = "codegen_tests"))]
84impl<Config: SafeGpuConfig> DynamicSharedAllocBuilder for Config {
85 fn smem_alloc(&self) -> DynamicSharedAlloc {
87 DynamicSharedAlloc { size: self.shared_size() as usize }
88 }
89}
90
91#[cfg(not(feature = "codegen_tests"))]
92unsafe impl cuda_bindings::AsHostKernelParams for DynamicSharedAlloc {
93 fn as_kernel_param_data(&self, args: &mut alloc::vec::Vec<*mut ::core::ffi::c_void>) {
94 args.push(self as *const _ as _);
95 }
96}
97
98impl<T> core::ops::Index<usize> for GpuShared<[T]> {
99 type Output = GpuShared<T>;
100
101 #[inline(always)]
102 #[gpu_codegen::device]
103 fn index(&self, idx: usize) -> &GpuShared<T> {
104 unsafe { core::mem::transmute(&self.value[idx]) }
105 }
106}
107
108#[allow(private_bounds)]
111pub struct SMemThreadChunk<'a, T: ?Sized + AsSharedSlice, CS: ChunkScope, Map: ScopeUniqueMap<CS>> {
112 data: &'a mut GpuShared<T>, pub map_params: Map,
114 dummy: core::marker::PhantomData<CS>,
115}
116
117impl<'a, T: ?Sized + AsSharedSlice, CS: ChunkScope, Map: ScopeUniqueMap<CS>>
118 SMemThreadChunk<'a, T, CS, Map>
119{
120 #[inline]
121 #[gpu_codegen::device]
122 #[gpu_codegen::memspace_shared(0, 1000)]
123 #[gpu_codegen::sync_data(1, 2)] pub fn chunk_to_scope<CS2: ChunkScope, Map2: ScopeUniqueMap<CS2>>(
125 self,
126 _scope: CS2,
127 map_params: Map2,
128 ) -> SMemThreadChunk<'a, T, ChainedScope<CS, CS2>, ChainedMap<CS, CS2, Map, Map2>>
129 where
130 Map: ScopeUniqueMap<CS>,
131 CS: ChunkScope<ToScope = CS2::FromScope>,
132 Map2::GlobalIndexType: AsPrimitive<Map::IndexType>,
133 {
134 if !map_params.precondition() {
135 core::intrinsics::abort();
136 }
137 SMemThreadChunk {
138 data: self.data,
139 map_params: ChainedMap::new(self.map_params, map_params),
140 dummy: PhantomData,
141 }
142 }
143
144 #[gpu_codegen::device]
145 #[inline]
146 pub fn local2global(
147 &self,
148 idx: <Map as ScopeUniqueMap<CS>>::IndexType,
149 ) -> Map::GlobalIndexType {
150 self.map_params.local_to_global_index(idx).1
151 }
152}
153
154trait PrivateTraitGuard {}
155
156#[expect(private_bounds)]
157pub trait AsSharedSlice: PrivateTraitGuard {
158 type Elem;
159 #[gpu_codegen::device]
160 #[gpu_codegen::memspace_shared(0, 1000)]
161 fn as_mut_slice(&mut self) -> &mut [Self::Elem];
162
163 #[gpu_codegen::device]
164 #[gpu_codegen::memspace_shared(0, 1000)]
165 fn as_slice(&self) -> &[Self::Elem];
166}
167
168impl<T> PrivateTraitGuard for [T] {}
169impl<T> AsSharedSlice for [T] {
170 type Elem = T;
171 #[inline]
172 #[gpu_codegen::device]
173 #[gpu_codegen::memspace_shared(0, 1000)]
174 fn as_mut_slice(&mut self) -> &mut [Self::Elem] {
175 self
176 }
177
178 #[inline]
179 #[gpu_codegen::device]
180 #[gpu_codegen::memspace_shared(0, 1000)]
181 fn as_slice(&self) -> &[Self::Elem] {
182 self
183 }
184}
185
186impl<T, const N: usize> PrivateTraitGuard for [T; N] {}
187impl<T, const N: usize> AsSharedSlice for [T; N] {
188 type Elem = T;
189 #[inline]
190 #[gpu_codegen::device]
191 #[gpu_codegen::memspace_shared(0, 1000)]
192 fn as_mut_slice(&mut self) -> &mut [Self::Elem] {
193 self
194 }
195
196 #[inline]
197 #[gpu_codegen::device]
198 #[gpu_codegen::memspace_shared(0, 1000)]
199 fn as_slice(&self) -> &[Self::Elem] {
200 self
201 }
202}
203
204impl<T: ?Sized + AsSharedSlice> GpuShared<T> {
205 #[inline]
206 #[gpu_codegen::device]
207 #[gpu_codegen::memspace_shared(0, 1000)]
208 #[gpu_codegen::sync_data(0, 1)]
209 #[gpu_codegen::ret_sync_data(0, 1000)]
210 #[rustc_diagnostic_item = "gpu::shared_chunk_mut"]
211 pub fn chunk_mut<'a, Map: ScopeUniqueMap<Block2ThreadScope>>(
212 &'a mut self,
213 map_params: Map,
214 ) -> SMemThreadChunk<'a, T, Block2ThreadScope, Map> {
215 if !map_params.precondition() {
216 core::intrinsics::abort();
217 }
218 SMemThreadChunk { data: self, map_params, dummy: PhantomData }
219 }
220
221 #[inline]
222 #[gpu_codegen::device]
223 #[gpu_codegen::memspace_shared(0, 1000)]
224 #[gpu_codegen::sync_data(0, 1, 2)]
225 #[gpu_codegen::ret_sync_data(0, 1000)]
226 pub fn chunk_to_scope<'a, CS, Map: ScopeUniqueMap<CS>>(
227 &'a mut self,
228 _scope: CS,
229 map_params: Map,
230 ) -> SMemThreadChunk<'a, T, CS, Map>
231 where
232 CS: ChunkScope<FromScope = Block>,
233 {
234 if !map_params.precondition() {
235 core::intrinsics::abort();
236 }
237 SMemThreadChunk { data: self, map_params, dummy: PhantomData }
238 }
239}
240
241impl<'a, T: ?Sized + AsSharedSlice, CS: ChunkScope, Map: ScopeUniqueMap<CS>>
242 core::ops::Index<Map::IndexType> for SMemThreadChunk<'a, T, CS, Map>
243{
244 type Output = T::Elem;
245
246 #[inline(always)]
247 #[gpu_codegen::device]
248 #[gpu_codegen::memspace_shared(1000)]
249 fn index(&self, idx: Map::IndexType) -> &Self::Output {
250 let (idx_precondition, idx) = self.map_params.local_to_global_index(idx);
251 let idx = idx.as_();
252 let valid = self.map_params.precondition() & idx_precondition;
253 assert_ptr(valid, &self.data.value.as_slice()[idx])
254 }
255}
256
257impl<'a, T: ?Sized + AsSharedSlice, CS: ChunkScope, Map: ScopeUniqueMap<CS>>
258 core::ops::IndexMut<Map::IndexType> for SMemThreadChunk<'a, T, CS, Map>
259where
260 CS: ChunkScope<ToScope = Thread>,
261{
262 #[inline(always)]
263 #[gpu_codegen::device]
264 #[gpu_codegen::memspace_shared(1000)]
265 fn index_mut(&mut self, idx: Map::IndexType) -> &mut Self::Output {
266 let (idx_precondition, idx) = self.map_params.local_to_global_index(idx);
267 let idx = idx.as_();
268 let valid = self.map_params.precondition() & idx_precondition;
269 assert_ptr(valid, &mut self.data.value.as_mut_slice()[idx])
270 }
271}