gpu/
shared.rs

1use core::marker::PhantomData;
2/// This file contains shared memory related APIs.
3///
4use core::ops::{Deref, DerefMut};
5
6#[cfg(not(feature = "codegen_tests"))]
7use cuda_bindings::SafeGpuConfig;
8use num_traits::AsPrimitive;
9
10use crate::assert_ptr;
11use crate::cg::Block;
12use crate::chunk::{ScopeUniqueMap, ScopeUniqueMapProvidedMethods};
13use crate::chunk_scope::{Block2ThreadScope, ChainedMap, ChainedScope, ChunkScope, Thread};
14
15/// Static GPU shared memory.
16/// NVCC always aligns shared memory to 16 bytes,
17/// so we also align to 16 bytes here.
18#[rustc_diagnostic_item = "gpu::GpuShared"]
19#[repr(C, align(16))]
20pub struct GpuShared<T: ?Sized> {
21    value: T,
22}
23
24impl<T> GpuShared<T> {
25    #[rustc_diagnostic_item = "gpu::new_shared_mem"]
26    #[gpu_codegen::device]
27    #[gpu_codegen::sync_data]
28    #[inline(never)]
29    pub const fn zero() -> Self {
30        unimplemented!();
31    }
32}
33
34impl<T> Deref for GpuShared<T> {
35    type Target = T;
36
37    #[gpu_codegen::device]
38    #[gpu_codegen::memspace_shared(0, 1000)]
39    #[inline(always)]
40    fn deref(&self) -> &T {
41        &self.value
42    }
43}
44
45impl<T> !DerefMut for GpuShared<T> {}
46
47/// Dynamic GPU shared memory allocation.
48#[allow(dead_code)]
49pub struct DynamicSharedAlloc {
50    size: usize,
51}
52
53impl DynamicSharedAlloc {
54    #[rustc_diagnostic_item = "gpu::base_dynamic_shared"]
55    #[inline(never)]
56    #[gpu_codegen::memspace_shared(1000)]
57    unsafe fn base_ptr() -> *const u8 {
58        unimplemented!()
59    }
60
61    #[gpu_codegen::device]
62    #[gpu_codegen::memspace_shared(1000)]
63    #[gpu_codegen::sync_data(1)] // len is non-divergent
64    #[gpu_codegen::ret_sync_data(1000)] // return pointer is divergent
65    pub fn alloc<T: Sized>(&mut self, len: usize) -> &'static mut GpuShared<[T]> {
66        let size = core::mem::size_of::<T>() * len;
67        assert!(size <= self.size);
68        self.size -= size;
69        #[allow(clippy::transmute_ptr_to_ref)]
70        unsafe {
71            let raw = core::intrinsics::offset(Self::base_ptr(), self.size);
72            let slice = core::ptr::slice_from_raw_parts_mut(raw as *mut T, len);
73            core::mem::transmute(slice)
74        }
75    }
76}
77
78/// This trait is implemented for kernel config struct to provide dynamic shared memory allocation.
79pub trait DynamicSharedAllocBuilder {
80    fn smem_alloc(&self) -> DynamicSharedAlloc;
81}
82
83#[cfg(not(feature = "codegen_tests"))]
84impl<Config: SafeGpuConfig> DynamicSharedAllocBuilder for Config {
85    // This is host-side function.
86    fn smem_alloc(&self) -> DynamicSharedAlloc {
87        DynamicSharedAlloc { size: self.shared_size() as usize }
88    }
89}
90
91#[cfg(not(feature = "codegen_tests"))]
92unsafe impl cuda_bindings::AsHostKernelParams for DynamicSharedAlloc {
93    fn as_kernel_param_data(&self, args: &mut alloc::vec::Vec<*mut ::core::ffi::c_void>) {
94        args.push(self as *const _ as _);
95    }
96}
97
98impl<T> core::ops::Index<usize> for GpuShared<[T]> {
99    type Output = GpuShared<T>;
100
101    #[inline(always)]
102    #[gpu_codegen::device]
103    fn index(&self, idx: usize) -> &GpuShared<T> {
104        unsafe { core::mem::transmute(&self.value[idx]) }
105    }
106}
107
108/// N:core::ops::Index dimension, 1, 2, 3
109/// Map: Mapping strategy
110#[allow(private_bounds)]
111pub struct SMemThreadChunk<'a, T: ?Sized + AsSharedSlice, CS: ChunkScope, Map: ScopeUniqueMap<CS>> {
112    data: &'a mut GpuShared<T>, // Must be private.
113    pub map_params: Map,
114    dummy: core::marker::PhantomData<CS>,
115}
116
117impl<'a, T: ?Sized + AsSharedSlice, CS: ChunkScope, Map: ScopeUniqueMap<CS>>
118    SMemThreadChunk<'a, T, CS, Map>
119{
120    #[inline]
121    #[gpu_codegen::device]
122    #[gpu_codegen::memspace_shared(0, 1000)]
123    #[gpu_codegen::sync_data(1, 2)] // self is guaranteed to be non-divergent and so no check is required
124    pub fn chunk_to_scope<CS2: ChunkScope, Map2: ScopeUniqueMap<CS2>>(
125        self,
126        _scope: CS2,
127        map_params: Map2,
128    ) -> SMemThreadChunk<'a, T, ChainedScope<CS, CS2>, ChainedMap<CS, CS2, Map, Map2>>
129    where
130        Map: ScopeUniqueMap<CS>,
131        CS: ChunkScope<ToScope = CS2::FromScope>,
132        Map2::GlobalIndexType: AsPrimitive<Map::IndexType>,
133    {
134        if !map_params.precondition() {
135            core::intrinsics::abort();
136        }
137        SMemThreadChunk {
138            data: self.data,
139            map_params: ChainedMap::new(self.map_params, map_params),
140            dummy: PhantomData,
141        }
142    }
143
144    #[gpu_codegen::device]
145    #[inline]
146    pub fn local2global(
147        &self,
148        idx: <Map as ScopeUniqueMap<CS>>::IndexType,
149    ) -> Map::GlobalIndexType {
150        self.map_params.local_to_global_index(idx).1
151    }
152}
153
154trait PrivateTraitGuard {}
155
156#[expect(private_bounds)]
157pub trait AsSharedSlice: PrivateTraitGuard {
158    type Elem;
159    #[gpu_codegen::device]
160    #[gpu_codegen::memspace_shared(0, 1000)]
161    fn as_mut_slice(&mut self) -> &mut [Self::Elem];
162
163    #[gpu_codegen::device]
164    #[gpu_codegen::memspace_shared(0, 1000)]
165    fn as_slice(&self) -> &[Self::Elem];
166}
167
168impl<T> PrivateTraitGuard for [T] {}
169impl<T> AsSharedSlice for [T] {
170    type Elem = T;
171    #[inline]
172    #[gpu_codegen::device]
173    #[gpu_codegen::memspace_shared(0, 1000)]
174    fn as_mut_slice(&mut self) -> &mut [Self::Elem] {
175        self
176    }
177
178    #[inline]
179    #[gpu_codegen::device]
180    #[gpu_codegen::memspace_shared(0, 1000)]
181    fn as_slice(&self) -> &[Self::Elem] {
182        self
183    }
184}
185
186impl<T, const N: usize> PrivateTraitGuard for [T; N] {}
187impl<T, const N: usize> AsSharedSlice for [T; N] {
188    type Elem = T;
189    #[inline]
190    #[gpu_codegen::device]
191    #[gpu_codegen::memspace_shared(0, 1000)]
192    fn as_mut_slice(&mut self) -> &mut [Self::Elem] {
193        self
194    }
195
196    #[inline]
197    #[gpu_codegen::device]
198    #[gpu_codegen::memspace_shared(0, 1000)]
199    fn as_slice(&self) -> &[Self::Elem] {
200        self
201    }
202}
203
204impl<T: ?Sized + AsSharedSlice> GpuShared<T> {
205    #[inline]
206    #[gpu_codegen::device]
207    #[gpu_codegen::memspace_shared(0, 1000)]
208    #[gpu_codegen::sync_data(0, 1)]
209    #[gpu_codegen::ret_sync_data(0, 1000)]
210    #[rustc_diagnostic_item = "gpu::shared_chunk_mut"]
211    pub fn chunk_mut<'a, Map: ScopeUniqueMap<Block2ThreadScope>>(
212        &'a mut self,
213        map_params: Map,
214    ) -> SMemThreadChunk<'a, T, Block2ThreadScope, Map> {
215        if !map_params.precondition() {
216            core::intrinsics::abort();
217        }
218        SMemThreadChunk { data: self, map_params, dummy: PhantomData }
219    }
220
221    #[inline]
222    #[gpu_codegen::device]
223    #[gpu_codegen::memspace_shared(0, 1000)]
224    #[gpu_codegen::sync_data(0, 1, 2)]
225    #[gpu_codegen::ret_sync_data(0, 1000)]
226    pub fn chunk_to_scope<'a, CS, Map: ScopeUniqueMap<CS>>(
227        &'a mut self,
228        _scope: CS,
229        map_params: Map,
230    ) -> SMemThreadChunk<'a, T, CS, Map>
231    where
232        CS: ChunkScope<FromScope = Block>,
233    {
234        if !map_params.precondition() {
235            core::intrinsics::abort();
236        }
237        SMemThreadChunk { data: self, map_params, dummy: PhantomData }
238    }
239}
240
241impl<'a, T: ?Sized + AsSharedSlice, CS: ChunkScope, Map: ScopeUniqueMap<CS>>
242    core::ops::Index<Map::IndexType> for SMemThreadChunk<'a, T, CS, Map>
243{
244    type Output = T::Elem;
245
246    #[inline(always)]
247    #[gpu_codegen::device]
248    #[gpu_codegen::memspace_shared(1000)]
249    fn index(&self, idx: Map::IndexType) -> &Self::Output {
250        let (idx_precondition, idx) = self.map_params.local_to_global_index(idx);
251        let idx = idx.as_();
252        let valid = self.map_params.precondition() & idx_precondition;
253        assert_ptr(valid, &self.data.value.as_slice()[idx])
254    }
255}
256
257impl<'a, T: ?Sized + AsSharedSlice, CS: ChunkScope, Map: ScopeUniqueMap<CS>>
258    core::ops::IndexMut<Map::IndexType> for SMemThreadChunk<'a, T, CS, Map>
259where
260    CS: ChunkScope<ToScope = Thread>,
261{
262    #[inline(always)]
263    #[gpu_codegen::device]
264    #[gpu_codegen::memspace_shared(1000)]
265    fn index_mut(&mut self, idx: Map::IndexType) -> &mut Self::Output {
266        let (idx_precondition, idx) = self.map_params.local_to_global_index(idx);
267        let idx = idx.as_();
268        let valid = self.map_params.precondition() & idx_precondition;
269        assert_ptr(valid, &mut self.data.value.as_mut_slice()[idx])
270    }
271}