gpu/global.rs
1use core::ptr::slice_from_raw_parts_mut;
2
3#[cfg(not(feature = "codegen_tests"))]
4use cuda_bindings::TensorViewMut;
5
6use crate::chunk::ScopeUniqueMap;
7use crate::chunk_scope::{ChunkScope, Grid};
8use crate::{GlobalGroupChunk, VecFlatten};
9
10/// Used to distinguish different memory spaces in GPU programming.
11/// GpuGlobal represents global memory space.
12/// See shared::GpuShared for shared memory space.
13/// When chunking or atomic operations are needed, GpuGlobal is owned by
14/// chunk or atomic struct.
15/// This ensures that the user cannot access the data without using chunk or
16/// atomic operations.
17#[rustc_diagnostic_item = "gpu::global::GpuGlobal"]
18pub struct GpuGlobal<'a, T: ?Sized> {
19 pub(crate) data: &'a mut T, // Accessed only by chunk or atomic constructor.
20}
21
22impl<'a, T: ?Sized> GpuGlobal<'a, T> {
23 // This is a host-side function.
24 #[cfg(not(feature = "codegen_tests"))]
25 pub fn new<'b: 'a>(slice: TensorViewMut<'a, T>) -> Self {
26 unsafe { GpuGlobal { data: &mut *(slice.as_flat_devptr() as *mut T) } }
27 }
28}
29
30impl<'a, T> GpuGlobal<'a, [T]> {
31 /// Convert GpuGlobal to GlobalGroupChunk in one step.
32 /// See `ChunkScope` for more details about chunk scope.
33 #[gpu_codegen::device]
34 #[gpu_codegen::sync_data(0, 1, 2)]
35 #[gpu_codegen::ret_sync_data(1000)]
36 #[inline(always)]
37 pub fn chunk_to_scope<CS, Map: ScopeUniqueMap<CS>>(
38 self,
39 _scope: CS,
40 m: Map,
41 ) -> GlobalGroupChunk<'a, T, CS, Map>
42 where
43 CS: ChunkScope<FromScope = Grid>,
44 {
45 GlobalGroupChunk::new(self, m)
46 }
47}
48
49impl<'a, T> GpuGlobal<'a, [T]> {
50 /// Useful to optimize code with vector load/store.
51 /// If length of the slice is not a multiple of N,
52 /// the remaining elements will be ignored.
53 /// For now, we only use flatten for global memory.
54 /// For shared memory, user can use GpuShared<[[T; N]]> directly.
55 #[gpu_codegen::device]
56 #[inline(always)]
57 pub fn flatten<T2>(self) -> GpuGlobal<'a, [T2]>
58 where
59 &'a [T]: VecFlatten<T2>,
60 {
61 // SAFETY: the returned slice will be at same size or shorter, so it is safe.
62 assert!(size_of::<T>() >= size_of::<T2>(), "T2 is larger than T");
63 assert!(align_of::<T>() >= align_of::<T2>(), "T2 has stricter alignment than T");
64 unsafe {
65 GpuGlobal {
66 data: &mut *slice_from_raw_parts_mut(
67 self.data.as_mut_ptr() as _,
68 self.len() * size_of::<T>() / size_of::<T2>(),
69 ),
70 }
71 }
72 }
73
74 #[inline(always)]
75 #[gpu_codegen::device]
76 pub fn is_empty(&self) -> bool {
77 self.data.is_empty()
78 }
79
80 #[inline(always)]
81 #[gpu_codegen::device]
82 pub fn len(&self) -> usize {
83 self.data.len()
84 }
85}
86
87/// Never implement Deref to prevent direct read access to mutable data.
88/// When the global mem is immutable, use &T directly instead of &mut T which will be converted to GpuGlobal.
89///
90/// Can I read global data before write to unique chunk?
91/// Yes, but it is not common and requires us to syncronize the read for all running threads from future write access.
92/// otherwide, the read may get old or new data indeterministically.
93/// This is not a common pattern in GPU programming.
94/// So we disallow it for simplicity.
95///
96/// Can I read the cross-thread global data after write to unique chunk?
97/// Yes, but it requires us to syncronize the read for all running threads after write access.
98/// otherside, the read may get old or new data indeterministically.
99/// This is not a common pattern in GPU programming.
100/// So we disallow it for simplicity.
101#[cfg(not(doc))]
102impl<'a, T: ?Sized> !core::ops::Deref for GpuGlobal<'a, T> {}
103
104/// Never implement DerefMut to prevent direct mutable access to the data.
105/// This ensures that the user cannot access the data without using chunk or
106/// atomic operations.
107impl<'a, T: Sized> !core::ops::DerefMut for GpuGlobal<'a, T> {}