gpu/
global.rs

1use core::ptr::slice_from_raw_parts_mut;
2
3#[cfg(not(feature = "codegen_tests"))]
4use cuda_bindings::TensorViewMut;
5
6use crate::chunk::ScopeUniqueMap;
7use crate::chunk_scope::{ChunkScope, Grid};
8use crate::{GlobalGroupChunk, VecFlatten};
9
10/// Used to distinguish different memory spaces in GPU programming.
11/// GpuGlobal represents global memory space.
12/// See shared::GpuShared for shared memory space.
13/// When chunking or atomic operations are needed, GpuGlobal is owned by
14/// chunk or atomic struct.
15/// This ensures that the user cannot access the data without using chunk or
16/// atomic operations.
17#[rustc_diagnostic_item = "gpu::global::GpuGlobal"]
18pub struct GpuGlobal<'a, T: ?Sized> {
19    pub(crate) data: &'a mut T, // Accessed only by chunk or atomic constructor.
20}
21
22impl<'a, T: ?Sized> GpuGlobal<'a, T> {
23    // This is a host-side function.
24    #[cfg(not(feature = "codegen_tests"))]
25    pub fn new<'b: 'a>(slice: TensorViewMut<'a, T>) -> Self {
26        unsafe { GpuGlobal { data: &mut *(slice.as_flat_devptr() as *mut T) } }
27    }
28}
29
30impl<'a, T> GpuGlobal<'a, [T]> {
31    /// Convert GpuGlobal to GlobalGroupChunk in one step.
32    /// See `ChunkScope` for more details about chunk scope.
33    #[gpu_codegen::device]
34    #[gpu_codegen::sync_data(0, 1, 2)]
35    #[gpu_codegen::ret_sync_data(1000)]
36    #[inline(always)]
37    pub fn chunk_to_scope<CS, Map: ScopeUniqueMap<CS>>(
38        self,
39        _scope: CS,
40        m: Map,
41    ) -> GlobalGroupChunk<'a, T, CS, Map>
42    where
43        CS: ChunkScope<FromScope = Grid>,
44    {
45        GlobalGroupChunk::new(self, m)
46    }
47}
48
49impl<'a, T> GpuGlobal<'a, [T]> {
50    /// Useful to optimize code with vector load/store.
51    /// If length of the slice is not a multiple of N,
52    /// the remaining elements will be ignored.
53    /// For now, we only use flatten for global memory.
54    /// For shared memory, user can use GpuShared<[[T; N]]> directly.
55    #[gpu_codegen::device]
56    #[inline(always)]
57    pub fn flatten<T2>(self) -> GpuGlobal<'a, [T2]>
58    where
59        &'a [T]: VecFlatten<T2>,
60    {
61        // SAFETY: the returned slice will be at same size or shorter, so it is safe.
62        assert!(size_of::<T>() >= size_of::<T2>(), "T2 is larger than T");
63        assert!(align_of::<T>() >= align_of::<T2>(), "T2 has stricter alignment than T");
64        unsafe {
65            GpuGlobal {
66                data: &mut *slice_from_raw_parts_mut(
67                    self.data.as_mut_ptr() as _,
68                    self.len() * size_of::<T>() / size_of::<T2>(),
69                ),
70            }
71        }
72    }
73
74    #[inline(always)]
75    #[gpu_codegen::device]
76    pub fn is_empty(&self) -> bool {
77        self.data.is_empty()
78    }
79
80    #[inline(always)]
81    #[gpu_codegen::device]
82    pub fn len(&self) -> usize {
83        self.data.len()
84    }
85}
86
87/// Never implement Deref to prevent direct read access to mutable data.
88/// When the global mem is immutable, use &T directly instead of &mut T which will be converted to GpuGlobal.
89///
90/// Can I read global data before write to unique chunk?
91/// Yes, but it is not common and requires us to syncronize the read for all running threads from future write access.
92/// otherwide, the read may get old or new data indeterministically.
93/// This is not a common pattern in GPU programming.
94/// So we disallow it for simplicity.
95///
96/// Can I read the cross-thread global data after write to unique chunk?
97/// Yes, but it requires us to syncronize the read for all running threads after write access.
98/// otherside, the read may get old or new data indeterministically.
99/// This is not a common pattern in GPU programming.
100/// So we disallow it for simplicity.
101#[cfg(not(doc))]
102impl<'a, T: ?Sized> !core::ops::Deref for GpuGlobal<'a, T> {}
103
104/// Never implement DerefMut to prevent direct mutable access to the data.
105/// This ensures that the user cannot access the data without using chunk or
106/// atomic operations.
107impl<'a, T: Sized> !core::ops::DerefMut for GpuGlobal<'a, T> {}