1macro_rules! impl_ld {
2 ($fname: ident, $ty:ty, $c: literal, $t: literal) => {
3 #[gpu_macros::device]
4 #[inline(always)]
5 pub fn $fname(ptr: &$ty) -> $ty {
6 let mut ret: $ty;
7 let ptr = ptr as *const $ty;
8 unsafe {
9 $crate::asm!(
10 concat!("ld.global.", $c, ".", $t, " {0:reg32}, [{1:reg64}];"),
11 out(reg) ret,
12 in(reg) ptr,
13 );
14 }
15 ret
16 }
17 };
18}
19
20impl_ld!(__ldcs_u32, u32, "cs", "u32");
21impl_ld!(__ldcs_i32, i32, "cs", "u32");
22impl_ld!(__ldcs_f32, f32, "cs", "f32");
23
24impl_ld!(__ldcg_u32, u32, "cg", "u32");
25impl_ld!(__ldcg_i32, i32, "cg", "u32");
26impl_ld!(__ldcg_f32, f32, "cg", "f32");
27
28macro_rules! impl_st {
29 ($fname: ident, $ty:ty, $c: literal, $t: literal) => {
30 #[gpu_macros::device]
31 #[inline(always)]
32 pub fn $fname(ptr: &mut $ty, val: $ty) {
33 let ptr = ptr as *mut $ty;
34 unsafe {
35 $crate::asm!(
36 concat!("st.global.", $c, ".", $t, "[{0:reg64}], {1:reg32};"),
37 in(reg) ptr,
38 in(reg) val,
39 );
40 }
41 }
42 };
43}
44
45impl_st!(__stcs_u32, u32, "cs", "u32");
46impl_st!(__stcs_i32, i32, "cs", "u32");
47impl_st!(__stcs_f32, f32, "cs", "f32");
48
49impl_st!(__stcg_u32, u32, "cg", "u32");
50impl_st!(__stcg_i32, i32, "cg", "u32");
51impl_st!(__stcg_f32, f32, "cg", "f32");
52
53pub trait CacheStreamLoadStore: Sized {
54 type Output;
55 fn ldcs(&self) -> Self::Output;
56 fn stcs(&mut self, val: Self::Output);
57 fn ldcg(&self) -> Self::Output;
58 fn stcg(&mut self, val: Self::Output);
59}
60
61impl CacheStreamLoadStore for i32 {
62 type Output = i32;
63 #[gpu_macros::device]
64 #[inline(always)]
65 fn ldcs(&self) -> Self::Output {
66 __ldcs_i32(self)
67 }
68
69 #[gpu_macros::device]
70 #[inline(always)]
71 fn stcs(&mut self, val: Self::Output) {
72 __stcs_i32(self, val)
73 }
74
75 #[gpu_macros::device]
76 #[inline(always)]
77 fn ldcg(&self) -> Self::Output {
78 __ldcg_i32(self)
79 }
80
81 #[gpu_macros::device]
82 #[inline(always)]
83 fn stcg(&mut self, val: Self::Output) {
84 __stcg_i32(self, val)
85 }
86}
87
88impl CacheStreamLoadStore for f32 {
89 type Output = f32;
90
91 #[gpu_macros::device]
92 #[inline(always)]
93 fn ldcs(&self) -> Self::Output {
94 __ldcs_f32(self)
95 }
96
97 #[gpu_macros::device]
98 #[inline(always)]
99 fn stcs(&mut self, val: Self::Output) {
100 __stcs_f32(self, val)
101 }
102
103 #[gpu_macros::device]
104 #[inline(always)]
105 fn ldcg(&self) -> Self::Output {
106 __ldcg_f32(self)
107 }
108
109 #[gpu_macros::device]
110 #[inline(always)]
111 fn stcg(&mut self, val: Self::Output) {
112 __stcg_f32(self, val)
113 }
114}