workgroup_synchronization.hpp Source File

workgroup_synchronization.hpp Source File#

Composable Kernel: workgroup_synchronization.hpp Source File
workgroup_synchronization.hpp
Go to the documentation of this file.
1// SPDX-License-Identifier: MIT
2// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
3
4#pragma once
6
7namespace ck {
8
9// Initialization flag of Barrier object, can be any value except for zero
10static constexpr int BarrierInitFlag = 0x7856;
11
12// 1) only the first thread-block in the synchronizaton group is supposed to call this function. It
13// is the responsibility of the user to ensure the two integer values in p_control_bits are zeros
14// before calling gms_init().
15// 2) Aftercalling gms_reset(), the two integer values in p_control_bits will be zeros, so no
16// repetitious initialization of p_control_bits buffer is required
17static __device__ void gms_init(int NumWarps, int* p_control_bits)
18{
19 union
20 {
21 int two32[2];
22 unsigned long one64;
23 } regs;
24
25 regs.two32[0] = BarrierInitFlag;
26 regs.two32[1] = NumWarps;
27
28 if(threadIdx.x == 0)
29 atomicCAS(reinterpret_cast<unsigned long*>(p_control_bits), 0, regs.one64);
30};
31
32// all the workgroups in the synchronization group is supposed to call this function
33static __device__ void gms_barrier(int* p_control_bits)
34{
35 constexpr int mask = WarpSize - 1;
36
37 if((threadIdx.x & mask) == 0)
38 {
39 // ensure the barrier object is initialized
40 do
41 {
42 const int r0 = __atomic_load_n(&p_control_bits[0], __ATOMIC_RELAXED);
43
44 if(r0 == BarrierInitFlag)
45 break;
46
47 } while(true);
48
49 // go ahead toward the barrier line
50 atomicSub(&p_control_bits[1], 1);
51
52 // wait until all warps have arrived
53 do
54 {
55 const int r1 = __atomic_load_n(&p_control_bits[1], __ATOMIC_RELAXED);
56
57 if(r1 == 0)
58 break;
59
60 } while(true);
61 };
62};
63
64// 1) Only the first thread-block in the synchronizaton group is supposed to call this function.
65// 2) Aftercalling gms_reset(), the two integer values in p_control_bits will be zeros, so no
66// repetitious initialization of p_control_bits buffer is required
67static __device__ void gms_reset(int* p_control_bits)
68{
69 // reset the barrier object
70 if(threadIdx.x == 0)
71 (void)atomicCAS(&p_control_bits[0], BarrierInitFlag, 0);
72};
73
74} // namespace ck
Definition ck.hpp:268