Skip to content

Commit 6015ff4

Browse files
committed
增加了第10章
1 parent 5290a6b commit 6015ff4

File tree

7 files changed

+234
-118
lines changed

7 files changed

+234
-118
lines changed

energy.txt

Lines changed: 0 additions & 100 deletions
This file was deleted.

src/04-error-check/readme.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
pycuda使用python报错的机制,无需特别说明。

src/09-atomic/a.py

Lines changed: 0 additions & 8 deletions
This file was deleted.

src/10-warp/reduce.py

Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,170 @@
1+
import pycuda.autoinit
2+
import pycuda.driver as drv
3+
import numpy, math, sys
4+
from pycuda.compiler import DynamicSourceModule
5+
6+
if len(sys.argv)>2 and sys.argv[1]=='-double':
7+
real_py = 'float64'
8+
real_cpp = 'double'
9+
else:
10+
real_py = 'float32'
11+
real_cpp = 'float'
12+
13+
mod = DynamicSourceModule(r"""
14+
#include <cooperative_groups.h>
15+
using namespace cooperative_groups;
16+
const unsigned FULL_MASK = 0xffffffff;
17+
18+
extern "C"{void __global__ reduce_syncwarp(const real *d_x, real *d_y, const int N)
19+
{
20+
const int tid = threadIdx.x;
21+
const int bid = blockIdx.x;
22+
const int n = bid * blockDim.x + tid;
23+
extern __shared__ real s_y[];
24+
s_y[tid] = (n < N) ? d_x[n] : 0.0;
25+
__syncthreads();
26+
27+
for (int offset = blockDim.x >> 1; offset >= 32; offset >>= 1)
28+
{
29+
if (tid < offset)
30+
{
31+
s_y[tid] += s_y[tid + offset];
32+
}
33+
__syncthreads();
34+
}
35+
36+
for (int offset = 16; offset > 0; offset >>= 1)
37+
{
38+
if (tid < offset)
39+
{
40+
s_y[tid] += s_y[tid + offset];
41+
}
42+
__syncwarp();
43+
}
44+
45+
if (tid == 0)
46+
{
47+
atomicAdd(d_y, s_y[0]);
48+
}
49+
}}
50+
51+
extern "C"{void __global__ reduce_shfl(const real *d_x, real *d_y, const int N)
52+
{
53+
const int tid = threadIdx.x;
54+
const int bid = blockIdx.x;
55+
const int n = bid * blockDim.x + tid;
56+
extern __shared__ real s_y[];
57+
s_y[tid] = (n < N) ? d_x[n] : 0.0;
58+
__syncthreads();
59+
60+
for (int offset = blockDim.x >> 1; offset >= 32; offset >>= 1)
61+
{
62+
if (tid < offset)
63+
{
64+
s_y[tid] += s_y[tid + offset];
65+
}
66+
__syncthreads();
67+
}
68+
69+
real y = s_y[tid];
70+
71+
for (int offset = 16; offset > 0; offset >>= 1)
72+
{
73+
y += __shfl_down_sync(FULL_MASK, y, offset);
74+
}
75+
76+
if (tid == 0)
77+
{
78+
atomicAdd(d_y, y);
79+
}
80+
}
81+
}
82+
83+
extern "C"{void __global__ reduce_cp(const real *d_x, real *d_y, const int N)
84+
{
85+
const int tid = threadIdx.x;
86+
const int bid = blockIdx.x;
87+
const int n = bid * blockDim.x + tid;
88+
extern __shared__ real s_y[];
89+
s_y[tid] = (n < N) ? d_x[n] : 0.0;
90+
__syncthreads();
91+
92+
for (int offset = blockDim.x >> 1; offset >= 32; offset >>= 1)
93+
{
94+
if (tid < offset)
95+
{
96+
s_y[tid] += s_y[tid + offset];
97+
}
98+
__syncthreads();
99+
}
100+
101+
real y = s_y[tid];
102+
103+
thread_block_tile<32> g = tiled_partition<32>(this_thread_block());
104+
for (int i = g.size() >> 1; i > 0; i >>= 1)
105+
{
106+
y += g.shfl_down(y, i);
107+
}
108+
109+
if (tid == 0)
110+
{
111+
atomicAdd(d_y, y);
112+
}
113+
}
114+
}
115+
""".replace('real', real_cpp), no_extern_c=True)
116+
reduce_syncwarp = mod.get_function("reduce_syncwarp")
117+
reduce_shfl = mod.get_function("reduce_shfl")
118+
reduce_cp = mod.get_function("reduce_cp")
119+
120+
121+
122+
def timing(method):
123+
NUM_REPEATS = 10
124+
N = 100000000
125+
BLOCK_SIZE = 128
126+
grid_size = (N-1)//128+1
127+
h_x = numpy.full((N,1), 1.23, dtype=real_py)
128+
d_x = drv.mem_alloc(h_x.nbytes)
129+
drv.memcpy_htod(d_x, h_x)
130+
t_sum = 0
131+
t2_sum = 0
132+
for repeat in range(NUM_REPEATS+1):
133+
start = drv.Event()
134+
stop = drv.Event()
135+
start.record()
136+
137+
h_y = numpy.zeros((1,1), dtype=real_py)
138+
d_y = drv.mem_alloc(h_y.nbytes)
139+
drv.memcpy_htod(d_y, h_y)
140+
if method==0:
141+
reduce_syncwarp(d_x, d_y, numpy.int32(N), grid=(grid_size, 1), block=(128,1,1), shared=numpy.zeros((1,1),dtype=real_py).nbytes*BLOCK_SIZE)
142+
elif method==1:
143+
reduce_shfl(d_x, d_y, numpy.int32(N), grid=((N-1)//128+1, 1), block=(128,1,1), shared=numpy.zeros((1,1),dtype=real_py).nbytes*BLOCK_SIZE)
144+
elif method==2:
145+
reduce_cp(d_x, d_y, numpy.int32(N), grid=((N-1)//128+1, 1), block=(128,1,1), shared=numpy.zeros((1,1),dtype=real_py).nbytes*BLOCK_SIZE)
146+
else:
147+
print("Error: wrong method")
148+
break
149+
drv.memcpy_dtoh(h_y, d_y)
150+
v_sum = h_y[0,0]
151+
152+
stop.record()
153+
stop.synchronize()
154+
elapsed_time = start.time_till(stop)
155+
print("Time = {:.6f} ms.".format(elapsed_time))
156+
if repeat > 0:
157+
t_sum += elapsed_time
158+
t2_sum += elapsed_time * elapsed_time
159+
t_ave = t_sum / NUM_REPEATS
160+
t_err = math.sqrt(t2_sum / NUM_REPEATS - t_ave * t_ave)
161+
print("Time = {:.6f} +- {:.6f} ms.".format(t_ave, t_err))
162+
print("sum = ", v_sum)
163+
164+
165+
print("\nusing syncwarp:")
166+
timing(0)
167+
print("\nusing shfl:")
168+
timing(1)
169+
print("\nusing cooperative group:")
170+
timing(2)

src/10-warp/warp_primitives.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
import pycuda.autoinit
2+
import pycuda.driver as drv
3+
from pycuda.compiler import DynamicSourceModule
4+
import numpy, math, sys
5+
6+
mod = DynamicSourceModule(r"""
7+
const unsigned WIDTH = 8;
8+
const unsigned FULL_MASK = 0xffffffff;
9+
void __global__ test_warp_primitives(void)
10+
{
11+
int tid = threadIdx.x;
12+
int lane_id = tid % WIDTH;
13+
14+
if (tid == 0) printf("threadIdx.x: ");
15+
printf("%2d ", tid);
16+
if (tid == 0) printf("\n");
17+
18+
if (tid == 0) printf("lane_id: ");
19+
printf("%2d ", lane_id);
20+
if (tid == 0) printf("\n");
21+
22+
unsigned mask1 = __ballot_sync(FULL_MASK, tid > 0);
23+
unsigned mask2 = __ballot_sync(FULL_MASK, tid == 0);
24+
if (tid == 0) printf("FULL_MASK = %x\n", FULL_MASK);
25+
if (tid == 1) printf("nask1 = %x\n", mask1);
26+
if (tid == 0) printf("mask2 = %x\n", mask2);
27+
28+
int result = __all_sync(FULL_MASK, tid);
29+
if (tid == 0) printf("all_sync (FULL_MASK): %d\n", result);
30+
31+
result = __all_sync(mask1, tid);
32+
if (tid == 1) printf("all_sync (mask1): %d\n", result);
33+
34+
result = __any_sync(FULL_MASK, tid);
35+
if (tid == 0) printf("any_sync (FULL_MASK): %d\n", result);
36+
37+
result = __any_sync(mask2, tid);
38+
if (tid == 0) printf("any_sync (mask2): %d\n", result);
39+
40+
int value = __shfl_sync(FULL_MASK, tid, 2, WIDTH);
41+
if (tid == 0) printf("shfl: ");
42+
printf("%2d ", value);
43+
if (tid == 0) printf("\n");
44+
45+
value = __shfl_up_sync(FULL_MASK, tid, 1, WIDTH);
46+
if (tid == 0) printf("shfl_up: ");
47+
printf("%2d ", value);
48+
if (tid == 0) printf("\n");
49+
50+
value = __shfl_down_sync(FULL_MASK, tid, 1, WIDTH);
51+
if (tid == 0) printf("shfl_down: ");
52+
printf("%2d ", value);
53+
if (tid == 0) printf("\n");
54+
55+
value = __shfl_xor_sync(FULL_MASK, tid, 1, WIDTH);
56+
if (tid == 0) printf("shfl_xor: ");
57+
printf("%2d ", value);
58+
if (tid == 0) printf("\n");
59+
}""")
60+
test_warp_primitives = mod.get_function('test_warp_primitives')
61+
62+
test_warp_primitives(block = (16,1,1))

src/20-md-unified-memory/readme.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
目前pycuda的managed memory是实验性质的,并不能保证其稳定性,代码将在日后再补充。

src/md-gpu-array/a.py

Lines changed: 0 additions & 10 deletions
This file was deleted.

0 commit comments

Comments
 (0)