Skip to content

Commit 1bbca72

Browse files
committed
增加11章
对前面代码做了一些修改。 增加了第11章。 增加了21章一个代码。
1 parent 6015ff4 commit 1bbca72

File tree

22 files changed

+484
-29
lines changed

22 files changed

+484
-29
lines changed

bash.exe.stackdump

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
Stack trace:
2+
Frame Function Args
3+
00600000010 001800617BE (00180251890, 0018023DFD1, 00000000058, 000FFFFB770)
4+
00600000010 001800490FA (00000000000, 00100000000, 00000000000, 00000000001)
5+
00600000010 00180049132 (00000000000, 00000000000, 00000000058, 0018031E960)
6+
00600000010 0018006D9C9 (0000000000A, 000FFFFC940, 001800458BF, 00000000000)
7+
00600000010 0018006DB92 (00000000003, 000FFFFC940, 001800458BF, 000FFFFC940)
8+
00600000010 0018006EA4C (000FFFFC940, 001802405E5, 001800EAF57, 0000000000D)
9+
00600000010 001800596A6 (000FFFF0000, 00000000000, 00000000000, 6C36EE15FFFFFFFF)
10+
00600000010 0018005A9C5 (00000000002, 0018031E270, 001800BE5F9, 00600040000)
11+
00600000010 0018005AE89 (001800C7664, 00000000000, 00000000000, 00000000000)
12+
000FFFFCCE0 0018005B149 (000FFFFCE00, 00000000000, 00000000030, 0000000002F)
13+
000FFFFCCE0 00180049877 (00000000000, 00000000000, 00000000000, 00000000000)
14+
000FFFFFFF0 001800482C6 (00000000000, 00000000000, 00000000000, 00000000000)
15+
000FFFFFFF0 00180048374 (00000000000, 00000000000, 00000000000, 00000000000)
16+
End of stack trace

src/05-prerequisites-for-speedup/arithmetic_gpu.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,10 @@
4747
stop = drv.Event()
4848
start.record()
4949

50-
arithmetic(d_x, x0, numpy.int32(N), grid=((N-1)//128+1, 1), block=(128,1,1))
50+
arithmetic(d_x, x0, numpy.int32(N),
51+
grid=((N-1)//128+1, 1),
52+
block=(128,1,1)
53+
)
5154

5255
stop.record()
5356
stop.synchronize()

src/07-global-memory/matrix.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -77,13 +77,25 @@ def timing(d_A, d_B, N, task):
7777
stop = drv.Event()
7878
start.record()
7979
if task == 0:
80-
copy(d_A, d_B, numpy.int32(N), numpy.int32(TILE_DIM), grid=grid_size, block=block_size)
80+
copy(d_A, d_B, numpy.int32(N), numpy.int32(TILE_DIM),
81+
grid=grid_size,
82+
block=block_size
83+
)
8184
elif task == 1:
82-
transpose1(d_A, d_B, numpy.int32(N), grid=grid_size, block=block_size)
85+
transpose1(d_A, d_B, numpy.int32(N),
86+
grid=grid_size,
87+
block=block_size
88+
)
8389
elif task == 2:
84-
transpose2(d_A, d_B, numpy.int32(N), grid=grid_size, block=block_size)
90+
transpose2(d_A, d_B, numpy.int32(N),
91+
grid=grid_size,
92+
block=block_size
93+
)
8594
elif task == 3:
86-
transpose3(d_A, d_B, numpy.int32(N), grid=grid_size, block=block_size)
95+
transpose3(d_A, d_B, numpy.int32(N),
96+
grid=grid_size,
97+
block=block_size
98+
)
8799
else:
88100
print("Error: wrong task")
89101
return

src/08-shared-memory/bank_conflict.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -83,9 +83,15 @@ def timing(d_A, d_B, N, task):
8383
stop = drv.Event()
8484
start.record()
8585
if task == 1:
86-
transpose1(d_A, d_B, numpy.int32(N), grid=grid_size, block=block_size)
86+
transpose1(d_A, d_B, numpy.int32(N),
87+
grid=grid_size,
88+
block=block_size
89+
)
8790
elif task == 2:
88-
transpose2(d_A, d_B, numpy.int32(N), grid=grid_size, block=block_size)
91+
transpose2(d_A, d_B, numpy.int32(N),
92+
grid=grid_size,
93+
block=block_size
94+
)
8995
else:
9096
print("Error: wrong task")
9197
return

src/08-shared-memory/reduce_gpu.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ def timing(method):
9696
d_x = drv.mem_alloc(h_x.nbytes)
9797
h_y = numpy.zeros((grid_size,1), dtype=real_py)
9898
d_y = drv.mem_alloc(h_y.nbytes)
99+
size_real = numpy.dtype(real_py).itemsize
99100
t_sum = 0
100101
t2_sum = 0
101102
for repeat in range(NUM_REPEATS+1):
@@ -105,11 +106,21 @@ def timing(method):
105106
start.record()
106107

107108
if method==0:
108-
reduce_global(d_x, d_y, grid=(grid_size, 1), block=(128,1,1))
109+
reduce_global(d_x, d_y,
110+
grid=(grid_size, 1),
111+
block=(128,1,1)
112+
)
109113
elif method==1:
110-
reduce_shared(d_x, d_y, numpy.int32(N), grid=((N-1)//128+1, 1), block=(128,1,1))
114+
reduce_shared(d_x, d_y, numpy.int32(N),
115+
grid=((N-1)//128+1, 1),
116+
block=(128,1,1)
117+
)
111118
elif method==2:
112-
reduce_dynamic(d_x, d_y, numpy.int32(N), grid=((N-1)//128+1, 1), block=(128,1,1), shared=numpy.zeros((1,1),dtype=real_py).nbytes*BLOCK_SIZE)
119+
reduce_dynamic(d_x, d_y, numpy.int32(N),
120+
grid=((N-1)//128+1, 1),
121+
block=(128,1,1),
122+
shared=size_real*BLOCK_SIZE
123+
)
113124
else:
114125
print("Error: wrong method")
115126
break

src/09-atomic/neighbor_gpu.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -74,14 +74,16 @@ def timing(d_NN, d_NL, d_x, d_y, N, MN, atomic):
7474
start.record()
7575
if atomic:
7676
find_neighbor_atomic(d_NN, d_NL, d_x, d_y, numpy.int32(N),
77-
numpy.int32(MN), numpy.__dict__[real_py](cutoff_square),
78-
grid=((N-1)//128+1, 1),
79-
block=(128,1,1))
77+
numpy.int32(MN), numpy.__dict__[real_py](cutoff_square),
78+
grid=((N-1)//128+1, 1),
79+
block=(128,1,1)
80+
)
8081
else:
8182
find_neighbor_no_atomic(d_NN, d_NL, d_x, d_y, numpy.int32(N),
82-
numpy.__dict__[real_py](cutoff_square),
83-
grid=((N-1)//128+1, 1),
84-
block=(128,1,1))
83+
numpy.__dict__[real_py](cutoff_square),
84+
grid=((N-1)//128+1, 1),
85+
block=(128,1,1)
86+
)
8587
stop.record()
8688
stop.synchronize()
8789
elapsed_time = start.time_till(stop)

src/09-atomic/reduce_atomic.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ def timing():
4646
h_x = numpy.full((N,1), 1.23, dtype=real_py)
4747
d_x = drv.mem_alloc(h_x.nbytes)
4848
drv.memcpy_htod(d_x, h_x)
49+
size_real = numpy.dtype(real_py).itemsize
4950
t_sum = 0
5051
t2_sum = 0
5152
for repeat in range(NUM_REPEATS+1):
@@ -61,10 +62,7 @@ def timing():
6162
numpy.int32(N),
6263
grid=(grid_size, 1),
6364
block=(128,1,1),
64-
shared=numpy.zeros(
65-
(1,1),
66-
dtype=real_py
67-
).nbytes*BLOCK_SIZE
65+
shared=size_real*BLOCK_SIZE
6866
)
6967

7068
drv.memcpy_dtoh(h_y, d_y)

src/10-warp/reduce.py

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import pycuda.autoinit
22
import pycuda.driver as drv
3-
import numpy, math, sys
3+
import numpy as np
4+
import math, sys
45
from pycuda.compiler import DynamicSourceModule
56

67
if len(sys.argv)>2 and sys.argv[1]=='-double':
@@ -117,32 +118,43 @@
117118
reduce_shfl = mod.get_function("reduce_shfl")
118119
reduce_cp = mod.get_function("reduce_cp")
119120

120-
121-
122121
def timing(method):
123122
NUM_REPEATS = 10
124123
N = 100000000
125124
BLOCK_SIZE = 128
126125
grid_size = (N-1)//128+1
127-
h_x = numpy.full((N,1), 1.23, dtype=real_py)
126+
h_x = np.full((N,1), 1.23, dtype=real_py)
128127
d_x = drv.mem_alloc(h_x.nbytes)
129128
drv.memcpy_htod(d_x, h_x)
129+
size_real = np.dtype(real_py).itemsize
130130
t_sum = 0
131131
t2_sum = 0
132132
for repeat in range(NUM_REPEATS+1):
133133
start = drv.Event()
134134
stop = drv.Event()
135135
start.record()
136136

137-
h_y = numpy.zeros((1,1), dtype=real_py)
137+
h_y = np.zeros((1,1), dtype=real_py)
138138
d_y = drv.mem_alloc(h_y.nbytes)
139139
drv.memcpy_htod(d_y, h_y)
140140
if method==0:
141-
reduce_syncwarp(d_x, d_y, numpy.int32(N), grid=(grid_size, 1), block=(128,1,1), shared=numpy.zeros((1,1),dtype=real_py).nbytes*BLOCK_SIZE)
141+
reduce_syncwarp(d_x, d_y, np.int32(N),
142+
grid=(grid_size, 1),
143+
block=(128,1,1),
144+
shared=size_real*BLOCK_SIZE
145+
)
142146
elif method==1:
143-
reduce_shfl(d_x, d_y, numpy.int32(N), grid=((N-1)//128+1, 1), block=(128,1,1), shared=numpy.zeros((1,1),dtype=real_py).nbytes*BLOCK_SIZE)
147+
reduce_shfl(d_x, d_y, np.int32(N),
148+
grid=((N-1)//128+1, 1),
149+
block=(128,1,1),
150+
shared=size_real*BLOCK_SIZE
151+
)
144152
elif method==2:
145-
reduce_cp(d_x, d_y, numpy.int32(N), grid=((N-1)//128+1, 1), block=(128,1,1), shared=numpy.zeros((1,1),dtype=real_py).nbytes*BLOCK_SIZE)
153+
reduce_cp(d_x, d_y, np.int32(N),
154+
grid=((N-1)//128+1, 1),
155+
block=(128,1,1),
156+
shared=size_real*BLOCK_SIZE
157+
)
146158
else:
147159
print("Error: wrong method")
148160
break

src/11-stream/a.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
import numpy as np
2+
3+
a = np.zeros((1,10))
4+
print(a)
5+
b = np.ones((1,3))
6+
b = a[0,3:5]
7+
print(type(b))

src/11-stream/host_kernel.py

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
import numpy, math, sys, time
2+
import pycuda.autoinit
3+
import pycuda.driver as drv
4+
from pycuda.compiler import DynamicSourceModule
5+
6+
if len(sys.argv)>2 and sys.argv[1]=='-double':
7+
real_py = 'float64'
8+
real_cpp = 'double'
9+
else:
10+
real_py = 'float32'
11+
real_cpp = 'float'
12+
13+
mod = DynamicSourceModule(r"""
14+
void __global__ gpu_sum(const real *x, const real *y, real *z, const int N)
15+
{
16+
const int n = blockDim.x * blockIdx.x + threadIdx.x;
17+
if (n < N)
18+
{
19+
z[n] = x[n] + y[n];
20+
}
21+
}""".replace('real', real_cpp))
22+
gpu_sum = mod.get_function("gpu_sum")
23+
24+
def cpu_sum(x, y, N_host):
25+
z = numpy.empty_like(x, dtype=real_py)
26+
for n in range(N_host):
27+
z[n] = x[n] + y[n]
28+
return z
29+
30+
def timing(h_x, h_y, h_z, d_x, d_y, d_z, ratio, overlap):
31+
NUM_REPEATS = 10
32+
N = h_x.size
33+
t_sum = 0
34+
t2_sum = 0
35+
for repeat in range(NUM_REPEATS+1):
36+
start = time.time()
37+
38+
if not overlap:
39+
cpu_sum(h_x, h_y, N//ratio)
40+
41+
gpu_sum(d_x, d_y, d_z, numpy.int32(N),
42+
grid=((N-1)//128+1, 1),
43+
block=(128,1,1)
44+
)
45+
46+
if overlap:
47+
cpu_sum(h_x, h_y, N//ratio)
48+
49+
elapsed_time = (time.time()-start)*1000
50+
print("Time = {:.6f} ms.".format(elapsed_time))
51+
if repeat > 0:
52+
t_sum += elapsed_time
53+
t2_sum += elapsed_time * elapsed_time
54+
t_ave = t_sum / NUM_REPEATS
55+
t_err = math.sqrt(t2_sum / NUM_REPEATS - t_ave * t_ave)
56+
print("Time = {:.6f} +- {:.6f} ms.".format(t_ave, t_err))
57+
58+
59+
60+
N = 100000000
61+
h_x = numpy.full((N,1), 1.23, dtype=real_py)
62+
h_y = numpy.full((N,1), 2.34, dtype=real_py)
63+
h_z = numpy.zeros_like(h_x, dtype=real_py)
64+
d_x = drv.mem_alloc(h_x.nbytes)
65+
d_y = drv.mem_alloc(h_y.nbytes)
66+
d_z = drv.mem_alloc(h_z.nbytes)
67+
drv.memcpy_htod(d_x, h_x)
68+
drv.memcpy_htod(d_y, h_y)
69+
70+
print("Without CPU-GPU overlap (ratio = 1000000)")
71+
timing(h_x, h_y, h_z, d_x, d_y, d_z, 1000000, False)
72+
print("With CPU-GPU overlap (ratio = 1000000)")
73+
timing(h_x, h_y, h_z, d_x, d_y, d_z, 1000000, True)
74+
75+
print("Without CPU-GPU overlap (ratio = 100000)")
76+
timing(h_x, h_y, h_z, d_x, d_y, d_z, 100000, False)
77+
print("With CPU-GPU overlap (ratio = 100000)")
78+
timing(h_x, h_y, h_z, d_x, d_y, d_z, 100000, True)
79+
80+
print("Without CPU-GPU overlap (ratio = 10000000)")
81+
timing(h_x, h_y, h_z, d_x, d_y, d_z, 10000000, False)
82+
print("With CPU-GPU overlap (ratio = 10000000)")
83+
timing(h_x, h_y, h_z, d_x, d_y, d_z, 10000000, True)

0 commit comments

Comments
 (0)