coderonion
diff --git a/‎energy.txt
Lines changed: 0 additions & 100 deletions b/‎energy.txt
Lines changed: 0 additions & 100 deletions
diff --git a/‎src/04-error-check/readme.txt
Lines changed: 1 addition & 0 deletions b/‎src/04-error-check/readme.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/09-atomic/a.py
Lines changed: 0 additions & 8 deletions b/‎src/09-atomic/a.py
Lines changed: 0 additions & 8 deletions
diff --git a/‎src/10-warp/reduce.py
Lines changed: 170 additions & 0 deletions b/‎src/10-warp/reduce.py
Lines changed: 170 additions & 0 deletions
diff --git a/‎src/10-warp/warp_primitives.py
Lines changed: 62 additions & 0 deletions b/‎src/10-warp/warp_primitives.py
Lines changed: 62 additions & 0 deletions
diff --git a/‎src/20-md-unified-memory/readme.txt
Lines changed: 1 addition & 0 deletions b/‎src/20-md-unified-memory/readme.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/md-gpu-array/a.py
Lines changed: 0 additions & 10 deletions b/‎src/md-gpu-array/a.py
Lines changed: 0 additions & 10 deletions
@@ -0,0 +1 @@
+pycuda使用python报错的机制，无需特别说明。
@@ -0,0 +1,170 @@
+import pycuda.autoinit
+import pycuda.driver as drv
+import numpy, math, sys
+from pycuda.compiler import DynamicSourceModule
+
+if len(sys.argv)>2 and sys.argv[1]=='-double':
+    real_py = 'float64' 
+    real_cpp = 'double'
+else:
+    real_py = 'float32'
+    real_cpp = 'float'
+
+mod = DynamicSourceModule(r"""
+#include <cooperative_groups.h>
+using namespace cooperative_groups;
+const unsigned FULL_MASK = 0xffffffff;
+
+extern "C"{void __global__ reduce_syncwarp(const real *d_x, real *d_y, const int N)
+{
+    const int tid = threadIdx.x;
+    const int bid = blockIdx.x;
+    const int n = bid * blockDim.x + tid;
+    extern __shared__ real s_y[];
+    s_y[tid] = (n < N) ? d_x[n] : 0.0;
+    __syncthreads();
+
+    for (int offset = blockDim.x >> 1; offset >= 32; offset >>= 1)
+    {
+        if (tid < offset)
+        {
+            s_y[tid] += s_y[tid + offset];
+        }
+        __syncthreads();
+    }
+
+    for (int offset = 16; offset > 0; offset >>= 1)
+    {
+        if (tid < offset)
+        {
+            s_y[tid] += s_y[tid + offset];
+        }
+        __syncwarp();
+    }
+
+    if (tid == 0)
+    {
+        atomicAdd(d_y, s_y[0]);
+    }
+}}
+
+extern "C"{void __global__ reduce_shfl(const real *d_x, real *d_y, const int N)
+{
+    const int tid = threadIdx.x;
+    const int bid = blockIdx.x;
+    const int n = bid * blockDim.x + tid;
+    extern __shared__ real s_y[];
+    s_y[tid] = (n < N) ? d_x[n] : 0.0;
+    __syncthreads();
+
+    for (int offset = blockDim.x >> 1; offset >= 32; offset >>= 1)
+    {
+        if (tid < offset)
+        {
+            s_y[tid] += s_y[tid + offset];
+        }
+        __syncthreads();
+    }
+
+    real y = s_y[tid];
+
+    for (int offset = 16; offset > 0; offset >>= 1)
+    {
+        y += __shfl_down_sync(FULL_MASK, y, offset);
+    }
+
+    if (tid == 0)
+    {
+        atomicAdd(d_y, y);
+    }
+}
+}
+
+extern "C"{void __global__ reduce_cp(const real *d_x, real *d_y, const int N)
+{
+    const int tid = threadIdx.x;
+    const int bid = blockIdx.x;
+    const int n = bid * blockDim.x + tid;
+    extern __shared__ real s_y[];
+    s_y[tid] = (n < N) ? d_x[n] : 0.0;
+    __syncthreads();
+
+    for (int offset = blockDim.x >> 1; offset >= 32; offset >>= 1)
+    {
+        if (tid < offset)
+        {
+            s_y[tid] += s_y[tid + offset];
+        }
+        __syncthreads();
+    }
+
+    real y = s_y[tid];
+
+    thread_block_tile<32> g = tiled_partition<32>(this_thread_block());
+    for (int i = g.size() >> 1; i > 0; i >>= 1)
+    {
+        y += g.shfl_down(y, i);
+    }
+
+    if (tid == 0)
+    {
+        atomicAdd(d_y, y);
+    }
+}
+}
+""".replace('real', real_cpp), no_extern_c=True)
+reduce_syncwarp = mod.get_function("reduce_syncwarp")
+reduce_shfl = mod.get_function("reduce_shfl")
+reduce_cp = mod.get_function("reduce_cp")
+
+
+
+def timing(method):
+    NUM_REPEATS = 10
+    N = 100000000
+    BLOCK_SIZE = 128
+    grid_size = (N-1)//128+1
+    h_x = numpy.full((N,1), 1.23, dtype=real_py)
+    d_x = drv.mem_alloc(h_x.nbytes)
+    drv.memcpy_htod(d_x, h_x)
+    t_sum = 0
+    t2_sum = 0
+    for repeat in range(NUM_REPEATS+1):
+        start = drv.Event()
+        stop = drv.Event()
+        start.record() 
+
+        h_y = numpy.zeros((1,1), dtype=real_py)
+        d_y = drv.mem_alloc(h_y.nbytes)
+        drv.memcpy_htod(d_y, h_y)
+        if method==0:
+            reduce_syncwarp(d_x, d_y, numpy.int32(N), grid=(grid_size, 1), block=(128,1,1), shared=numpy.zeros((1,1),dtype=real_py).nbytes*BLOCK_SIZE)
+        elif method==1:
+            reduce_shfl(d_x, d_y, numpy.int32(N), grid=((N-1)//128+1, 1), block=(128,1,1), shared=numpy.zeros((1,1),dtype=real_py).nbytes*BLOCK_SIZE)
+        elif method==2:
+            reduce_cp(d_x, d_y, numpy.int32(N), grid=((N-1)//128+1, 1), block=(128,1,1), shared=numpy.zeros((1,1),dtype=real_py).nbytes*BLOCK_SIZE)
+        else:
+            print("Error: wrong method")
+            break
+        drv.memcpy_dtoh(h_y, d_y)
+        v_sum = h_y[0,0]
+
+        stop.record()
+        stop.synchronize()
+        elapsed_time = start.time_till(stop)
+        print("Time = {:.6f} ms.".format(elapsed_time))
+        if repeat > 0:
+            t_sum += elapsed_time
+            t2_sum += elapsed_time * elapsed_time
+    t_ave = t_sum / NUM_REPEATS
+    t_err = math.sqrt(t2_sum / NUM_REPEATS - t_ave * t_ave)
+    print("Time = {:.6f} +- {:.6f} ms.".format(t_ave, t_err))
+    print("sum = ", v_sum)
+    
+
+print("\nusing syncwarp:")
+timing(0)
+print("\nusing shfl:")
+timing(1)
+print("\nusing cooperative group:")
+timing(2)
@@ -0,0 +1,62 @@
+import pycuda.autoinit
+import pycuda.driver as drv
+from pycuda.compiler import DynamicSourceModule
+import numpy, math, sys
+
+mod = DynamicSourceModule(r"""
+const unsigned WIDTH = 8;
+const unsigned FULL_MASK = 0xffffffff;
+void __global__ test_warp_primitives(void)
+{
+    int tid = threadIdx.x;
+    int lane_id = tid % WIDTH;
+
+    if (tid == 0) printf("threadIdx.x: ");
+    printf("%2d ", tid);
+    if (tid == 0) printf("\n");
+
+    if (tid == 0) printf("lane_id:     ");
+    printf("%2d ", lane_id);
+    if (tid == 0) printf("\n");
+
+    unsigned mask1 = __ballot_sync(FULL_MASK, tid > 0);
+    unsigned mask2 = __ballot_sync(FULL_MASK, tid == 0);
+    if (tid == 0) printf("FULL_MASK = %x\n", FULL_MASK);
+    if (tid == 1) printf("nask1     = %x\n", mask1);
+    if (tid == 0) printf("mask2     = %x\n", mask2);
+
+    int result = __all_sync(FULL_MASK, tid);
+    if (tid == 0) printf("all_sync (FULL_MASK): %d\n", result);
+
+    result = __all_sync(mask1, tid);
+    if (tid == 1) printf("all_sync     (mask1): %d\n", result);
+
+    result = __any_sync(FULL_MASK, tid);
+    if (tid == 0) printf("any_sync (FULL_MASK): %d\n", result);
+
+    result = __any_sync(mask2, tid);
+    if (tid == 0) printf("any_sync     (mask2): %d\n", result);
+
+    int value = __shfl_sync(FULL_MASK, tid, 2, WIDTH);
+    if (tid == 0) printf("shfl:      ");
+    printf("%2d ", value);
+    if (tid == 0) printf("\n");
+
+    value = __shfl_up_sync(FULL_MASK, tid, 1, WIDTH);
+    if (tid == 0) printf("shfl_up:   ");
+    printf("%2d ", value);
+    if (tid == 0) printf("\n");
+
+    value = __shfl_down_sync(FULL_MASK, tid, 1, WIDTH);
+    if (tid == 0) printf("shfl_down: ");
+    printf("%2d ", value);
+    if (tid == 0) printf("\n");
+
+    value = __shfl_xor_sync(FULL_MASK, tid, 1, WIDTH);
+    if (tid == 0) printf("shfl_xor:  ");
+    printf("%2d ", value);
+    if (tid == 0) printf("\n");
+}""")
+test_warp_primitives = mod.get_function('test_warp_primitives')
+
+test_warp_primitives(block = (16,1,1))
@@ -0,0 +1 @@
+目前pycuda的managed memory是实验性质的，并不能保证其稳定性，代码将在日后再补充。
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+pycuda使用python报错的机制，无需特别说明。`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+目前pycuda的managed memory是实验性质的，并不能保证其稳定性，代码将在日后再补充。`