//VC++ 2010 + CUDA 4.1
#ifndef _CLOCK_KERNEL_H_
#define _CLOCK_KERNEL_H_
// This kernel computes a standard parallel reduction and evaluates the
// time it takes to do that for each block. The timing results are stored
// in device memory.
__global__ static void timedReduction(const float * input, float * output, clock_t * timer)
{
// __shared__ float shared[2 * blockDim.x];
extern __shared__ float shared[];
const int tid = threadIdx.x;
const int bid = blockIdx.x;
if (tid == 0) timer[bid] = clock();
// Copy input.
shared[tid] = input[tid];
shared[tid + blockDim.x] = input[tid + blockDim.x];
// Perform reduction to find minimum.
for(int d = blockDim.x; d > 0; d /= 2)
{
__syncthreads();
if (tid < d)
{
float f0 = shared[tid];
float f1 = shared[tid + d];
if (f1 < f0) {
shared[tid] = f1;
}
}
}
// Write result.
if (tid == 0) output[bid] = shared[0];
__syncthreads();
if (tid == 0) timer[bid+gridDim.x] = clock();
}
#endif // _CLOCK_KERNEL_H_
#include <stdio.h>
#include <stdlib.h>
#include <shrQATest.h>
#include <cutil_inline.h>
#include "clock_kernel.cu"
// This example shows how to use the clock function to measure the performance of
// a kernel accurately.
//
// Blocks are executed in parallel and out of order. Since there's no synchronization
// mechanism between blocks, we measure the clock once for each block. The clock
// samples are written to device memory.
#define NUM_BLOCKS 64
#define NUM_THREADS 256
// It's interesting to change the number of blocks and the number of threads to
// understand how to keep the hardware busy.
//
// Here are some numbers I get on my G80:
// blocks - clocks
// 1 - 3096
// 8 - 3232
// 16 - 3364
// 32 - 4615
// 64 - 9981
//
// With less than 16 blocks some of the multiprocessors of the device are idle. With
// more than 16 you are using all the multiprocessors, but there's only one block per
// multiprocessor and that doesn't allow you to hide the latency of the memory. With
// more than 32 the speed scales linearly.
int main(int argc, char** argv)
{
shrQAStart(argc, argv);
// use command-line specified CUDA device, otherwise use device with highest Gflops/s
if ( cutCheckCmdLineFlag(argc, (const char **)argv, "device")) {
int devID = cutilDeviceInit(argc, argv);
if (devID < 0) {
printf("No CUDA Capable devices found, exiting...\n");
shrQAFinishExit(argc, (const char **)argv, QA_WAIVED);
}
} else {
cudaSetDevice( cutGetMaxGflopsDeviceId() );
}
float * dinput = NULL;
float * doutput = NULL;
clock_t * dtimer = NULL;
clock_t timer[NUM_BLOCKS * 2];
float input[NUM_THREADS * 2];
for (int i = 0; i < NUM_THREADS * 2; i++)
{
input[i] = (float)i;
}
cutilSafeCall(cudaMalloc((void**)&dinput, sizeof(float) * NUM_THREADS * 2));
cutilSafeCall(cudaMalloc((void**)&doutput, sizeof(float) * NUM_BLOCKS));
cutilSafeCall(cudaMalloc((void**)&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2));
cutilSafeCall(cudaMemcpy(dinput, input, sizeof(float) * NUM_THREADS * 2, cudaMemcpyHostToDevice));
timedReduction<<<NUM_BLOCKS, NUM_THREADS, sizeof(float) * 2 * NUM_THREADS>>>(dinput, doutput, dtimer);
//cutilSafeCall(cudaMemcpy(output, doutput, sizeof(float) * NUM_BLOCKS, cudaMemcpyDeviceToHost));
cutilSafeCall(cudaMemcpy(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2, cudaMemcpyDeviceToHost));
cutilSafeCall(cudaFree(dinput));
cutilSafeCall(cudaFree(doutput));
cutilSafeCall(cudaFree(dtimer));
// Compute the difference between the last block end and the first block start.
clock_t minStart = timer[0];
clock_t maxEnd = timer[NUM_BLOCKS];
for (int i = 1; i < NUM_BLOCKS; i++)
{
minStart = timer[i] < minStart ? timer[i] : minStart;
maxEnd = timer[NUM_BLOCKS+i] > maxEnd ? timer[NUM_BLOCKS+i] : maxEnd;
}
printf("time = %d\n", maxEnd - minStart);
cutilDeviceReset();
// This test always passes.
shrQAFinishExit(argc, (const char **)argv, QA_PASSED);
}
分享到:
相关推荐
VC++的.cpp文件调用CUDA的.cu文件中的函数
nvidia cuda 学习.md
英伟达cuda认证通过代码
NVIDIA很恶心,需要注册开发者才能下载
NVIDIA CUDA编程指南.pdf GPU系列技术文档.....................................................................................................................1 NVIDIA CUDA 编程指南.........................
CUDA规约求和.cu
nvidia-docker-images,Linux版本GPU测试容器镜像,主要用于linux环境下支持GPU资源,容器调用GPU资源前驱动测试
cuda_8.0.61.2_windows :win7平台下的安装资源,英伟达的cuda驱动
VS2012+CUDA8.0 visual assist对.cu和.cuh高亮注册表,压缩文档有说明
nvidia-docker-images,Linux版本GPU测试容器镜像,主要用于linux环境下支持GPU资源,容器调用GPU资源前驱动测试
NVIDIA推出下一代CUDA GPU.pdf
nvidia-docker-images,Linux版本GPU测试容器镜像,主要用于linux环境下支持GPU资源,容器调用GPU资源前驱动测试
安装cuda后,安装opencv出现Error generating file,替换文件即可
用cuda实现的3×3中值滤波,排序算法为二分法,利用共享内存,巧妙加速,算法执行效率非常高。 下载后带入数据直接用。
《CUDA C编程》CodeSamples源代码,《CUDA C编程》CodeSamples源代码
电子-NVIDIACUDA开发者系列培训之三CUDA演化.pdf,综合电子技术编程语言学习
CUDA™是一种由NVIDIA推出的通用并行计算架构,该架构使GPU能够解决复杂的计算问题。 它包含了CUDA指令集架构(ISA)以及GPU内部的并行计算引擎。 开发人员可以使用C语言来为CUDA™架构编写程序,C语言是应用最广泛...
cudnn被墙了下不了,故贴资源于此。 cuda本体资源详见https://blog.csdn.net/m0_46258498/article/details/126250308
光线跟踪的cuda代码,用cuda1.1实现的光线跟踪算法,是学时光线跟踪的比较好的例程。
NVIDIA CUDA Programming Guide