ПРИЛОЖЕНИЕ Б. Пример программы CUDA+MPI: расчет интерференционных картин вероятности переходов кубита квантовым методом Монте-Карло
Модуль qubit_CUDA.ccp // cubit_cuda.cpp: определяет точку входа для консольного приложения.
// #include "stdafx.h"
#include "cudacomplex.h"
#include "math.h"
#include "fstream"
#include
#include
#include
#include
#include
#include
#include
#include
#include using namespace std;
int _tmain(int argc, _TCHAR* argv[])
{
int T, N; N = 1024; T = 1000; cudaSetDevice(2); float *av_a, *av_b, *av_a_dev, *av_b_dev, *rand_dev, *rand;
singlecomplex *a_host, *b_host, *a_dev, *b_dev; av_a = new float[T]; av_b = new float [T]; rand = new float[T * N];
a_host = new singlecomplex[T * N]; b_host = new singlecomplex[T * N]; for (int i = 0; i<= T-1; i++)
{
av_a[i] = 0; av_b[i] = 0;
} for (int i = 0; i<= T * N - 1; i++)
{
rand[i] = 0.f;
a_host[i] = make_singlecomplex(1, 0);
b_host[i] = make_singlecomplex(0, 0);
}
CUDA_SAFE_CALL( cudaMalloc((void**) &av_a_dev, T * sizeof *av_a_dev));
CUDA_SAFE_CALL( cudaMalloc((void**) &av_b_dev, T * sizeof *av_b_dev));
CUDA_SAFE_CALL( cudaMalloc((void**) &rand_dev, T * N * sizeof *rand_dev));
CUDA_SAFE_CALL( cudaMalloc((void**) &a_dev, T * N * sizeof *a_dev));
CUDA_SAFE_CALL( cudaMalloc((void**) &b_dev, T * N * sizeof *b_dev)); CUDA_SAFE_CALL( cudaMemcpy(a_dev, a_host, T * N * sizeof *a_dev, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL( cudaMemcpy(b_dev, b_host, T * N * sizeof *b_dev, cudaMemcpyHostToDevice));
float Am, eps;
volatile DWORD dwStart;
dwStart = GetTickCount(); montecarlo(av_a_dev, av_b_dev, a_dev, b_dev, rand_dev, N, T, Am , eps); printf_s("For %d steps, %d milliseconds\n", N, GetTickCount() - dwStart); CUDA_SAFE_CALL( cudaMemcpy( av_a, av_a_dev, T * sizeof *av_a_dev, cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL( cudaMemcpy( av_b, av_b_dev, T * sizeof *av_b_dev, cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL( cudaMemcpy(rand, rand_dev, T * N * sizeof *rand_dev, cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL( cudaMemcpy( a_host, a_dev, T * N * sizeof *a_dev, cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL( cudaMemcpy( b_host, b_dev, T * N * sizeof *b_dev, cudaMemcpyDeviceToHost)); for (int i = 0; i< T; i++)
for (int m = 0; m < N; m++){
av_a[i] = a_host[m*T + i].abs() * a_host[m*T + i].abs()+ av_a[i];
av_b[i] = b_host[m*T + i].abs() * b_host[m*T + i].abs()+ av_b[i];
}
//печать усредненных значений коэффициентов abs(a)^2, abs(b)^2 в файлы ofstream outA("AverageA.txt",ios::out);
ofstream outB("AverageB.txt",ios::out);
for (int i = 0; i<= T-1; i++)
{
outA << i << " " << av_a[i]/N << "\n" ;
outB << i << " " << av_b[i]/N << "\n";
} return 0;
}
Модуль calc.ccp #include "stdafx.h"
#include "cudacomplex.h"
#include "math.h"
#include "fstream"
#include
#include
#include
#include
#include
#include
#include
#include
#include using namespace std; #include "arr.h" void calc (float *data, float *res, int maxThreadsPerBlock, int NumMultiprocessors)
{
float Am = data[0];
float eps = data[1];
float sum = 0.f; int T, N; N = 5000; T = 1000;
float *av_a, *av_b, *rand_dev;
singlecomplex *a_host, *b_host, *a_dev, *b_dev; av_a = new float[T]; av_b = new float [T];
a_host = new singlecomplex[T * N]; b_host = new singlecomplex[T * N]; for (int i = 0; i< T; i++)
{
av_a[i] = 0; av_b[i] = 0;
} for (int i = 0; i < T * N ; i++)
{
a_host[i] = make_singlecomplex(1, 0);
b_host[i] = make_singlecomplex(0, 0);
}
//CUDA_SAFE_CALL( cudaMalloc((void**) &rand_dev, T * N * sizeof *rand_dev));
//CUDA_SAFE_CALL( cudaMalloc((void**) &a_dev, T * N * sizeof *a_dev));
//CUDA_SAFE_CALL( cudaMalloc((void**) &b_dev, T * N * sizeof *b_dev)); //CUDA_SAFE_CALL( cudaMemcpy(a_dev, a_host, T * N * sizeof *a_dev, cudaMemcpyHostToDevice));
//CUDA_SAFE_CALL( cudaMemcpy(b_dev, b_host, T * N * sizeof *b_dev, cudaMemcpyHostToDevice)); cudaMalloc((void**) &rand_dev, T * N * sizeof *rand_dev);
cudaMalloc((void**) &a_dev, T * N * sizeof *a_dev);
cudaMalloc((void**) &b_dev, T * N * sizeof *b_dev); cudaMemcpy(a_dev, a_host, T * N * sizeof *a_dev, cudaMemcpyHostToDevice);
cudaMemcpy(b_dev, b_host, T * N * sizeof *b_dev, cudaMemcpyHostToDevice);
//Функция, в которой происходит рассчет траекторий
montecarlo( a_dev, b_dev, rand_dev, N, T, Am , eps, maxThreadsPerBlock, NumMultiprocessors);
/*CUDA_SAFE_CALL( cudaMemcpy( a_host, a_dev, T * N * sizeof *a_dev, cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL( cudaMemcpy( b_host, b_dev, T * N * sizeof *b_dev, cudaMemcpyDeviceToHost));*/ cudaMemcpy( a_host, a_dev, T * N * sizeof *a_dev, cudaMemcpyDeviceToHost);
cudaMemcpy( b_host, b_dev, T * N * sizeof *b_dev, cudaMemcpyDeviceToHost);
//получение усредненной траектории
for (int i = 0; i< T; i++)
for (int m = 0; m < N; m++){
av_a[i] = a_host[m*T + i].abs() * a_host[m*T + i].abs()+ av_a[i];
//av_b[i] = b_host[m*T + i].abs() * b_host[m*T + i].abs()+ av_b[i];
}
res[0] = (float) Am;
res[1] = (float) eps;
res[2] = (float) av_a[T-1] / N;
cudaFree(rand_dev);
cudaFree(a_dev);
cudaFree(b_dev); delete [] av_a;
delete [] av_b;
delete [] a_host;
delete [] b_host; } Модуль Net.ccp #include
#include
#include "arr.h" #include
#include "fstream" using namespace std; int n; // число занятых процессов void
init_net()
{
n = 0;
} int
is_free ()
{
int proc_num; MPI_Comm_size (MPI_COMM_WORLD, &proc_num);
return (n < (proc_num - 1));
} int
are_all_free ()// 0 - это лошь, 1 -истина
{ return (n == 0);
}
// on the master side
void
send_data (float* data)
{
MPI_Status status; //cout << "send " << (float) data[0] << " " << (float) data[1] << "\n"; wmsg_t msg = WL_NO; MPI_Recv ((void *) &msg, 1, MPI_INTEGER, MPI_ANY_SOURCE, TAG_WMSG, MPI_COMM_WORLD, &status); int rank = status.MPI_SOURCE;
msg = WL_OK; MPI_Send ((void *) &msg, 1, MPI_INTEGER, rank, TAG_WMSG, MPI_COMM_WORLD);
MPI_Send ((void *) data, 2, MPI_FLOAT, rank, TAG_DATA, MPI_COMM_WORLD);
n++;
} // on the slave side
int
receive_data (float* data)
{ MPI_Status status; wmsg_t msg = WL_REQUEST;
MPI_Send ((void *) &msg, 1, MPI_INTEGER, 0, TAG_WMSG, MPI_COMM_WORLD); MPI_Recv ((void *) &msg, 1, MPI_INTEGER, 0, TAG_WMSG, MPI_COMM_WORLD, &status); if (msg == WL_OK)
{
MPI_Recv ((void *) data, 2, MPI_FLOAT, 0, TAG_DATA, MPI_COMM_WORLD, &status); //cout << "receive " << data[0] << " " << data[1] << "\n"; return OK; } return NO; } // on the slave side
void
send_result (float *res)
{
MPI_Send ((void *) res, 3, MPI_FLOAT, 0, TAG_RESULT, MPI_COMM_WORLD); } // on the master side
int
receive_result (float *res)
{
MPI_Status status;
MPI_Recv ((void *) res, 3, MPI_FLOAT, MPI_ANY_SOURCE, TAG_RESULT, MPI_COMM_WORLD, &status);
n--; return OK;
}
void
send_end ()
{
MPI_Status status; wmsg_t msg; int proc_num, rank, i; MPI_Comm_size (MPI_COMM_WORLD, &proc_num); for (i = 1; i < proc_num; i++)
{
MPI_Recv ((void *) &msg, 1, MPI_INTEGER, MPI_ANY_SOURCE, TAG_WMSG, MPI_COMM_WORLD, &status); rank = status.MPI_SOURCE;
msg = WL_NO; MPI_Send ((void *) &msg, 1, MPI_INTEGER, rank, TAG_WMSG, MPI_COMM_WORLD);
}
} Модуль io.ccp #include
#include
#include
#include "arr.h" using namespace std; int nx, ny; float Xmin = 0.f, Xmax = 10.f;
float Ymin = 0.f, Ymax = 10.f; const int N_x = 100, N_y = 100; ofstream outfile; int num_flush; void
init_io()
{
nx = 0;
ny = 0; outfile.open ("test_mpi.txt", ios::out); //outfile.flush(); num_flush = 0;
}
void
close_io()
{
outfile.close();
outfile.flush();
} int
load_data (float *data)
{ if (ny != N_y)
{ data[0] = (float) (Xmin + nx * (Xmax - Xmin) / (N_x - 1.0f));
data[1] = (float) (Ymin + ny * (Ymax - Ymin) / (N_y - 1.0f)); nx = nx + 1; if (nx == N_x)
{
nx = 0; ny = ny + 1;
} return OK;
}
return NO;
}
void
save_result (float *res)
{
outfile << res[0] << " " << res[1] << " " << res[2] <<"\n" ;
} Модуль Arr.cpp #include
#include
#include "arr.h"
#include #include using namespace std; void master ();
void slave (); int
main (int argc, char **argv)
{
int myid; MPI_Init (&argc, &argv);
MPI_Comm_rank (MPI_COMM_WORLD, &myid); double starttime, endtime;
starttime = MPI_Wtime(); if (myid == 0)
master ();
else
slave (); printf ("process %d: the work is done\n", myid); endtime = MPI_Wtime();
printf("That took %f seconds\n",endtime-starttime); MPI_Finalize (); return 0;
} void
master ()
{ float data[2];
float res[3]; init_io();
init_net();
while (load_data ((float *) data))
{ if (! is_free ()) // есть ли свободные
{
receive_result (res); save_result (res); } send_data ((float *) data); } while (! are_all_free ())
{
receive_result (res); save_result (res);
} close_io();
send_end (); }
void
slave ()
{
float data[2];
float res[3]; int j;
j = 0;
int myid;
MPI_Comm_rank (MPI_COMM_WORLD, &myid);
int device = myid % 2;
cudaSetDevice(0);
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, device);
while (receive_data ((float *) data))
{ calc (data, res,deviceProp.maxThreadsDim[0], deviceProp.multiProcessorCount);
send_result (res); j = j + 1; } printf ("------> j = %d\n", j);
}
|