Minor refactor to beable to easily switch number of dimensions

This commit is contained in:
Daniel Weber 2025-04-06 21:03:52 -04:00
parent 40a1be1c78
commit 95af7079b6
6 changed files with 86 additions and 87 deletions

View File

@ -14,8 +14,7 @@ You're on your own with figuring out how to install cuda...but in theory once th
```
cmake -S ./ -B build/
cmake --build build/
cd build/
./src/particle_sim_exe
./build/src/particle_sim_exe
```
Current application runs 10,000 particles at 50fps on an RTX3060. The gif below only appears to be moving slow because the time steps between renders is much lower then in the python example. The frame is actually higher here then it was in python.

View File

@ -1,2 +1,2 @@
void custom_kernel_process_ray_collision(int size, float * norms, float * direction, float * vel, float * pos, int array_stride, float * line_vec);
void custom_kernel_process_ray_collision(int size, float * norms, float * direction, float * vel, float * pos, int array_stride, float * line_vec, int num_dim);

View File

@ -14,8 +14,10 @@ class universe
int num_of_particles;
int array_size;
cublasHandle_t cublas_handle;
float * device_ray_position;
float * device_ray_direction;
float * device_boundary_position;
float * device_boundary_direction;
float * host_boundary_position;
float * host_boundary_direction;
float * device_position;
float * device_velocity;
float * device_acceleration;
@ -26,7 +28,7 @@ class universe
float * host_acceleration;
curandGenerator_t gen;
void process_collisions_with_other_particles(void);
void process_collisions_with_rays(void);
void process_collisions_with_boundary(void);
void process_particles(void);
/* data3d */
public:

View File

@ -7,7 +7,7 @@
SDL_Surface *bitmapSurface[4]= {};
SDL_Texture * bitmapTexture[4] = {};
static std::string color_paths[] = {"blue", "green", "red", "yellow"};
static std::string sprite_path = "../sdl_sprites/";
static std::string sprite_path = "sdl_sprites/";
void init_textures(SDL_Renderer *in_renderer)
{

View File

@ -1,3 +1,5 @@
#include <cstddef>
#include <cstdlib>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
@ -13,26 +15,11 @@
#include "util.hpp"
#define DELTA_TIME 0.005
#define NUM_DIMENSIONS 2
void universe::process_collisions_with_rays(void)
void universe::process_collisions_with_boundary(void)
{
float temp_directions[][3] =
{
{1.0,0.0,0.0},
{0.0,1.0,0.0},
{1.0,0.0,0.0},
{0.0,1.0,0.0}
};
float temp_positions[][3] =
{
{0.0,0.0,0.0},
{0.0,0.0,0.0},
{800.0,800.0,800.0},
{800.0,800.0,800.0}
};
float alpha = 1.0;
float beta = 0.0;
@ -43,57 +30,53 @@ void universe::process_collisions_with_rays(void)
long(num_of_particles*sizeof(float))));
CHECK_CUDA_ERROR(cudaMallocManaged(&(dev_direction_vec),
long(num_of_particles*3*sizeof(float))));
long(num_of_particles*NUM_DIMENSIONS*sizeof(float))));
for(int i = 0; i < 4; i++)
{
//Fill scratchpad A with vec1 = line_position - particle_position
for (int j = 0; j < 3; j++)
for (int j = 0; j < NUM_DIMENSIONS; j++)
{
CHECK_CUDA_ERROR(nppsSubCRev_32f(&device_position[array_size * j],
temp_positions[i][j], &device_scratch_padA[num_of_particles * j],
host_boundary_position[i*NUM_DIMENSIONS+j], &device_scratch_padA[num_of_particles * j],
num_of_particles));
}
alpha = 1.0;
beta = 0.0;
CHECK_CUDA_ERROR(cublasSgemm(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N,
num_of_particles, 1, 3, &alpha, device_scratch_padA,
num_of_particles, &device_ray_direction[i * 3], 3, &beta,
num_of_particles, 1, NUM_DIMENSIONS, &alpha, device_scratch_padA,
num_of_particles, &device_boundary_direction[i * NUM_DIMENSIONS], NUM_DIMENSIONS, &beta,
dev_dot_product_result, num_of_particles));
//Use dot product to find projections onto the line directions
CHECK_CUDA_ERROR(cublasSgemm(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_T,
num_of_particles, 3, 1, &alpha, dev_dot_product_result,
num_of_particles, &device_ray_direction[i * 3], 3, &beta,
num_of_particles, NUM_DIMENSIONS, 1, &alpha, dev_dot_product_result,
num_of_particles, &device_boundary_direction[i * NUM_DIMENSIONS], NUM_DIMENSIONS, &beta,
device_scratch_padA, num_of_particles));
//calc norm
CHECK_CUDA_ERROR(nppsSqr_32f(device_scratch_padA, dev_direction_vec,
num_of_particles * 3));
num_of_particles * NUM_DIMENSIONS));
for(int index = 1; index < NUM_DIMENSIONS; index++)
{
CHECK_CUDA_ERROR(cublasSaxpy(cublas_handle, num_of_particles, &alpha,
&dev_direction_vec[num_of_particles * 1], 1,
dev_direction_vec, 1));
CHECK_CUDA_ERROR(cublasSaxpy(cublas_handle, num_of_particles, &alpha,
&dev_direction_vec[num_of_particles * 2], 1,
&dev_direction_vec[num_of_particles * index], 1,
dev_direction_vec, 1));
}
CHECK_CUDA_ERROR(nppsSqrt_32f_I(dev_direction_vec, num_of_particles));
//Normalize vector
CHECK_CUDA_ERROR(nppsDiv_32f_I(dev_direction_vec, device_scratch_padA,
num_of_particles));
for(int index = 0; index < NUM_DIMENSIONS; index++)
{
CHECK_CUDA_ERROR(nppsDiv_32f_I(dev_direction_vec,
&device_scratch_padA[1 * num_of_particles], num_of_particles));
CHECK_CUDA_ERROR(nppsDiv_32f_I(dev_direction_vec,
&device_scratch_padA[2 * num_of_particles], num_of_particles));
&device_scratch_padA[index * num_of_particles], num_of_particles));
}
custom_kernel_process_ray_collision(num_of_particles, dev_direction_vec,
device_scratch_padA, device_velocity, device_position,
array_size, &device_ray_direction[i*3]);
array_size, &device_boundary_direction[i*NUM_DIMENSIONS], NUM_DIMENSIONS);
}
@ -114,15 +97,15 @@ void universe::process_particles(void)
// // Update the positions of the particles:
// // self.positions[counti] = self.velocities[counti] * DELTA_TIME + self.positions[counti]
CHECK_CUDA_ERROR(cublasSaxpy(cublas_handle, array_size*3, &delta,
CHECK_CUDA_ERROR(cublasSaxpy(cublas_handle, array_size*NUM_DIMENSIONS, &delta,
device_velocity, 1, device_position, 1));
// Update the velocities of the particles:
// self.velocities[counti] = self.accelerations[counti] * DELTA_TIME + self.velocities[counti]
CHECK_CUDA_ERROR(cublasSaxpy(cublas_handle, array_size*3, &delta,
CHECK_CUDA_ERROR(cublasSaxpy(cublas_handle, array_size*NUM_DIMENSIONS, &delta,
device_acceleration, 1, device_velocity, 1));
process_collisions_with_rays();
process_collisions_with_boundary();
process_collisions_with_other_particles();
}
@ -130,7 +113,7 @@ void universe::process_particles(void)
void universe::get_positions(float ** in_out_buffer, int * size, int * stride)
{
CHECK_CUDA_ERROR(cudaMemcpy(host_position, device_position,
long(array_size * 3 * sizeof(float)), cudaMemcpyDeviceToHost));
long(array_size * NUM_DIMENSIONS * sizeof(float)), cudaMemcpyDeviceToHost));
*in_out_buffer = host_position;
*size = num_of_particles;
@ -139,7 +122,7 @@ void universe::get_positions(float ** in_out_buffer, int * size, int * stride)
universe::universe(int size)
{
curandCreateGenerator(&gen,
curandCreateGenerator(&gen,
CURAND_RNG_PSEUDO_DEFAULT);
CHECK_CUDA_ERROR(cublasCreate(&cublas_handle));
@ -148,49 +131,64 @@ curandCreateGenerator(&gen,
array_size = size;
CHECK_CUDA_ERROR(cudaMallocManaged(&(device_position),
long(array_size*3*sizeof(float))));
long(array_size*NUM_DIMENSIONS*sizeof(float))));
CHECK_CUDA_ERROR(cudaMallocManaged(&(device_velocity),
long(array_size*3*sizeof(float))));
long(array_size*NUM_DIMENSIONS*sizeof(float))));
CHECK_CUDA_ERROR(cudaMallocManaged(&(device_acceleration),
long(array_size*3*sizeof(float))));
long(array_size*NUM_DIMENSIONS*sizeof(float))));
CHECK_CUDA_ERROR(cudaMallocManaged(&(device_scratch_padA),
long(array_size*3*sizeof(float))));
long(array_size*NUM_DIMENSIONS*sizeof(float))));
host_position = (float *)malloc(long(array_size*3*sizeof(float)));
host_velocity = (float *)malloc(long(array_size*3*sizeof(float)));
host_acceleration = (float *)malloc(long(array_size*3*sizeof(float)));
host_position = (float *)malloc(long(array_size*NUM_DIMENSIONS*sizeof(float)));
host_velocity = (float *)malloc(long(array_size*NUM_DIMENSIONS*sizeof(float)));
host_acceleration = (float *)malloc(long(array_size*NUM_DIMENSIONS*sizeof(float)));
CHECK_CUDA_ERROR(curandGenerateUniform(gen, device_position, array_size*2));
CHECK_CUDA_ERROR(nppsAddC_32f_I(20.0, device_position, array_size*3));
CHECK_CUDA_ERROR(curandGenerateUniform(gen, device_velocity, array_size*2));
// CHECK_CUDA_ERROR(curandGenerateUniform(gen, device_acceleration, num_of_particles*3));
CHECK_CUDA_ERROR(curandGenerateUniform(gen, device_position, array_size*NUM_DIMENSIONS));
CHECK_CUDA_ERROR(nppsAddC_32f_I(20.0, device_position, array_size*NUM_DIMENSIONS));
CHECK_CUDA_ERROR(curandGenerateUniform(gen, device_velocity, array_size*NUM_DIMENSIONS));
// CHECK_CUDA_ERROR(curandGenerateUniform(gen, device_acceleration, num_of_particles*NUM_DIMENSIONS));
//Add rays that can collied with particles to the universe
CHECK_CUDA_ERROR(cudaMallocManaged(&(device_ray_position),
long(4*3*sizeof(float))));
CHECK_CUDA_ERROR(cudaMallocManaged(&(device_boundary_position),
long(4*NUM_DIMENSIONS*sizeof(float))));
CHECK_CUDA_ERROR(cudaMallocManaged(&(device_ray_direction),
long(4*3*sizeof(float))));
CHECK_CUDA_ERROR(cudaMallocManaged(&(device_boundary_direction),
long(4*NUM_DIMENSIONS*sizeof(float))));
float temp_positions[] = {0.0,0.0,0.0,0.0,0.0,0.0,800.0,800.0,800.0,800.0,800.0,800.0};
float temp_directions[] = {1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0};
float boundary_direction[4*NUM_DIMENSIONS] = {0};
float boundary_point[4*NUM_DIMENSIONS] = {0};
for(size_t i = 0; i < 4; i++)
{
boundary_direction[i*NUM_DIMENSIONS+(i%2)] = 1;
if(i >= 2)
{
for (size_t j = 0; j < NUM_DIMENSIONS; j++)
{
boundary_point[i*NUM_DIMENSIONS+j] = 800;
}
}
}
CHECK_CUDA_ERROR(cudaMemcpy(device_ray_position, temp_positions,
long(sizeof(temp_positions)), cudaMemcpyHostToDevice));
host_boundary_position = (float *)malloc(sizeof(boundary_point));
host_boundary_direction = (float *)malloc(sizeof(boundary_direction));
memcpy(host_boundary_position, boundary_point, sizeof(boundary_point));
memcpy(host_boundary_direction, boundary_direction, sizeof(boundary_direction));
CHECK_CUDA_ERROR(cudaMemcpy(device_ray_direction, temp_directions,
long(sizeof(temp_directions)), cudaMemcpyHostToDevice));
CHECK_CUDA_ERROR(cudaMemcpy(device_boundary_position, boundary_point,
long(sizeof(boundary_point)), cudaMemcpyHostToDevice));
CHECK_CUDA_ERROR(cudaMemcpy(device_boundary_direction, boundary_direction,
long(sizeof(boundary_direction)), cudaMemcpyHostToDevice));
}
universe::~universe()
{
CHECK_CUDA_ERROR(cudaFree(device_position));
CHECK_CUDA_ERROR(cudaFree(device_velocity));
CHECK_CUDA_ERROR(cudaFree(device_acceleration));
@ -199,6 +197,8 @@ universe::~universe()
free(host_position);
free(host_velocity);
free(host_acceleration);
free(host_boundary_direction);
free(host_boundary_position);
CHECK_CUDA_ERROR(cublasDestroy(cublas_handle));
@ -207,7 +207,6 @@ universe::~universe()
void universe::process(void)
{
//First we need to process gravity

View File

@ -2,7 +2,7 @@
#include "stdio.h"
#define BLOCK_SIZE 32
__global__ static void process_ray_collision(int size, float * norms, float * direction, float * vel, float * pos, int array_stride, float * line_vec)
__global__ static void process_ray_collision(int size, float * norms, float * direction, float * vel, float * pos, int array_stride, float * line_vec, int num_dim)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
@ -12,17 +12,17 @@ __global__ static void process_ray_collision(int size, float * norms, float * di
{
float speed_norm = 0.0;
float dot_product = 0.0;
for(int k = 0; k<3; k++)
for(int k = 0; k<num_dim; k++)
{
dot_product += vel[k*array_stride+i]*line_vec[k];
}
for(int k = 0; k<3; k++)
for(int k = 0; k<num_dim; k++)
{
float temp = (dot_product*(line_vec[k]));
speed_norm += pow(temp, 2);
}
speed_norm = sqrt(speed_norm)*2.0;
for(int k = 0; k<3; k++)
for(int k = 0; k<num_dim; k++)
{
vel[k*array_stride+i] = vel[k*array_stride+i] - speed_norm*direction[k*size+i];
pos[k*array_stride+i] = pos[k*array_stride+i] - direction[k*size+i]*(5.0-norms[i]);
@ -31,9 +31,8 @@ __global__ static void process_ray_collision(int size, float * norms, float * di
}
}
void custom_kernel_process_ray_collision(int size, float * norms, float * direction, float * vel, float * pos, int array_stride, float * line_vec)
void custom_kernel_process_ray_collision(int size, float * norms, float * direction, float * vel, float * pos, int array_stride, float * line_vec, int num_dim)
{
int numBlocks = (size + BLOCK_SIZE - 1) / BLOCK_SIZE;
process_ray_collision<<<numBlocks,BLOCK_SIZE>>>(size, norms, direction, vel, pos, array_stride, line_vec);
process_ray_collision<<<numBlocks,BLOCK_SIZE>>>(size, norms, direction, vel, pos, array_stride, line_vec, num_dim);
}