From 95af7079b6d0aa4d488c0ef763284db912c167f5 Mon Sep 17 00:00:00 2001
From: Daniel Weber <djweber12@hotmail.com>
Date: Sun, 6 Apr 2025 21:03:52 -0400
Subject: [PATCH] Minor refactor to beable to easily switch number of
 dimensions

---
 Cpp_Stuff/README.md                     |   3 +-
 Cpp_Stuff/include/custom_kernels.cuh    |   2 +-
 Cpp_Stuff/include/universe.hpp          |   8 +-
 Cpp_Stuff/src/renderer/textures.cpp     |   4 +-
 Cpp_Stuff/src/simulator/universe.cpp    | 139 ++++++++++++------------
 Cpp_Stuff/src/utility/custom_kernels.cu |  17 ++-
 6 files changed, 86 insertions(+), 87 deletions(-)
diff --git a/Cpp_Stuff/README.md b/Cpp_Stuff/README.md
index 5699073..998662d 100644
--- a/Cpp_Stuff/README.md
+++ b/Cpp_Stuff/README.md
@@ -14,8 +14,7 @@ You're on your own with figuring out how to install cuda...but in theory once th
 ```
 cmake -S ./ -B build/
 cmake --build build/
-cd build/
-./src/particle_sim_exe
+./build/src/particle_sim_exe
 ```
 
 Current application runs 10,000 particles at 50fps on an RTX3060. The gif below only appears to be moving slow because the time steps between renders is much lower then in the python example. The frame is actually higher here then it was in python.
diff --git a/Cpp_Stuff/include/custom_kernels.cuh b/Cpp_Stuff/include/custom_kernels.cuh
index 1937bb4..0d282e6 100644
--- a/Cpp_Stuff/include/custom_kernels.cuh
+++ b/Cpp_Stuff/include/custom_kernels.cuh
@@ -1,2 +1,2 @@
 
-void custom_kernel_process_ray_collision(int size, float * norms, float * direction, float * vel, float * pos, int array_stride, float * line_vec);
+void custom_kernel_process_ray_collision(int size, float * norms, float * direction, float * vel, float * pos, int array_stride, float * line_vec, int num_dim);
diff --git a/Cpp_Stuff/include/universe.hpp b/Cpp_Stuff/include/universe.hpp
index 0abe543..fee8d92 100644
--- a/Cpp_Stuff/include/universe.hpp
+++ b/Cpp_Stuff/include/universe.hpp
@@ -14,8 +14,10 @@ class universe
         int num_of_particles;
         int array_size;
         cublasHandle_t cublas_handle;
-        float * device_ray_position;
-        float * device_ray_direction;
+        float * device_boundary_position;
+        float * device_boundary_direction;
+        float * host_boundary_position;
+        float * host_boundary_direction;
         float * device_position;
         float * device_velocity;
         float * device_acceleration;
@@ -26,7 +28,7 @@ class universe
         float * host_acceleration;
         curandGenerator_t gen;
         void process_collisions_with_other_particles(void);
-        void process_collisions_with_rays(void);
+        void process_collisions_with_boundary(void);
         void process_particles(void);
      /* data3d */
     public:
diff --git a/Cpp_Stuff/src/renderer/textures.cpp b/Cpp_Stuff/src/renderer/textures.cpp
index 4981abc..a5a730a 100644
--- a/Cpp_Stuff/src/renderer/textures.cpp
+++ b/Cpp_Stuff/src/renderer/textures.cpp
@@ -7,7 +7,7 @@
 SDL_Surface *bitmapSurface[4]= {};
 SDL_Texture * bitmapTexture[4] = {};
 static std::string color_paths[] = {"blue", "green", "red", "yellow"};
-static std::string sprite_path = "../sdl_sprites/";
+static std::string sprite_path = "sdl_sprites/";
 
 void init_textures(SDL_Renderer *in_renderer)
 {
@@ -16,7 +16,7 @@ void init_textures(SDL_Renderer *in_renderer)
         bitmapSurface[color] = SDL_LoadBMP((sprite_path+color_paths[color]+".bmp").c_str());
         bitmapTexture[color] = SDL_CreateTextureFromSurface(in_renderer, bitmapSurface[color]);
     }
-    
+
 }
 
 SDL_Texture * get_texture(int color)
diff --git a/Cpp_Stuff/src/simulator/universe.cpp b/Cpp_Stuff/src/simulator/universe.cpp
index 2dcd6a7..3d9e841 100644
--- a/Cpp_Stuff/src/simulator/universe.cpp
+++ b/Cpp_Stuff/src/simulator/universe.cpp
@@ -1,3 +1,5 @@
+#include <cstddef>
+#include <cstdlib>
 #include <stdio.h>
 #include <stdlib.h>
 #include <iostream>
@@ -13,26 +15,11 @@
 #include "util.hpp"
 
 #define DELTA_TIME 0.005
+#define NUM_DIMENSIONS 2
 
 
-void universe::process_collisions_with_rays(void)
+void universe::process_collisions_with_boundary(void)
 {
-    float temp_directions[][3] =
-    {
-        {1.0,0.0,0.0},
-        {0.0,1.0,0.0},
-        {1.0,0.0,0.0},
-        {0.0,1.0,0.0}
-    };
-
-    float temp_positions[][3] =
-    {
-        {0.0,0.0,0.0},
-        {0.0,0.0,0.0},
-        {800.0,800.0,800.0},
-        {800.0,800.0,800.0}
-    };
-
     float alpha = 1.0;
     float beta = 0.0;
 
@@ -43,57 +30,53 @@ void universe::process_collisions_with_rays(void)
                 long(num_of_particles*sizeof(float))));
 
     CHECK_CUDA_ERROR(cudaMallocManaged(&(dev_direction_vec),
-                long(num_of_particles*3*sizeof(float))));
+                long(num_of_particles*NUM_DIMENSIONS*sizeof(float))));
 
     for(int i = 0; i < 4; i++)
     {
         //Fill scratchpad A with vec1 = line_position - particle_position
-        for (int j = 0; j < 3; j++)
+        for (int j = 0; j < NUM_DIMENSIONS; j++)
         {
             CHECK_CUDA_ERROR(nppsSubCRev_32f(&device_position[array_size * j],
-                        temp_positions[i][j], &device_scratch_padA[num_of_particles * j],
+                        host_boundary_position[i*NUM_DIMENSIONS+j], &device_scratch_padA[num_of_particles * j],
                         num_of_particles));
         }
         alpha = 1.0;
         beta = 0.0;
         CHECK_CUDA_ERROR(cublasSgemm(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N,
-                    num_of_particles, 1, 3, &alpha, device_scratch_padA,
-                    num_of_particles, &device_ray_direction[i * 3], 3, &beta,
+                    num_of_particles, 1, NUM_DIMENSIONS, &alpha, device_scratch_padA,
+                    num_of_particles, &device_boundary_direction[i * NUM_DIMENSIONS], NUM_DIMENSIONS, &beta,
                     dev_dot_product_result, num_of_particles));
 
         //Use dot product to find projections onto the line directions
         CHECK_CUDA_ERROR(cublasSgemm(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_T,
-                    num_of_particles, 3, 1, &alpha, dev_dot_product_result,
-                    num_of_particles, &device_ray_direction[i * 3], 3, &beta,
+                    num_of_particles, NUM_DIMENSIONS, 1, &alpha, dev_dot_product_result,
+                    num_of_particles, &device_boundary_direction[i * NUM_DIMENSIONS], NUM_DIMENSIONS, &beta,
                     device_scratch_padA, num_of_particles));
 
         //calc norm
         CHECK_CUDA_ERROR(nppsSqr_32f(device_scratch_padA, dev_direction_vec,
-                    num_of_particles * 3));
+                    num_of_particles * NUM_DIMENSIONS));
 
-        CHECK_CUDA_ERROR(cublasSaxpy(cublas_handle, num_of_particles, &alpha,
-                    &dev_direction_vec[num_of_particles * 1], 1,
-                    dev_direction_vec, 1));
-
-        CHECK_CUDA_ERROR(cublasSaxpy(cublas_handle, num_of_particles, &alpha,
-                    &dev_direction_vec[num_of_particles * 2], 1,
-                    dev_direction_vec, 1));
+        for(int index = 1; index < NUM_DIMENSIONS; index++)
+        {
+            CHECK_CUDA_ERROR(cublasSaxpy(cublas_handle, num_of_particles, &alpha,
+                        &dev_direction_vec[num_of_particles * index], 1,
+                        dev_direction_vec, 1));
+        }
 
         CHECK_CUDA_ERROR(nppsSqrt_32f_I(dev_direction_vec, num_of_particles));
 
         //Normalize vector
-        CHECK_CUDA_ERROR(nppsDiv_32f_I(dev_direction_vec, device_scratch_padA,
-                    num_of_particles));
-
-        CHECK_CUDA_ERROR(nppsDiv_32f_I(dev_direction_vec,
-                    &device_scratch_padA[1 * num_of_particles], num_of_particles));
-
-        CHECK_CUDA_ERROR(nppsDiv_32f_I(dev_direction_vec,
-                    &device_scratch_padA[2 * num_of_particles], num_of_particles));
+        for(int index = 0; index < NUM_DIMENSIONS; index++)
+        {
+            CHECK_CUDA_ERROR(nppsDiv_32f_I(dev_direction_vec,
+                        &device_scratch_padA[index * num_of_particles], num_of_particles));
+        }
 
         custom_kernel_process_ray_collision(num_of_particles, dev_direction_vec,
                 device_scratch_padA, device_velocity, device_position,
-                array_size, &device_ray_direction[i*3]);
+                array_size, &device_boundary_direction[i*NUM_DIMENSIONS], NUM_DIMENSIONS);
 
     }
 
@@ -114,15 +97,15 @@ void universe::process_particles(void)
 
     // // Update the positions of the particles:
     // // self.positions[counti] = self.velocities[counti] * DELTA_TIME + self.positions[counti]
-    CHECK_CUDA_ERROR(cublasSaxpy(cublas_handle, array_size*3, &delta,
+    CHECK_CUDA_ERROR(cublasSaxpy(cublas_handle, array_size*NUM_DIMENSIONS, &delta,
                 device_velocity, 1, device_position, 1));
 
     // Update the velocities of the particles:
     // self.velocities[counti] = self.accelerations[counti] * DELTA_TIME + self.velocities[counti]
-    CHECK_CUDA_ERROR(cublasSaxpy(cublas_handle, array_size*3, &delta,
+    CHECK_CUDA_ERROR(cublasSaxpy(cublas_handle, array_size*NUM_DIMENSIONS, &delta,
                 device_acceleration, 1, device_velocity, 1));
 
-    process_collisions_with_rays();
+    process_collisions_with_boundary();
 
     process_collisions_with_other_particles();
 }
@@ -130,7 +113,7 @@ void universe::process_particles(void)
 void universe::get_positions(float ** in_out_buffer, int * size, int * stride)
 {
     CHECK_CUDA_ERROR(cudaMemcpy(host_position, device_position,
-                long(array_size * 3 * sizeof(float)), cudaMemcpyDeviceToHost));
+                long(array_size * NUM_DIMENSIONS * sizeof(float)), cudaMemcpyDeviceToHost));
 
     *in_out_buffer = host_position;
     *size = num_of_particles;
@@ -139,7 +122,7 @@ void universe::get_positions(float ** in_out_buffer, int * size, int * stride)
 
 universe::universe(int size)
 {
-curandCreateGenerator(&gen,
+    curandCreateGenerator(&gen,
             CURAND_RNG_PSEUDO_DEFAULT);
 
     CHECK_CUDA_ERROR(cublasCreate(&cublas_handle));
@@ -148,49 +131,64 @@ curandCreateGenerator(&gen,
     array_size = size;
 
     CHECK_CUDA_ERROR(cudaMallocManaged(&(device_position),
-                long(array_size*3*sizeof(float))));
+                long(array_size*NUM_DIMENSIONS*sizeof(float))));
 
     CHECK_CUDA_ERROR(cudaMallocManaged(&(device_velocity),
-                long(array_size*3*sizeof(float))));
+                long(array_size*NUM_DIMENSIONS*sizeof(float))));
 
     CHECK_CUDA_ERROR(cudaMallocManaged(&(device_acceleration),
-                long(array_size*3*sizeof(float))));
+                long(array_size*NUM_DIMENSIONS*sizeof(float))));
 
     CHECK_CUDA_ERROR(cudaMallocManaged(&(device_scratch_padA),
-                long(array_size*3*sizeof(float))));
+                long(array_size*NUM_DIMENSIONS*sizeof(float))));
 
-    host_position = (float *)malloc(long(array_size*3*sizeof(float)));
-    host_velocity = (float *)malloc(long(array_size*3*sizeof(float)));
-    host_acceleration = (float *)malloc(long(array_size*3*sizeof(float)));
+    host_position = (float *)malloc(long(array_size*NUM_DIMENSIONS*sizeof(float)));
+    host_velocity = (float *)malloc(long(array_size*NUM_DIMENSIONS*sizeof(float)));
+    host_acceleration = (float *)malloc(long(array_size*NUM_DIMENSIONS*sizeof(float)));
 
-    CHECK_CUDA_ERROR(curandGenerateUniform(gen, device_position, array_size*2));
-    CHECK_CUDA_ERROR(nppsAddC_32f_I(20.0, device_position, array_size*3));
-    CHECK_CUDA_ERROR(curandGenerateUniform(gen, device_velocity, array_size*2));
-    // CHECK_CUDA_ERROR(curandGenerateUniform(gen, device_acceleration, num_of_particles*3));
+    CHECK_CUDA_ERROR(curandGenerateUniform(gen, device_position, array_size*NUM_DIMENSIONS));
+    CHECK_CUDA_ERROR(nppsAddC_32f_I(20.0, device_position, array_size*NUM_DIMENSIONS));
+    CHECK_CUDA_ERROR(curandGenerateUniform(gen, device_velocity, array_size*NUM_DIMENSIONS));
+    // CHECK_CUDA_ERROR(curandGenerateUniform(gen, device_acceleration, num_of_particles*NUM_DIMENSIONS));
 
     //Add rays that can collied with particles to the universe
-    CHECK_CUDA_ERROR(cudaMallocManaged(&(device_ray_position),
-                long(4*3*sizeof(float))));
+    CHECK_CUDA_ERROR(cudaMallocManaged(&(device_boundary_position),
+                long(4*NUM_DIMENSIONS*sizeof(float))));
 
-    CHECK_CUDA_ERROR(cudaMallocManaged(&(device_ray_direction),
-                long(4*3*sizeof(float))));
+    CHECK_CUDA_ERROR(cudaMallocManaged(&(device_boundary_direction),
+                long(4*NUM_DIMENSIONS*sizeof(float))));
 
-    float temp_positions[] = {0.0,0.0,0.0,0.0,0.0,0.0,800.0,800.0,800.0,800.0,800.0,800.0};
-    float temp_directions[] = {1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0};
+    float boundary_direction[4*NUM_DIMENSIONS] = {0};
+    float boundary_point[4*NUM_DIMENSIONS] = {0};
+    for(size_t i = 0; i < 4; i++)
+    {
+        boundary_direction[i*NUM_DIMENSIONS+(i%2)] = 1;
+        if(i >= 2)
+        {
+            for (size_t j = 0; j < NUM_DIMENSIONS; j++)
+            {
+                boundary_point[i*NUM_DIMENSIONS+j] = 800;
+            }
+        }
+    }
 
-    CHECK_CUDA_ERROR(cudaMemcpy(device_ray_position, temp_positions,
-                long(sizeof(temp_positions)), cudaMemcpyHostToDevice));
+    host_boundary_position = (float *)malloc(sizeof(boundary_point));
+    host_boundary_direction = (float *)malloc(sizeof(boundary_direction));
+    memcpy(host_boundary_position, boundary_point, sizeof(boundary_point));
+    memcpy(host_boundary_direction, boundary_direction, sizeof(boundary_direction));
 
-    CHECK_CUDA_ERROR(cudaMemcpy(device_ray_direction, temp_directions,
-                long(sizeof(temp_directions)), cudaMemcpyHostToDevice));
+
+    CHECK_CUDA_ERROR(cudaMemcpy(device_boundary_position, boundary_point,
+                long(sizeof(boundary_point)), cudaMemcpyHostToDevice));
+
+    CHECK_CUDA_ERROR(cudaMemcpy(device_boundary_direction, boundary_direction,
+                long(sizeof(boundary_direction)), cudaMemcpyHostToDevice));
 
 
 }
 
 universe::~universe()
 {
-
-
     CHECK_CUDA_ERROR(cudaFree(device_position));
     CHECK_CUDA_ERROR(cudaFree(device_velocity));
     CHECK_CUDA_ERROR(cudaFree(device_acceleration));
@@ -199,6 +197,8 @@ universe::~universe()
     free(host_position);
     free(host_velocity);
     free(host_acceleration);
+    free(host_boundary_direction);
+    free(host_boundary_position);
     CHECK_CUDA_ERROR(cublasDestroy(cublas_handle));
 
 
@@ -207,7 +207,6 @@ universe::~universe()
 
 void universe::process(void)
 {
-
     //First we need to process gravity
 
 
diff --git a/Cpp_Stuff/src/utility/custom_kernels.cu b/Cpp_Stuff/src/utility/custom_kernels.cu
index 7890c69..e574c5a 100644
--- a/Cpp_Stuff/src/utility/custom_kernels.cu
+++ b/Cpp_Stuff/src/utility/custom_kernels.cu
@@ -2,7 +2,7 @@
 #include "stdio.h"
 #define BLOCK_SIZE 32
 
-__global__ static void process_ray_collision(int size, float * norms, float * direction, float * vel, float * pos, int array_stride, float * line_vec)
+__global__ static void process_ray_collision(int size, float * norms, float * direction, float * vel, float * pos, int array_stride, float * line_vec, int num_dim)
 {
     int index = blockIdx.x * blockDim.x + threadIdx.x;
     int stride = blockDim.x * gridDim.x;
@@ -12,17 +12,17 @@ __global__ static void process_ray_collision(int size, float * norms, float * di
         {
             float speed_norm = 0.0;
             float dot_product = 0.0;
-            for(int k = 0; k<3; k++)
+            for(int k = 0; k<num_dim; k++)
             {
                 dot_product += vel[k*array_stride+i]*line_vec[k];
             }
-            for(int k = 0; k<3; k++)
+            for(int k = 0; k<num_dim; k++)
             {
                 float temp = (dot_product*(line_vec[k]));
-                speed_norm += pow(temp, 2); 
-            } 
+                speed_norm += pow(temp, 2);
+            }
             speed_norm = sqrt(speed_norm)*2.0;
-            for(int k = 0; k<3; k++)
+            for(int k = 0; k<num_dim; k++)
             {
                 vel[k*array_stride+i] = vel[k*array_stride+i] - speed_norm*direction[k*size+i];
                 pos[k*array_stride+i] = pos[k*array_stride+i] - direction[k*size+i]*(5.0-norms[i]);
@@ -31,9 +31,8 @@ __global__ static void process_ray_collision(int size, float * norms, float * di
     }
 }
 
-
-void custom_kernel_process_ray_collision(int size, float * norms, float * direction, float * vel, float * pos, int array_stride, float * line_vec)
+void custom_kernel_process_ray_collision(int size, float * norms, float * direction, float * vel, float * pos, int array_stride, float * line_vec, int num_dim)
 {
     int numBlocks = (size + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    process_ray_collision<<<numBlocks,BLOCK_SIZE>>>(size, norms, direction, vel, pos, array_stride, line_vec);
+    process_ray_collision<<<numBlocks,BLOCK_SIZE>>>(size, norms, direction, vel, pos, array_stride, line_vec, num_dim);
 }