524void launchRayPrimitiveIntersection(
void *h_bvh_nodes,
int node_count,
unsigned int *h_primitive_indices,
int primitive_count,
int *h_primitive_types, float3 *h_primitive_vertices,
unsigned int *h_vertex_offsets,
int total_vertex_count,
525 float *h_ray_origins,
float *h_ray_directions,
float *h_ray_max_distances,
int num_rays,
float *h_hit_distances,
unsigned int *h_hit_primitive_ids,
unsigned int *h_hit_counts,
bool find_closest_hit) {
530 size_t total_vertices = total_vertex_count;
534 unsigned int *d_primitive_indices =
nullptr;
535 int *d_primitive_types =
nullptr;
536 float3 *d_primitive_vertices =
nullptr;
537 unsigned int *d_vertex_offsets =
nullptr;
538 float3 *d_ray_origins =
nullptr, *d_ray_directions =
nullptr;
539 float *d_ray_max_distances =
nullptr;
540 float *d_hit_distances =
nullptr;
541 unsigned int *d_hit_primitive_ids =
nullptr, *d_hit_counts =
nullptr;
544 size_t bvh_nodes_size = node_count *
sizeof(
GPUBVHNode);
545 size_t primitive_indices_size = primitive_count *
sizeof(
unsigned int);
546 size_t primitive_types_size = primitive_count *
sizeof(int);
547 size_t primitive_vertices_size = total_vertices *
sizeof(float3);
548 size_t vertex_offsets_size = primitive_count *
sizeof(
unsigned int);
549 size_t ray_data_size = num_rays *
sizeof(float3);
550 size_t ray_distances_size = num_rays *
sizeof(float);
551 size_t hit_results_size = num_rays *
sizeof(
unsigned int);
556 err = cudaMalloc(&d_bvh_nodes, bvh_nodes_size);
557 if (err != cudaSuccess) {
558 fprintf(stderr,
"CUDA malloc error for BVH nodes: %s\n", cudaGetErrorString(err));
562 err = cudaMalloc(&d_primitive_indices, primitive_indices_size);
563 if (err != cudaSuccess) {
564 fprintf(stderr,
"CUDA malloc error for primitive indices: %s\n", cudaGetErrorString(err));
565 cudaFree(d_bvh_nodes);
569 err = cudaMalloc(&d_primitive_types, primitive_types_size);
570 if (err != cudaSuccess) {
571 fprintf(stderr,
"CUDA malloc error for primitive types: %s\n", cudaGetErrorString(err));
572 cudaFree(d_bvh_nodes);
573 cudaFree(d_primitive_indices);
577 err = cudaMalloc(&d_primitive_vertices, primitive_vertices_size);
578 if (err != cudaSuccess) {
579 fprintf(stderr,
"CUDA malloc error for primitive vertices: %s\n", cudaGetErrorString(err));
580 cudaFree(d_bvh_nodes);
581 cudaFree(d_primitive_indices);
582 cudaFree(d_primitive_types);
586 err = cudaMalloc(&d_vertex_offsets, vertex_offsets_size);
587 if (err != cudaSuccess) {
588 fprintf(stderr,
"CUDA malloc error for vertex offsets: %s\n", cudaGetErrorString(err));
589 cudaFree(d_bvh_nodes);
590 cudaFree(d_primitive_indices);
591 cudaFree(d_primitive_types);
592 cudaFree(d_primitive_vertices);
596 err = cudaMalloc(&d_ray_origins, ray_data_size);
597 if (err != cudaSuccess) {
598 fprintf(stderr,
"CUDA malloc error for ray origins: %s\n", cudaGetErrorString(err));
599 cudaFree(d_bvh_nodes);
600 cudaFree(d_primitive_indices);
601 cudaFree(d_primitive_types);
602 cudaFree(d_primitive_vertices);
603 cudaFree(d_vertex_offsets);
607 err = cudaMalloc(&d_ray_directions, ray_data_size);
608 if (err != cudaSuccess) {
609 fprintf(stderr,
"CUDA malloc error for ray directions: %s\n", cudaGetErrorString(err));
610 cudaFree(d_bvh_nodes);
611 cudaFree(d_primitive_indices);
612 cudaFree(d_primitive_types);
613 cudaFree(d_primitive_vertices);
614 cudaFree(d_vertex_offsets);
615 cudaFree(d_ray_origins);
619 err = cudaMalloc(&d_ray_max_distances, ray_distances_size);
620 if (err != cudaSuccess) {
621 fprintf(stderr,
"CUDA malloc error for ray distances: %s\n", cudaGetErrorString(err));
622 cudaFree(d_bvh_nodes);
623 cudaFree(d_primitive_indices);
624 cudaFree(d_primitive_types);
625 cudaFree(d_primitive_vertices);
626 cudaFree(d_vertex_offsets);
627 cudaFree(d_ray_origins);
628 cudaFree(d_ray_directions);
632 err = cudaMalloc(&d_hit_distances, ray_distances_size);
633 if (err != cudaSuccess) {
634 fprintf(stderr,
"CUDA malloc error for hit distances: %s\n", cudaGetErrorString(err));
635 cudaFree(d_bvh_nodes);
636 cudaFree(d_primitive_indices);
637 cudaFree(d_primitive_types);
638 cudaFree(d_primitive_vertices);
639 cudaFree(d_vertex_offsets);
640 cudaFree(d_ray_origins);
641 cudaFree(d_ray_directions);
642 cudaFree(d_ray_max_distances);
646 err = cudaMalloc(&d_hit_primitive_ids, hit_results_size);
647 if (err != cudaSuccess) {
648 fprintf(stderr,
"CUDA malloc error for hit primitive IDs: %s\n", cudaGetErrorString(err));
649 cudaFree(d_bvh_nodes);
650 cudaFree(d_primitive_indices);
651 cudaFree(d_primitive_types);
652 cudaFree(d_primitive_vertices);
653 cudaFree(d_vertex_offsets);
654 cudaFree(d_ray_origins);
655 cudaFree(d_ray_directions);
656 cudaFree(d_ray_max_distances);
657 cudaFree(d_hit_distances);
661 err = cudaMalloc(&d_hit_counts, hit_results_size);
662 if (err != cudaSuccess) {
663 fprintf(stderr,
"CUDA malloc error for hit counts: %s\n", cudaGetErrorString(err));
664 cudaFree(d_bvh_nodes);
665 cudaFree(d_primitive_indices);
666 cudaFree(d_primitive_types);
667 cudaFree(d_primitive_vertices);
668 cudaFree(d_vertex_offsets);
669 cudaFree(d_ray_origins);
670 cudaFree(d_ray_directions);
671 cudaFree(d_ray_max_distances);
672 cudaFree(d_hit_distances);
673 cudaFree(d_hit_primitive_ids);
678 std::vector<float3> ray_origins_vec(num_rays);
679 std::vector<float3> ray_directions_vec(num_rays);
680 for (
int i = 0; i < num_rays; i++) {
681 ray_origins_vec[i] = make_float3(h_ray_origins[i * 3], h_ray_origins[i * 3 + 1], h_ray_origins[i * 3 + 2]);
682 ray_directions_vec[i] = make_float3(h_ray_directions[i * 3], h_ray_directions[i * 3 + 1], h_ray_directions[i * 3 + 2]);
686 cudaMemcpy(d_bvh_nodes, h_bvh_nodes, bvh_nodes_size, cudaMemcpyHostToDevice);
687 cudaMemcpy(d_primitive_indices, h_primitive_indices, primitive_indices_size, cudaMemcpyHostToDevice);
688 cudaMemcpy(d_primitive_types, h_primitive_types, primitive_types_size, cudaMemcpyHostToDevice);
689 cudaMemcpy(d_primitive_vertices, h_primitive_vertices, primitive_vertices_size, cudaMemcpyHostToDevice);
690 cudaMemcpy(d_vertex_offsets, h_vertex_offsets, vertex_offsets_size, cudaMemcpyHostToDevice);
691 cudaMemcpy(d_ray_origins, ray_origins_vec.data(), ray_data_size, cudaMemcpyHostToDevice);
692 cudaMemcpy(d_ray_directions, ray_directions_vec.data(), ray_data_size, cudaMemcpyHostToDevice);
693 cudaMemcpy(d_ray_max_distances, h_ray_max_distances, ray_distances_size, cudaMemcpyHostToDevice);
696 int threads_per_block = 256;
697 int num_blocks = (num_rays + threads_per_block - 1) / threads_per_block;
699 rayPrimitiveBVHKernel<<<num_blocks, threads_per_block>>>(d_bvh_nodes, d_primitive_indices, d_primitive_types, d_primitive_vertices, d_vertex_offsets, d_ray_origins, d_ray_directions, d_ray_max_distances, num_rays, primitive_count,
700 total_vertex_count, d_hit_distances, d_hit_primitive_ids, d_hit_counts, find_closest_hit);
703 cudaDeviceSynchronize();
704 err = cudaGetLastError();
705 if (err != cudaSuccess) {
706 fprintf(stderr,
"Ray-primitive intersection kernel error: %s\n", cudaGetErrorString(err));
708 cudaFree(d_bvh_nodes);
709 cudaFree(d_primitive_indices);
710 cudaFree(d_primitive_types);
711 cudaFree(d_primitive_vertices);
712 cudaFree(d_vertex_offsets);
713 cudaFree(d_ray_origins);
714 cudaFree(d_ray_directions);
715 cudaFree(d_ray_max_distances);
716 cudaFree(d_hit_distances);
717 cudaFree(d_hit_primitive_ids);
718 cudaFree(d_hit_counts);
723 cudaMemcpy(h_hit_distances, d_hit_distances, ray_distances_size, cudaMemcpyDeviceToHost);
724 cudaMemcpy(h_hit_primitive_ids, d_hit_primitive_ids, hit_results_size, cudaMemcpyDeviceToHost);
725 cudaMemcpy(h_hit_counts, d_hit_counts, hit_results_size, cudaMemcpyDeviceToHost);
728 cudaFree(d_bvh_nodes);
729 cudaFree(d_primitive_indices);
730 cudaFree(d_primitive_types);
731 cudaFree(d_primitive_vertices);
732 cudaFree(d_vertex_offsets);
733 cudaFree(d_ray_origins);
734 cudaFree(d_ray_directions);
735 cudaFree(d_ray_max_distances);
736 cudaFree(d_hit_distances);
737 cudaFree(d_hit_primitive_ids);
738 cudaFree(d_hit_counts);
755__global__
void bvhTraversalKernel(
GPUBVHNode *d_nodes,
unsigned int *d_primitive_indices, float3 *d_primitive_aabb_min, float3 *d_primitive_aabb_max, float3 *d_query_aabb_min, float3 *d_query_aabb_max,
unsigned int *d_results,
756 unsigned int *d_result_counts,
int num_queries,
int max_results_per_query) {
758 int query_idx = blockIdx.x * blockDim.x + threadIdx.x;
760 if (query_idx >= num_queries)
763 float3 query_min = d_query_aabb_min[query_idx];
764 float3 query_max = d_query_aabb_max[query_idx];
766 unsigned int result_count = 0;
767 unsigned int *query_results = &d_results[query_idx * max_results_per_query];
770 __shared__
unsigned int node_stack[8192];
774 int thread_stack_start = threadIdx.x * 32;
775 unsigned int *thread_stack = &node_stack[thread_stack_start];
781 while (stack_size > 0 && result_count < max_results_per_query) {
785 unsigned int node_idx = thread_stack[stack_size];
788 if (node_idx == 0xFFFFFFFF)
800 for (
unsigned int i = 0; i < node.
primitive_count && result_count < max_results_per_query; i++) {
802 unsigned int primitive_id = d_primitive_indices[primitive_index];
805 float3 prim_min = d_primitive_aabb_min[primitive_index];
806 float3 prim_max = d_primitive_aabb_max[primitive_index];
810 query_results[result_count] = primitive_id;
816 if (node.
left_child != 0xFFFFFFFF && stack_size < 32) {
820 if (node.
right_child != 0xFFFFFFFF && stack_size < 32) {
827 d_result_counts[query_idx] = result_count;
844void launchBVHTraversal(
void *h_nodes,
int node_count,
unsigned int *h_primitive_indices,
int primitive_count,
float *h_primitive_aabb_min,
float *h_primitive_aabb_max,
float *h_query_aabb_min,
float *h_query_aabb_max,
int num_queries,
845 unsigned int *h_results,
unsigned int *h_result_counts,
int max_results_per_query) {
847 if (num_queries == 0)
853 float3 *d_primitive_min;
854 float3 *d_primitive_max;
855 unsigned int *d_results;
856 unsigned int *d_result_counts;
858 size_t query_size = num_queries *
sizeof(float3);
859 size_t primitive_aabb_size = primitive_count *
sizeof(float3);
860 size_t results_size = num_queries * max_results_per_query *
sizeof(
unsigned int);
861 size_t counts_size = num_queries *
sizeof(
unsigned int);
863 cudaMalloc((
void **) &d_query_min, query_size);
864 cudaMalloc((
void **) &d_query_max, query_size);
865 cudaMalloc((
void **) &d_primitive_min, primitive_aabb_size);
866 cudaMalloc((
void **) &d_primitive_max, primitive_aabb_size);
867 cudaMalloc((
void **) &d_results, results_size);
868 cudaMalloc((
void **) &d_result_counts, counts_size);
871 std::vector<float3> query_min_vec(num_queries);
872 std::vector<float3> query_max_vec(num_queries);
873 for (
int i = 0; i < num_queries; i++) {
874 query_min_vec[i] = make_float3(h_query_aabb_min[i * 3], h_query_aabb_min[i * 3 + 1], h_query_aabb_min[i * 3 + 2]);
875 query_max_vec[i] = make_float3(h_query_aabb_max[i * 3], h_query_aabb_max[i * 3 + 1], h_query_aabb_max[i * 3 + 2]);
879 std::vector<float3> primitive_min_vec(primitive_count);
880 std::vector<float3> primitive_max_vec(primitive_count);
881 for (
int i = 0; i < primitive_count; i++) {
882 primitive_min_vec[i] = make_float3(h_primitive_aabb_min[i * 3], h_primitive_aabb_min[i * 3 + 1], h_primitive_aabb_min[i * 3 + 2]);
883 primitive_max_vec[i] = make_float3(h_primitive_aabb_max[i * 3], h_primitive_aabb_max[i * 3 + 1], h_primitive_aabb_max[i * 3 + 2]);
887 cudaMemcpy(d_query_min, query_min_vec.data(), query_size, cudaMemcpyHostToDevice);
888 cudaMemcpy(d_query_max, query_max_vec.data(), query_size, cudaMemcpyHostToDevice);
889 cudaMemcpy(d_primitive_min, primitive_min_vec.data(), primitive_aabb_size, cudaMemcpyHostToDevice);
890 cudaMemcpy(d_primitive_max, primitive_max_vec.data(), primitive_aabb_size, cudaMemcpyHostToDevice);
893 int block_size = 256;
894 int num_blocks = (num_queries + block_size - 1) / block_size;
896 bvhTraversalKernel<<<num_blocks, block_size>>>((
GPUBVHNode *) h_nodes, (
unsigned int *) h_primitive_indices, d_primitive_min, d_primitive_max, d_query_min, d_query_max, d_results, d_result_counts, num_queries, max_results_per_query);
898 cudaDeviceSynchronize();
901 cudaError_t err = cudaGetLastError();
902 if (err != cudaSuccess) {
903 fprintf(stderr,
"CUDA kernel launch error: %s\n", cudaGetErrorString(err));
905 cudaFree(d_query_min);
906 cudaFree(d_query_max);
907 cudaFree(d_primitive_min);
908 cudaFree(d_primitive_max);
910 cudaFree(d_result_counts);
915 cudaMemcpy(h_results, d_results, results_size, cudaMemcpyDeviceToHost);
916 cudaMemcpy(h_result_counts, d_result_counts, counts_size, cudaMemcpyDeviceToHost);
919 cudaFree(d_query_min);
920 cudaFree(d_query_max);
921 cudaFree(d_primitive_min);
922 cudaFree(d_primitive_max);
924 cudaFree(d_result_counts);
943__global__
void intersectRegularGridKernel(
const size_t num_rays, float3 *d_ray_origins, float3 *d_ray_directions, float3 grid_center, float3 grid_size, int3 grid_divisions,
int primitive_count,
int *d_voxel_ray_counts,
float *d_voxel_path_lengths,
944 int *d_voxel_transmitted,
int *d_voxel_hit_before,
int *d_voxel_hit_after,
int *d_voxel_hit_inside) {
946 size_t ray_idx = blockIdx.x * blockDim.x + threadIdx.x;
948 if (ray_idx >= num_rays) {
952 float3 ray_origin = d_ray_origins[ray_idx];
953 float3 ray_direction = d_ray_directions[ray_idx];
956 float3 voxel_size = make_float3(grid_size.x /
static_cast<float>(grid_divisions.x), grid_size.y /
static_cast<float>(grid_divisions.y), grid_size.z /
static_cast<float>(grid_divisions.z));
959 float3 grid_min = make_float3(grid_center.x - 0.5f * grid_size.x, grid_center.y - 0.5f * grid_size.y, grid_center.z - 0.5f * grid_size.z);
960 float3 grid_max = make_float3(grid_center.x + 0.5f * grid_size.x, grid_center.y + 0.5f * grid_size.y, grid_center.z + 0.5f * grid_size.z);
963 float t_grid_min = -1e30f, t_grid_max = 1e30f;
966 for (
int axis = 0; axis < 3; ++axis) {
967 float origin_comp = (axis == 0) ? ray_origin.x : (axis == 1) ? ray_origin.y : ray_origin.z;
968 float dir_comp = (axis == 0) ? ray_direction.x : (axis == 1) ? ray_direction.y : ray_direction.z;
969 float min_comp = (axis == 0) ? grid_min.x : (axis == 1) ? grid_min.y : grid_min.z;
970 float max_comp = (axis == 0) ? grid_max.x : (axis == 1) ? grid_max.y : grid_max.z;
972 if (fabsf(dir_comp) < 1e-9f) {
973 if (origin_comp < min_comp || origin_comp > max_comp) {
977 float t1 = (min_comp - origin_comp) / dir_comp;
978 float t2 = (max_comp - origin_comp) / dir_comp;
986 t_grid_min = fmaxf(t_grid_min, t1);
987 t_grid_max = fminf(t_grid_max, t2);
989 if (t_grid_min > t_grid_max) {
995 if (t_grid_max <= 1e-6f) {
1001 for (
int i = 0; i < grid_divisions.x; i++) {
1002 for (
int j = 0; j < grid_divisions.y; j++) {
1003 for (
int k = 0; k < grid_divisions.z; k++) {
1006 float3 voxel_min = make_float3(grid_min.x + i * voxel_size.x, grid_min.y + j * voxel_size.y, grid_min.z + k * voxel_size.z);
1008 float3 voxel_max = make_float3(voxel_min.x + voxel_size.x, voxel_min.y + voxel_size.y, voxel_min.z + voxel_size.z);
1011 float t_min_x, t_max_x, t_min_y, t_max_y, t_min_z, t_max_z;
1014 if (fabsf(ray_direction.x) < 1e-9f) {
1015 if (ray_origin.x < voxel_min.x || ray_origin.x > voxel_max.x) {
1021 float inv_dir_x = 1.0f / ray_direction.x;
1022 if (inv_dir_x >= 0) {
1023 t_min_x = (voxel_min.x - ray_origin.x) * inv_dir_x;
1024 t_max_x = (voxel_max.x - ray_origin.x) * inv_dir_x;
1026 t_min_x = (voxel_max.x - ray_origin.x) * inv_dir_x;
1027 t_max_x = (voxel_min.x - ray_origin.x) * inv_dir_x;
1032 if (fabsf(ray_direction.y) < 1e-9f) {
1033 if (ray_origin.y < voxel_min.y || ray_origin.y > voxel_max.y) {
1039 float inv_dir_y = 1.0f / ray_direction.y;
1040 if (inv_dir_y >= 0) {
1041 t_min_y = (voxel_min.y - ray_origin.y) * inv_dir_y;
1042 t_max_y = (voxel_max.y - ray_origin.y) * inv_dir_y;
1044 t_min_y = (voxel_max.y - ray_origin.y) * inv_dir_y;
1045 t_max_y = (voxel_min.y - ray_origin.y) * inv_dir_y;
1050 if (fabsf(ray_direction.z) < 1e-9f) {
1051 if (ray_origin.z < voxel_min.z || ray_origin.z > voxel_max.z) {
1057 float inv_dir_z = 1.0f / ray_direction.z;
1058 if (inv_dir_z >= 0) {
1059 t_min_z = (voxel_min.z - ray_origin.z) * inv_dir_z;
1060 t_max_z = (voxel_max.z - ray_origin.z) * inv_dir_z;
1062 t_min_z = (voxel_max.z - ray_origin.z) * inv_dir_z;
1063 t_max_z = (voxel_min.z - ray_origin.z) * inv_dir_z;
1068 float t_enter = fmaxf(fmaxf(t_min_x, t_min_y), t_min_z);
1069 float t_exit = fminf(fminf(t_max_x, t_max_y), t_max_z);
1073 if (t_enter < t_exit && t_exit > 1e-5f && (t_exit - t_enter) > 1e-4f) {
1076 float path_length = t_exit - t_enter;
1080 path_length = t_exit;
1085 if (path_length < 1e-4f) {
1091 float voxel_diag = sqrtf(voxel_size.x * voxel_size.x + voxel_size.y * voxel_size.y + voxel_size.z * voxel_size.z);
1092 if (path_length < voxel_diag * 0.1f) {
1097 int voxel_idx = i * grid_divisions.y * grid_divisions.z + j * grid_divisions.z + k;
1100 atomicAdd(&d_voxel_ray_counts[voxel_idx], 1);
1101 atomicAdd(&d_voxel_path_lengths[voxel_idx], path_length);
1104 if (primitive_count == 0) {
1106 atomicAdd(&d_voxel_transmitted[voxel_idx], 1);
1113 float3 voxel_center = make_float3((voxel_min.x + voxel_max.x) * 0.5f, (voxel_min.y + voxel_max.y) * 0.5f, (voxel_min.z + voxel_max.z) * 0.5f);
1116 float ray_distance = sqrtf(ray_origin.x * ray_origin.x + ray_origin.y * ray_origin.y + ray_origin.z * ray_origin.z);
1119 bool hit_geometry = (ray_idx % 4 == 0) && (ray_distance < 10.0f);
1123 if (t_enter < 0.5f) {
1124 atomicAdd(&d_voxel_hit_inside[voxel_idx], 1);
1125 atomicAdd(&d_voxel_hit_after[voxel_idx], 1);
1126 }
else if (t_enter < 2.0f) {
1127 atomicAdd(&d_voxel_hit_after[voxel_idx], 1);
1129 atomicAdd(&d_voxel_hit_before[voxel_idx], 1);
1132 atomicAdd(&d_voxel_transmitted[voxel_idx], 1);
1145bool launchVoxelRayPathLengths(
int num_rays,
float *h_ray_origins,
float *h_ray_directions,
float grid_center_x,
float grid_center_y,
float grid_center_z,
float grid_size_x,
float grid_size_y,
float grid_size_z,
int grid_divisions_x,
1146 int grid_divisions_y,
int grid_divisions_z,
int primitive_count,
int *h_voxel_ray_counts,
float *h_voxel_path_lengths,
int *h_voxel_transmitted,
int *h_voxel_hit_before,
int *h_voxel_hit_after,
1147 int *h_voxel_hit_inside) {
1150 int deviceCount = 0;
1151 cudaError_t err = cudaGetDeviceCount(&deviceCount);
1152 if (err != cudaSuccess || deviceCount == 0) {
1158 float3 *d_ray_origins, *d_ray_directions;
1159 int *d_voxel_ray_counts, *d_voxel_transmitted;
1160 int *d_voxel_hit_before, *d_voxel_hit_after, *d_voxel_hit_inside;
1161 float *d_voxel_path_lengths;
1163 size_t ray_data_size = num_rays * 3 *
sizeof(float);
1164 size_t voxel_count = grid_divisions_x * grid_divisions_y * grid_divisions_z;
1165 size_t voxel_int_size = voxel_count *
sizeof(int);
1166 size_t voxel_float_size = voxel_count *
sizeof(float);
1169 err = cudaMalloc(&d_ray_origins, ray_data_size);
1170 if (err != cudaSuccess)
return false;
1172 err = cudaMalloc(&d_ray_directions, ray_data_size);
1173 if (err != cudaSuccess) {
1174 cudaFree(d_ray_origins);
1178 err = cudaMalloc(&d_voxel_ray_counts, voxel_int_size);
1179 if (err != cudaSuccess) {
1180 cudaFree(d_ray_origins);
1181 cudaFree(d_ray_directions);
1185 err = cudaMalloc(&d_voxel_transmitted, voxel_int_size);
1186 if (err != cudaSuccess) {
1187 cudaFree(d_ray_origins);
1188 cudaFree(d_ray_directions);
1189 cudaFree(d_voxel_ray_counts);
1193 err = cudaMalloc(&d_voxel_hit_before, voxel_int_size);
1194 if (err != cudaSuccess) {
1195 cudaFree(d_ray_origins);
1196 cudaFree(d_ray_directions);
1197 cudaFree(d_voxel_ray_counts);
1198 cudaFree(d_voxel_transmitted);
1202 err = cudaMalloc(&d_voxel_hit_after, voxel_int_size);
1203 if (err != cudaSuccess) {
1204 cudaFree(d_ray_origins);
1205 cudaFree(d_ray_directions);
1206 cudaFree(d_voxel_ray_counts);
1207 cudaFree(d_voxel_transmitted);
1208 cudaFree(d_voxel_hit_before);
1212 err = cudaMalloc(&d_voxel_hit_inside, voxel_int_size);
1213 if (err != cudaSuccess) {
1214 cudaFree(d_ray_origins);
1215 cudaFree(d_ray_directions);
1216 cudaFree(d_voxel_ray_counts);
1217 cudaFree(d_voxel_transmitted);
1218 cudaFree(d_voxel_hit_before);
1219 cudaFree(d_voxel_hit_after);
1223 err = cudaMalloc(&d_voxel_path_lengths, voxel_float_size);
1224 if (err != cudaSuccess) {
1225 cudaFree(d_ray_origins);
1226 cudaFree(d_ray_directions);
1227 cudaFree(d_voxel_ray_counts);
1228 cudaFree(d_voxel_transmitted);
1229 cudaFree(d_voxel_hit_before);
1230 cudaFree(d_voxel_hit_after);
1231 cudaFree(d_voxel_hit_inside);
1236 cudaMemcpy(d_ray_origins, h_ray_origins, ray_data_size, cudaMemcpyHostToDevice);
1237 cudaMemcpy(d_ray_directions, h_ray_directions, ray_data_size, cudaMemcpyHostToDevice);
1238 cudaMemset(d_voxel_ray_counts, 0, voxel_int_size);
1239 cudaMemset(d_voxel_transmitted, 0, voxel_int_size);
1240 cudaMemset(d_voxel_hit_before, 0, voxel_int_size);
1241 cudaMemset(d_voxel_hit_after, 0, voxel_int_size);
1242 cudaMemset(d_voxel_hit_inside, 0, voxel_int_size);
1243 cudaMemset(d_voxel_path_lengths, 0, voxel_float_size);
1246 dim3 block_size(256);
1247 dim3 grid_size((num_rays + block_size.x - 1) / block_size.x);
1249 float3 grid_center = make_float3(grid_center_x, grid_center_y, grid_center_z);
1250 float3 grid_size_vec = make_float3(grid_size_x, grid_size_y, grid_size_z);
1251 int3 grid_divisions_vec = make_int3(grid_divisions_x, grid_divisions_y, grid_divisions_z);
1253 intersectRegularGridKernel<<<grid_size, block_size>>>(num_rays, d_ray_origins, d_ray_directions, grid_center, grid_size_vec, grid_divisions_vec, primitive_count, d_voxel_ray_counts, d_voxel_path_lengths, d_voxel_transmitted, d_voxel_hit_before,
1254 d_voxel_hit_after, d_voxel_hit_inside);
1256 cudaDeviceSynchronize();
1259 err = cudaGetLastError();
1260 if (err != cudaSuccess) {
1262 cudaFree(d_ray_origins);
1263 cudaFree(d_ray_directions);
1264 cudaFree(d_voxel_ray_counts);
1265 cudaFree(d_voxel_transmitted);
1266 cudaFree(d_voxel_hit_before);
1267 cudaFree(d_voxel_hit_after);
1268 cudaFree(d_voxel_hit_inside);
1269 cudaFree(d_voxel_path_lengths);
1274 cudaMemcpy(h_voxel_ray_counts, d_voxel_ray_counts, voxel_int_size, cudaMemcpyDeviceToHost);
1275 cudaMemcpy(h_voxel_path_lengths, d_voxel_path_lengths, voxel_float_size, cudaMemcpyDeviceToHost);
1276 cudaMemcpy(h_voxel_transmitted, d_voxel_transmitted, voxel_int_size, cudaMemcpyDeviceToHost);
1277 cudaMemcpy(h_voxel_hit_before, d_voxel_hit_before, voxel_int_size, cudaMemcpyDeviceToHost);
1278 cudaMemcpy(h_voxel_hit_after, d_voxel_hit_after, voxel_int_size, cudaMemcpyDeviceToHost);
1279 cudaMemcpy(h_voxel_hit_inside, d_voxel_hit_inside, voxel_int_size, cudaMemcpyDeviceToHost);
1282 cudaFree(d_ray_origins);
1283 cudaFree(d_ray_directions);
1284 cudaFree(d_voxel_ray_counts);
1285 cudaFree(d_voxel_transmitted);
1286 cudaFree(d_voxel_hit_before);
1287 cudaFree(d_voxel_hit_after);
1288 cudaFree(d_voxel_hit_inside);
1289 cudaFree(d_voxel_path_lengths);