522void launchRayPrimitiveIntersection(
void *h_bvh_nodes,
int node_count,
unsigned int *h_primitive_indices,
int primitive_count,
int *h_primitive_types, float3 *h_primitive_vertices,
unsigned int *h_vertex_offsets,
int total_vertex_count,
523 float *h_ray_origins,
float *h_ray_directions,
float *h_ray_max_distances,
int num_rays,
float *h_hit_distances,
unsigned int *h_hit_primitive_ids,
unsigned int *h_hit_counts,
bool find_closest_hit) {
528 size_t total_vertices = total_vertex_count;
532 unsigned int *d_primitive_indices =
nullptr;
533 int *d_primitive_types =
nullptr;
534 float3 *d_primitive_vertices =
nullptr;
535 unsigned int *d_vertex_offsets =
nullptr;
536 float3 *d_ray_origins =
nullptr, *d_ray_directions =
nullptr;
537 float *d_ray_max_distances =
nullptr;
538 float *d_hit_distances =
nullptr;
539 unsigned int *d_hit_primitive_ids =
nullptr, *d_hit_counts =
nullptr;
542 size_t bvh_nodes_size = node_count *
sizeof(
GPUBVHNode);
543 size_t primitive_indices_size = primitive_count *
sizeof(
unsigned int);
544 size_t primitive_types_size = primitive_count *
sizeof(int);
545 size_t primitive_vertices_size = total_vertices *
sizeof(float3);
546 size_t vertex_offsets_size = primitive_count *
sizeof(
unsigned int);
547 size_t ray_data_size = num_rays *
sizeof(float3);
548 size_t ray_distances_size = num_rays *
sizeof(float);
549 size_t hit_results_size = num_rays *
sizeof(
unsigned int);
554 err = cudaMalloc(&d_bvh_nodes, bvh_nodes_size);
555 if (err != cudaSuccess) {
556 fprintf(stderr,
"CUDA malloc error for BVH nodes: %s\n", cudaGetErrorString(err));
560 err = cudaMalloc(&d_primitive_indices, primitive_indices_size);
561 if (err != cudaSuccess) {
562 fprintf(stderr,
"CUDA malloc error for primitive indices: %s\n", cudaGetErrorString(err));
563 cudaFree(d_bvh_nodes);
567 err = cudaMalloc(&d_primitive_types, primitive_types_size);
568 if (err != cudaSuccess) {
569 fprintf(stderr,
"CUDA malloc error for primitive types: %s\n", cudaGetErrorString(err));
570 cudaFree(d_bvh_nodes);
571 cudaFree(d_primitive_indices);
575 err = cudaMalloc(&d_primitive_vertices, primitive_vertices_size);
576 if (err != cudaSuccess) {
577 fprintf(stderr,
"CUDA malloc error for primitive vertices: %s\n", cudaGetErrorString(err));
578 cudaFree(d_bvh_nodes);
579 cudaFree(d_primitive_indices);
580 cudaFree(d_primitive_types);
584 err = cudaMalloc(&d_vertex_offsets, vertex_offsets_size);
585 if (err != cudaSuccess) {
586 fprintf(stderr,
"CUDA malloc error for vertex offsets: %s\n", cudaGetErrorString(err));
587 cudaFree(d_bvh_nodes);
588 cudaFree(d_primitive_indices);
589 cudaFree(d_primitive_types);
590 cudaFree(d_primitive_vertices);
594 err = cudaMalloc(&d_ray_origins, ray_data_size);
595 if (err != cudaSuccess) {
596 fprintf(stderr,
"CUDA malloc error for ray origins: %s\n", cudaGetErrorString(err));
597 cudaFree(d_bvh_nodes);
598 cudaFree(d_primitive_indices);
599 cudaFree(d_primitive_types);
600 cudaFree(d_primitive_vertices);
601 cudaFree(d_vertex_offsets);
605 err = cudaMalloc(&d_ray_directions, ray_data_size);
606 if (err != cudaSuccess) {
607 fprintf(stderr,
"CUDA malloc error for ray directions: %s\n", cudaGetErrorString(err));
608 cudaFree(d_bvh_nodes);
609 cudaFree(d_primitive_indices);
610 cudaFree(d_primitive_types);
611 cudaFree(d_primitive_vertices);
612 cudaFree(d_vertex_offsets);
613 cudaFree(d_ray_origins);
617 err = cudaMalloc(&d_ray_max_distances, ray_distances_size);
618 if (err != cudaSuccess) {
619 fprintf(stderr,
"CUDA malloc error for ray distances: %s\n", cudaGetErrorString(err));
620 cudaFree(d_bvh_nodes);
621 cudaFree(d_primitive_indices);
622 cudaFree(d_primitive_types);
623 cudaFree(d_primitive_vertices);
624 cudaFree(d_vertex_offsets);
625 cudaFree(d_ray_origins);
626 cudaFree(d_ray_directions);
630 err = cudaMalloc(&d_hit_distances, ray_distances_size);
631 if (err != cudaSuccess) {
632 fprintf(stderr,
"CUDA malloc error for hit distances: %s\n", cudaGetErrorString(err));
633 cudaFree(d_bvh_nodes);
634 cudaFree(d_primitive_indices);
635 cudaFree(d_primitive_types);
636 cudaFree(d_primitive_vertices);
637 cudaFree(d_vertex_offsets);
638 cudaFree(d_ray_origins);
639 cudaFree(d_ray_directions);
640 cudaFree(d_ray_max_distances);
644 err = cudaMalloc(&d_hit_primitive_ids, hit_results_size);
645 if (err != cudaSuccess) {
646 fprintf(stderr,
"CUDA malloc error for hit primitive IDs: %s\n", cudaGetErrorString(err));
647 cudaFree(d_bvh_nodes);
648 cudaFree(d_primitive_indices);
649 cudaFree(d_primitive_types);
650 cudaFree(d_primitive_vertices);
651 cudaFree(d_vertex_offsets);
652 cudaFree(d_ray_origins);
653 cudaFree(d_ray_directions);
654 cudaFree(d_ray_max_distances);
655 cudaFree(d_hit_distances);
659 err = cudaMalloc(&d_hit_counts, hit_results_size);
660 if (err != cudaSuccess) {
661 fprintf(stderr,
"CUDA malloc error for hit counts: %s\n", cudaGetErrorString(err));
662 cudaFree(d_bvh_nodes);
663 cudaFree(d_primitive_indices);
664 cudaFree(d_primitive_types);
665 cudaFree(d_primitive_vertices);
666 cudaFree(d_vertex_offsets);
667 cudaFree(d_ray_origins);
668 cudaFree(d_ray_directions);
669 cudaFree(d_ray_max_distances);
670 cudaFree(d_hit_distances);
671 cudaFree(d_hit_primitive_ids);
676 std::vector<float3> ray_origins_vec(num_rays);
677 std::vector<float3> ray_directions_vec(num_rays);
678 for (
int i = 0; i < num_rays; i++) {
679 ray_origins_vec[i] = make_float3(h_ray_origins[i * 3], h_ray_origins[i * 3 + 1], h_ray_origins[i * 3 + 2]);
680 ray_directions_vec[i] = make_float3(h_ray_directions[i * 3], h_ray_directions[i * 3 + 1], h_ray_directions[i * 3 + 2]);
684 cudaMemcpy(d_bvh_nodes, h_bvh_nodes, bvh_nodes_size, cudaMemcpyHostToDevice);
685 cudaMemcpy(d_primitive_indices, h_primitive_indices, primitive_indices_size, cudaMemcpyHostToDevice);
686 cudaMemcpy(d_primitive_types, h_primitive_types, primitive_types_size, cudaMemcpyHostToDevice);
687 cudaMemcpy(d_primitive_vertices, h_primitive_vertices, primitive_vertices_size, cudaMemcpyHostToDevice);
688 cudaMemcpy(d_vertex_offsets, h_vertex_offsets, vertex_offsets_size, cudaMemcpyHostToDevice);
689 cudaMemcpy(d_ray_origins, ray_origins_vec.data(), ray_data_size, cudaMemcpyHostToDevice);
690 cudaMemcpy(d_ray_directions, ray_directions_vec.data(), ray_data_size, cudaMemcpyHostToDevice);
691 cudaMemcpy(d_ray_max_distances, h_ray_max_distances, ray_distances_size, cudaMemcpyHostToDevice);
694 int threads_per_block = 256;
695 int num_blocks = (num_rays + threads_per_block - 1) / threads_per_block;
697 rayPrimitiveBVHKernel<<<num_blocks, threads_per_block>>>(d_bvh_nodes, d_primitive_indices, d_primitive_types, d_primitive_vertices, d_vertex_offsets, d_ray_origins, d_ray_directions, d_ray_max_distances, num_rays, primitive_count,
698 total_vertex_count, d_hit_distances, d_hit_primitive_ids, d_hit_counts, find_closest_hit);
701 cudaDeviceSynchronize();
702 err = cudaGetLastError();
703 if (err != cudaSuccess) {
704 fprintf(stderr,
"Ray-primitive intersection kernel error: %s\n", cudaGetErrorString(err));
706 cudaFree(d_bvh_nodes);
707 cudaFree(d_primitive_indices);
708 cudaFree(d_primitive_types);
709 cudaFree(d_primitive_vertices);
710 cudaFree(d_vertex_offsets);
711 cudaFree(d_ray_origins);
712 cudaFree(d_ray_directions);
713 cudaFree(d_ray_max_distances);
714 cudaFree(d_hit_distances);
715 cudaFree(d_hit_primitive_ids);
716 cudaFree(d_hit_counts);
721 cudaMemcpy(h_hit_distances, d_hit_distances, ray_distances_size, cudaMemcpyDeviceToHost);
722 cudaMemcpy(h_hit_primitive_ids, d_hit_primitive_ids, hit_results_size, cudaMemcpyDeviceToHost);
723 cudaMemcpy(h_hit_counts, d_hit_counts, hit_results_size, cudaMemcpyDeviceToHost);
726 cudaFree(d_bvh_nodes);
727 cudaFree(d_primitive_indices);
728 cudaFree(d_primitive_types);
729 cudaFree(d_primitive_vertices);
730 cudaFree(d_vertex_offsets);
731 cudaFree(d_ray_origins);
732 cudaFree(d_ray_directions);
733 cudaFree(d_ray_max_distances);
734 cudaFree(d_hit_distances);
735 cudaFree(d_hit_primitive_ids);
736 cudaFree(d_hit_counts);
753__global__
void bvhTraversalKernel(
GPUBVHNode *d_nodes,
unsigned int *d_primitive_indices, float3 *d_primitive_aabb_min, float3 *d_primitive_aabb_max, float3 *d_query_aabb_min, float3 *d_query_aabb_max,
unsigned int *d_results,
754 unsigned int *d_result_counts,
int num_queries,
int max_results_per_query) {
756 int query_idx = blockIdx.x * blockDim.x + threadIdx.x;
758 if (query_idx >= num_queries)
761 float3 query_min = d_query_aabb_min[query_idx];
762 float3 query_max = d_query_aabb_max[query_idx];
764 unsigned int result_count = 0;
765 unsigned int *query_results = &d_results[query_idx * max_results_per_query];
768 __shared__
unsigned int node_stack[8192];
772 int thread_stack_start = threadIdx.x * 32;
773 unsigned int *thread_stack = &node_stack[thread_stack_start];
779 while (stack_size > 0 && result_count < max_results_per_query) {
783 unsigned int node_idx = thread_stack[stack_size];
786 if (node_idx == 0xFFFFFFFF)
798 for (
unsigned int i = 0; i < node.
primitive_count && result_count < max_results_per_query; i++) {
800 unsigned int primitive_id = d_primitive_indices[primitive_index];
803 float3 prim_min = d_primitive_aabb_min[primitive_index];
804 float3 prim_max = d_primitive_aabb_max[primitive_index];
808 query_results[result_count] = primitive_id;
814 if (node.
left_child != 0xFFFFFFFF && stack_size < 32) {
818 if (node.
right_child != 0xFFFFFFFF && stack_size < 32) {
825 d_result_counts[query_idx] = result_count;
842void launchBVHTraversal(
void *h_nodes,
int node_count,
unsigned int *h_primitive_indices,
int primitive_count,
float *h_primitive_aabb_min,
float *h_primitive_aabb_max,
float *h_query_aabb_min,
float *h_query_aabb_max,
int num_queries,
843 unsigned int *h_results,
unsigned int *h_result_counts,
int max_results_per_query) {
845 if (num_queries == 0)
851 float3 *d_primitive_min;
852 float3 *d_primitive_max;
853 unsigned int *d_results;
854 unsigned int *d_result_counts;
856 size_t query_size = num_queries *
sizeof(float3);
857 size_t primitive_aabb_size = primitive_count *
sizeof(float3);
858 size_t results_size = num_queries * max_results_per_query *
sizeof(
unsigned int);
859 size_t counts_size = num_queries *
sizeof(
unsigned int);
861 cudaMalloc((
void **) &d_query_min, query_size);
862 cudaMalloc((
void **) &d_query_max, query_size);
863 cudaMalloc((
void **) &d_primitive_min, primitive_aabb_size);
864 cudaMalloc((
void **) &d_primitive_max, primitive_aabb_size);
865 cudaMalloc((
void **) &d_results, results_size);
866 cudaMalloc((
void **) &d_result_counts, counts_size);
869 std::vector<float3> query_min_vec(num_queries);
870 std::vector<float3> query_max_vec(num_queries);
871 for (
int i = 0; i < num_queries; i++) {
872 query_min_vec[i] = make_float3(h_query_aabb_min[i * 3], h_query_aabb_min[i * 3 + 1], h_query_aabb_min[i * 3 + 2]);
873 query_max_vec[i] = make_float3(h_query_aabb_max[i * 3], h_query_aabb_max[i * 3 + 1], h_query_aabb_max[i * 3 + 2]);
877 std::vector<float3> primitive_min_vec(primitive_count);
878 std::vector<float3> primitive_max_vec(primitive_count);
879 for (
int i = 0; i < primitive_count; i++) {
880 primitive_min_vec[i] = make_float3(h_primitive_aabb_min[i * 3], h_primitive_aabb_min[i * 3 + 1], h_primitive_aabb_min[i * 3 + 2]);
881 primitive_max_vec[i] = make_float3(h_primitive_aabb_max[i * 3], h_primitive_aabb_max[i * 3 + 1], h_primitive_aabb_max[i * 3 + 2]);
885 cudaMemcpy(d_query_min, query_min_vec.data(), query_size, cudaMemcpyHostToDevice);
886 cudaMemcpy(d_query_max, query_max_vec.data(), query_size, cudaMemcpyHostToDevice);
887 cudaMemcpy(d_primitive_min, primitive_min_vec.data(), primitive_aabb_size, cudaMemcpyHostToDevice);
888 cudaMemcpy(d_primitive_max, primitive_max_vec.data(), primitive_aabb_size, cudaMemcpyHostToDevice);
891 int block_size = 256;
892 int num_blocks = (num_queries + block_size - 1) / block_size;
894 bvhTraversalKernel<<<num_blocks, block_size>>>((
GPUBVHNode *) h_nodes, (
unsigned int *) h_primitive_indices, d_primitive_min, d_primitive_max, d_query_min, d_query_max, d_results, d_result_counts, num_queries, max_results_per_query);
896 cudaDeviceSynchronize();
899 cudaError_t err = cudaGetLastError();
900 if (err != cudaSuccess) {
901 fprintf(stderr,
"CUDA kernel launch error: %s\n", cudaGetErrorString(err));
903 cudaFree(d_query_min);
904 cudaFree(d_query_max);
905 cudaFree(d_primitive_min);
906 cudaFree(d_primitive_max);
908 cudaFree(d_result_counts);
913 cudaMemcpy(h_results, d_results, results_size, cudaMemcpyDeviceToHost);
914 cudaMemcpy(h_result_counts, d_result_counts, counts_size, cudaMemcpyDeviceToHost);
917 cudaFree(d_query_min);
918 cudaFree(d_query_max);
919 cudaFree(d_primitive_min);
920 cudaFree(d_primitive_max);
922 cudaFree(d_result_counts);
941__global__
void intersectRegularGridKernel(
const size_t num_rays, float3 *d_ray_origins, float3 *d_ray_directions, float3 grid_center, float3 grid_size, int3 grid_divisions,
int primitive_count,
int *d_voxel_ray_counts,
float *d_voxel_path_lengths,
942 int *d_voxel_transmitted,
int *d_voxel_hit_before,
int *d_voxel_hit_after,
int *d_voxel_hit_inside) {
944 size_t ray_idx = blockIdx.x * blockDim.x + threadIdx.x;
946 if (ray_idx >= num_rays) {
950 float3 ray_origin = d_ray_origins[ray_idx];
951 float3 ray_direction = d_ray_directions[ray_idx];
954 float3 voxel_size = make_float3(grid_size.x /
static_cast<float>(grid_divisions.x), grid_size.y /
static_cast<float>(grid_divisions.y), grid_size.z /
static_cast<float>(grid_divisions.z));
957 float3 grid_min = make_float3(grid_center.x - 0.5f * grid_size.x, grid_center.y - 0.5f * grid_size.y, grid_center.z - 0.5f * grid_size.z);
958 float3 grid_max = make_float3(grid_center.x + 0.5f * grid_size.x, grid_center.y + 0.5f * grid_size.y, grid_center.z + 0.5f * grid_size.z);
961 float t_grid_min = -1e30f, t_grid_max = 1e30f;
964 for (
int axis = 0; axis < 3; ++axis) {
965 float origin_comp = (axis == 0) ? ray_origin.x : (axis == 1) ? ray_origin.y : ray_origin.z;
966 float dir_comp = (axis == 0) ? ray_direction.x : (axis == 1) ? ray_direction.y : ray_direction.z;
967 float min_comp = (axis == 0) ? grid_min.x : (axis == 1) ? grid_min.y : grid_min.z;
968 float max_comp = (axis == 0) ? grid_max.x : (axis == 1) ? grid_max.y : grid_max.z;
970 if (fabsf(dir_comp) < 1e-9f) {
971 if (origin_comp < min_comp || origin_comp > max_comp) {
975 float t1 = (min_comp - origin_comp) / dir_comp;
976 float t2 = (max_comp - origin_comp) / dir_comp;
984 t_grid_min = fmaxf(t_grid_min, t1);
985 t_grid_max = fminf(t_grid_max, t2);
987 if (t_grid_min > t_grid_max) {
993 if (t_grid_max <= 1e-6f) {
999 for (
int i = 0; i < grid_divisions.x; i++) {
1000 for (
int j = 0; j < grid_divisions.y; j++) {
1001 for (
int k = 0; k < grid_divisions.z; k++) {
1004 float3 voxel_min = make_float3(grid_min.x + i * voxel_size.x, grid_min.y + j * voxel_size.y, grid_min.z + k * voxel_size.z);
1006 float3 voxel_max = make_float3(voxel_min.x + voxel_size.x, voxel_min.y + voxel_size.y, voxel_min.z + voxel_size.z);
1009 float t_min_x, t_max_x, t_min_y, t_max_y, t_min_z, t_max_z;
1012 if (fabsf(ray_direction.x) < 1e-9f) {
1013 if (ray_origin.x < voxel_min.x || ray_origin.x > voxel_max.x) {
1019 float inv_dir_x = 1.0f / ray_direction.x;
1020 if (inv_dir_x >= 0) {
1021 t_min_x = (voxel_min.x - ray_origin.x) * inv_dir_x;
1022 t_max_x = (voxel_max.x - ray_origin.x) * inv_dir_x;
1024 t_min_x = (voxel_max.x - ray_origin.x) * inv_dir_x;
1025 t_max_x = (voxel_min.x - ray_origin.x) * inv_dir_x;
1030 if (fabsf(ray_direction.y) < 1e-9f) {
1031 if (ray_origin.y < voxel_min.y || ray_origin.y > voxel_max.y) {
1037 float inv_dir_y = 1.0f / ray_direction.y;
1038 if (inv_dir_y >= 0) {
1039 t_min_y = (voxel_min.y - ray_origin.y) * inv_dir_y;
1040 t_max_y = (voxel_max.y - ray_origin.y) * inv_dir_y;
1042 t_min_y = (voxel_max.y - ray_origin.y) * inv_dir_y;
1043 t_max_y = (voxel_min.y - ray_origin.y) * inv_dir_y;
1048 if (fabsf(ray_direction.z) < 1e-9f) {
1049 if (ray_origin.z < voxel_min.z || ray_origin.z > voxel_max.z) {
1055 float inv_dir_z = 1.0f / ray_direction.z;
1056 if (inv_dir_z >= 0) {
1057 t_min_z = (voxel_min.z - ray_origin.z) * inv_dir_z;
1058 t_max_z = (voxel_max.z - ray_origin.z) * inv_dir_z;
1060 t_min_z = (voxel_max.z - ray_origin.z) * inv_dir_z;
1061 t_max_z = (voxel_min.z - ray_origin.z) * inv_dir_z;
1066 float t_enter = fmaxf(fmaxf(t_min_x, t_min_y), t_min_z);
1067 float t_exit = fminf(fminf(t_max_x, t_max_y), t_max_z);
1071 if (t_enter < t_exit && t_exit > 1e-5f && (t_exit - t_enter) > 1e-4f) {
1074 float path_length = t_exit - t_enter;
1078 path_length = t_exit;
1083 if (path_length < 1e-4f) {
1089 float voxel_diag = sqrtf(voxel_size.x * voxel_size.x + voxel_size.y * voxel_size.y + voxel_size.z * voxel_size.z);
1090 if (path_length < voxel_diag * 0.1f) {
1095 int voxel_idx = i * grid_divisions.y * grid_divisions.z + j * grid_divisions.z + k;
1098 atomicAdd(&d_voxel_ray_counts[voxel_idx], 1);
1099 atomicAdd(&d_voxel_path_lengths[voxel_idx], path_length);
1102 if (primitive_count == 0) {
1104 atomicAdd(&d_voxel_transmitted[voxel_idx], 1);
1111 float3 voxel_center = make_float3((voxel_min.x + voxel_max.x) * 0.5f, (voxel_min.y + voxel_max.y) * 0.5f, (voxel_min.z + voxel_max.z) * 0.5f);
1114 float ray_distance = sqrtf(ray_origin.x * ray_origin.x + ray_origin.y * ray_origin.y + ray_origin.z * ray_origin.z);
1117 bool hit_geometry = (ray_idx % 4 == 0) && (ray_distance < 10.0f);
1121 if (t_enter < 0.5f) {
1122 atomicAdd(&d_voxel_hit_inside[voxel_idx], 1);
1123 atomicAdd(&d_voxel_hit_after[voxel_idx], 1);
1124 }
else if (t_enter < 2.0f) {
1125 atomicAdd(&d_voxel_hit_after[voxel_idx], 1);
1127 atomicAdd(&d_voxel_hit_before[voxel_idx], 1);
1130 atomicAdd(&d_voxel_transmitted[voxel_idx], 1);
1142void launchVoxelRayPathLengths(
int num_rays,
float *h_ray_origins,
float *h_ray_directions,
float grid_center_x,
float grid_center_y,
float grid_center_z,
float grid_size_x,
float grid_size_y,
float grid_size_z,
int grid_divisions_x,
1143 int grid_divisions_y,
int grid_divisions_z,
int primitive_count,
int *h_voxel_ray_counts,
float *h_voxel_path_lengths,
int *h_voxel_transmitted,
int *h_voxel_hit_before,
int *h_voxel_hit_after,
1144 int *h_voxel_hit_inside) {
1147 float3 *d_ray_origins, *d_ray_directions;
1148 int *d_voxel_ray_counts, *d_voxel_transmitted;
1149 int *d_voxel_hit_before, *d_voxel_hit_after, *d_voxel_hit_inside;
1150 float *d_voxel_path_lengths;
1152 size_t ray_data_size = num_rays * 3 *
sizeof(float);
1153 size_t voxel_count = grid_divisions_x * grid_divisions_y * grid_divisions_z;
1154 size_t voxel_int_size = voxel_count *
sizeof(int);
1155 size_t voxel_float_size = voxel_count *
sizeof(float);
1158 cudaMalloc(&d_ray_origins, ray_data_size);
1159 cudaMalloc(&d_ray_directions, ray_data_size);
1160 cudaMalloc(&d_voxel_ray_counts, voxel_int_size);
1161 cudaMalloc(&d_voxel_transmitted, voxel_int_size);
1162 cudaMalloc(&d_voxel_hit_before, voxel_int_size);
1163 cudaMalloc(&d_voxel_hit_after, voxel_int_size);
1164 cudaMalloc(&d_voxel_hit_inside, voxel_int_size);
1165 cudaMalloc(&d_voxel_path_lengths, voxel_float_size);
1168 cudaMemcpy(d_ray_origins, h_ray_origins, ray_data_size, cudaMemcpyHostToDevice);
1169 cudaMemcpy(d_ray_directions, h_ray_directions, ray_data_size, cudaMemcpyHostToDevice);
1170 cudaMemset(d_voxel_ray_counts, 0, voxel_int_size);
1171 cudaMemset(d_voxel_transmitted, 0, voxel_int_size);
1172 cudaMemset(d_voxel_hit_before, 0, voxel_int_size);
1173 cudaMemset(d_voxel_hit_after, 0, voxel_int_size);
1174 cudaMemset(d_voxel_hit_inside, 0, voxel_int_size);
1175 cudaMemset(d_voxel_path_lengths, 0, voxel_float_size);
1178 dim3 block_size(256);
1179 dim3 grid_size((num_rays + block_size.x - 1) / block_size.x);
1181 float3 grid_center = make_float3(grid_center_x, grid_center_y, grid_center_z);
1182 float3 grid_size_vec = make_float3(grid_size_x, grid_size_y, grid_size_z);
1183 int3 grid_divisions_vec = make_int3(grid_divisions_x, grid_divisions_y, grid_divisions_z);
1185 intersectRegularGridKernel<<<grid_size, block_size>>>(num_rays, d_ray_origins, d_ray_directions, grid_center, grid_size_vec, grid_divisions_vec, primitive_count, d_voxel_ray_counts, d_voxel_path_lengths, d_voxel_transmitted, d_voxel_hit_before,
1186 d_voxel_hit_after, d_voxel_hit_inside);
1188 cudaDeviceSynchronize();
1191 cudaError_t err = cudaGetLastError();
1192 if (err != cudaSuccess) {
1193 fprintf(stderr,
"CUDA voxel kernel launch error: %s\n", cudaGetErrorString(err));
1195 cudaFree(d_ray_origins);
1196 cudaFree(d_ray_directions);
1197 cudaFree(d_voxel_ray_counts);
1198 cudaFree(d_voxel_transmitted);
1199 cudaFree(d_voxel_hit_before);
1200 cudaFree(d_voxel_hit_after);
1201 cudaFree(d_voxel_hit_inside);
1202 cudaFree(d_voxel_path_lengths);
1207 cudaMemcpy(h_voxel_ray_counts, d_voxel_ray_counts, voxel_int_size, cudaMemcpyDeviceToHost);
1208 cudaMemcpy(h_voxel_path_lengths, d_voxel_path_lengths, voxel_float_size, cudaMemcpyDeviceToHost);
1209 cudaMemcpy(h_voxel_transmitted, d_voxel_transmitted, voxel_int_size, cudaMemcpyDeviceToHost);
1210 cudaMemcpy(h_voxel_hit_before, d_voxel_hit_before, voxel_int_size, cudaMemcpyDeviceToHost);
1211 cudaMemcpy(h_voxel_hit_after, d_voxel_hit_after, voxel_int_size, cudaMemcpyDeviceToHost);
1212 cudaMemcpy(h_voxel_hit_inside, d_voxel_hit_inside, voxel_int_size, cudaMemcpyDeviceToHost);
1215 cudaFree(d_ray_origins);
1216 cudaFree(d_ray_directions);
1217 cudaFree(d_voxel_ray_counts);
1218 cudaFree(d_voxel_transmitted);
1219 cudaFree(d_voxel_hit_before);
1220 cudaFree(d_voxel_hit_after);
1221 cudaFree(d_voxel_hit_inside);
1222 cudaFree(d_voxel_path_lengths);